diff --git a/gradle/java/modules.gradle b/gradle/java/modules.gradle index 0855423c983..5e334ab2db2 100644 --- a/gradle/java/modules.gradle +++ b/gradle/java/modules.gradle @@ -214,7 +214,7 @@ allprojects { } // Configure (tasks.test, sourceSets.test) - tasks.matching { it.name == "test" }.all { Test task -> + tasks.matching { it.name ==~ /test(_[0-9]+)?/ }.all { Test task -> configureTestTaskForSourceSet(task, task.project.sourceSets.test) } diff --git a/gradle/validation/rat-sources.gradle b/gradle/validation/rat-sources.gradle index 3bc0d35c660..4d9759188cb 100644 --- a/gradle/validation/rat-sources.gradle +++ b/gradle/validation/rat-sources.gradle @@ -102,6 +102,7 @@ allprojects { break case ":lucene:analysis:common": + case ":lucene:analysis.tests": exclude "src/**/*.aff" exclude "src/**/*.dic" exclude "src/**/*.good" diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a964e732160..8d18a409ca6 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -153,6 +153,12 @@ Bug Fixes * LUCENE-10349: Fix all analyzers to behave according to their documentation: getDefaultStopSet() methods now return unmodifiable CharArraySets. (Uwe Schindler) +* LUCENE-10352: Add missing service provider entries: KoreanNumberFilterFactory, + DaitchMokotoffSoundexFilterFactory (Uwe Schindler, Robert Muir) + +* LUCENE-10352: Fixed ctor argument checks: JapaneseKatakanaStemFilter, + DoubleMetaphoneFilter (Uwe Schindler, Robert Muir) + Other --------------------- @@ -163,6 +169,13 @@ Other * LUCENE-10310: TestXYDocValuesQueries#doRandomDistanceTest does not produce random circles with radius with '0' value any longer. +* LUCENE-10352: Removed duplicate instances of StringMockResourceLoader and migrated class to + test-framework. (Uwe Schindler, Robert Muir) + +* LUCENE-10352: Convert TestAllAnalyzersHaveFactories and TestRandomChains to a global integration test + and discover classes to check from module system. The test now checks all analyzer modules, + so it may discover new bugs outside of analysis:common module. (Uwe Schindler, Robert Muir) + ======================= Lucene 9.0.0 ======================= New Features diff --git a/lucene/analysis.tests/build.gradle b/lucene/analysis.tests/build.gradle new file mode 100644 index 00000000000..be1c51fecac --- /dev/null +++ b/lucene/analysis.tests/build.gradle @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +apply plugin: 'java-library' + +description = 'Module integration tests for all :lucene:analysis modules' + +dependencies { + moduleTestImplementation project(':lucene:analysis:common') + moduleTestImplementation project(':lucene:analysis:icu') + moduleTestImplementation project(':lucene:analysis:kuromoji') + moduleTestImplementation project(':lucene:analysis:morfologik') + moduleTestImplementation project(':lucene:analysis:nori') + moduleTestImplementation project(':lucene:analysis:opennlp') + moduleTestImplementation project(':lucene:analysis:phonetic') + moduleTestImplementation project(':lucene:analysis:smartcn') + moduleTestImplementation project(':lucene:analysis:stempel') + moduleTestImplementation project(':lucene:test-framework') +} diff --git a/lucene/analysis.tests/src/test/module-info.java b/lucene/analysis.tests/src/test/module-info.java new file mode 100644 index 00000000000..502611624a0 --- /dev/null +++ b/lucene/analysis.tests/src/test/module-info.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Test module for global integration tests of all {@code org.apache.lucene.analysis} + * packages/modules. + */ +@SuppressWarnings({"requires-automatic"}) +module org.apache.lucene.analysis.tests { + requires java.xml; + requires org.apache.lucene.core; + requires org.apache.lucene.analysis.common; + requires org.apache.lucene.analysis.icu; + requires org.apache.lucene.analysis.kuromoji; + requires org.apache.lucene.analysis.morfologik; + requires org.apache.lucene.analysis.nori; + requires org.apache.lucene.analysis.opennlp; + requires org.apache.lucene.analysis.phonetic; + requires org.apache.lucene.analysis.smartcn; + requires org.apache.lucene.analysis.stempel; + requires org.apache.lucene.test_framework; + requires junit; + + exports org.apache.lucene.analysis.tests; +} diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/ModuleClassDiscovery.java b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/ModuleClassDiscovery.java new file mode 100644 index 00000000000..28b90e58691 --- /dev/null +++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/ModuleClassDiscovery.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.tests; + +import java.io.IOException; +import java.lang.module.ResolvedModule; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.function.Predicate; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.junit.Assert; + +/** Discovers all classes from the module graph and loads them (without initialization) */ +abstract class ModuleClassDiscovery { + + private static final Module THIS_MODULE = ModuleClassDiscovery.class.getModule(); + private static final ModuleLayer LAYER = THIS_MODULE.getLayer(); + private static final SortedMap ALL_ANALYSIS_MODULES; + + private static final Predicate ALLOW_MODULES = + name -> + name.equals("org.apache.lucene.core") || name.startsWith("org.apache.lucene.analysis."); + + static { + Assert.assertTrue( + "Analysis integration tests must run in Java Module System as named module", + THIS_MODULE.isNamed()); + Assert.assertNotNull("Module layer is missing", LAYER); + + var mods = new TreeMap(); + discoverAnalysisModules(LAYER, mods); + ALL_ANALYSIS_MODULES = Collections.unmodifiableSortedMap(mods); + if (LuceneTestCase.VERBOSE) { + System.out.println( + "Discovered the following analysis modules: " + ALL_ANALYSIS_MODULES.keySet()); + } + } + + private static void discoverAnalysisModules( + ModuleLayer layer, Map result) { + for (var mod : layer.configuration().modules()) { + String name = mod.name(); + if (ALLOW_MODULES.test(name) && !Objects.equals(name, THIS_MODULE.getName())) { + result.put(name, mod); + } + } + for (var parent : layer.parents()) { + discoverAnalysisModules(parent, result); + } + } + + /** Finds all classes in package across all analysis modules */ + public static List> getClassesForPackage(String pkgname) throws IOException { + final var prefix = pkgname.concat("."); + final var classes = new ArrayList>(); + for (var resolvedModule : ALL_ANALYSIS_MODULES.values()) { + final var module = LAYER.findModule(resolvedModule.name()).orElseThrow(); + try (var reader = resolvedModule.reference().open()) { + reader + .list() + .filter(entry -> entry.endsWith(".class")) + .map(entry -> entry.substring(0, entry.length() - 6).replace('/', '.')) + .filter(clazzname -> clazzname.startsWith(prefix)) + .sorted() + .map( + clazzname -> + Objects.requireNonNull( + Class.forName(module, clazzname), + "Class '" + clazzname + "' not found in module '" + module.getName() + "'")) + .forEach(classes::add); + } + } + Assert.assertFalse("No classes found in package:" + pkgname, classes.isEmpty()); + return classes; + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestAllAnalyzersHaveFactories.java similarity index 69% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java rename to lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestAllAnalyzersHaveFactories.java index 945177b69af..c7df6e16f93 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java +++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestAllAnalyzersHaveFactories.java @@ -14,15 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.analysis.core; +package org.apache.lucene.analysis.tests; import java.io.Reader; import java.io.StringReader; import java.lang.reflect.Modifier; -import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; -import java.util.IdentityHashMap; import java.util.List; import java.util.Map; import java.util.Set; @@ -34,27 +31,17 @@ import org.apache.lucene.analysis.TokenFilterFactory; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.TokenizerFactory; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; import org.apache.lucene.analysis.sinks.TeeSinkTokenFilter; import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.sr.SerbianNormalizationRegularFilter; -import org.apache.lucene.analysis.util.StringMockResourceLoader; -import org.apache.lucene.tests.analysis.CrankyTokenFilter; -import org.apache.lucene.tests.analysis.MockCharFilter; -import org.apache.lucene.tests.analysis.MockFixedLengthPayloadFilter; -import org.apache.lucene.tests.analysis.MockGraphTokenFilter; -import org.apache.lucene.tests.analysis.MockHoleInjectingTokenFilter; -import org.apache.lucene.tests.analysis.MockLowerCaseFilter; -import org.apache.lucene.tests.analysis.MockRandomLookaheadTokenFilter; -import org.apache.lucene.tests.analysis.MockSynonymFilter; -import org.apache.lucene.tests.analysis.MockTokenFilter; -import org.apache.lucene.tests.analysis.MockTokenizer; -import org.apache.lucene.tests.analysis.MockVariableLengthPayloadFilter; -import org.apache.lucene.tests.analysis.SimplePayloadFilter; -import org.apache.lucene.tests.analysis.ValidatingTokenFilter; +import org.apache.lucene.analysis.stempel.StempelFilter; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.ResourceLoader; import org.apache.lucene.util.ResourceLoaderAware; import org.apache.lucene.util.Version; @@ -65,71 +52,37 @@ import org.apache.lucene.util.Version; */ public class TestAllAnalyzersHaveFactories extends LuceneTestCase { - // these are test-only components (e.g. test-framework) - private static final Set> testComponents = - Collections.newSetFromMap(new IdentityHashMap, Boolean>()); - - static { - Collections.>addAll( - testComponents, - MockTokenizer.class, - MockCharFilter.class, - MockFixedLengthPayloadFilter.class, - MockGraphTokenFilter.class, - MockHoleInjectingTokenFilter.class, - MockLowerCaseFilter.class, - MockRandomLookaheadTokenFilter.class, - MockSynonymFilter.class, - MockTokenFilter.class, - MockVariableLengthPayloadFilter.class, - ValidatingTokenFilter.class, - CrankyTokenFilter.class, - SimplePayloadFilter.class); - } - // these are 'crazy' components like cachingtokenfilter. does it make sense to add factories for // these? private static final Set> crazyComponents = - Collections.newSetFromMap(new IdentityHashMap, Boolean>()); - - static { - Collections.>addAll( - crazyComponents, CachingTokenFilter.class, TeeSinkTokenFilter.class); - } + Set.of(CachingTokenFilter.class, TeeSinkTokenFilter.class); // these are oddly-named (either the actual analyzer, or its factory) // they do actually have factories. // TODO: clean this up! private static final Set> oddlyNamedComponents = - Collections.newSetFromMap(new IdentityHashMap, Boolean>()); - - static { - Collections.>addAll( - oddlyNamedComponents, - // this is supported via an option to PathHierarchyTokenizer's factory - ReversePathHierarchyTokenizer.class, - SnowballFilter.class, // this is called SnowballPorterFilterFactory - PatternKeywordMarkerFilter.class, - SetKeywordMarkerFilter.class, - UnicodeWhitespaceTokenizer.class, // a supported option via WhitespaceTokenizerFactory - // class from core, but StopFilterFactory creates one from this module - org.apache.lucene.analysis.StopFilter.class, - // class from core, but LowerCaseFilterFactory creates one from this module - org.apache.lucene.analysis.LowerCaseFilter.class); - } + Set.of( + // this is supported via an option to PathHierarchyTokenizer's factory + ReversePathHierarchyTokenizer.class, + SnowballFilter.class, // this is called SnowballPorterFilterFactory + StempelFilter.class, // this is called StempelPolishStemFilterFactory + PatternKeywordMarkerFilter.class, + SetKeywordMarkerFilter.class, + UnicodeWhitespaceTokenizer.class, // a supported option via WhitespaceTokenizerFactory + // class from core, but StopFilterFactory creates one from this module + org.apache.lucene.analysis.StopFilter.class, + // class from core, but LowerCaseFilterFactory creates one from this module + org.apache.lucene.analysis.LowerCaseFilter.class); // The following token filters are excused from having their factory. - private static final Set> tokenFiltersWithoutFactory = new HashSet<>(); - - static { - tokenFiltersWithoutFactory.add(SerbianNormalizationRegularFilter.class); - } + private static final Set> tokenFiltersWithoutFactory = + Set.of(SerbianNormalizationRegularFilter.class); private static final ResourceLoader loader = new StringMockResourceLoader(""); public void test() throws Exception { List> analysisClasses = - TestRandomChains.getClassesForPackage("org.apache.lucene.analysis"); + ModuleClassDiscovery.getClassesForPackage("org.apache.lucene.analysis"); for (final Class c : analysisClasses) { final int modifiers = c.getModifiers(); @@ -141,7 +94,6 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase { || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() - || testComponents.contains(c) || crazyComponents.contains(c) || oddlyNamedComponents.contains(c) || tokenFiltersWithoutFactory.contains(c) diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestRandomChains.java b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestRandomChains.java new file mode 100644 index 00000000000..208c882532c --- /dev/null +++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestRandomChains.java @@ -0,0 +1,961 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.tests; + +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.Transliterator; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Modifier; +import java.text.DateFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.IdentityHashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.commons.codec.Encoder; +import org.apache.commons.codec.language.Caverphone2; +import org.apache.commons.codec.language.ColognePhonetic; +import org.apache.commons.codec.language.DoubleMetaphone; +import org.apache.commons.codec.language.Metaphone; +import org.apache.commons.codec.language.Nysiis; +import org.apache.commons.codec.language.RefinedSoundex; +import org.apache.commons.codec.language.Soundex; +import org.apache.commons.codec.language.bm.PhoneticEngine; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArrayMap; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.charfilter.NormalizeCharMap; +import org.apache.lucene.analysis.commongrams.CommonGramsFilter; +import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; +import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; +import org.apache.lucene.analysis.core.FlattenGraphFilter; +import org.apache.lucene.analysis.hunspell.Dictionary; +import org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig; +import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerConfig; +import org.apache.lucene.analysis.ja.JapaneseCompletionFilter; +import org.apache.lucene.analysis.ja.JapaneseTokenizer; +import org.apache.lucene.analysis.ko.KoreanTokenizer; +import org.apache.lucene.analysis.minhash.MinHashFilter; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; +import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; +import org.apache.lucene.analysis.miscellaneous.FingerprintFilter; +import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; +import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter; +import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter; +import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; +import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; +import org.apache.lucene.analysis.pattern.PatternTypingFilter; +import org.apache.lucene.analysis.payloads.IdentityEncoder; +import org.apache.lucene.analysis.payloads.PayloadEncoder; +import org.apache.lucene.analysis.pl.PolishAnalyzer; +import org.apache.lucene.analysis.shingle.FixedShingleFilter; +import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.stempel.StempelStemmer; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.analysis.MockTokenFilter; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.apache.lucene.tests.analysis.ValidatingTokenFilter; +import org.apache.lucene.tests.util.Rethrow; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; +import org.apache.lucene.util.AttributeFactory; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IgnoreRandomChains; +import org.apache.lucene.util.Version; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.tartarus.snowball.SnowballStemmer; +import org.xml.sax.InputSource; + +/** tests random analysis chains */ +public class TestRandomChains extends BaseTokenStreamTestCase { + + static List> tokenizers; + static List> tokenfilters; + static List> charfilters; + + static List> snowballStemmers; + + private static final Set> avoidConditionals = + Set.of( + FingerprintFilter.class, + MinHashFilter.class, + ConcatenateGraphFilter.class, + // ShingleFilter doesn't handle input graphs correctly, so wrapping it in a condition can + // expose inconsistent offsets + // https://issues.apache.org/jira/browse/LUCENE-4170 + ShingleFilter.class, + FixedShingleFilter.class, + // FlattenGraphFilter changes the output graph entirely, so wrapping it in a condition + // can break position lengths + FlattenGraphFilter.class, + // LimitToken*Filters don't set end offsets correctly + LimitTokenOffsetFilter.class, + LimitTokenCountFilter.class, + LimitTokenPositionFilter.class); + + private static final Map, Predicate> brokenConstructors; + + static { + try { + final Map, Predicate> map = new HashMap<>(); + // LimitToken*Filter can only use special ctor when last arg is true + for (final var c : + List.of( + LimitTokenCountFilter.class, + LimitTokenOffsetFilter.class, + LimitTokenPositionFilter.class)) { + map.put( + c.getConstructor(TokenStream.class, int.class, boolean.class), + args -> { + assert args.length == 3; + return false == ((Boolean) args[2]); // args are broken if consumeAllTokens is false + }); + } + brokenConstructors = Collections.unmodifiableMap(map); + } catch (Exception e) { + throw new Error(e); + } + } + + private static final Map, Function> argProducers = + Collections.unmodifiableMap( + new IdentityHashMap, Function>() { + { + put( + int.class, + random -> { + // TODO: could cause huge ram usage to use full int range for some filters + // (e.g. allocate enormous arrays) + // return Integer.valueOf(random.nextInt()); + return Integer.valueOf(TestUtil.nextInt(random, -50, 50)); + }); + put( + char.class, + random -> { + // TODO: fix any filters that care to throw IAE instead. + // also add a unicode validating filter to validate termAtt? + // return Character.valueOf((char)random.nextInt(65536)); + while (true) { + char c = (char) random.nextInt(65536); + if (c < '\uD800' || c > '\uDFFF') { + return Character.valueOf(c); + } + } + }); + put(float.class, Random::nextFloat); + put(boolean.class, Random::nextBoolean); + put(byte.class, random -> (byte) random.nextInt(256)); + put( + byte[].class, + random -> { + byte[] bytes = new byte[random.nextInt(256)]; + random.nextBytes(bytes); + return bytes; + }); + put(Random.class, random -> new Random(random.nextLong())); + put(Version.class, random -> Version.LATEST); + put(AttributeFactory.class, BaseTokenStreamTestCase::newAttributeFactory); + put(AttributeSource.class, random -> null); // force IAE/NPE + put( + Set.class, + random -> { + // TypeTokenFilter + Set set = new HashSet<>(); + int num = random.nextInt(5); + for (int i = 0; i < num; i++) { + set.add( + StandardTokenizer.TOKEN_TYPES[ + random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]); + } + return set; + }); + put( + Collection.class, + random -> { + // CapitalizationFilter + Collection col = new ArrayList<>(); + int num = random.nextInt(5); + for (int i = 0; i < num; i++) { + col.add(TestUtil.randomSimpleString(random).toCharArray()); + } + return col; + }); + put( + CharArraySet.class, + random -> { + int num = random.nextInt(10); + CharArraySet set = new CharArraySet(num, random.nextBoolean()); + for (int i = 0; i < num; i++) { + // TODO: make nastier + set.add(TestUtil.randomSimpleString(random)); + } + return set; + }); + // TODO: don't want to make the exponentially slow ones Dawid documents + // in TestPatternReplaceFilter, so dont use truly random patterns (for now) + put(Pattern.class, random -> Pattern.compile("a")); + put( + Pattern[].class, + random -> + new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")}); + put( + PayloadEncoder.class, + random -> + new IdentityEncoder()); // the other encoders will throw exceptions if tokens + // arent numbers? + put( + Dictionary.class, + random -> { + // TODO: make nastier + InputStream affixStream = + TestRandomChains.class.getResourceAsStream("simple.aff"); + InputStream dictStream = + TestRandomChains.class.getResourceAsStream("simple.dic"); + try { + return new Dictionary( + new ByteBuffersDirectory(), "dictionary", affixStream, dictStream); + } catch (Exception ex) { + Rethrow.rethrow(ex); + return null; // unreachable code + } + }); + put( + HyphenationTree.class, + random -> { + // TODO: make nastier + try { + InputSource is = + new InputSource( + TestRandomChains.class.getResource("da_UTF8.xml").toExternalForm()); + HyphenationTree hyphenator = + HyphenationCompoundWordTokenFilter.getHyphenationTree(is); + return hyphenator; + } catch (Exception ex) { + Rethrow.rethrow(ex); + return null; // unreachable code + } + }); + put( + SnowballStemmer.class, + random -> { + try { + var clazz = snowballStemmers.get(random.nextInt(snowballStemmers.size())); + return clazz.getConstructor().newInstance(); + } catch (Exception ex) { + Rethrow.rethrow(ex); + return null; // unreachable code + } + }); + put( + String.class, + random -> { + // TODO: make nastier + if (random.nextBoolean()) { + // a token type + return StandardTokenizer.TOKEN_TYPES[ + random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]; + } else { + return TestUtil.randomSimpleString(random); + } + }); + put( + NormalizeCharMap.class, + random -> { + NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); + // we can't add duplicate keys, or NormalizeCharMap gets angry + Set keys = new HashSet<>(); + int num = random.nextInt(5); + // System.out.println("NormalizeCharMap="); + for (int i = 0; i < num; i++) { + String key = TestUtil.randomSimpleString(random); + if (!keys.contains(key) && key.length() > 0) { + String value = TestUtil.randomSimpleString(random); + builder.add(key, value); + keys.add(key); + // System.out.println("mapping: '" + key + "' => '" + value + "'"); + } + } + return builder.build(); + }); + put( + CharacterRunAutomaton.class, + random -> { + // TODO: could probably use a purely random automaton + switch (random.nextInt(5)) { + case 0: + return MockTokenizer.KEYWORD; + case 1: + return MockTokenizer.SIMPLE; + case 2: + return MockTokenizer.WHITESPACE; + case 3: + return MockTokenFilter.EMPTY_STOPSET; + default: + return MockTokenFilter.ENGLISH_STOPSET; + } + }); + put( + CharArrayMap.class, + random -> { + int num = random.nextInt(10); + CharArrayMap map = new CharArrayMap<>(num, random.nextBoolean()); + for (int i = 0; i < num; i++) { + // TODO: make nastier + map.put( + TestUtil.randomSimpleString(random), TestUtil.randomSimpleString(random)); + } + return map; + }); + put( + StemmerOverrideMap.class, + random -> { + int num = random.nextInt(10); + StemmerOverrideFilter.Builder builder = + new StemmerOverrideFilter.Builder(random.nextBoolean()); + for (int i = 0; i < num; i++) { + String input = ""; + do { + input = TestUtil.randomRealisticUnicodeString(random); + } while (input.isEmpty()); + String out = ""; + TestUtil.randomSimpleString(random); + do { + out = TestUtil.randomRealisticUnicodeString(random); + } while (out.isEmpty()); + builder.add(input, out); + } + try { + return builder.build(); + } catch (Exception ex) { + Rethrow.rethrow(ex); + return null; // unreachable code + } + }); + put( + SynonymMap.class, + new Function() { + @Override + public Object apply(Random random) { + SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean()); + final int numEntries = atLeast(10); + for (int j = 0; j < numEntries; j++) { + addSyn( + b, + randomNonEmptyString(random), + randomNonEmptyString(random), + random.nextBoolean()); + } + try { + return b.build(); + } catch (Exception ex) { + Rethrow.rethrow(ex); + return null; // unreachable code + } + } + + private void addSyn( + SynonymMap.Builder b, String input, String output, boolean keepOrig) { + b.add( + new CharsRef(input.replaceAll(" +", "\u0000")), + new CharsRef(output.replaceAll(" +", "\u0000")), + keepOrig); + } + + private String randomNonEmptyString(Random random) { + while (true) { + final String s = TestUtil.randomUnicodeString(random).trim(); + if (s.length() != 0 && s.indexOf('\u0000') == -1) { + return s; + } + } + } + }); + put( + DateFormat.class, + random -> { + if (random.nextBoolean()) return null; + return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random)); + }); + put( + Automaton.class, + random -> { + return Operations.determinize( + new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE) + .toAutomaton(), + Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + }); + put( + PatternTypingFilter.PatternTypingRule[].class, + random -> { + int numRules = TestUtil.nextInt(random, 1, 3); + PatternTypingFilter.PatternTypingRule[] patternTypingRules = + new PatternTypingFilter.PatternTypingRule[numRules]; + for (int i = 0; i < patternTypingRules.length; i++) { + String s = TestUtil.randomSimpleString(random, 1, 2); + // random regex with one group + String regex = s + "(.*)"; + // pattern rule with a template that accepts one group. + patternTypingRules[i] = + new PatternTypingFilter.PatternTypingRule( + Pattern.compile(regex), TestUtil.nextInt(random, 1, 8), s + "_$1"); + } + return patternTypingRules; + }); + + // ICU: + put( + Normalizer2.class, + random -> { + switch (random.nextInt(5)) { + case 0: + return Normalizer2.getNFCInstance(); + case 1: + return Normalizer2.getNFDInstance(); + case 2: + return Normalizer2.getNFKCInstance(); + case 3: + return Normalizer2.getNFKDInstance(); + default: + return Normalizer2.getNFKCCasefoldInstance(); + } + }); + final var icuTransliterators = Collections.list(Transliterator.getAvailableIDs()); + Collections.sort(icuTransliterators); + put( + Transliterator.class, + random -> + Transliterator.getInstance( + icuTransliterators.get(random.nextInt(icuTransliterators.size())))); + put( + ICUTokenizerConfig.class, + random -> + new DefaultICUTokenizerConfig(random.nextBoolean(), random.nextBoolean())); + + // Kuromoji: + final var jaComplFilterModes = JapaneseCompletionFilter.Mode.values(); + put( + JapaneseCompletionFilter.Mode.class, + random -> jaComplFilterModes[random.nextInt(jaComplFilterModes.length)]); + final var jaTokModes = JapaneseTokenizer.Mode.values(); + put( + JapaneseTokenizer.Mode.class, + random -> jaTokModes[random.nextInt(jaTokModes.length)]); + put(org.apache.lucene.analysis.ja.dict.UserDictionary.class, random -> null); + + // Nori: + final var koComplFilterModes = KoreanTokenizer.DecompoundMode.values(); + put( + KoreanTokenizer.DecompoundMode.class, + random -> koComplFilterModes[random.nextInt(koComplFilterModes.length)]); + put(org.apache.lucene.analysis.ko.dict.UserDictionary.class, random -> null); + + // Phonetic: + final var bmNameTypes = org.apache.commons.codec.language.bm.NameType.values(); + final var bmRuleTypes = + Stream.of(org.apache.commons.codec.language.bm.RuleType.values()) + .filter(e -> e != org.apache.commons.codec.language.bm.RuleType.RULES) + .toArray(org.apache.commons.codec.language.bm.RuleType[]::new); + put( + PhoneticEngine.class, + random -> + new PhoneticEngine( + bmNameTypes[random.nextInt(bmNameTypes.length)], + bmRuleTypes[random.nextInt(bmRuleTypes.length)], + random.nextBoolean())); + put( + Encoder.class, + random -> { + switch (random.nextInt(7)) { + case 0: + return new DoubleMetaphone(); + case 1: + return new Metaphone(); + case 2: + return new Soundex(); + case 3: + return new RefinedSoundex(); + case 4: + return new Caverphone2(); + case 5: + return new ColognePhonetic(); + default: + return new Nysiis(); + } + }); + + // Stempel + put( + StempelStemmer.class, + random -> new StempelStemmer(PolishAnalyzer.getDefaultTable())); + } + }); + + static final Set> allowedTokenizerArgs = argProducers.keySet(), + allowedTokenFilterArgs = + union(argProducers.keySet(), List.of(TokenStream.class, CommonGramsFilter.class)), + allowedCharFilterArgs = union(argProducers.keySet(), List.of(Reader.class)); + + @BeforeClass + public static void beforeClass() throws Exception { + List> analysisClasses = + ModuleClassDiscovery.getClassesForPackage("org.apache.lucene.analysis"); + tokenizers = new ArrayList<>(); + tokenfilters = new ArrayList<>(); + charfilters = new ArrayList<>(); + for (final Class c : analysisClasses) { + final int modifiers = c.getModifiers(); + if ( + // don't waste time with abstract classes, deprecated, or @IgnoreRandomChains annotated + // classes: + Modifier.isAbstract(modifiers) + || !Modifier.isPublic(modifiers) + || c.isSynthetic() + || c.isAnonymousClass() + || c.isMemberClass() + || c.isInterface() + || c.isAnnotationPresent(Deprecated.class) + || c.isAnnotationPresent(IgnoreRandomChains.class) + || !(Tokenizer.class.isAssignableFrom(c) + || TokenFilter.class.isAssignableFrom(c) + || CharFilter.class.isAssignableFrom(c))) { + continue; + } + + for (final Constructor ctor : c.getConstructors()) { + // don't test synthetic, deprecated, or @IgnoreRandomChains annotated ctors, they likely + // have known bugs: + if (ctor.isSynthetic() + || ctor.isAnnotationPresent(Deprecated.class) + || ctor.isAnnotationPresent(IgnoreRandomChains.class)) { + continue; + } + // conditional filters are tested elsewhere + if (ConditionalTokenFilter.class.isAssignableFrom(c)) { + continue; + } + if (Tokenizer.class.isAssignableFrom(c)) { + assertTrue( + ctor.toGenericString() + " has unsupported parameter types", + allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); + tokenizers.add(castConstructor(Tokenizer.class, ctor)); + } else if (TokenFilter.class.isAssignableFrom(c)) { + assertTrue( + ctor.toGenericString() + " has unsupported parameter types", + allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); + tokenfilters.add(castConstructor(TokenFilter.class, ctor)); + } else if (CharFilter.class.isAssignableFrom(c)) { + assertTrue( + ctor.toGenericString() + " has unsupported parameter types", + allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); + charfilters.add(castConstructor(CharFilter.class, ctor)); + } else { + fail("Cannot get here"); + } + } + } + + final Comparator> ctorComp = Comparator.comparing(Constructor::toGenericString); + Collections.sort(tokenizers, ctorComp); + Collections.sort(tokenfilters, ctorComp); + Collections.sort(charfilters, ctorComp); + if (VERBOSE) { + System.out.println("tokenizers = " + tokenizers); + System.out.println("tokenfilters = " + tokenfilters); + System.out.println("charfilters = " + charfilters); + } + + // TODO: Eclipse does not get that cast right, so make explicit: + final Function, Class> stemmerCast = + c -> c.asSubclass(SnowballStemmer.class); + snowballStemmers = + ModuleClassDiscovery.getClassesForPackage("org.tartarus.snowball.ext").stream() + .filter(c -> c.getName().endsWith("Stemmer")) + .map(stemmerCast) + .sorted(Comparator.comparing(Class::getName)) + .collect(Collectors.toList()); + if (VERBOSE) { + System.out.println("snowballStemmers = " + snowballStemmers); + } + } + + @AfterClass + public static void afterClass() { + tokenizers = null; + tokenfilters = null; + charfilters = null; + snowballStemmers = null; + } + + /** Creates a static/unmodifiable set from 2 collections as union. */ + private static Set union(Collection c1, Collection c2) { + return Stream.concat(c1.stream(), c2.stream()).collect(Collectors.toUnmodifiableSet()); + } + + /** + * Hack to work around the stupidness of Oracle's strict Java backwards compatibility. {@code + * Class#getConstructors()} should return unmodifiable {@code List>} not array! + */ + @SuppressWarnings("unchecked") + private static Constructor castConstructor(Class instanceClazz, Constructor ctor) { + return (Constructor) ctor; + } + + @SuppressWarnings("unchecked") + static T newRandomArg(Random random, Class paramType) { + final Function producer = argProducers.get(paramType); + assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer); + return (T) producer.apply(random); + } + + static Object[] newTokenizerArgs(Random random, Class[] paramTypes) { + Object[] args = new Object[paramTypes.length]; + for (int i = 0; i < args.length; i++) { + Class paramType = paramTypes[i]; + args[i] = newRandomArg(random, paramType); + } + return args; + } + + static Object[] newCharFilterArgs(Random random, Reader reader, Class[] paramTypes) { + Object[] args = new Object[paramTypes.length]; + for (int i = 0; i < args.length; i++) { + Class paramType = paramTypes[i]; + if (paramType == Reader.class) { + args[i] = reader; + } else { + args[i] = newRandomArg(random, paramType); + } + } + return args; + } + + static Object[] newFilterArgs(Random random, TokenStream stream, Class[] paramTypes) { + Object[] args = new Object[paramTypes.length]; + for (int i = 0; i < args.length; i++) { + Class paramType = paramTypes[i]; + if (paramType == TokenStream.class) { + args[i] = stream; + } else if (paramType == CommonGramsFilter.class) { + // TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly + args[i] = new CommonGramsFilter(stream, newRandomArg(random, CharArraySet.class)); + } else { + args[i] = newRandomArg(random, paramType); + } + } + return args; + } + + static class MockRandomAnalyzer extends Analyzer { + final long seed; + + MockRandomAnalyzer(long seed) { + this.seed = seed; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Random random = new Random(seed); + TokenizerSpec tokenizerSpec = newTokenizer(random); + // System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString); + TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer); + // System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString); + return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream); + } + + @Override + protected Reader initReader(String fieldName, Reader reader) { + Random random = new Random(seed); + CharFilterSpec charfilterspec = newCharFilterChain(random, reader); + return charfilterspec.reader; + } + + @Override + public String toString() { + Random random = new Random(seed); + StringBuilder sb = new StringBuilder(); + CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader("")); + sb.append("\ncharfilters="); + sb.append(charFilterSpec.toString); + // intentional: initReader gets its own separate random + random = new Random(seed); + TokenizerSpec tokenizerSpec = newTokenizer(random); + sb.append("\n"); + sb.append("tokenizer="); + sb.append(tokenizerSpec.toString); + TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer); + sb.append("\n"); + sb.append("filters="); + sb.append(tokenFilterSpec.toString); + return sb.toString(); + } + + private T createComponent( + Constructor ctor, Object[] args, StringBuilder descr, boolean isConditional) { + try { + final T instance = ctor.newInstance(args); + /* + if (descr.length() > 0) { + descr.append(","); + } + */ + descr.append("\n "); + if (isConditional) { + descr.append("Conditional:"); + } + descr.append(ctor.getDeclaringClass().getName()); + String params = Arrays.deepToString(args); + params = params.substring(1, params.length() - 1); + descr.append("(").append(params).append(")"); + return instance; + } catch (InvocationTargetException ite) { + final Throwable cause = ite.getCause(); + if (cause instanceof IllegalArgumentException + || cause instanceof UnsupportedOperationException) { + // thats ok, ignore + if (VERBOSE) { + System.err.println("Ignoring IAE/UOE from ctor:"); + cause.printStackTrace(System.err); + } + } else { + Rethrow.rethrow(cause); + } + } catch (IllegalAccessException | InstantiationException iae) { + Rethrow.rethrow(iae); + } + return null; // no success + } + + private boolean broken(Constructor ctor, Object[] args) { + final Predicate pred = brokenConstructors.get(ctor); + return pred != null && pred.test(args); + } + + // create a new random tokenizer from classpath + private TokenizerSpec newTokenizer(Random random) { + TokenizerSpec spec = new TokenizerSpec(); + while (spec.tokenizer == null) { + final Constructor ctor = + tokenizers.get(random.nextInt(tokenizers.size())); + final StringBuilder descr = new StringBuilder(); + final Object[] args = newTokenizerArgs(random, ctor.getParameterTypes()); + if (broken(ctor, args)) { + continue; + } + spec.tokenizer = createComponent(ctor, args, descr, false); + if (spec.tokenizer != null) { + spec.toString = descr.toString(); + } + } + return spec; + } + + private CharFilterSpec newCharFilterChain(Random random, Reader reader) { + CharFilterSpec spec = new CharFilterSpec(); + spec.reader = reader; + StringBuilder descr = new StringBuilder(); + int numFilters = random.nextInt(3); + for (int i = 0; i < numFilters; i++) { + while (true) { + final Constructor ctor = + charfilters.get(random.nextInt(charfilters.size())); + final Object[] args = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes()); + if (broken(ctor, args)) { + continue; + } + reader = createComponent(ctor, args, descr, false); + if (reader != null) { + spec.reader = reader; + break; + } + } + } + spec.toString = descr.toString(); + return spec; + } + + private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) { + TokenFilterSpec spec = new TokenFilterSpec(); + spec.stream = tokenizer; + StringBuilder descr = new StringBuilder(); + int numFilters = random.nextInt(5); + for (int i = 0; i < numFilters; i++) { + + // Insert ValidatingTF after each stage so we can + // catch problems right after the TF that "caused" + // them: + spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i); + + while (true) { + final Constructor ctor = + tokenfilters.get(random.nextInt(tokenfilters.size())); + if (random.nextBoolean() + && avoidConditionals.contains(ctor.getDeclaringClass()) == false) { + long seed = random.nextLong(); + spec.stream = + new ConditionalTokenFilter( + spec.stream, + in -> { + final Object[] args = newFilterArgs(random, in, ctor.getParameterTypes()); + if (broken(ctor, args)) { + return in; + } + TokenStream ts = createComponent(ctor, args, descr, true); + if (ts == null) { + return in; + } + return ts; + }) { + Random random = new Random(seed); + + @Override + public void reset() throws IOException { + super.reset(); + random = new Random(seed); + } + + @Override + protected boolean shouldFilter() throws IOException { + return random.nextBoolean(); + } + }; + break; + } else { + final Object[] args = newFilterArgs(random, spec.stream, ctor.getParameterTypes()); + if (broken(ctor, args)) { + continue; + } + final TokenFilter flt = createComponent(ctor, args, descr, false); + if (flt != null) { + spec.stream = flt; + break; + } + } + } + } + + // Insert ValidatingTF after each stage so we can + // catch problems right after the TF that "caused" + // them: + spec.stream = new ValidatingTokenFilter(spec.stream, "last stage"); + + spec.toString = descr.toString(); + return spec; + } + } + + static class TokenizerSpec { + Tokenizer tokenizer; + String toString; + } + + static class TokenFilterSpec { + TokenStream stream; + String toString; + } + + static class CharFilterSpec { + Reader reader; + String toString; + } + + public void testRandomChains() throws Throwable { + int numIterations = TEST_NIGHTLY ? atLeast(20) : 3; + Random random = random(); + for (int i = 0; i < numIterations; i++) { + try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) { + if (VERBOSE) { + System.out.println("Creating random analyzer:" + a); + } + try { + checkNormalize(a); + checkRandomData( + random, + a, + 500 * RANDOM_MULTIPLIER, + 20, + false, + false /* We already validate our own offsets... */); + } catch (Throwable e) { + System.err.println("Exception from random analyzer: " + a); + throw e; + } + } + } + } + + public void checkNormalize(Analyzer a) { + // normalization should not modify characters that may be used for wildcards + // or regular expressions + String s = "([0-9]+)?*"; + assertEquals(s, a.normalize("dummy", s).utf8ToString()); + } + + // we might regret this decision... + public void testRandomChainsWithLargeStrings() throws Throwable { + int numIterations = TEST_NIGHTLY ? atLeast(20) : 3; + Random random = random(); + for (int i = 0; i < numIterations; i++) { + try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) { + if (VERBOSE) { + System.out.println("Creating random analyzer:" + a); + } + try { + checkRandomData( + random, + a, + 50 * RANDOM_MULTIPLIER, + 80, + false, + false /* We already validate our own offsets... */); + } catch (Throwable e) { + System.err.println("Exception from random analyzer: " + a); + throw e; + } + } + } + } +} diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/da_UTF8.xml b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/da_UTF8.xml new file mode 100644 index 00000000000..2c8d203be68 --- /dev/null +++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/da_UTF8.xml @@ -0,0 +1,1208 @@ + + + + + + + + + + +aA +bB +cC +dD +eE +fF +gG +hH +iI +jJ +kK +lL +mM +nN +oO +pP +qQ +rR +sS +tT +uU +vV +wW +xX +yY +zZ +æÆ +øØ +åÅ + + + +.ae3 +.an3k +.an1s +.be5la +.be1t +.bi4tr +.der3i +.diagno5 +.her3 +.hoved3 +.ne4t5 +.om1 +.ove4 +.po1 +.til3 +.yd5r +ab5le +3abst +a3c +ade5la +5adg +a1e +5afg +5a4f1l +af3r +af4ri +5afs +a4gef +a4gi +ag5in +ag5si +3agti +a4gy +a3h +ais5t +a3j +a5ka +a3ke +a5kr +aku5 +a3la +a1le +a1li +al3k +4alkv +a1lo +al5si +a3lu +a1ly +am4pa +3analy +an4k5r +a3nu +3anv +a5o +a5pe +a3pi +a5po +a1ra +ar5af +1arb +a1re +5arg +a1ri +a3ro +a3sa +a3sc +a1si +a3sk +a3so +3a3sp +a3ste +a3sti +a1ta1 +a1te +a1ti +a4t5in +a1to +ato5v +a5tr +a1tu +a5va +a1ve +a5z +1ba +ba4ti +4bd +1be +be1k +be3ro +be5ru +be1s4 +be1tr +1bi +bi5sk +b1j +4b1n +1bo +bo4gr +bo3ra +bo5re +1br4 +4bs +bs5k +b3so +b1st +b5t +3bu +bu4s5tr +b5w +1by +by5s +4c1c +1ce +ce5ro +3ch +4ch. +ci4o +ck3 +5cy +3da +4d3af +d5anta +da4s +d1b +d1d4 +1de +de5d +4de4lem +der5eri +de4rig +de5sk +d1f +d1g +d3h +1di +di1e +di5l +d3j +d1k +d1l +d1m +4d1n +3do +4dop +d5ov +d1p +4drett +5d4reve +3drif +3driv +d5ros +d5ru +ds5an +ds5in +d1ski +d4sm +d4su +dsu5l +ds5vi +d3ta +d1te +dt5o +d5tr +dt5u +1du +dub5 +d1v +3dy +e5ad +e3af +e5ag +e3ak +e1al +ea4la +e3an +e5ap +e3at +e3bl +ebs3 +e1ci +ed5ar +edde4 +eddel5 +e4do +ed5ra +ed3re +ed3rin +ed4str +e3e +3eff +e3fr +3eft +e3gu +e1h +e3in +ei5s +e3je +e4j5el +e1ka +e3ke +e3kl +4e1ko +e5kr +ek5sa +3eksem +3eksp +e3ku +e1kv +e5ky +e3lad +el3ak +el3ar +e1las +e3le +e4lek +3elem +e1li +5elim +e3lo +el5sa +e5lu +e3ly +e4mad +em4p5le +em1s +en5ak +e4nan +4enn +e4no +en3so +e5nu +e5ol +e3op +e1or +e3ov +epi3 +e1pr +e3ra +er3af +e4rag +e4rak +e1re +e4ref +er5ege +5erhv +e1ri +e4rib +er1k +ero5d +er5ov +er3s +er5tr +e3rum +er5un +e5ry +e1ta +e1te +etek4s +e1ti +e3tj +e1to +e3tr +e3tu +e1ty +e3um +e3un +3eur +e1va +e3ve +e4v3erf +e1vi +e5x +1fa +fa4ce +fags3 +f1b +f1d +1fe +fej4 +fejl1 +f1f +f1g +f1h +1fi +f1k +3fl +1fo +for1en +fo4ri +f1p +f1s4 +4ft +f3ta +f1te +f1ti +f5to +f5tvi +1fu +f1v +3fy +1ga +g3art +g1b +g1d +1ge +4g5enden +ger3in +ge3s +g3f +g1g +g1h +1gi +gi4b +gi3st +5gj +g3k +g1l +g1m +3go +4g5om +g5ov +g3p +1gr +gs1a +gsde4len +g4se +gsha4 +g5sla +gs3or +gs1p +g5s4tide +g4str +gs1v +g3ta +g1te +g1ti +g5to +g3tr +gt4s +g3ud +gun5 +g3v +1gy +g5yd +4ha. +heds3 +he5s +4het +hi4e +hi4n5 +hi3s +ho5ko +ho5ve +4h3t +hun4 +hund3 +hvo4 +i1a +i3b +i4ble +i1c +i3dr +ids5k +i1el +i1en +i3er +i3et. +if3r +i3gu +i3h +i5i +i5j +i1ka +i1ke +ik1l +i5ko +ik3re +ik5ri +iks5t +ik4tu +i3ku +ik3v +i3lag +il3eg +il5ej +il5el +i3li +i4l5id +il3k +i1lo +il5u +i3mu +ind3t +5inf +ings1 +in3s +in4sv +inter1 +i3nu +i3od +i3og +i5ok +i3ol +ion4 +ions1 +i5o5r +i3ot +i5pi +i3pli +i5pr +i3re +i3ri +ir5t +i3sc +i3si +i4sm +is3p +i1ster +i3sti +i5sua +i1ta +i1te +i1ti +i3to +i3tr +it5re. +i1tu +i3ty +i1u +i1va +i1ve +i1vi +j3ag +jde4rer +jds1 +jek4to +4j5en. +j5k +j3le +j3li +jlmeld5 +jlmel4di +j3r +jre5 +ju3s +5kap +k5au +5kav +k5b +kel5s +ke3sk +ke5st +ke4t5a +k3h +ki3e +ki3st +k1k +k5lak +k1le +3klu +k4ny +5kod +1kon +ko3ra +3kort +ko3v +1kra +5kry +ks3an +k1si +ks3k +ks1p +k3ste +k5stu +ks5v +k1t +k4tar +k4terh +kti4e +kt5re +kt5s +3kur +1kus +3kut +k4vo +k4vu +5lab +lad3r +5lagd +la4g3r +5lam +1lat +l1b +ldiagnos5 +l3dr +ld3st +1le. +5led +4lele +le4mo +3len +1ler +1les +4leu +l1f +lfin4 +lfind5 +l1go1 +l3h +li4ga +4l5ins +4l3int +li5o +l3j +l1ke +l1ko +l3ky +l1l +l5mu +lo4du +l3op +4l5or +3lov +4l3p +l4ps +l3r +4ls +lses1 +ls5in +l5sj +l1ta +l4taf +l1te +l4t5erf +l3ti +lt3o +l3tr +l3tu +lu5l +l3ve +l3vi +1ma +m1b +m3d +1me +4m5ej +m3f +m1g +m3h +1mi +mi3k +m5ing +mi4o +mi5sty +m3k +m1l +m1m +mmen5 +m1n +3mo +mo4da +4mop +4m5ov +m1pe +m3pi +m3pl +m1po +m3pr +m1r +mse5s +ms5in +m5sk +ms3p +m3ste +ms5v +m3ta +m3te +m3ti +m3tr +m1ud +1mul +mu1li +3my +3na +4nak +1nal +n1b +n1c +4nd +n3dr +nd5si +nd5sk +nd5sp +1ne +ne5a +ne4da +nemen4 +nement5e +neo4 +n3erk +n5erl +ne5sl +ne5st +n1f +n4go +4n1h +1ni +4nim +ni5o +ni3st +n1ke +n1ko +n3kr +n3ku +n5kv +4n1l +n1m +n1n +1no +n3ord +n5p +n3r +4ns +n3si +n1sku +ns3po +n1sta +n5sti +n1ta +nta4le +n1te +n1ti +ntiali4 +n3to +n1tr +nt4s5t +nt4su +n3tu +n3ty +4n1v +3ny +n3z +o3a +o4as +ob3li +o1c +o4din +od5ri +od5s +od5un +o1e +of5r +o4gek +o4gel +o4g5o +og5re +og5sk +o5h +o5in +oi6s5e +o1j +o3ka +o1ke +o3ku +o3la +o3le +o1li +o1lo +o3lu +o5ly +1omr +on3k +ook5 +o3or +o5ov +o3pi +op3l +op3r +op3s +3opta +4or. +or1an +3ordn +ord5s +o3re. +o3reg +o3rek +o3rer +o3re3s +o3ret +o3ri +3orient +or5im +o4r5in +or3k +or5o +or3sl +or3st +o3si +o3so +o3t +o1te +o5un +ov4s +3pa +pa5gh +p5anl +p3d +4pec +3pen +1per +pe1ra +pe5s +pe3u +p3f +4p5h +1pla +p4lan +4ple. +4pler +4ples +p3m +p3n +5pok +4po3re +3pot +4p5p4 +p4ro +1proc +p3sk +p5so +ps4p +p3st +p1t +1pu +pu5b +p5ule +p5v +5py3 +qu4 +4raf +ra5is +4rarb +r1b +r4d5ar +r3dr +rd4s3 +4reks +1rel +re5la +r5enss +5rese +re5spo +4ress +re3st +re5s4u +5rett +r1f +r1gu +r1h +ri1e +ri5la +4rimo +r4ing +ringse4 +ringso4r +4rinp +4rint +r3ka +r1ke +r1ki +rk3so +r3ku +r1l +rmo4 +r5mu +r1n +ro1b +ro3p +r3or +r3p +r1r +rre5s +rro4n5 +r1sa +r1si +r5skr +r4sk5v +rs4n +r3sp +r5stu +r5su +r3sv +r5tal +r1te +r4teli +r1ti +r3to +r4t5or +rt5rat +rt3re +r5tri +r5tro +rt3s +r5ty +r3ud +run4da +5rut +r3va +r1ve +r3vi +ry4s +s3af +1sam +sa4ma +s3ap +s1ar +1sat +4s1b +s1d +sdy4 +1se +s4ed +5s4er +se4se +s1f +4s1g4 +4s3h +si4bl +1sig +s5int +5sis +5sit +5siu +s5ju +4sk. +1skab +1ske +s3kl +sk5s4 +5sky +s1le +s1li +slo3 +5slu +s5ly +s1m +s4my +4snin +s4nit +so5k +5sol +5som. +3somm +s5oms +5somt +3son +4s1op +sp4 +3spec +4sper +3s4pi +s1pl +3sprog. +s5r4 +s1s4 +4st. +5s4tam +1stan +st5as +3stat +1stav +1ste. +1sted +3stel +5stemo +1sten +5step +3ster. +3stes +5stet +5stj +3sto +st5om +1str +s1ud +3sul +s3un +3sur +s3ve +3s4y +1sy1s +5ta. +1tag +tands3 +4tanv +4tb +tede4l +teds5 +3teg +5tekn +teo1 +5term +te5ro +4t1f +6t3g +t1h +tialis5t +3tid +ti4en +ti3st +4t3k +4t1l +tli4s5 +t1m +t1n +to5ra +to1re +to1ri +tor4m +4t3p +t4ra +4tres +tro5v +1try +4ts +t3si +ts4pa +ts5pr +t3st +ts5ul +4t1t +t5uds +5tur +t5ve +1typ +u1a +5udl +ud5r +ud3s +3udv +u1e +ue4t5 +uge4ri +ugs3 +u5gu +u3i +u5kl +uk4ta +uk4tr +u1la +u1le +u5ly +u5pe +up5l +u5q +u3ra +u3re +u4r3eg +u1rer +u3ro +us5a +u3si +u5ska +u5so +us5v +u1te +u1ti +u1to +ut5r +ut5s4 +5u5v +va5d +3varm +1ved +ve4l5e +ve4reg +ve3s +5vet +v5h +vi4l3in +1vis +v5j +v5k +vl4 +v3le +v5li +vls1 +1vo +4v5om +v5p +v5re +v3st +v5su +v5t +3vu +y3a +y5dr +y3e +y3ke +y5ki +yk3li +y3ko +yk4s5 +y3kv +y5li +y5lo +y5mu +yns5 +y5o +y1pe +y3pi +y3re +yr3ek +y3ri +y3si +y3ti +y5t3r +y5ve +zi5o + +.så3 +.ær5i +.øv3r +a3tø +a5væ +brød3 +5bæ +5drøv +dstå4 +3dæ +3dø +e3læ +e3lø +e3rø +er5øn +e5tæ +e5tø +e1væ +e3æ +e5å +3fæ +3fø +fø4r5en +giø4 +g4sø +g5så +3gæ +3gø1 +3gå +i5tæ +i3ø +3kø +3kå +lingeniø4 +l3væ +5løs +m5tå +1mæ +3mø +3må +n3kæ +n5tæ +3næ +4n5æb +5nø +o5læ +or3ø +o5å +5præ +5pæd +på3 +r5kæ +r5tæ +r5tø +r3væ +r5æl +4røn +5rør +3råd +r5år +s4kå +3slå +s4næ +5stø +1stå +1sæ +4s5æn +1sø +s5øk +så4r5 +ti4ø +3træk. +t4sø +t5så +t3væ +u3læ +3værd +1værk +5vå +y5væ +æb3l +æ3c +æ3e +æg5a +æ4gek +æ4g5r +ægs5 +æ5i +æ5kv +ælle4 +æn1dr +æ5o +æ1re +ær4g5r +æ3ri +ær4ma +ær4mo +ær5s +æ5si +æ3so +æ3ste +æ3ve +øde5 +ø3e +ø1je +ø3ke +ø3le +øms5 +øn3st +øn4t3 +ø1re +ø3ri +ørne3 +ør5o +ø1ve +å1d +å1e +å5h +å3l +å3re +års5t +å5sk +å3t + + diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.aff b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.aff new file mode 100644 index 00000000000..aaf4a6cdf22 --- /dev/null +++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.aff @@ -0,0 +1,20 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +SFX A Y 3 +SFX A 0 e n +SFX A 0 e t +SFX A 0 e h + +SFX C Y 2 +SFX C 0 d/C c +SFX C 0 c b + +SFX D Y 1 +SFX D 0 s o + +SFX E Y 1 +SFX E 0 d o + +PFX B Y 1 +PFX B 0 s o diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.dic b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.dic new file mode 100644 index 00000000000..2809611b876 --- /dev/null +++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.dic @@ -0,0 +1,11 @@ +9 +ab/C +apach/A +foo/D +foo/E +lucen/A +lucene +mahout/A +moo/E +olr/B +db \ No newline at end of file diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java index b70768e65d3..9e693ca8710 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.search.BoostAttribute; +import org.apache.lucene.util.IgnoreRandomChains; /** * Characters before the delimiter are the "token", those after are the boost. @@ -30,6 +31,8 @@ import org.apache.lucene.search.BoostAttribute; * *

Note make sure your Tokenizer doesn't split on the delimiter, or this won't work */ +@IgnoreRandomChains( + reason = "requires a special encoded token value, so it may fail with random data") public final class DelimitedBoostTokenFilter extends TokenFilter { private final char delimiter; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java index 2f9337d242f..a384dba2b85 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IgnoreRandomChains; /** * Forms bigrams of CJK terms that are generated from StandardTokenizer or ICUTokenizer. @@ -47,6 +48,7 @@ import org.apache.lucene.util.ArrayUtil; * *

In all cases, all non-CJK input is passed thru unmodified. */ +@IgnoreRandomChains(reason = "LUCENE-8092: doesn't handle graph inputs") public final class CJKBigramFilter extends TokenFilter { // configuration /** bigram flag for Han Ideographs */ diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java index d1a81c17631..0979ade78c8 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.IgnoreRandomChains; /* * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors @@ -43,10 +44,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute; * "the-quick" has a term.type() of "gram" * */ - -/* - * Constructors and makeCommonSet based on similar code in StopFilter - */ +@IgnoreRandomChains(reason = "LUCENE-4983") public final class CommonGramsFilter extends TokenFilter { public static final String GRAM_TYPE = "gram"; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java index 80a638112fd..7a5ba1322ec 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.IgnoreRandomChains; /** * Wrap a CommonGramsFilter optimizing phrase queries by only returning single words when they are @@ -42,6 +43,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute; * See:http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/TokenStream.html and * http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/analysis/package.html?revision=718798 */ +@IgnoreRandomChains(reason = "TODO: doesn't handle graph inputs") public final class CommonGramsQueryFilter extends TokenFilter { private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java index 144fe069c13..fafdec7c72c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.core; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.util.IgnoreRandomChains; /** * Normalizes token text to lower case. @@ -27,6 +28,7 @@ import org.apache.lucene.analysis.TokenStream; * @see org.apache.lucene.analysis.LowerCaseFilter * @see LowerCaseFilterFactory */ +@IgnoreRandomChains(reason = "clones of core's filters") public final class LowerCaseFilter extends org.apache.lucene.analysis.LowerCaseFilter { /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java index 22b756138d6..08b170f3f51 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis.core; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.util.IgnoreRandomChains; /** * Removes stop words from a token stream. @@ -28,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream; * @see org.apache.lucene.analysis.StopFilter * @see StopFilterFactory */ +@IgnoreRandomChains(reason = "clones of core's filters") public final class StopFilter extends org.apache.lucene.analysis.StopFilter { /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java index 417602c0297..a42c988ab8a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IgnoreRandomChains; /** * Characters before the delimiter are the "token", the textual integer after is the term frequency. @@ -36,6 +37,8 @@ import org.apache.lucene.util.ArrayUtil; * *

Note make sure your Tokenizer doesn't split on the delimiter, or this won't work */ +@IgnoreRandomChains( + reason = "requires a special encoded token value, so it may fail with random data") public final class DelimitedTermFrequencyTokenFilter extends TokenFilter { public static final char DEFAULT_DELIMITER = '|'; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java index 47fef0937e8..68216359eb0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.util.IgnoreRandomChains; /** * When the plain text is extracted from documents, we will often have many words hyphenated and @@ -50,6 +51,8 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; * </fieldtype> * */ +@IgnoreRandomChains( + reason = "TODO: doesn't handle graph inputs (or even look at positionIncrement)") public final class HyphenatedWordsFilter extends TokenFilter { private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.java index 4627f9ea4e3..80863ee2839 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.java @@ -19,6 +19,7 @@ package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.util.IgnoreRandomChains; /** * This TokenFilter limits the number of tokens while indexing. It is a replacement for the maximum @@ -45,6 +46,7 @@ public final class LimitTokenCountFilter extends TokenFilter { * * @see #LimitTokenCountFilter(TokenStream,int,boolean) */ + @IgnoreRandomChains(reason = "all tokens must be consumed") public LimitTokenCountFilter(TokenStream in, int maxTokenCount) { this(in, maxTokenCount, false); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java index 757fa96c9a7..0a2db1df243 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.util.IgnoreRandomChains; /** * Lets all tokens pass through until it sees one with a start offset <= a configured limit, @@ -46,6 +47,7 @@ public final class LimitTokenOffsetFilter extends TokenFilter { * * @param maxStartOffset the maximum start offset allowed */ + @IgnoreRandomChains(reason = "all tokens must be consumed") public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset) { this(input, maxStartOffset, false); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java index 6230ee7f7cb..edbee58bbb8 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.IgnoreRandomChains; /** * This TokenFilter limits its emitted tokens to those with positions that are not greater than the @@ -50,6 +51,7 @@ public final class LimitTokenPositionFilter extends TokenFilter { * @param maxTokenPosition max position of tokens to produce (1st token always has position 1) * @see #LimitTokenPositionFilter(TokenStream,int,boolean) */ + @IgnoreRandomChains(reason = "all tokens must be consumed") public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition) { this(in, maxTokenPosition, false); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java index 2971704297b..8b871d3f2e9 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.IgnoreRandomChains; import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.RamUsageEstimator; @@ -83,6 +84,7 @@ import org.apache.lucene.util.RamUsageEstimator; * StandardTokenizer} immediately removes many intra-word delimiters, it is recommended that this * filter be used after a tokenizer that does not do this (such as {@link WhitespaceTokenizer}). */ +@IgnoreRandomChains(reason = "Cannot correct offsets when a char filter had changed them") public final class WordDelimiterGraphFilter extends TokenFilter { /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java index 57fe65bbb98..e7dfa320b6b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.AttributeFactory; +import org.apache.lucene.util.IgnoreRandomChains; /** * Tokenizer for path-like hierarchies. @@ -40,6 +41,7 @@ import org.apache.lucene.util.AttributeFactory; * /something/something/else * */ +@IgnoreRandomChains(reason = "broken offsets") public class PathHierarchyTokenizer extends Tokenizer { public PathHierarchyTokenizer() { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java index d1cdb3a9386..7b1f60f51c4 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.AttributeFactory; +import org.apache.lucene.util.IgnoreRandomChains; /** * Tokenizer for domain-like hierarchies. @@ -43,6 +44,7 @@ import org.apache.lucene.util.AttributeFactory; * uk * */ +@IgnoreRandomChains(reason = "broken offsets") public class ReversePathHierarchyTokenizer extends Tokenizer { public ReversePathHierarchyTokenizer() { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java index 76ef11be8ed..bdb8799dbcf 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.IgnoreRandomChains; /** * Extension of StandardTokenizer that is aware of Wikipedia syntax. It is based off of the @@ -34,6 +35,7 @@ import org.apache.lucene.util.AttributeSource; * * @lucene.experimental */ +@IgnoreRandomChains(reason = "TODO: it seems to mess up offsets!?") public final class WikipediaTokenizer extends Tokenizer { public static final String INTERNAL_LINK = "il"; public static final String EXTERNAL_LINK = "el"; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java index 28777e8fdb6..0ee336aeb25 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java @@ -69,7 +69,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase { protected Reader initReader(String fieldName, Reader reader) { reader = new MockCharFilter(reader, 0); reader = new MappingCharFilter(map, reader); - reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader); + reader = new CheckThatYouDidntReadAnythingReaderWrapper(reader); return reader; } }; @@ -137,7 +137,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase { }; public void testWrapping() throws Exception { - CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream); + CharFilter cs = new CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream); Exception expected = expectThrows( Exception.class, @@ -221,6 +221,69 @@ public class TestBugInSomething extends BaseTokenStreamTestCase { // todo: test framework? + static class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter { + boolean readSomething; + + CheckThatYouDidntReadAnythingReaderWrapper(Reader in) { + super(in); + } + + @Override + public int correct(int currentOff) { + return currentOff; // we don't change any offsets + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + readSomething = true; + return input.read(cbuf, off, len); + } + + @Override + public int read() throws IOException { + readSomething = true; + return input.read(); + } + + @Override + public int read(CharBuffer target) throws IOException { + readSomething = true; + return input.read(target); + } + + @Override + public int read(char[] cbuf) throws IOException { + readSomething = true; + return input.read(cbuf); + } + + @Override + public long skip(long n) throws IOException { + readSomething = true; + return input.skip(n); + } + + @Override + public void mark(int readAheadLimit) throws IOException { + input.mark(readAheadLimit); + } + + @Override + public boolean markSupported() { + return input.markSupported(); + } + + @Override + public boolean ready() throws IOException { + return input.ready(); + } + + @Override + public void reset() throws IOException { + input.reset(); + } + } + static final class SopTokenFilter extends TokenFilter { SopTokenFilter(TokenStream input) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java index 95b8bddb114..5ae7b111024 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java @@ -33,10 +33,10 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.TokenizerFactory; import org.apache.lucene.analysis.boost.DelimitedBoostTokenFilterFactory; import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory; -import org.apache.lucene.analysis.util.StringMockResourceLoader; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.tests.analysis.MockTokenizer; import org.apache.lucene.tests.util.LuceneTestCase.Nightly; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.ResourceLoaderAware; import org.apache.lucene.util.Version; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java deleted file mode 100644 index 98256b3b6f2..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ /dev/null @@ -1,1045 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.core; - -import java.io.IOException; -import java.io.InputStream; -import java.io.Reader; -import java.io.StringReader; -import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.Modifier; -import java.net.URI; -import java.net.URL; -import java.nio.CharBuffer; -import java.nio.file.DirectoryStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.text.DateFormat; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.Enumeration; -import java.util.HashMap; -import java.util.HashSet; -import java.util.IdentityHashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.Set; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.regex.Pattern; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CachingTokenFilter; -import org.apache.lucene.analysis.CharArrayMap; -import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.CharFilter; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.boost.DelimitedBoostTokenFilter; -import org.apache.lucene.analysis.charfilter.NormalizeCharMap; -import org.apache.lucene.analysis.cjk.CJKBigramFilter; -import org.apache.lucene.analysis.commongrams.CommonGramsFilter; -import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter; -import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; -import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter; -import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; -import org.apache.lucene.analysis.hunspell.Dictionary; -import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter; -import org.apache.lucene.analysis.minhash.MinHashFilter; -import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; -import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; -import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; -import org.apache.lucene.analysis.miscellaneous.FingerprintFilter; -import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter; -import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; -import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter; -import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter; -import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; -import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; -import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; -import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; -import org.apache.lucene.analysis.path.PathHierarchyTokenizer; -import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; -import org.apache.lucene.analysis.pattern.PatternTypingFilter; -import org.apache.lucene.analysis.payloads.IdentityEncoder; -import org.apache.lucene.analysis.payloads.PayloadEncoder; -import org.apache.lucene.analysis.shingle.FixedShingleFilter; -import org.apache.lucene.analysis.shingle.ShingleFilter; -import org.apache.lucene.analysis.snowball.TestSnowball; -import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.analysis.synonym.SynonymMap; -import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; -import org.apache.lucene.store.ByteBuffersDirectory; -import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.tests.analysis.CrankyTokenFilter; -import org.apache.lucene.tests.analysis.MockTokenFilter; -import org.apache.lucene.tests.analysis.MockTokenizer; -import org.apache.lucene.tests.analysis.ValidatingTokenFilter; -import org.apache.lucene.tests.util.Rethrow; -import org.apache.lucene.tests.util.TestUtil; -import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; -import org.apache.lucene.util.AttributeFactory; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.Version; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; -import org.apache.lucene.util.automaton.Operations; -import org.apache.lucene.util.automaton.RegExp; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.tartarus.snowball.SnowballStemmer; -import org.xml.sax.InputSource; - -/** tests random analysis chains */ -public class TestRandomChains extends BaseTokenStreamTestCase { - - static List> tokenizers; - static List> tokenfilters; - static List> charfilters; - - private static final Predicate ALWAYS = (objects -> true); - - private static final Set> avoidConditionals = new HashSet<>(); - - static { - // These filters needs to consume the whole tokenstream, so conditionals don't make sense here - avoidConditionals.add(FingerprintFilter.class); - avoidConditionals.add(MinHashFilter.class); - avoidConditionals.add(ConcatenateGraphFilter.class); - // ShingleFilter doesn't handle input graphs correctly, so wrapping it in a condition can - // expose inconsistent offsets - // https://issues.apache.org/jira/browse/LUCENE-4170 - avoidConditionals.add(ShingleFilter.class); - avoidConditionals.add(FixedShingleFilter.class); - // FlattenGraphFilter changes the output graph entirely, so wrapping it in a condition - // can break position lengths - avoidConditionals.add(FlattenGraphFilter.class); - // LimitToken*Filters don't set end offsets correctly - avoidConditionals.add(LimitTokenOffsetFilter.class); - avoidConditionals.add(LimitTokenCountFilter.class); - avoidConditionals.add(LimitTokenPositionFilter.class); - } - - private static final Map, Predicate> brokenConstructors = - new HashMap<>(); - - static { - initBrokenConstructors(); - } - - @SuppressWarnings("deprecation") - private static void initBrokenConstructors() { - try { - brokenConstructors.put( - LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class), ALWAYS); - brokenConstructors.put( - LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class, boolean.class), - args -> { - assert args.length == 3; - return !((Boolean) args[2]); // args are broken if consumeAllTokens is false - }); - brokenConstructors.put( - LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class), ALWAYS); - brokenConstructors.put( - LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class, boolean.class), - args -> { - assert args.length == 3; - return !((Boolean) args[2]); // args are broken if consumeAllTokens is false - }); - brokenConstructors.put( - LimitTokenPositionFilter.class.getConstructor(TokenStream.class, int.class), ALWAYS); - brokenConstructors.put( - LimitTokenPositionFilter.class.getConstructor( - TokenStream.class, int.class, boolean.class), - args -> { - assert args.length == 3; - return !((Boolean) args[2]); // args are broken if consumeAllTokens is false - }); - for (Class c : - Arrays.>asList( - // doesn't actual reset itself! TODO this statement is probably obsolete as of - // LUCENE-6121 ? - CachingTokenFilter.class, - // LUCENE-8092: doesn't handle graph inputs - CJKBigramFilter.class, - // TODO: LUCENE-4983 - CommonGramsFilter.class, - // TODO: doesn't handle graph inputs - CommonGramsQueryFilter.class, - // Not broken, simulates brokenness: - CrankyTokenFilter.class, - // TODO: doesn't handle graph inputs (or even look at positionIncrement) - HyphenatedWordsFilter.class, - // broken offsets - PathHierarchyTokenizer.class, - // broken offsets - ReversePathHierarchyTokenizer.class, - // Not broken: we forcefully add this, so we shouldn't - // also randomly pick it: - ValidatingTokenFilter.class, - // TODO: it seems to mess up offsets!? - WikipediaTokenizer.class, - // TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or - // similar following will then cause pain) - WordDelimiterFilter.class, - // Cannot correct offsets when a char filter had changed them: - WordDelimiterGraphFilter.class, - // requires a special encoded token value, so it may fail with random data: - DelimitedTermFrequencyTokenFilter.class, - // requires a special encoded token value, so it may fail with random data: - DelimitedBoostTokenFilter.class, - // clones of core's filters: - org.apache.lucene.analysis.core.StopFilter.class, - org.apache.lucene.analysis.core.LowerCaseFilter.class)) { - for (Constructor ctor : c.getConstructors()) { - brokenConstructors.put(ctor, ALWAYS); - } - } - } catch (Exception e) { - throw new Error(e); - } - } - - @BeforeClass - public static void beforeClass() throws Exception { - List> analysisClasses = getClassesForPackage("org.apache.lucene.analysis"); - tokenizers = new ArrayList<>(); - tokenfilters = new ArrayList<>(); - charfilters = new ArrayList<>(); - for (final Class c : analysisClasses) { - final int modifiers = c.getModifiers(); - if ( - // don't waste time with abstract classes or deprecated known-buggy ones - Modifier.isAbstract(modifiers) - || !Modifier.isPublic(modifiers) - || c.isSynthetic() - || c.isAnonymousClass() - || c.isMemberClass() - || c.isInterface() - || c.isAnnotationPresent(Deprecated.class) - || !(Tokenizer.class.isAssignableFrom(c) - || TokenFilter.class.isAssignableFrom(c) - || CharFilter.class.isAssignableFrom(c))) { - continue; - } - - for (final Constructor ctor : c.getConstructors()) { - // don't test synthetic or deprecated ctors, they likely have known bugs: - if (ctor.isSynthetic() - || ctor.isAnnotationPresent(Deprecated.class) - || brokenConstructors.get(ctor) == ALWAYS) { - continue; - } - // conditional filters are tested elsewhere - if (ConditionalTokenFilter.class.isAssignableFrom(c)) { - continue; - } - if (Tokenizer.class.isAssignableFrom(c)) { - assertTrue( - ctor.toGenericString() + " has unsupported parameter types", - allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); - tokenizers.add(castConstructor(Tokenizer.class, ctor)); - } else if (TokenFilter.class.isAssignableFrom(c)) { - assertTrue( - ctor.toGenericString() + " has unsupported parameter types", - allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); - tokenfilters.add(castConstructor(TokenFilter.class, ctor)); - } else if (CharFilter.class.isAssignableFrom(c)) { - assertTrue( - ctor.toGenericString() + " has unsupported parameter types", - allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); - charfilters.add(castConstructor(CharFilter.class, ctor)); - } else { - fail("Cannot get here"); - } - } - } - - final Comparator> ctorComp = - (arg0, arg1) -> arg0.toGenericString().compareTo(arg1.toGenericString()); - Collections.sort(tokenizers, ctorComp); - Collections.sort(tokenfilters, ctorComp); - Collections.sort(charfilters, ctorComp); - if (VERBOSE) { - System.out.println("tokenizers = " + tokenizers); - System.out.println("tokenfilters = " + tokenfilters); - System.out.println("charfilters = " + charfilters); - } - } - - @AfterClass - public static void afterClass() { - tokenizers = null; - tokenfilters = null; - charfilters = null; - } - - /** - * Hack to work around the stupidness of Oracle's strict Java backwards compatibility. {@code - * Class#getConstructors()} should return unmodifiable {@code List>} not array! - */ - @SuppressWarnings("unchecked") - private static Constructor castConstructor(Class instanceClazz, Constructor ctor) { - return (Constructor) ctor; - } - - public static List> getClassesForPackage(String pckgname) throws Exception { - final List> classes = new ArrayList<>(); - collectClassesForPackage(pckgname, classes); - assertFalse( - "No classes found in package '" - + pckgname - + "'; maybe your test classes are packaged as JAR file?", - classes.isEmpty()); - return classes; - } - - private static void collectClassesForPackage(String pckgname, List> classes) - throws Exception { - final ClassLoader cld = TestRandomChains.class.getClassLoader(); - final String path = pckgname.replace('.', '/'); - final Enumeration resources = cld.getResources(path); - while (resources.hasMoreElements()) { - final URI uri = resources.nextElement().toURI(); - if (!"file".equalsIgnoreCase(uri.getScheme())) continue; - final Path directory = Paths.get(uri); - if (Files.exists(directory)) { - try (DirectoryStream stream = Files.newDirectoryStream(directory)) { - for (Path file : stream) { - if (Files.isDirectory(file)) { - // recurse - String subPackage = pckgname + "." + file.getFileName().toString(); - collectClassesForPackage(subPackage, classes); - } - String fname = file.getFileName().toString(); - if (fname.endsWith(".class")) { - String clazzName = fname.substring(0, fname.length() - 6); - // exclude Test classes that happen to be in these packages. - // class.ForName'ing some of them can cause trouble. - if (!clazzName.endsWith("Test") && !clazzName.startsWith("Test")) { - // Don't run static initializers, as we won't use most of them. - // Java will do that automatically once accessed/instantiated. - classes.add(Class.forName(pckgname + '.' + clazzName, false, cld)); - } - } - } - } - } - } - } - - private static final Map, Function> argProducers = - new IdentityHashMap, Function>() { - { - put( - int.class, - random -> { - // TODO: could cause huge ram usage to use full int range for some filters - // (e.g. allocate enormous arrays) - // return Integer.valueOf(random.nextInt()); - return Integer.valueOf(TestUtil.nextInt(random, -50, 50)); - }); - put( - char.class, - random -> { - // TODO: fix any filters that care to throw IAE instead. - // also add a unicode validating filter to validate termAtt? - // return Character.valueOf((char)random.nextInt(65536)); - while (true) { - char c = (char) random.nextInt(65536); - if (c < '\uD800' || c > '\uDFFF') { - return Character.valueOf(c); - } - } - }); - put(float.class, Random::nextFloat); - put(boolean.class, Random::nextBoolean); - put(byte.class, random -> (byte) random.nextInt(256)); - put( - byte[].class, - random -> { - byte[] bytes = new byte[random.nextInt(256)]; - random.nextBytes(bytes); - return bytes; - }); - put(Random.class, random -> new Random(random.nextLong())); - put(Version.class, random -> Version.LATEST); - put(AttributeFactory.class, BaseTokenStreamTestCase::newAttributeFactory); - put( - Set.class, - random -> { - // TypeTokenFilter - Set set = new HashSet<>(); - int num = random.nextInt(5); - for (int i = 0; i < num; i++) { - set.add( - StandardTokenizer.TOKEN_TYPES[ - random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]); - } - return set; - }); - put( - Collection.class, - random -> { - // CapitalizationFilter - Collection col = new ArrayList<>(); - int num = random.nextInt(5); - for (int i = 0; i < num; i++) { - col.add(TestUtil.randomSimpleString(random).toCharArray()); - } - return col; - }); - put( - CharArraySet.class, - random -> { - int num = random.nextInt(10); - CharArraySet set = new CharArraySet(num, random.nextBoolean()); - for (int i = 0; i < num; i++) { - // TODO: make nastier - set.add(TestUtil.randomSimpleString(random)); - } - return set; - }); - // TODO: don't want to make the exponentially slow ones Dawid documents - // in TestPatternReplaceFilter, so dont use truly random patterns (for now) - put(Pattern.class, random -> Pattern.compile("a")); - put( - Pattern[].class, - random -> new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")}); - put( - PayloadEncoder.class, - random -> - new IdentityEncoder()); // the other encoders will throw exceptions if tokens - // arent numbers? - put( - Dictionary.class, - random -> { - // TODO: make nastier - InputStream affixStream = - TestHunspellStemFilter.class.getResourceAsStream("simple.aff"); - InputStream dictStream = - TestHunspellStemFilter.class.getResourceAsStream("simple.dic"); - try { - return new Dictionary( - new ByteBuffersDirectory(), "dictionary", affixStream, dictStream); - } catch (Exception ex) { - Rethrow.rethrow(ex); - return null; // unreachable code - } - }); - put( - HyphenationTree.class, - random -> { - // TODO: make nastier - try { - InputSource is = - new InputSource( - TestCompoundWordTokenFilter.class - .getResource("da_UTF8.xml") - .toExternalForm()); - HyphenationTree hyphenator = - HyphenationCompoundWordTokenFilter.getHyphenationTree(is); - return hyphenator; - } catch (Exception ex) { - Rethrow.rethrow(ex); - return null; // unreachable code - } - }); - put( - SnowballStemmer.class, - random -> { - try { - String lang = - TestSnowball.SNOWBALL_LANGS.get( - random.nextInt(TestSnowball.SNOWBALL_LANGS.size())); - Class clazz = - Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer") - .asSubclass(SnowballStemmer.class); - return clazz.getConstructor().newInstance(); - } catch (Exception ex) { - Rethrow.rethrow(ex); - return null; // unreachable code - } - }); - put( - String.class, - random -> { - // TODO: make nastier - if (random.nextBoolean()) { - // a token type - return StandardTokenizer.TOKEN_TYPES[ - random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]; - } else { - return TestUtil.randomSimpleString(random); - } - }); - put( - NormalizeCharMap.class, - random -> { - NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); - // we can't add duplicate keys, or NormalizeCharMap gets angry - Set keys = new HashSet<>(); - int num = random.nextInt(5); - // System.out.println("NormalizeCharMap="); - for (int i = 0; i < num; i++) { - String key = TestUtil.randomSimpleString(random); - if (!keys.contains(key) && key.length() > 0) { - String value = TestUtil.randomSimpleString(random); - builder.add(key, value); - keys.add(key); - // System.out.println("mapping: '" + key + "' => '" + value + "'"); - } - } - return builder.build(); - }); - put( - CharacterRunAutomaton.class, - random -> { - // TODO: could probably use a purely random automaton - switch (random.nextInt(5)) { - case 0: - return MockTokenizer.KEYWORD; - case 1: - return MockTokenizer.SIMPLE; - case 2: - return MockTokenizer.WHITESPACE; - case 3: - return MockTokenFilter.EMPTY_STOPSET; - default: - return MockTokenFilter.ENGLISH_STOPSET; - } - }); - put( - CharArrayMap.class, - random -> { - int num = random.nextInt(10); - CharArrayMap map = new CharArrayMap<>(num, random.nextBoolean()); - for (int i = 0; i < num; i++) { - // TODO: make nastier - map.put(TestUtil.randomSimpleString(random), TestUtil.randomSimpleString(random)); - } - return map; - }); - put( - StemmerOverrideMap.class, - random -> { - int num = random.nextInt(10); - StemmerOverrideFilter.Builder builder = - new StemmerOverrideFilter.Builder(random.nextBoolean()); - for (int i = 0; i < num; i++) { - String input = ""; - do { - input = TestUtil.randomRealisticUnicodeString(random); - } while (input.isEmpty()); - String out = ""; - TestUtil.randomSimpleString(random); - do { - out = TestUtil.randomRealisticUnicodeString(random); - } while (out.isEmpty()); - builder.add(input, out); - } - try { - return builder.build(); - } catch (Exception ex) { - Rethrow.rethrow(ex); - return null; // unreachable code - } - }); - put( - SynonymMap.class, - new Function() { - @Override - public Object apply(Random random) { - SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean()); - final int numEntries = atLeast(10); - for (int j = 0; j < numEntries; j++) { - addSyn( - b, - randomNonEmptyString(random), - randomNonEmptyString(random), - random.nextBoolean()); - } - try { - return b.build(); - } catch (Exception ex) { - Rethrow.rethrow(ex); - return null; // unreachable code - } - } - - private void addSyn( - SynonymMap.Builder b, String input, String output, boolean keepOrig) { - b.add( - new CharsRef(input.replaceAll(" +", "\u0000")), - new CharsRef(output.replaceAll(" +", "\u0000")), - keepOrig); - } - - private String randomNonEmptyString(Random random) { - while (true) { - final String s = TestUtil.randomUnicodeString(random).trim(); - if (s.length() != 0 && s.indexOf('\u0000') == -1) { - return s; - } - } - } - }); - put( - DateFormat.class, - random -> { - if (random.nextBoolean()) return null; - return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random)); - }); - put( - Automaton.class, - random -> { - return Operations.determinize( - new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton(), - Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); - }); - put( - PatternTypingFilter.PatternTypingRule[].class, - random -> { - int numRules = TestUtil.nextInt(random, 1, 3); - PatternTypingFilter.PatternTypingRule[] patternTypingRules = - new PatternTypingFilter.PatternTypingRule[numRules]; - for (int i = 0; i < patternTypingRules.length; i++) { - String s = TestUtil.randomSimpleString(random, 1, 2); - // random regex with one group - String regex = s + "(.*)"; - // pattern rule with a template that accepts one group. - patternTypingRules[i] = - new PatternTypingFilter.PatternTypingRule( - Pattern.compile(regex), TestUtil.nextInt(random, 1, 8), s + "_$1"); - } - return patternTypingRules; - }); - } - }; - - static final Set> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs; - - static { - allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap, Boolean>()); - allowedTokenizerArgs.addAll(argProducers.keySet()); - allowedTokenizerArgs.add(Reader.class); - allowedTokenizerArgs.add(AttributeFactory.class); - allowedTokenizerArgs.add(AttributeSource.class); - allowedTokenizerArgs.add(Automaton.class); - - allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap, Boolean>()); - allowedTokenFilterArgs.addAll(argProducers.keySet()); - allowedTokenFilterArgs.add(TokenStream.class); - // TODO: fix this one, thats broken: - allowedTokenFilterArgs.add(CommonGramsFilter.class); - - allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap, Boolean>()); - allowedCharFilterArgs.addAll(argProducers.keySet()); - allowedCharFilterArgs.add(Reader.class); - } - - @SuppressWarnings("unchecked") - static T newRandomArg(Random random, Class paramType) { - final Function producer = argProducers.get(paramType); - assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer); - return (T) producer.apply(random); - } - - static Object[] newTokenizerArgs(Random random, Class[] paramTypes) { - Object[] args = new Object[paramTypes.length]; - for (int i = 0; i < args.length; i++) { - Class paramType = paramTypes[i]; - if (paramType == AttributeSource.class) { - // TODO: args[i] = new AttributeSource(); - // this is currently too scary to deal with! - args[i] = null; // force IAE - } else { - args[i] = newRandomArg(random, paramType); - } - } - return args; - } - - static Object[] newCharFilterArgs(Random random, Reader reader, Class[] paramTypes) { - Object[] args = new Object[paramTypes.length]; - for (int i = 0; i < args.length; i++) { - Class paramType = paramTypes[i]; - if (paramType == Reader.class) { - args[i] = reader; - } else { - args[i] = newRandomArg(random, paramType); - } - } - return args; - } - - static Object[] newFilterArgs(Random random, TokenStream stream, Class[] paramTypes) { - Object[] args = new Object[paramTypes.length]; - for (int i = 0; i < args.length; i++) { - Class paramType = paramTypes[i]; - if (paramType == TokenStream.class) { - args[i] = stream; - } else if (paramType == CommonGramsFilter.class) { - // TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly - args[i] = new CommonGramsFilter(stream, newRandomArg(random, CharArraySet.class)); - } else { - args[i] = newRandomArg(random, paramType); - } - } - return args; - } - - static class MockRandomAnalyzer extends Analyzer { - final long seed; - - MockRandomAnalyzer(long seed) { - this.seed = seed; - } - - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Random random = new Random(seed); - TokenizerSpec tokenizerSpec = newTokenizer(random); - // System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString); - TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer); - // System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString); - return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream); - } - - @Override - protected Reader initReader(String fieldName, Reader reader) { - Random random = new Random(seed); - CharFilterSpec charfilterspec = newCharFilterChain(random, reader); - return charfilterspec.reader; - } - - @Override - public String toString() { - Random random = new Random(seed); - StringBuilder sb = new StringBuilder(); - CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader("")); - sb.append("\ncharfilters="); - sb.append(charFilterSpec.toString); - // intentional: initReader gets its own separate random - random = new Random(seed); - TokenizerSpec tokenizerSpec = newTokenizer(random); - sb.append("\n"); - sb.append("tokenizer="); - sb.append(tokenizerSpec.toString); - TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer); - sb.append("\n"); - sb.append("filters="); - sb.append(tokenFilterSpec.toString); - return sb.toString(); - } - - private T createComponent( - Constructor ctor, Object[] args, StringBuilder descr, boolean isConditional) { - try { - final T instance = ctor.newInstance(args); - /* - if (descr.length() > 0) { - descr.append(","); - } - */ - descr.append("\n "); - if (isConditional) { - descr.append("Conditional:"); - } - descr.append(ctor.getDeclaringClass().getName()); - String params = Arrays.deepToString(args); - params = params.substring(1, params.length() - 1); - descr.append("(").append(params).append(")"); - return instance; - } catch (InvocationTargetException ite) { - final Throwable cause = ite.getCause(); - if (cause instanceof IllegalArgumentException - || cause instanceof UnsupportedOperationException) { - // thats ok, ignore - if (VERBOSE) { - System.err.println("Ignoring IAE/UOE from ctor:"); - cause.printStackTrace(System.err); - } - } else { - Rethrow.rethrow(cause); - } - } catch (IllegalAccessException | InstantiationException iae) { - Rethrow.rethrow(iae); - } - return null; // no success - } - - private boolean broken(Constructor ctor, Object[] args) { - final Predicate pred = brokenConstructors.get(ctor); - return pred != null && pred.test(args); - } - - // create a new random tokenizer from classpath - private TokenizerSpec newTokenizer(Random random) { - TokenizerSpec spec = new TokenizerSpec(); - while (spec.tokenizer == null) { - final Constructor ctor = - tokenizers.get(random.nextInt(tokenizers.size())); - final StringBuilder descr = new StringBuilder(); - final Object[] args = newTokenizerArgs(random, ctor.getParameterTypes()); - if (broken(ctor, args)) { - continue; - } - spec.tokenizer = createComponent(ctor, args, descr, false); - if (spec.tokenizer != null) { - spec.toString = descr.toString(); - } - } - return spec; - } - - private CharFilterSpec newCharFilterChain(Random random, Reader reader) { - CharFilterSpec spec = new CharFilterSpec(); - spec.reader = reader; - StringBuilder descr = new StringBuilder(); - int numFilters = random.nextInt(3); - for (int i = 0; i < numFilters; i++) { - while (true) { - final Constructor ctor = - charfilters.get(random.nextInt(charfilters.size())); - final Object[] args = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes()); - if (broken(ctor, args)) { - continue; - } - reader = createComponent(ctor, args, descr, false); - if (reader != null) { - spec.reader = reader; - break; - } - } - } - spec.toString = descr.toString(); - return spec; - } - - private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) { - TokenFilterSpec spec = new TokenFilterSpec(); - spec.stream = tokenizer; - StringBuilder descr = new StringBuilder(); - int numFilters = random.nextInt(5); - for (int i = 0; i < numFilters; i++) { - - // Insert ValidatingTF after each stage so we can - // catch problems right after the TF that "caused" - // them: - spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i); - - while (true) { - final Constructor ctor = - tokenfilters.get(random.nextInt(tokenfilters.size())); - if (random.nextBoolean() - && avoidConditionals.contains(ctor.getDeclaringClass()) == false) { - long seed = random.nextLong(); - spec.stream = - new ConditionalTokenFilter( - spec.stream, - in -> { - final Object[] args = newFilterArgs(random, in, ctor.getParameterTypes()); - if (broken(ctor, args)) { - return in; - } - TokenStream ts = createComponent(ctor, args, descr, true); - if (ts == null) { - return in; - } - return ts; - }) { - Random random = new Random(seed); - - @Override - public void reset() throws IOException { - super.reset(); - random = new Random(seed); - } - - @Override - protected boolean shouldFilter() throws IOException { - return random.nextBoolean(); - } - }; - break; - } else { - final Object[] args = newFilterArgs(random, spec.stream, ctor.getParameterTypes()); - if (broken(ctor, args)) { - continue; - } - final TokenFilter flt = createComponent(ctor, args, descr, false); - if (flt != null) { - spec.stream = flt; - break; - } - } - } - } - - // Insert ValidatingTF after each stage so we can - // catch problems right after the TF that "caused" - // them: - spec.stream = new ValidatingTokenFilter(spec.stream, "last stage"); - - spec.toString = descr.toString(); - return spec; - } - } - - static class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter { - boolean readSomething; - - CheckThatYouDidntReadAnythingReaderWrapper(Reader in) { - super(in); - } - - @Override - public int correct(int currentOff) { - return currentOff; // we don't change any offsets - } - - @Override - public int read(char[] cbuf, int off, int len) throws IOException { - readSomething = true; - return input.read(cbuf, off, len); - } - - @Override - public int read() throws IOException { - readSomething = true; - return input.read(); - } - - @Override - public int read(CharBuffer target) throws IOException { - readSomething = true; - return input.read(target); - } - - @Override - public int read(char[] cbuf) throws IOException { - readSomething = true; - return input.read(cbuf); - } - - @Override - public long skip(long n) throws IOException { - readSomething = true; - return input.skip(n); - } - - @Override - public void mark(int readAheadLimit) throws IOException { - input.mark(readAheadLimit); - } - - @Override - public boolean markSupported() { - return input.markSupported(); - } - - @Override - public boolean ready() throws IOException { - return input.ready(); - } - - @Override - public void reset() throws IOException { - input.reset(); - } - } - - static class TokenizerSpec { - Tokenizer tokenizer; - String toString; - } - - static class TokenFilterSpec { - TokenStream stream; - String toString; - } - - static class CharFilterSpec { - Reader reader; - String toString; - } - - public void testRandomChains() throws Throwable { - int numIterations = TEST_NIGHTLY ? atLeast(20) : 3; - Random random = random(); - for (int i = 0; i < numIterations; i++) { - try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) { - if (VERBOSE) { - System.out.println("Creating random analyzer:" + a); - } - try { - checkNormalize(a); - checkRandomData( - random, - a, - 500 * RANDOM_MULTIPLIER, - 20, - false, - false /* We already validate our own offsets... */); - } catch (Throwable e) { - System.err.println("Exception from random analyzer: " + a); - throw e; - } - } - } - } - - public void checkNormalize(Analyzer a) { - // normalization should not modify characters that may be used for wildcards - // or regular expressions - String s = "([0-9]+)?*"; - assertEquals(s, a.normalize("dummy", s).utf8ToString()); - } - - // we might regret this decision... - public void testRandomChainsWithLargeStrings() throws Throwable { - int numIterations = TEST_NIGHTLY ? atLeast(20) : 3; - Random random = random(); - for (int i = 0; i < numIterations; i++) { - try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) { - if (VERBOSE) { - System.out.println("Creating random analyzer:" + a); - } - try { - checkRandomData( - random, - a, - 50 * RANDOM_MULTIPLIER, - 80, - false, - false /* We already validate our own offsets... */); - } catch (Throwable e) { - System.err.println("Exception from random analyzer: " + a); - throw e; - } - } - } - } -} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java index 52854280c12..166b4b7b1ef 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java @@ -19,8 +19,8 @@ package org.apache.lucene.analysis.miscellaneous; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.StringMockResourceLoader; import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.Version; /** Simple tests to ensure the keyword marker filter factory is working. */ diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java index 9e366bc4930..c581ab22f36 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java @@ -19,8 +19,8 @@ package org.apache.lucene.analysis.miscellaneous; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.StringMockResourceLoader; import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.Version; /** Simple tests to ensure the stemmer override filter factory is working. */ diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilterFactory.java index 37006580507..543600155f0 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilterFactory.java @@ -19,10 +19,10 @@ package org.apache.lucene.analysis.pattern; import org.apache.lucene.analysis.TokenFilterFactory; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.StringMockResourceLoader; import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase; import org.apache.lucene.tests.analysis.CannedTokenStream; import org.apache.lucene.tests.analysis.Token; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.Version; /** This test just ensures the factory works */ diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java index b55542a83d4..1340714fbee 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java @@ -19,8 +19,8 @@ package org.apache.lucene.analysis.snowball; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.StringMockResourceLoader; import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.EnglishStemmer; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestMultiWordSynonyms.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestMultiWordSynonyms.java index 7cd538c750b..c1024c34104 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestMultiWordSynonyms.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestMultiWordSynonyms.java @@ -19,8 +19,8 @@ package org.apache.lucene.analysis.synonym; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.StringMockResourceLoader; import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.Version; /** @since solr 1.4 */ diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java index 8df8e4bfeb4..b42c7725182 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java @@ -22,8 +22,8 @@ import org.apache.lucene.analysis.TokenFilterFactory; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.analysis.pattern.PatternTokenizerFactory; -import org.apache.lucene.analysis.util.StringMockResourceLoader; import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.Version; @Deprecated diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java index b82bde51b8a..8f0ab4c5344 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilterFactory; import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.ResourceLoader; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java index 5fc33753501..4a22cce3f12 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.IgnoreRandomChains; /** * A {@link org.apache.lucene.analysis.TokenFilter} that adds Japanese romanized tokens to the term @@ -54,6 +55,7 @@ import org.apache.lucene.util.CharsRefBuilder; * WIDTH NORMALIZATION IS NOT PERFORMED, THIS DOES NOT WORK AS EXPECTED. See also: {@link * JapaneseCompletionAnalyzer}. */ +@IgnoreRandomChains(reason = "LUCENE-10363: fails with incorrect offsets") public final class JapaneseCompletionFilter extends TokenFilter { public static final Mode DEFAULT_MODE = Mode.INDEX; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java index c2350c7c94a..2fdaffea38a 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.util.RollingCharBuffer; +import org.apache.lucene.util.IgnoreRandomChains; /** * Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form. @@ -36,6 +37,8 @@ import org.apache.lucene.analysis.util.RollingCharBuffer; * reached in order to not keep a copy of the character stream in memory. Vertical iteration marks, * which are even rarer than horizontal iteration marks in contemporary Japanese, are unsupported. */ +@IgnoreRandomChains( + reason = "LUCENE-10358: fails with incorrect offsets or causes IndexOutOfBounds") public class JapaneseIterationMarkCharFilter extends CharFilter { /** Normalize kanji iteration marks by default */ diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java index 18b5ee1c930..9198a17388e 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java @@ -45,6 +45,9 @@ public final class JapaneseKatakanaStemFilter extends TokenFilter { public JapaneseKatakanaStemFilter(TokenStream input, int minimumLength) { super(input); + if (minimumLength < 1) { + throw new IllegalArgumentException("minimumLength must be >=1"); + } this.minimumKatakanaLength = minimumLength; } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java index 70438022012..7b01751a3ae 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.IgnoreRandomChains; /** * A {@link TokenFilter} that normalizes Japanese numbers (kansūji) to regular Arabic decimal @@ -82,6 +83,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; *

Japanese formal numbers (daiji), accounting numbers and decimal fractions are currently not * supported. */ +@IgnoreRandomChains(reason = "LUCENE-10362: fails with incorrect offsets") public class JapaneseNumberFilter extends TokenFilter { private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java index bbc5ccf7335..47cb8d19297 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java @@ -41,6 +41,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.util.RollingCharBuffer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeFactory; +import org.apache.lucene.util.IgnoreRandomChains; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.fst.FST; @@ -275,6 +276,7 @@ public final class JapaneseTokenizer extends Tokenizer { * @param mode tokenization mode. * @lucene.experimental */ + @IgnoreRandomChains(reason = "Parameters are too complex to be tested") public JapaneseTokenizer( AttributeFactory factory, TokenInfoDictionary systemDictionary, diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java deleted file mode 100644 index d38acabea74..00000000000 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ja; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.StandardCharsets; -import org.apache.lucene.util.ResourceLoader; - -/** Fake resource loader for tests: works if you want to fake reading a single file */ -class StringMockResourceLoader implements ResourceLoader { - String text; - - public StringMockResourceLoader(String text) { - this.text = text; - } - - @Override - public Class findClass(String cname, Class expectedType) { - try { - return Class.forName(cname).asSubclass(expectedType); - } catch (Exception e) { - throw new RuntimeException("Cannot load class: " + cname, e); - } - } - - @Override - public InputStream openResource(String resource) throws IOException { - return new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)); - } -} diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java index ee6232fa411..b05d5edadc3 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java @@ -36,6 +36,7 @@ import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilte import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.tests.analysis.MockTokenizer; import org.apache.lucene.tests.util.LuceneTestCase.Nightly; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.ResourceLoaderAware; import org.apache.lucene.util.Version; diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java index 470ce0c8f2d..80264951fab 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java @@ -22,6 +22,7 @@ import java.util.HashMap; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; /** Simple tests for {@link JapaneseBaseFormFilterFactory} */ public class TestJapaneseBaseFormFilterFactory extends BaseTokenStreamTestCase { diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java index 6ac6dcc6981..9e456a5aa7d 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.tests.analysis.MockTokenizer; +import org.apache.lucene.tests.util.StringMockResourceLoader; /** Simple tests for {@link JapaneseIterationMarkCharFilterFactory} */ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamTestCase { diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java index 53d8e9c8d1b..aee5f75f03e 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java @@ -22,6 +22,7 @@ import java.util.HashMap; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; /** Simple tests for {@link JapaneseKatakanaStemFilterFactory} */ public class TestJapaneseKatakanaStemFilterFactory extends BaseTokenStreamTestCase { diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java index 5b260d87e0d..ef91bab1a4c 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java @@ -23,6 +23,7 @@ import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; /** Simple tests for {@link org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory} */ public class TestJapaneseNumberFilterFactory extends BaseTokenStreamTestCase { diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java index c3d403811db..02006ab8e82 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java @@ -23,6 +23,7 @@ import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.ClasspathResourceLoader; import org.apache.lucene.util.Version; diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java index e5ed23c904f..cd4eb045c44 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java @@ -22,6 +22,7 @@ import java.util.HashMap; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; /** Simple tests for {@link JapaneseReadingFormFilterFactory} */ public class TestJapaneseReadingFormFilterFactory extends BaseTokenStreamTestCase { diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java index 3c11270f5fc..fb11c4cd6d6 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java @@ -23,6 +23,7 @@ import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; /** Simple tests for {@link JapaneseTokenizerFactory} */ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase { diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java index 648bb403ceb..18a82ba184a 100644 --- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java +++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java @@ -32,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.IgnoreRandomChains; /** * {@link TokenFilter} using Morfologik library to transform input tokens into lemma and @@ -73,6 +74,7 @@ public class MorfologikFilter extends TokenFilter { * @param in input token stream. * @param dict Dictionary to use for stemming. */ + @IgnoreRandomChains(reason = "No dictionary support yet") public MorfologikFilter(final TokenStream in, final Dictionary dict) { super(in); this.input = in; diff --git a/lucene/analysis/nori/src/java/module-info.java b/lucene/analysis/nori/src/java/module-info.java index 9dd085b5a5d..77e67801524 100644 --- a/lucene/analysis/nori/src/java/module-info.java +++ b/lucene/analysis/nori/src/java/module-info.java @@ -28,6 +28,7 @@ module org.apache.lucene.analysis.nori { provides org.apache.lucene.analysis.TokenizerFactory with org.apache.lucene.analysis.ko.KoreanTokenizerFactory; provides org.apache.lucene.analysis.TokenFilterFactory with + org.apache.lucene.analysis.ko.KoreanNumberFilterFactory, org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilterFactory, org.apache.lucene.analysis.ko.KoreanReadingFormFilterFactory; } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java index 61ef959f27d..bc435aa4661 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.IgnoreRandomChains; /** * A {@link TokenFilter} that normalizes Korean numbers to regular Arabic decimal numbers in @@ -72,6 +73,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; * * @lucene.experimental */ +@IgnoreRandomChains(reason = "LUCENE-10361: KoreanNumberFilter messes up offsets") public class KoreanNumberFilter extends TokenFilter { private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java index 0765b801c4d..325fae710b9 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java @@ -40,6 +40,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.util.RollingCharBuffer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeFactory; +import org.apache.lucene.util.IgnoreRandomChains; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.fst.FST; @@ -59,6 +60,7 @@ import org.apache.lucene.util.fst.FST; * * @lucene.experimental */ +@IgnoreRandomChains(reason = "LUCENE-10359: fails with incorrect offsets") public final class KoreanTokenizer extends Tokenizer { /** Token type reflecting the original source of this token */ @@ -205,6 +207,7 @@ public final class KoreanTokenizer extends Tokenizer { * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @lucene.experimental */ + @IgnoreRandomChains(reason = "Parameters are too complex to be tested") public KoreanTokenizer( AttributeFactory factory, TokenInfoDictionary systemDictionary, diff --git a/lucene/analysis/nori/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/nori/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory index 4fff75330d2..cf903c1e04c 100644 --- a/lucene/analysis/nori/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory +++ b/lucene/analysis/nori/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory @@ -13,5 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. +org.apache.lucene.analysis.ko.KoreanNumberFilterFactory org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilterFactory org.apache.lucene.analysis.ko.KoreanReadingFormFilterFactory \ No newline at end of file diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/StringMockResourceLoader.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/StringMockResourceLoader.java deleted file mode 100644 index e29bfbef1cf..00000000000 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/StringMockResourceLoader.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ko; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.StandardCharsets; -import org.apache.lucene.util.ResourceLoader; - -/** Fake resource loader for tests: works if you want to fake reading a single file */ -class StringMockResourceLoader implements ResourceLoader { - private String text; - - public StringMockResourceLoader(String text) { - this.text = text; - } - - @Override - public Class findClass(String cname, Class expectedType) { - try { - return Class.forName(cname).asSubclass(expectedType); - } catch (Exception e) { - throw new RuntimeException("Cannot load class: " + cname, e); - } - } - - @Override - public InputStream openResource(String resource) throws IOException { - return new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)); - } -} diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java index 2a519e8c723..9dc244a12a0 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java @@ -23,6 +23,7 @@ import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; /** Simple tests for {@link org.apache.lucene.analysis.ko.KoreanNumberFilterFactory} */ public class TestKoreanNumberFilterFactory extends BaseTokenStreamTestCase { diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java index 68fd7faa312..5a6c31dca32 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java @@ -23,6 +23,7 @@ import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; import org.apache.lucene.util.Version; /** Simple tests for {@link KoreanPartOfSpeechStopFilterFactory} */ diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java index 46b910c92a3..a92aab1ef2a 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java @@ -21,6 +21,7 @@ import java.io.StringReader; import java.util.HashMap; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; /** Simple tests for {@link KoreanReadingFormFilterFactory} */ public class TestKoreanReadingFormFilterFactory extends BaseTokenStreamTestCase { diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java index 93bd20d663e..63847cbebe8 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java @@ -23,6 +23,7 @@ import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; /** Simple tests for {@link KoreanTokenizerFactory} */ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase { diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java index 93d0c11b608..00932278337 100644 --- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java +++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java @@ -27,12 +27,14 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.IgnoreRandomChains; /** * Run OpenNLP chunker. Prerequisite: the OpenNLPTokenizer and OpenNLPPOSFilter must precede this * filter. Tags terms in the TypeAttribute, replacing the POS tags previously put there by * OpenNLPPOSFilter. */ +@IgnoreRandomChains(reason = "other filters must precede this one (see docs)") public final class OpenNLPChunkerFilter extends TokenFilter { private List sentenceTokenAttrs = new ArrayList<>(); diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java index 1e8e1d13938..af14f03cf21 100644 --- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java +++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java @@ -29,6 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.IgnoreRandomChains; /** * Runs OpenNLP dictionary-based and/or MaxEnt lemmatizers. @@ -41,6 +42,7 @@ import org.apache.lucene.util.AttributeSource; *

The dictionary file must be encoded as UTF-8, with one entry per line, in the form * word[tab]lemma[tab]part-of-speech */ +@IgnoreRandomChains(reason = "LUCENE-10352: no dictionary support yet") public class OpenNLPLemmatizerFilter extends TokenFilter { private final NLPLemmatizerOp lemmatizerOp; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java index f9c7bdd73a1..2cb3ab595fc 100644 --- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java +++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java @@ -27,8 +27,10 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.IgnoreRandomChains; /** Run OpenNLP POS tagger. Tags all terms in the TypeAttribute. */ +@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one") public final class OpenNLPPOSFilter extends TokenFilter { private List sentenceTokenAttrs = new ArrayList<>(); diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java index 134fa25d855..c31f5c11ea0 100644 --- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java +++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java @@ -26,12 +26,14 @@ import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.util.SegmentingTokenizerBase; import org.apache.lucene.util.AttributeFactory; +import org.apache.lucene.util.IgnoreRandomChains; /** * Run OpenNLP SentenceDetector and Tokenizer. The last token in each sentence is marked by setting * the {@link #EOS_FLAG_BIT} in the FlagsAttribute; following filters can use this information to * apply operations to tokens one sentence at a time. */ +@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one") public final class OpenNLPTokenizer extends SegmentingTokenizerBase { public static int EOS_FLAG_BIT = 1; diff --git a/lucene/analysis/phonetic/build.gradle b/lucene/analysis/phonetic/build.gradle index e5595cb2d76..2297af53c85 100644 --- a/lucene/analysis/phonetic/build.gradle +++ b/lucene/analysis/phonetic/build.gradle @@ -23,7 +23,7 @@ dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:analysis:common') - moduleImplementation 'commons-codec:commons-codec' + moduleApi 'commons-codec:commons-codec' testImplementation project(':lucene:test-framework') } diff --git a/lucene/analysis/phonetic/src/java/module-info.java b/lucene/analysis/phonetic/src/java/module-info.java index 706251af4ca..9bf5e641b51 100644 --- a/lucene/analysis/phonetic/src/java/module-info.java +++ b/lucene/analysis/phonetic/src/java/module-info.java @@ -26,6 +26,7 @@ module org.apache.lucene.analysis.phonetic { provides org.apache.lucene.analysis.TokenFilterFactory with org.apache.lucene.analysis.phonetic.BeiderMorseFilterFactory, + org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilterFactory, org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilterFactory, org.apache.lucene.analysis.phonetic.PhoneticFilterFactory; } diff --git a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java index 5e16e47298d..aa0dc1a8caf 100644 --- a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java +++ b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.IgnoreRandomChains; /** * TokenFilter for Beider-Morse phonetic encoding. @@ -33,6 +34,8 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * @see BeiderMorseEncoder * @lucene.experimental */ +@IgnoreRandomChains( + reason = "LUCENE-10360: cannot handle empty tokens (or those only dashes and whitespace)") public final class BeiderMorseFilter extends TokenFilter { private final PhoneticEngine engine; private final LanguageSet languages; @@ -72,6 +75,7 @@ public final class BeiderMorseFilter extends TokenFilter { * @param languages optional Set of original languages. Can be null (which means it will be * guessed). */ + @IgnoreRandomChains(reason = "LUCENE-10352: Add support for LanguageSet randomization") public BeiderMorseFilter(TokenStream input, PhoneticEngine engine, LanguageSet languages) { super(input); this.engine = engine; diff --git a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java index e1f267a5d66..6a950d84cb4 100644 --- a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java +++ b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java @@ -39,6 +39,9 @@ public final class DoubleMetaphoneFilter extends TokenFilter { */ public DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) { super(input); + if (maxCodeLength < 1) { + throw new IllegalArgumentException("maxCodeLength must be >=1"); + } this.encoder.setMaxCodeLen(maxCodeLength); this.inject = inject; } diff --git a/lucene/analysis/phonetic/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/phonetic/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory index fe78873ce4f..677ae4829bf 100644 --- a/lucene/analysis/phonetic/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory +++ b/lucene/analysis/phonetic/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory @@ -14,5 +14,6 @@ # limitations under the License. org.apache.lucene.analysis.phonetic.BeiderMorseFilterFactory +org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilterFactory org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilterFactory org.apache.lucene.analysis.phonetic.PhoneticFilterFactory diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CachingTokenFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/CachingTokenFilter.java index f87ee8816ac..d3a6c24565a 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/CachingTokenFilter.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/CachingTokenFilter.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.IgnoreRandomChains; /** * This class can be used if the token attributes of a TokenStream are intended to be consumed more @@ -31,6 +32,9 @@ import org.apache.lucene.util.AttributeSource; * although only before {@link #incrementToken()} is called the first time. Prior to Lucene 5, it * was never propagated. */ +@IgnoreRandomChains( + reason = + "doesn't actual reset itself! TODO: this statement is probably obsolete as of LUCENE-6121") public final class CachingTokenFilter extends TokenFilter { private List cache = null; private Iterator iterator = null; diff --git a/lucene/core/src/java/org/apache/lucene/util/IgnoreRandomChains.java b/lucene/core/src/java/org/apache/lucene/util/IgnoreRandomChains.java new file mode 100644 index 00000000000..f6f4c2a4860 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/IgnoreRandomChains.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +/** + * Annotation to not test a class or constructor with {@code TestRandomChains} integration test. + * + * @lucene.internal + */ +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.CONSTRUCTOR, ElementType.TYPE}) +public @interface IgnoreRandomChains { + /** A reason for ignoring should always be given. */ + String reason(); +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/StringMockResourceLoader.java similarity index 97% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java rename to lucene/test-framework/src/java/org/apache/lucene/tests/util/StringMockResourceLoader.java index 87764d6f526..d708ac75764 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/StringMockResourceLoader.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.analysis.util; +package org.apache.lucene.tests.util; import java.io.ByteArrayInputStream; import java.io.IOException; diff --git a/settings.gradle b/settings.gradle index ed641bf1a6b..0923e9db874 100644 --- a/settings.gradle +++ b/settings.gradle @@ -36,6 +36,7 @@ include "lucene:analysis:opennlp" include "lucene:analysis:phonetic" include "lucene:analysis:smartcn" include "lucene:analysis:stempel" +include "lucene:analysis.tests" include "lucene:backward-codecs" include "lucene:benchmark" include "lucene:classification"