From c0d2975970d3de8f5056a20504dec1431d455ab1 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Sat, 15 Sep 2018 16:56:27 +0100 Subject: [PATCH] LUCENE-8498: Remove LowerCaseTokenizer --- lucene/CHANGES.txt | 3 + lucene/MIGRATE.txt | 11 ++ .../analysis/core/LowerCaseTokenizer.java | 72 ----------- .../core/LowerCaseTokenizerFactory.java | 75 ----------- .../lucene/analysis/core/SimpleAnalyzer.java | 4 +- .../lucene/analysis/core/StopAnalyzer.java | 6 +- .../lucene/analysis/util/CharTokenizer.java | 86 +----------- ...ache.lucene.analysis.util.TokenizerFactory | 1 - .../analysis/br/TestBrazilianAnalyzer.java | 7 +- .../lucene/analysis/core/TestAnalyzers.java | 8 -- .../analysis/custom/TestCustomAnalyzer.java | 13 +- .../analysis/de/TestGermanAnalyzer.java | 8 +- .../standard/TestStandardFactories.java | 16 --- .../analysis/util/TestCharTokenizers.java | 122 +++--------------- .../solr/collection1/conf/schema.xml | 15 --- .../conf/schema-copyfield-test.xml | 14 -- .../solr/collection1/conf/schema-folding.xml | 3 +- .../solr/collection1/conf/schema-hash.xml | 13 -- .../conf/schema-required-fields.xml | 14 -- .../solr/collection1/conf/schema-rest.xml | 9 +- .../solr/collection1/conf/schema-sql.xml | 13 -- .../conf/schema-tokenizer-test.xml | 11 +- .../solr/collection1/conf/schema.xml | 15 +-- .../solr/collection1/conf/schema12.xml | 9 +- .../solr/collection1/conf/schema15.xml | 14 -- .../solr/collection1/conf/schemasurround.xml | 14 -- .../schema/TestFieldCollectionResource.java | 10 +- .../rest/schema/TestFieldTypeResource.java | 3 +- .../solr/util/TestMaxTokenLenTokenizer.java | 20 +-- .../solr/collection1/conf/schema-sql.xml | 13 -- .../solrj/solr/collection1/conf/schema.xml | 13 -- .../solr/configsets/streaming/conf/schema.xml | 11 -- 32 files changed, 78 insertions(+), 568 deletions(-) delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index bd8c616c3f1..70badd8e56e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -81,6 +81,9 @@ API Changes * LUCENE-8352: TokenStreamComponents is now final, and can take a Consumer in its constructor (Mark Harwood, Alan Woodward, Adrien Grand) +* LUCENE-8498: LowerCaseTokenizer has been removed, and CharTokenizer no longer + takes a normalizer function. (Alan Woodward) + Changes in Runtime Behavior * LUCENE-8333: Switch MoreLikeThis.setMaxDocFreqPct to use maxDoc instead of diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 60089566e27..1b56b6465d6 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -129,3 +129,14 @@ Most code should just require recompilation, though possibly requiring some adde Instead of overriding TokenStreamComponents#setReader() to customise analyzer initialisation, you should now pass a Consumer<Reader> instance to the TokenStreamComponents constructor. + +## LowerCaseTokenizer and LowerCaseTokenizerFactory have been removed ## + +LowerCaseTokenizer combined tokenization and filtering in a way that broke token +normalization, so they have been removed. Instead, use a LetterTokenizer followed by +a LowerCaseFilter + +## CharTokenizer no longer takes a normalizer function ## + +CharTokenizer now only performs tokenization. To perform any type of filtering +use a TokenFilter chain as you would with any other Tokenizer. diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java deleted file mode 100644 index 26b8747962b..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.core; - - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.util.AttributeFactory; - -/** - * LowerCaseTokenizer performs the function of LetterTokenizer - * and LowerCaseFilter together. It divides text at non-letters and converts - * them to lower case. While it is functionally equivalent to the combination - * of LetterTokenizer and LowerCaseFilter, there is a performance advantage - * to doing the two tasks at once, hence this (redundant) implementation. - *

- * Note: this does a decent job for most European languages, but does a terrible - * job for some Asian languages, where words are not separated by spaces. - *

- */ -public final class LowerCaseTokenizer extends LetterTokenizer { - - /** - * Construct a new LowerCaseTokenizer. - */ - public LowerCaseTokenizer() { - } - - /** - * Construct a new LowerCaseTokenizer using a given - * {@link org.apache.lucene.util.AttributeFactory}. - * - * @param factory - * the attribute factory to use for this {@link Tokenizer} - */ - public LowerCaseTokenizer(AttributeFactory factory) { - super(factory); - } - - /** - * Construct a new LowerCaseTokenizer using a given - * {@link org.apache.lucene.util.AttributeFactory}. - * - * @param factory the attribute factory to use for this {@link Tokenizer} - * @param maxTokenLen maximum token length the tokenizer will emit. - * Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024) - * @throws IllegalArgumentException if maxTokenLen is invalid. - */ - public LowerCaseTokenizer(AttributeFactory factory, int maxTokenLen) { - super(factory, maxTokenLen); - } - - /** Converts char to lower case - * {@link Character#toLowerCase(int)}.*/ - @Override - protected int normalize(int c) { - return Character.toLowerCase(c); - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java deleted file mode 100644 index 44e27429b63..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.core; - - -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.CharTokenizer; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenizerFactory; -import org.apache.lucene.util.AttributeFactory; - -import java.util.HashMap; -import java.util.Map; - -import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT; - -/** - * Factory for {@link LowerCaseTokenizer}. - *
- * <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
- * <analyzer>
- * <tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="256"/>
- * </analyzer>
- * </fieldType>
- *

- * Options: - *

- */ -public class LowerCaseTokenizerFactory extends TokenizerFactory implements MultiTermAwareComponent { - private final int maxTokenLen; - - /** - * Creates a new LowerCaseTokenizerFactory - */ - public LowerCaseTokenizerFactory(Map args) { - super(args); - maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN); - if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) { - throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen); - } - if (!args.isEmpty()) { - throw new IllegalArgumentException("Unknown parameters: " + args); - } - } - - @Override - public LowerCaseTokenizer create(AttributeFactory factory) { - return new LowerCaseTokenizer(factory, maxTokenLen); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - Map map = new HashMap<>(getOriginalArgs()); - map.remove("maxTokenLen"); //removing "maxTokenLen" argument for LowerCaseFilterFactory init - return new LowerCaseFilterFactory(map); - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java index 6e0f2f0b67e..3fcb92c2703 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java @@ -20,6 +20,7 @@ package org.apache.lucene.analysis.core; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; /** An {@link Analyzer} that filters {@link LetterTokenizer} * with {@link LowerCaseFilter} @@ -34,7 +35,8 @@ public final class SimpleAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(final String fieldName) { - return new TokenStreamComponents(new LowerCaseTokenizer()); + Tokenizer tokenizer = new LetterTokenizer(); + return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer)); } @Override diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java index cf7ecdd6214..dde74c0df48 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java @@ -60,13 +60,13 @@ public final class StopAnalyzer extends StopwordAnalyzerBase { * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} - * built from a {@link LowerCaseTokenizer} filtered with + * built from a {@link LetterTokenizer} filtered with * {@link StopFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { - final Tokenizer source = new LowerCaseTokenizer(); - return new TokenStreamComponents(source, new StopFilter(source, stopwords)); + final Tokenizer source = new LetterTokenizer(); + return new TokenStreamComponents(source, new StopFilter(new LowerCaseFilter(source), stopwords)); } @Override diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java index ff9d6ff93c1..092d25d7cb4 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java @@ -20,14 +20,11 @@ package org.apache.lucene.analysis.util; import java.io.IOException; import java.util.Objects; import java.util.function.IntPredicate; -import java.util.function.IntUnaryOperator; -import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer; import org.apache.lucene.analysis.CharacterUtils; -import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; -import org.apache.lucene.analysis.core.LowerCaseTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -107,48 +104,12 @@ public abstract class CharTokenizer extends Tokenizer { * */ public static CharTokenizer fromTokenCharPredicate(AttributeFactory factory, final IntPredicate tokenCharPredicate) { - return fromTokenCharPredicate(factory, tokenCharPredicate, IntUnaryOperator.identity()); - } - - /** - * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression. - * The predicate should return {@code true} for all valid token characters. - * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression. - *

- * This factory is intended to be used with lambdas or method references. E.g., an elegant way - * to create an instance which behaves exactly as {@link LowerCaseTokenizer} is: - *

-   * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(Character::isLetter, Character::toLowerCase);
-   * 
- */ - public static CharTokenizer fromTokenCharPredicate(final IntPredicate tokenCharPredicate, final IntUnaryOperator normalizer) { - return fromTokenCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, tokenCharPredicate, normalizer); - } - - /** - * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate, supplied as method reference or lambda expression. - * The predicate should return {@code true} for all valid token characters. - * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression. - *

- * This factory is intended to be used with lambdas or method references. E.g., an elegant way - * to create an instance which behaves exactly as {@link LowerCaseTokenizer} is: - *

-   * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(factory, Character::isLetter, Character::toLowerCase);
-   * 
- */ - public static CharTokenizer fromTokenCharPredicate(AttributeFactory factory, final IntPredicate tokenCharPredicate, final IntUnaryOperator normalizer) { Objects.requireNonNull(tokenCharPredicate, "predicate must not be null."); - Objects.requireNonNull(normalizer, "normalizer must not be null"); return new CharTokenizer(factory) { @Override protected boolean isTokenChar(int c) { return tokenCharPredicate.test(c); } - - @Override - protected int normalize(int c) { - return normalizer.applyAsInt(c); - } }; } @@ -167,7 +128,7 @@ public abstract class CharTokenizer extends Tokenizer { public static CharTokenizer fromSeparatorCharPredicate(final IntPredicate separatorCharPredicate) { return fromSeparatorCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, separatorCharPredicate); } - + /** * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate, supplied as method reference or lambda expression. * The predicate should return {@code true} for all valid token separator characters. @@ -179,37 +140,7 @@ public abstract class CharTokenizer extends Tokenizer { * */ public static CharTokenizer fromSeparatorCharPredicate(AttributeFactory factory, final IntPredicate separatorCharPredicate) { - return fromSeparatorCharPredicate(factory, separatorCharPredicate, IntUnaryOperator.identity()); - } - - /** - * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression. - * The predicate should return {@code true} for all valid token separator characters. - * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression. - *

- * This factory is intended to be used with lambdas or method references. E.g., an elegant way - * to create an instance which behaves exactly as the combination {@link WhitespaceTokenizer} and {@link LowerCaseFilter} is: - *

-   * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(Character::isWhitespace, Character::toLowerCase);
-   * 
- */ - public static CharTokenizer fromSeparatorCharPredicate(final IntPredicate separatorCharPredicate, final IntUnaryOperator normalizer) { - return fromSeparatorCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, separatorCharPredicate, normalizer); - } - - /** - * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate. - * The predicate should return {@code true} for all valid token separator characters. - * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression. - *

- * This factory is intended to be used with lambdas or method references. E.g., an elegant way - * to create an instance which behaves exactly as {@link WhitespaceTokenizer} and {@link LowerCaseFilter} is: - *

-   * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(factory, Character::isWhitespace, Character::toLowerCase);
-   * 
- */ - public static CharTokenizer fromSeparatorCharPredicate(AttributeFactory factory, final IntPredicate separatorCharPredicate, final IntUnaryOperator normalizer) { - return fromTokenCharPredicate(factory, separatorCharPredicate.negate(), normalizer); + return fromTokenCharPredicate(factory, separatorCharPredicate.negate()); } private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; @@ -230,15 +161,6 @@ public abstract class CharTokenizer extends Tokenizer { */ protected abstract boolean isTokenChar(int c); - /** - * Called on each token character to normalize it before it is added to the - * token. The default implementation does nothing. Subclasses may use this to, - * e.g., lowercase tokens. - */ - protected int normalize(int c) { - return c; - } - @Override public final boolean incrementToken() throws IOException { clearAttributes(); @@ -276,7 +198,7 @@ public abstract class CharTokenizer extends Tokenizer { buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer } end += charCount; - length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized + length += Character.toChars(c, buffer, length); // buffer it, normalized if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test break; } diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory index 4b37eb868ea..e8bceff4cc2 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory @@ -15,7 +15,6 @@ org.apache.lucene.analysis.core.KeywordTokenizerFactory org.apache.lucene.analysis.core.LetterTokenizerFactory -org.apache.lucene.analysis.core.LowerCaseTokenizerFactory org.apache.lucene.analysis.core.WhitespaceTokenizerFactory org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory org.apache.lucene.analysis.ngram.NGramTokenizerFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java index 550a62ac040..5096ee8b7d2 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java @@ -25,7 +25,8 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.core.LowerCaseTokenizer; +import org.apache.lucene.analysis.core.LetterTokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; /** @@ -147,9 +148,9 @@ public class TestBrazilianAnalyzer extends BaseTokenStreamTestCase { public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(1, true); set.add("Brasília"); - Tokenizer tokenizer = new LowerCaseTokenizer(); + Tokenizer tokenizer = new LetterTokenizer(); tokenizer.setReader(new StringReader("Brasília Brasilia")); - BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(tokenizer, set)); + BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(new LowerCaseFilter(tokenizer), set)); assertTokenStreamContents(filter, new String[] { "brasília", "brasil" }); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java index b7fc18b1ac2..8133b7afdd2 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java @@ -216,14 +216,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase { int length = highSurEndingLower.length(); assertEquals('\ud801', termBuffer[length - 1]); } - - public void testLowerCaseTokenizer() throws IOException { - StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); - LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(); - tokenizer.setReader(reader); - assertTokenStreamContents(tokenizer, new String[] { "tokenizer", - "\ud801\udc44test" }); - } public void testWhitespaceTokenizer() throws IOException { StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java index 1fa59d189de..a4e1ac5345f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java @@ -31,9 +31,8 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory; import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory; +import org.apache.lucene.analysis.core.LetterTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilterFactory; -import org.apache.lucene.analysis.core.LowerCaseTokenizer; -import org.apache.lucene.analysis.core.LowerCaseTokenizerFactory; import org.apache.lucene.analysis.core.StopFilterFactory; import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory; @@ -419,7 +418,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { @Override public Tokenizer create(AttributeFactory factory) { - return new LowerCaseTokenizer(factory); + return new LetterTokenizer(factory); } } @@ -500,14 +499,6 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { .build(); assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c")); } - - /** test normalize where the TokenizerFactory returns a filter to normalize the text */ - public void testNormalizationWithLowerCaseTokenizer() throws IOException { - CustomAnalyzer analyzer1 = CustomAnalyzer.builder() - .withTokenizer(LowerCaseTokenizerFactory.class, Collections.emptyMap()) - .build(); - assertEquals(new BytesRef("abc"), analyzer1.normalize("dummy", "ABC")); - } public void testConditions() throws IOException { CustomAnalyzer analyzer = CustomAnalyzer.builder() diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java index 4c52c0e0713..3d8be315914 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java @@ -23,7 +23,9 @@ import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.core.LowerCaseTokenizer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LetterTokenizer; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; public class TestGermanAnalyzer extends BaseTokenStreamTestCase { @@ -38,10 +40,10 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase { public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet( 1, true); set.add("fischen"); - final LowerCaseTokenizer in = new LowerCaseTokenizer(); + final Tokenizer in = new LetterTokenizer(); in.setReader(new StringReader("Fischen Trinken")); GermanStemFilter filter = new GermanStemFilter( - new SetKeywordMarkerFilter(in, set)); + new SetKeywordMarkerFilter(new LowerCaseFilter(in), set)); assertTokenStreamContents(filter, new String[] { "fischen", "trink" }); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java index 00bc7c64ad0..3f3d5c22d04 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java @@ -125,17 +125,6 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase { new String[] { "What", "s", "this", "thing", "do" }); } - /** - * Test LowerCaseTokenizerFactory - */ - public void testLowerCaseTokenizer() throws Exception { - Reader reader = new StringReader("What's this thing do?"); - Tokenizer stream = tokenizerFactory("LowerCase").create(newAttributeFactory()); - stream.setReader(reader); - assertTokenStreamContents(stream, - new String[] { "what", "s", "this", "thing", "do" }); - } - /** * Ensure the ASCIIFoldingFilterFactory works */ @@ -168,11 +157,6 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase { }); assertTrue(expected.getMessage().contains("Unknown parameters")); - expected = expectThrows(IllegalArgumentException.class, () -> { - tokenizerFactory("LowerCase", "bogusArg", "bogusValue"); - }); - assertTrue(expected.getMessage().contains("Unknown parameters")); - expected = expectThrows(IllegalArgumentException.class, () -> { tokenFilterFactory("ASCIIFolding", "bogusArg", "bogusValue"); }); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java index 4596608b747..2fcda4f1b8d 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java @@ -21,16 +21,12 @@ import java.io.IOException; import java.io.StringReader; import java.util.Locale; -import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; -import org.apache.lucene.analysis.core.LowerCaseTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.util.TestUtil; /** @@ -54,9 +50,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase { } // internal buffer size is 1024 make sure we have a surrogate pair right at the border builder.insert(1023, "\ud801\udc1c"); - Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory()); + Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString())); - assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" ")); + assertTokenStreamContents(new LowerCaseFilter(tokenizer), builder.toString().toLowerCase(Locale.ROOT).split(" ")); } /* @@ -72,9 +68,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase { builder.append("a"); } builder.append("\ud801\udc1cabc"); - Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory()); + Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString())); - assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)}); + assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT)}); } } @@ -87,9 +83,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase { for (int i = 0; i < 255; i++) { builder.append("A"); } - Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory()); + Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString() + builder.toString())); - assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); + assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); } /* @@ -101,14 +97,14 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase { for (int i = 0; i < 100; i++) { builder.append("A"); } - Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100); + Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory(), 100); // Tricky, passing two copies of the string to the reader.... tokenizer.setReader(new StringReader(builder.toString() + builder.toString())); - assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT), + assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[]{builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT) }); Exception e = expectThrows(IllegalArgumentException.class, () -> - new LowerCaseTokenizer(newAttributeFactory(), -1)); + new LetterTokenizer(newAttributeFactory(), -1)); assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage()); tokenizer = new LetterTokenizer(newAttributeFactory(), 100); @@ -134,16 +130,16 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase { } e = expectThrows(IllegalArgumentException.class, () -> - new LowerCaseTokenizer(newAttributeFactory(), 0)); + new LetterTokenizer(newAttributeFactory(), 0)); assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage()); e = expectThrows(IllegalArgumentException.class, () -> - new LowerCaseTokenizer(newAttributeFactory(), 10_000_000)); + new LetterTokenizer(newAttributeFactory(), 10_000_000)); assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage()); - tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800); + tokenizer = new LetterTokenizer(newAttributeFactory(), 4800); tokenizer.setReader(new StringReader(builder.toString())); - assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT)}); + assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[]{builder.toString().toLowerCase(Locale.ROOT)}); e = expectThrows(IllegalArgumentException.class, () -> @@ -195,87 +191,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase { builder.append("A"); } builder.append("\ud801\udc1c"); - Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory()); + Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString() + builder.toString())); - assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); - } - - // LUCENE-3642: normalize SMP->BMP and check that offsets are correct - public void testCrossPlaneNormalization() throws IOException { - Analyzer analyzer = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) { - @Override - protected int normalize(int c) { - if (c > 0xffff) { - return 'δ'; - } else { - return c; - } - } - }; - return new TokenStreamComponents(tokenizer, tokenizer); - } - }; - int num = 1000 * RANDOM_MULTIPLIER; - for (int i = 0; i < num; i++) { - String s = TestUtil.randomUnicodeString(random()); - try (TokenStream ts = analyzer.tokenStream("foo", s)) { - ts.reset(); - OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); - while (ts.incrementToken()) { - String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset()); - for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) { - cp = highlightedText.codePointAt(j); - assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp)); - } - } - ts.end(); - } - } - // just for fun - checkRandomData(random(), analyzer, num); - analyzer.close(); - } - - // LUCENE-3642: normalize BMP->SMP and check that offsets are correct - public void testCrossPlaneNormalization2() throws IOException { - Analyzer analyzer = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) { - @Override - protected int normalize(int c) { - if (c <= 0xffff) { - return 0x1043C; - } else { - return c; - } - } - }; - return new TokenStreamComponents(tokenizer, tokenizer); - } - }; - int num = 1000 * RANDOM_MULTIPLIER; - for (int i = 0; i < num; i++) { - String s = TestUtil.randomUnicodeString(random()); - try (TokenStream ts = analyzer.tokenStream("foo", s)) { - ts.reset(); - OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); - while (ts.incrementToken()) { - String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset()); - for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) { - cp = highlightedText.codePointAt(j); - assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp)); - } - } - ts.end(); - } - } - // just for fun - checkRandomData(random(), analyzer, num); - analyzer.close(); + assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); } public void testDefinitionUsingMethodReference1() throws Exception { @@ -287,16 +205,16 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase { public void testDefinitionUsingMethodReference2() throws Exception { final StringReader reader = new StringReader("Tokenizer(Test)"); - final Tokenizer tokenizer = CharTokenizer.fromTokenCharPredicate(Character::isLetter, Character::toUpperCase); + final Tokenizer tokenizer = CharTokenizer.fromTokenCharPredicate(Character::isLetter); tokenizer.setReader(reader); - assertTokenStreamContents(tokenizer, new String[] { "TOKENIZER", "TEST" }); + assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "Test" }); } public void testDefinitionUsingLambda() throws Exception { final StringReader reader = new StringReader("Tokenizer\u00A0Test Foo"); - final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(c -> c == '\u00A0' || Character.isWhitespace(c), Character::toLowerCase); + final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(c -> c == '\u00A0' || Character.isWhitespace(c)); tokenizer.setReader(reader); - assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test", "foo" }); + assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "Test", "Foo" }); } } diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml index 3dbd6aa7886..475c333451e 100644 --- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml +++ b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml @@ -126,20 +126,7 @@ - - - - - - - - - - - - - @@ -386,8 +373,6 @@ - - diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-copyfield-test.xml b/solr/core/src/test-files/solr/collection1/conf/schema-copyfield-test.xml index f36751e7dbf..20dc97adb19 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-copyfield-test.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-copyfield-test.xml @@ -90,19 +90,7 @@ - - - - - - - - - - - - @@ -347,8 +335,6 @@ - - diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml b/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml index 1d20b808cb4..0b13a570c10 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml @@ -81,7 +81,8 @@ - + + diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-hash.xml b/solr/core/src/test-files/solr/collection1/conf/schema-hash.xml index 3e8aa151963..c2d6b39b787 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-hash.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-hash.xml @@ -139,18 +139,7 @@ - - - - - - - - - - - @@ -484,8 +473,6 @@ termPositions="true" termOffsets="true"/> - - diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-required-fields.xml b/solr/core/src/test-files/solr/collection1/conf/schema-required-fields.xml index 4210d5b73ac..0ac0c0463ab 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-required-fields.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-required-fields.xml @@ -73,19 +73,7 @@ - - - - - - - - - - - - @@ -331,8 +319,6 @@ - - diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-rest.xml b/solr/core/src/test-files/solr/collection1/conf/schema-rest.xml index 2a043564dcd..46b735ce782 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-rest.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-rest.xml @@ -199,17 +199,13 @@ - + + - - - - - @@ -604,7 +600,6 @@ - diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-sql.xml b/solr/core/src/test-files/solr/collection1/conf/schema-sql.xml index 40bbe5adb95..03d9d7eef85 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-sql.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-sql.xml @@ -149,18 +149,7 @@ - - - - - - - - - - - @@ -501,8 +490,6 @@ termPositions="true" termOffsets="true"/> - - diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml b/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml index 6c33504427f..5613c66be2f 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml @@ -43,7 +43,8 @@ more concise example. - + + @@ -85,12 +86,6 @@ more concise example. - - - - - - @@ -112,13 +107,11 @@ more concise example. - - diff --git a/solr/core/src/test-files/solr/collection1/conf/schema.xml b/solr/core/src/test-files/solr/collection1/conf/schema.xml index b1a261b6c73..b61bbb16cad 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema.xml @@ -142,20 +142,17 @@ - + + - + + - - - - - @@ -574,7 +571,7 @@ - + @@ -636,7 +633,7 @@ - + diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index 894767607c1..e4c3ad2e2ca 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -234,17 +234,13 @@ - + + - - - - - @@ -591,7 +587,6 @@ - diff --git a/solr/core/src/test-files/solr/collection1/conf/schema15.xml b/solr/core/src/test-files/solr/collection1/conf/schema15.xml index 80d19e9b2a8..361344fbeef 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema15.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema15.xml @@ -163,19 +163,7 @@ - - - - - - - - - - - - @@ -505,8 +493,6 @@ - - diff --git a/solr/core/src/test-files/solr/collection1/conf/schemasurround.xml b/solr/core/src/test-files/solr/collection1/conf/schemasurround.xml index 213acc7b032..93b11ed78e4 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schemasurround.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schemasurround.xml @@ -164,19 +164,7 @@ - - - - - - - - - - - - @@ -517,8 +505,6 @@ - - diff --git a/solr/core/src/test/org/apache/solr/rest/schema/TestFieldCollectionResource.java b/solr/core/src/test/org/apache/solr/rest/schema/TestFieldCollectionResource.java index 31fa9f5e50e..bdd3cd23294 100644 --- a/solr/core/src/test/org/apache/solr/rest/schema/TestFieldCollectionResource.java +++ b/solr/core/src/test/org/apache/solr/rest/schema/TestFieldCollectionResource.java @@ -77,11 +77,11 @@ public class TestFieldCollectionResource extends SolrRestletTestBase { "/fields/[0]/name=='HTMLstandardtok'", "/fields/[1]/name=='HTMLwhitetok'", "/fields/[2]/name=='_version_'", - "/fields/[108]/name=='*_d'", - "/fields/[107]/name=='*_f'", - "/fields/[106]/name=='*_b'", - "/fields/[105]/name=='*_t'", - "/fields/[104]/name=='*_l'" + "/fields/[107]/name=='*_d'", + "/fields/[106]/name=='*_f'", + "/fields/[105]/name=='*_b'", + "/fields/[104]/name=='*_t'", + "/fields/[103]/name=='*_l'" ); } diff --git a/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java b/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java index ea19af0d2ed..08a3f1b9fa3 100644 --- a/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java +++ b/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java @@ -81,7 +81,8 @@ public class TestFieldTypeResource extends SolrRestletTestBase { "count(/response/lst[@name='fieldType']/*) = 3", "/response/lst[@name='fieldType']/str[@name='name'] = 'teststop'", "/response/lst[@name='fieldType']/str[@name='class'] = 'solr.TextField'", - "/response/lst[@name='fieldType']/lst[@name='analyzer']/lst[@name='tokenizer']/str[@name='class'] = 'solr.LowerCaseTokenizerFactory'", + "/response/lst[@name='fieldType']/lst[@name='analyzer']/lst[@name='tokenizer']/str[@name='class'] = 'solr.LetterTokenizerFactory'", + "/response/lst[@name='fieldType']/lst[@name='analyzer']/arr[@name='filters']/lst/str[@name='class'][.='solr.LowerCaseFilterFactory']", "/response/lst[@name='fieldType']/lst[@name='analyzer']/arr[@name='filters']/lst/str[@name='class'][.='solr.StopFilterFactory']", "/response/lst[@name='fieldType']/lst[@name='analyzer']/arr[@name='filters']/lst/str[@name='words'][.='stopwords.txt']" ); diff --git a/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java b/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java index c7e0dc3c8c6..f66c03e6091 100644 --- a/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java +++ b/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java @@ -22,7 +22,6 @@ import org.junit.BeforeClass; /** * Tests for: - * {@link org.apache.lucene.analysis.core.LowerCaseTokenizerFactory} * {@link org.apache.lucene.analysis.core.LetterTokenizerFactory} * {@link org.apache.lucene.analysis.core.KeywordTokenizerFactory} * {@link org.apache.lucene.analysis.core.WhitespaceTokenizerFactory} @@ -44,25 +43,18 @@ public class TestMaxTokenLenTokenizer extends SolrTestCaseJ4 { // using fields with definitions, different tokenizer factories respectively at index time and standard tokenizer at query time. updateJ("{\"add\":{\"doc\": {\"id\":1,\"letter\":\"letter\"}},\"commit\":{}}",null); - updateJ("{\"add\":{\"doc\": {\"id\":2,\"lowerCase\":\"lowerCase\"}},\"commit\":{}}",null); updateJ("{\"add\":{\"doc\": {\"id\":3,\"whiteSpace\":\"whiteSpace in\"}},\"commit\":{}}",null); updateJ("{\"add\":{\"doc\": {\"id\":4,\"unicodeWhiteSpace\":\"unicode in\"}},\"commit\":{}}",null); updateJ("{\"add\":{\"doc\": {\"id\":5,\"keyword\":\"keyword\"}},\"commit\":{}}",null); assertU(commit()); - assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=5]"); + assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=4]"); //Tokens generated for "letter": "let" "ter" "letter" , maxTokenLen=3 assertQ("Check the total number of docs", req("q","letter:let"), "//result[@numFound=1]"); assertQ("Check the total number of docs", req("q","letter:lett"), "//result[@numFound=0]"); - //Tokens generated for "lowerCase": "low" "erC" "ase" "lowerCase" , maxTokenLen=3 - assertQ("Check the total number of docs", req("q","lowerCase:low"), "//result[@numFound=1]"); - assertQ("Check the total number of docs", req("q","lowerCase:l"), "//result[@numFound=0]"); - assertQ("Check the total number of docs", req("q","lowerCase:lo"), "//result[@numFound=0]"); - assertQ("Check the total number of docs", req("q","lowerCase:lower"), "//result[@numFound=0]"); - //Tokens generated for "whiteSpace in": "whi" "teS" "pac" "e" "in" "whiteSpace" , maxTokenLen=3 assertQ("Check the total number of docs", req("q","whiteSpace:whi"), "//result[@numFound=1]"); assertQ("Check the total number of docs", req("q","whiteSpace:teS"), "//result[@numFound=1]"); @@ -88,14 +80,13 @@ public class TestMaxTokenLenTokenizer extends SolrTestCaseJ4 { // using fields with definitions, same tokenizers both at index and query time. updateJ("{\"add\":{\"doc\": {\"id\":1,\"letter0\":\"letter\"}},\"commit\":{}}",null); - updateJ("{\"add\":{\"doc\": {\"id\":2,\"lowerCase0\":\"lowerCase\"}},\"commit\":{}}",null); updateJ("{\"add\":{\"doc\": {\"id\":3,\"whiteSpace0\":\"whiteSpace in\"}},\"commit\":{}}",null); updateJ("{\"add\":{\"doc\": {\"id\":4,\"unicodeWhiteSpace0\":\"unicode in\"}},\"commit\":{}}",null); updateJ("{\"add\":{\"doc\": {\"id\":5,\"keyword0\":\"keyword\"}},\"commit\":{}}",null); assertU(commit()); - assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=5]"); + assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=4]"); //Tokens generated for "letter": "let" "ter" "letter" , maxTokenLen=3 // Anything that matches the first three letters should be found when maxLen=3 @@ -104,13 +95,6 @@ public class TestMaxTokenLenTokenizer extends SolrTestCaseJ4 { assertQ("Check the total number of docs", req("q","letter0:lett"), "//result[@numFound=1]"); assertQ("Check the total number of docs", req("q","letter0:letXYZ"), "//result[@numFound=1]"); - //Tokens generated for "lowerCase": "low" "erC" "ase" "lowerCase" , maxTokenLen=3 - // Anything that matches the first three letters should be found when maxLen=3 - assertQ("Check the total number of docs", req("q","lowerCase0:low"), "//result[@numFound=1]"); - assertQ("Check the total number of docs", req("q","lowerCase0:l"), "//result[@numFound=0]"); - assertQ("Check the total number of docs", req("q","lowerCase0:lo"), "//result[@numFound=0]"); - assertQ("Check the total number of docs", req("q","lowerCase0:lowerXYZ"), "//result[@numFound=1]"); - //Tokens generated for "whiteSpace in": "whi" "teS" "pac" "e" "in" "whiteSpace" , maxTokenLen=3 // Anything that matches the first three letters should be found when maxLen=3 assertQ("Check the total number of docs", req("q","whiteSpace0:h"), "//result[@numFound=0]"); diff --git a/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml b/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml index 3a1f32815c8..974893ca848 100644 --- a/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml +++ b/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml @@ -141,18 +141,7 @@ - - - - - - - - - - - @@ -493,8 +482,6 @@ termPositions="true" termOffsets="true"/> - - diff --git a/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema.xml b/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema.xml index 02b505378e9..079a35fb040 100644 --- a/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema.xml +++ b/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema.xml @@ -116,18 +116,7 @@ - - - - - - - - - - - @@ -461,8 +450,6 @@ termPositions="true" termOffsets="true"/> - - diff --git a/solr/solrj/src/test-files/solrj/solr/configsets/streaming/conf/schema.xml b/solr/solrj/src/test-files/solrj/solr/configsets/streaming/conf/schema.xml index aa96296b580..6cd4f91eda2 100644 --- a/solr/solrj/src/test-files/solrj/solr/configsets/streaming/conf/schema.xml +++ b/solr/solrj/src/test-files/solrj/solr/configsets/streaming/conf/schema.xml @@ -137,16 +137,7 @@ - - - - - - - - - @@ -479,8 +470,6 @@ termPositions="true" termOffsets="true"/> - -