LUCENE-8498: Remove LowerCaseTokenizer

This commit is contained in:
Alan Woodward 2018-09-15 16:56:27 +01:00
parent 52bdcf6bb0
commit c0d2975970
32 changed files with 78 additions and 568 deletions

View File

@ -81,6 +81,9 @@ API Changes
* LUCENE-8352: TokenStreamComponents is now final, and can take a Consumer<Reader>
in its constructor (Mark Harwood, Alan Woodward, Adrien Grand)
* LUCENE-8498: LowerCaseTokenizer has been removed, and CharTokenizer no longer
takes a normalizer function. (Alan Woodward)
Changes in Runtime Behavior
* LUCENE-8333: Switch MoreLikeThis.setMaxDocFreqPct to use maxDoc instead of

View File

@ -129,3 +129,14 @@ Most code should just require recompilation, though possibly requiring some adde
Instead of overriding TokenStreamComponents#setReader() to customise analyzer
initialisation, you should now pass a Consumer&lt;Reader> instance to the
TokenStreamComponents constructor.
## LowerCaseTokenizer and LowerCaseTokenizerFactory have been removed ##
LowerCaseTokenizer combined tokenization and filtering in a way that broke token
normalization, so they have been removed. Instead, use a LetterTokenizer followed by
a LowerCaseFilter
## CharTokenizer no longer takes a normalizer function ##
CharTokenizer now only performs tokenization. To perform any type of filtering
use a TokenFilter chain as you would with any other Tokenizer.

View File

@ -1,72 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.AttributeFactory;
/**
* LowerCaseTokenizer performs the function of LetterTokenizer
* and LowerCaseFilter together. It divides text at non-letters and converts
* them to lower case. While it is functionally equivalent to the combination
* of LetterTokenizer and LowerCaseFilter, there is a performance advantage
* to doing the two tasks at once, hence this (redundant) implementation.
* <P>
* Note: this does a decent job for most European languages, but does a terrible
* job for some Asian languages, where words are not separated by spaces.
* </p>
*/
public final class LowerCaseTokenizer extends LetterTokenizer {
/**
* Construct a new LowerCaseTokenizer.
*/
public LowerCaseTokenizer() {
}
/**
* Construct a new LowerCaseTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}.
*
* @param factory
* the attribute factory to use for this {@link Tokenizer}
*/
public LowerCaseTokenizer(AttributeFactory factory) {
super(factory);
}
/**
* Construct a new LowerCaseTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}.
*
* @param factory the attribute factory to use for this {@link Tokenizer}
* @param maxTokenLen maximum token length the tokenizer will emit.
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
* @throws IllegalArgumentException if maxTokenLen is invalid.
*/
public LowerCaseTokenizer(AttributeFactory factory, int maxTokenLen) {
super(factory, maxTokenLen);
}
/** Converts char to lower case
* {@link Character#toLowerCase(int)}.*/
@Override
protected int normalize(int c) {
return Character.toLowerCase(c);
}
}

View File

@ -1,75 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import java.util.HashMap;
import java.util.Map;
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
/**
* Factory for {@link LowerCaseTokenizer}.
* <pre class="prettyprint">
* &lt;fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="256"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* <p>
* Options:
* <ul>
* <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
* It is rare to need to change this
* else {@link CharTokenizer}::DEFAULT_MAX_WORD_LEN</li>
* </ul>
*/
public class LowerCaseTokenizerFactory extends TokenizerFactory implements MultiTermAwareComponent {
private final int maxTokenLen;
/**
* Creates a new LowerCaseTokenizerFactory
*/
public LowerCaseTokenizerFactory(Map<String, String> args) {
super(args);
maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public LowerCaseTokenizer create(AttributeFactory factory) {
return new LowerCaseTokenizer(factory, maxTokenLen);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
Map<String,String> map = new HashMap<>(getOriginalArgs());
map.remove("maxTokenLen"); //removing "maxTokenLen" argument for LowerCaseFilterFactory init
return new LowerCaseFilterFactory(map);
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/** An {@link Analyzer} that filters {@link LetterTokenizer}
* with {@link LowerCaseFilter}
@ -34,7 +35,8 @@ public final class SimpleAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
return new TokenStreamComponents(new LowerCaseTokenizer());
Tokenizer tokenizer = new LetterTokenizer();
return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer));
}
@Override

View File

@ -60,13 +60,13 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link LowerCaseTokenizer} filtered with
* built from a {@link LetterTokenizer} filtered with
* {@link StopFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new LowerCaseTokenizer();
return new TokenStreamComponents(source, new StopFilter(source, stopwords));
final Tokenizer source = new LetterTokenizer();
return new TokenStreamComponents(source, new StopFilter(new LowerCaseFilter(source), stopwords));
}
@Override

View File

@ -20,14 +20,11 @@ package org.apache.lucene.analysis.util;
import java.io.IOException;
import java.util.Objects;
import java.util.function.IntPredicate;
import java.util.function.IntUnaryOperator;
import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@ -107,48 +104,12 @@ public abstract class CharTokenizer extends Tokenizer {
* </pre>
*/
public static CharTokenizer fromTokenCharPredicate(AttributeFactory factory, final IntPredicate tokenCharPredicate) {
return fromTokenCharPredicate(factory, tokenCharPredicate, IntUnaryOperator.identity());
}
/**
* Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
* The predicate should return {@code true} for all valid token characters.
* This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
* <p>
* This factory is intended to be used with lambdas or method references. E.g., an elegant way
* to create an instance which behaves exactly as {@link LowerCaseTokenizer} is:
* <pre class="prettyprint lang-java">
* Tokenizer tok = CharTokenizer.fromTokenCharPredicate(Character::isLetter, Character::toLowerCase);
* </pre>
*/
public static CharTokenizer fromTokenCharPredicate(final IntPredicate tokenCharPredicate, final IntUnaryOperator normalizer) {
return fromTokenCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, tokenCharPredicate, normalizer);
}
/**
* Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate, supplied as method reference or lambda expression.
* The predicate should return {@code true} for all valid token characters.
* This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
* <p>
* This factory is intended to be used with lambdas or method references. E.g., an elegant way
* to create an instance which behaves exactly as {@link LowerCaseTokenizer} is:
* <pre class="prettyprint lang-java">
* Tokenizer tok = CharTokenizer.fromTokenCharPredicate(factory, Character::isLetter, Character::toLowerCase);
* </pre>
*/
public static CharTokenizer fromTokenCharPredicate(AttributeFactory factory, final IntPredicate tokenCharPredicate, final IntUnaryOperator normalizer) {
Objects.requireNonNull(tokenCharPredicate, "predicate must not be null.");
Objects.requireNonNull(normalizer, "normalizer must not be null");
return new CharTokenizer(factory) {
@Override
protected boolean isTokenChar(int c) {
return tokenCharPredicate.test(c);
}
@Override
protected int normalize(int c) {
return normalizer.applyAsInt(c);
}
};
}
@ -179,37 +140,7 @@ public abstract class CharTokenizer extends Tokenizer {
* </pre>
*/
public static CharTokenizer fromSeparatorCharPredicate(AttributeFactory factory, final IntPredicate separatorCharPredicate) {
return fromSeparatorCharPredicate(factory, separatorCharPredicate, IntUnaryOperator.identity());
}
/**
* Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
* The predicate should return {@code true} for all valid token separator characters.
* This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
* <p>
* This factory is intended to be used with lambdas or method references. E.g., an elegant way
* to create an instance which behaves exactly as the combination {@link WhitespaceTokenizer} and {@link LowerCaseFilter} is:
* <pre class="prettyprint lang-java">
* Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(Character::isWhitespace, Character::toLowerCase);
* </pre>
*/
public static CharTokenizer fromSeparatorCharPredicate(final IntPredicate separatorCharPredicate, final IntUnaryOperator normalizer) {
return fromSeparatorCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, separatorCharPredicate, normalizer);
}
/**
* Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate.
* The predicate should return {@code true} for all valid token separator characters.
* This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
* <p>
* This factory is intended to be used with lambdas or method references. E.g., an elegant way
* to create an instance which behaves exactly as {@link WhitespaceTokenizer} and {@link LowerCaseFilter} is:
* <pre class="prettyprint lang-java">
* Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(factory, Character::isWhitespace, Character::toLowerCase);
* </pre>
*/
public static CharTokenizer fromSeparatorCharPredicate(AttributeFactory factory, final IntPredicate separatorCharPredicate, final IntUnaryOperator normalizer) {
return fromTokenCharPredicate(factory, separatorCharPredicate.negate(), normalizer);
return fromTokenCharPredicate(factory, separatorCharPredicate.negate());
}
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
@ -230,15 +161,6 @@ public abstract class CharTokenizer extends Tokenizer {
*/
protected abstract boolean isTokenChar(int c);
/**
* Called on each token character to normalize it before it is added to the
* token. The default implementation does nothing. Subclasses may use this to,
* e.g., lowercase tokens.
*/
protected int normalize(int c) {
return c;
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
@ -276,7 +198,7 @@ public abstract class CharTokenizer extends Tokenizer {
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
}
end += charCount;
length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
length += Character.toChars(c, buffer, length); // buffer it, normalized
if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test
break;
}

View File

@ -15,7 +15,6 @@
org.apache.lucene.analysis.core.KeywordTokenizerFactory
org.apache.lucene.analysis.core.LetterTokenizerFactory
org.apache.lucene.analysis.core.LowerCaseTokenizerFactory
org.apache.lucene.analysis.core.WhitespaceTokenizerFactory
org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory
org.apache.lucene.analysis.ngram.NGramTokenizerFactory

View File

@ -25,7 +25,8 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
/**
@ -147,9 +148,9 @@ public class TestBrazilianAnalyzer extends BaseTokenStreamTestCase {
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet(1, true);
set.add("Brasília");
Tokenizer tokenizer = new LowerCaseTokenizer();
Tokenizer tokenizer = new LetterTokenizer();
tokenizer.setReader(new StringReader("Brasília Brasilia"));
BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(tokenizer, set));
BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(new LowerCaseFilter(tokenizer), set));
assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}

View File

@ -217,14 +217,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
assertEquals('\ud801', termBuffer[length - 1]);
}
public void testLowerCaseTokenizer() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer();
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[] { "tokenizer",
"\ud801\udc44test" });
}
public void testWhitespaceTokenizer() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();

View File

@ -31,9 +31,8 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizerFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
@ -419,7 +418,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
@Override
public Tokenizer create(AttributeFactory factory) {
return new LowerCaseTokenizer(factory);
return new LetterTokenizer(factory);
}
}
@ -501,14 +500,6 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c"));
}
/** test normalize where the TokenizerFactory returns a filter to normalize the text */
public void testNormalizationWithLowerCaseTokenizer() throws IOException {
CustomAnalyzer analyzer1 = CustomAnalyzer.builder()
.withTokenizer(LowerCaseTokenizerFactory.class, Collections.emptyMap())
.build();
assertEquals(new BytesRef("abc"), analyzer1.normalize("dummy", "ABC"));
}
public void testConditions() throws IOException {
CustomAnalyzer analyzer = CustomAnalyzer.builder()
.withTokenizer("whitespace")

View File

@ -23,7 +23,9 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
@ -38,10 +40,10 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet( 1, true);
set.add("fischen");
final LowerCaseTokenizer in = new LowerCaseTokenizer();
final Tokenizer in = new LetterTokenizer();
in.setReader(new StringReader("Fischen Trinken"));
GermanStemFilter filter = new GermanStemFilter(
new SetKeywordMarkerFilter(in, set));
new SetKeywordMarkerFilter(new LowerCaseFilter(in), set));
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}

View File

@ -125,17 +125,6 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
new String[] { "What", "s", "this", "thing", "do" });
}
/**
* Test LowerCaseTokenizerFactory
*/
public void testLowerCaseTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
Tokenizer stream = tokenizerFactory("LowerCase").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "what", "s", "this", "thing", "do" });
}
/**
* Ensure the ASCIIFoldingFilterFactory works
*/
@ -168,11 +157,6 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
expected = expectThrows(IllegalArgumentException.class, () -> {
tokenizerFactory("LowerCase", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
expected = expectThrows(IllegalArgumentException.class, () -> {
tokenFilterFactory("ASCIIFolding", "bogusArg", "bogusValue");
});

View File

@ -21,16 +21,12 @@ import java.io.IOException;
import java.io.StringReader;
import java.util.Locale;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.TestUtil;
/**
@ -54,9 +50,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
}
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
builder.insert(1023, "\ud801\udc1c");
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" "));
assertTokenStreamContents(new LowerCaseFilter(tokenizer), builder.toString().toLowerCase(Locale.ROOT).split(" "));
}
/*
@ -72,9 +68,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
builder.append("a");
}
builder.append("\ud801\udc1cabc");
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)});
assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT)});
}
}
@ -87,9 +83,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
for (int i = 0; i < 255; i++) {
builder.append("A");
}
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
/*
@ -101,14 +97,14 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
for (int i = 0; i < 100; i++) {
builder.append("A");
}
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
// Tricky, passing two copies of the string to the reader....
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT),
assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[]{builder.toString().toLowerCase(Locale.ROOT),
builder.toString().toLowerCase(Locale.ROOT) });
Exception e = expectThrows(IllegalArgumentException.class, () ->
new LowerCaseTokenizer(newAttributeFactory(), -1));
new LetterTokenizer(newAttributeFactory(), -1));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
@ -134,16 +130,16 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
}
e = expectThrows(IllegalArgumentException.class, () ->
new LowerCaseTokenizer(newAttributeFactory(), 0));
new LetterTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () ->
new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
new LetterTokenizer(newAttributeFactory(), 10_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT)});
assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[]{builder.toString().toLowerCase(Locale.ROOT)});
e = expectThrows(IllegalArgumentException.class, () ->
@ -195,87 +191,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
builder.append("A");
}
builder.append("\ud801\udc1c");
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
// LUCENE-3642: normalize SMP->BMP and check that offsets are correct
public void testCrossPlaneNormalization() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) {
@Override
protected int normalize(int c) {
if (c > 0xffff) {
return 'δ';
} else {
return c;
}
}
};
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
int num = 1000 * RANDOM_MULTIPLIER;
for (int i = 0; i < num; i++) {
String s = TestUtil.randomUnicodeString(random());
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
ts.reset();
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
while (ts.incrementToken()) {
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
cp = highlightedText.codePointAt(j);
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
}
}
ts.end();
}
}
// just for fun
checkRandomData(random(), analyzer, num);
analyzer.close();
}
// LUCENE-3642: normalize BMP->SMP and check that offsets are correct
public void testCrossPlaneNormalization2() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) {
@Override
protected int normalize(int c) {
if (c <= 0xffff) {
return 0x1043C;
} else {
return c;
}
}
};
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
int num = 1000 * RANDOM_MULTIPLIER;
for (int i = 0; i < num; i++) {
String s = TestUtil.randomUnicodeString(random());
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
ts.reset();
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
while (ts.incrementToken()) {
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
cp = highlightedText.codePointAt(j);
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
}
}
ts.end();
}
}
// just for fun
checkRandomData(random(), analyzer, num);
analyzer.close();
assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
public void testDefinitionUsingMethodReference1() throws Exception {
@ -287,16 +205,16 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
public void testDefinitionUsingMethodReference2() throws Exception {
final StringReader reader = new StringReader("Tokenizer(Test)");
final Tokenizer tokenizer = CharTokenizer.fromTokenCharPredicate(Character::isLetter, Character::toUpperCase);
final Tokenizer tokenizer = CharTokenizer.fromTokenCharPredicate(Character::isLetter);
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[] { "TOKENIZER", "TEST" });
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "Test" });
}
public void testDefinitionUsingLambda() throws Exception {
final StringReader reader = new StringReader("Tokenizer\u00A0Test Foo");
final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(c -> c == '\u00A0' || Character.isWhitespace(c), Character::toLowerCase);
final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(c -> c == '\u00A0' || Character.isWhitespace(c));
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test", "foo" });
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "Test", "Foo" });
}
}

View File

@ -126,20 +126,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
<fieldType name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
<filter class="solr.ClassicFilterFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldType name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -386,8 +373,6 @@
<field name="test_hlt_off" type="highlittext" indexed="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>

View File

@ -90,19 +90,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
<fieldType name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldType name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -347,8 +335,6 @@
<field name="test_hlt_off" type="highlittext" indexed="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>

View File

@ -81,7 +81,8 @@
<fieldType name="text_lower_token" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
<tokenizer class="solr.LetterTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>

View File

@ -139,18 +139,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldtype>
<fieldtype name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldtype>
<!-- fieldtypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldtype name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -484,8 +473,6 @@
termPositions="true" termOffsets="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>

View File

@ -73,19 +73,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
<fieldType name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldType name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -331,8 +319,6 @@
<field name="test_hlt_off" type="highlittext" indexed="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>

View File

@ -199,17 +199,13 @@
<fieldType name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
<tokenizer class="solr.LetterTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldType name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -604,7 +600,6 @@
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>

View File

@ -149,18 +149,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldtype>
<fieldtype name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldtype>
<!-- fieldtypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldtype name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -501,8 +490,6 @@
termPositions="true" termOffsets="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>

View File

@ -43,7 +43,8 @@ more concise example.
<fieldType name="lowerCasefieldType" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
<analyzer type="index">
<tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="3" />
<tokenizer class="solr.LetterTokenizerFactory" maxTokenLen="3" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
@ -85,12 +86,6 @@ more concise example.
</analyzer>
</fieldType>
<fieldType name="lowerCase0fieldType" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="3" />
</analyzer>
</fieldType>
<fieldType name="whiteSp0fieldType" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory" maxTokenLen="3" />
@ -112,13 +107,11 @@ more concise example.
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
<field name="letter" type="letterfieldType" indexed="true" stored="true"/>
<field name="lowerCase" type="lowerCasefieldType" indexed="true" stored="true"/>
<field name="whiteSpace" type="whiteSpfieldType" indexed="true" stored="true"/>
<field name="unicodeWhiteSpace" type="uniWhiteSpfieldType" indexed="true" stored="true"/>
<field name="keyword" type="keywordfieldType" indexed="true" stored="true"/>
<field name="letter0" type="letter0fieldType" indexed="true" stored="true"/>
<field name="lowerCase0" type="lowerCase0fieldType" indexed="true" stored="true"/>
<field name="whiteSpace0" type="whiteSp0fieldType" indexed="true" stored="true"/>
<field name="unicodeWhiteSpace0" type="uniWhiteSp0fieldType" indexed="true" stored="true"/>
<field name="keyword0" type="keyword0fieldType" indexed="true" stored="true"/>

View File

@ -142,20 +142,17 @@
<fieldType name="teststop" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
<tokenizer class="solr.LetterTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
<tokenizer class="solr.LetterTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldType name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -574,7 +571,7 @@
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="lowertok" type="lowerfilt" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
@ -636,7 +633,7 @@
<field name="store" type="location" indexed="true" stored="true" omitNorms="false"/>
<field name="lower" type="lowertok" indexed="false" stored="true" multiValued="true"/>
<field name="lower" type="lowerfilt" indexed="false" stored="true" multiValued="true"/>
<field name="_route_" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="payloadDelimited" type="payloadDelimited"/>

View File

@ -234,17 +234,13 @@
<fieldType name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
<tokenizer class="solr.LetterTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldType name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -591,7 +587,6 @@
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>

View File

@ -163,19 +163,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
<fieldType name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldType name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -505,8 +493,6 @@
<field name="test_hlt_off" type="highlittext" indexed="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>

View File

@ -164,19 +164,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
<fieldType name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldType name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -517,8 +505,6 @@
<field name="test_hlt_off" type="highlittext" indexed="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>

View File

@ -77,11 +77,11 @@ public class TestFieldCollectionResource extends SolrRestletTestBase {
"/fields/[0]/name=='HTMLstandardtok'",
"/fields/[1]/name=='HTMLwhitetok'",
"/fields/[2]/name=='_version_'",
"/fields/[108]/name=='*_d'",
"/fields/[107]/name=='*_f'",
"/fields/[106]/name=='*_b'",
"/fields/[105]/name=='*_t'",
"/fields/[104]/name=='*_l'"
"/fields/[107]/name=='*_d'",
"/fields/[106]/name=='*_f'",
"/fields/[105]/name=='*_b'",
"/fields/[104]/name=='*_t'",
"/fields/[103]/name=='*_l'"
);
}

View File

@ -81,7 +81,8 @@ public class TestFieldTypeResource extends SolrRestletTestBase {
"count(/response/lst[@name='fieldType']/*) = 3",
"/response/lst[@name='fieldType']/str[@name='name'] = 'teststop'",
"/response/lst[@name='fieldType']/str[@name='class'] = 'solr.TextField'",
"/response/lst[@name='fieldType']/lst[@name='analyzer']/lst[@name='tokenizer']/str[@name='class'] = 'solr.LowerCaseTokenizerFactory'",
"/response/lst[@name='fieldType']/lst[@name='analyzer']/lst[@name='tokenizer']/str[@name='class'] = 'solr.LetterTokenizerFactory'",
"/response/lst[@name='fieldType']/lst[@name='analyzer']/arr[@name='filters']/lst/str[@name='class'][.='solr.LowerCaseFilterFactory']",
"/response/lst[@name='fieldType']/lst[@name='analyzer']/arr[@name='filters']/lst/str[@name='class'][.='solr.StopFilterFactory']",
"/response/lst[@name='fieldType']/lst[@name='analyzer']/arr[@name='filters']/lst/str[@name='words'][.='stopwords.txt']"
);

View File

@ -22,7 +22,6 @@ import org.junit.BeforeClass;
/**
* Tests for:
* {@link org.apache.lucene.analysis.core.LowerCaseTokenizerFactory}
* {@link org.apache.lucene.analysis.core.LetterTokenizerFactory}
* {@link org.apache.lucene.analysis.core.KeywordTokenizerFactory}
* {@link org.apache.lucene.analysis.core.WhitespaceTokenizerFactory}
@ -44,25 +43,18 @@ public class TestMaxTokenLenTokenizer extends SolrTestCaseJ4 {
// using fields with definitions, different tokenizer factories respectively at index time and standard tokenizer at query time.
updateJ("{\"add\":{\"doc\": {\"id\":1,\"letter\":\"letter\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":2,\"lowerCase\":\"lowerCase\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":3,\"whiteSpace\":\"whiteSpace in\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":4,\"unicodeWhiteSpace\":\"unicode in\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":5,\"keyword\":\"keyword\"}},\"commit\":{}}",null);
assertU(commit());
assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=5]");
assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=4]");
//Tokens generated for "letter": "let" "ter" "letter" , maxTokenLen=3
assertQ("Check the total number of docs", req("q","letter:let"), "//result[@numFound=1]");
assertQ("Check the total number of docs", req("q","letter:lett"), "//result[@numFound=0]");
//Tokens generated for "lowerCase": "low" "erC" "ase" "lowerCase" , maxTokenLen=3
assertQ("Check the total number of docs", req("q","lowerCase:low"), "//result[@numFound=1]");
assertQ("Check the total number of docs", req("q","lowerCase:l"), "//result[@numFound=0]");
assertQ("Check the total number of docs", req("q","lowerCase:lo"), "//result[@numFound=0]");
assertQ("Check the total number of docs", req("q","lowerCase:lower"), "//result[@numFound=0]");
//Tokens generated for "whiteSpace in": "whi" "teS" "pac" "e" "in" "whiteSpace" , maxTokenLen=3
assertQ("Check the total number of docs", req("q","whiteSpace:whi"), "//result[@numFound=1]");
assertQ("Check the total number of docs", req("q","whiteSpace:teS"), "//result[@numFound=1]");
@ -88,14 +80,13 @@ public class TestMaxTokenLenTokenizer extends SolrTestCaseJ4 {
// using fields with definitions, same tokenizers both at index and query time.
updateJ("{\"add\":{\"doc\": {\"id\":1,\"letter0\":\"letter\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":2,\"lowerCase0\":\"lowerCase\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":3,\"whiteSpace0\":\"whiteSpace in\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":4,\"unicodeWhiteSpace0\":\"unicode in\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":5,\"keyword0\":\"keyword\"}},\"commit\":{}}",null);
assertU(commit());
assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=5]");
assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=4]");
//Tokens generated for "letter": "let" "ter" "letter" , maxTokenLen=3
// Anything that matches the first three letters should be found when maxLen=3
@ -104,13 +95,6 @@ public class TestMaxTokenLenTokenizer extends SolrTestCaseJ4 {
assertQ("Check the total number of docs", req("q","letter0:lett"), "//result[@numFound=1]");
assertQ("Check the total number of docs", req("q","letter0:letXYZ"), "//result[@numFound=1]");
//Tokens generated for "lowerCase": "low" "erC" "ase" "lowerCase" , maxTokenLen=3
// Anything that matches the first three letters should be found when maxLen=3
assertQ("Check the total number of docs", req("q","lowerCase0:low"), "//result[@numFound=1]");
assertQ("Check the total number of docs", req("q","lowerCase0:l"), "//result[@numFound=0]");
assertQ("Check the total number of docs", req("q","lowerCase0:lo"), "//result[@numFound=0]");
assertQ("Check the total number of docs", req("q","lowerCase0:lowerXYZ"), "//result[@numFound=1]");
//Tokens generated for "whiteSpace in": "whi" "teS" "pac" "e" "in" "whiteSpace" , maxTokenLen=3
// Anything that matches the first three letters should be found when maxLen=3
assertQ("Check the total number of docs", req("q","whiteSpace0:h"), "//result[@numFound=0]");

View File

@ -141,18 +141,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldtype>
<fieldtype name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldtype>
<!-- fieldtypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldtype name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -493,8 +482,6 @@
termPositions="true" termOffsets="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>

View File

@ -116,18 +116,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
<fieldType name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldType name="lowertok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@ -461,8 +450,6 @@
termPositions="true" termOffsets="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>

View File

@ -137,16 +137,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldtype>
<fieldtype name="teststop" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldtype>
<!-- fieldtypes in this section isolate tokenizers and tokenfilters for testing -->
<fieldtype name="lowertok" class="solr.TextField">
<analyzer><tokenizer class="solr.LowerCaseTokenizerFactory"/></analyzer>
</fieldtype>
<fieldtype name="keywordtok" class="solr.TextField">
<analyzer><tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/></analyzer>
</fieldtype>
@ -479,8 +470,6 @@
termPositions="true" termOffsets="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>