mirror of https://github.com/apache/lucene.git
LUCENE-8186: LowerCaseTokenizerFactory now lowercases text in multi-term queries.
This commit is contained in:
parent
78ca82e63a
commit
1971ef3109
|
@ -248,6 +248,9 @@ Bug Fixes
|
||||||
* LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.
|
* LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.
|
||||||
(chengpohi via Jim Ferenczi)
|
(chengpohi via Jim Ferenczi)
|
||||||
|
|
||||||
|
* LUCENE-8186: LowerCaseTokenizerFactory now lowercases text in multi-term
|
||||||
|
queries. (Tim Allison via Adrien Grand)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
|
|
||||||
* LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)
|
* LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)
|
||||||
|
|
|
@ -162,6 +162,12 @@ public final class CustomAnalyzer extends Analyzer {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||||
TokenStream result = in;
|
TokenStream result = in;
|
||||||
|
// tokenizers can return a tokenfilter if the tokenizer does normalization,
|
||||||
|
// although this is really bogus/abstraction violation...
|
||||||
|
if (tokenizer instanceof MultiTermAwareComponent) {
|
||||||
|
TokenFilterFactory filter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenizer).getMultiTermComponent();
|
||||||
|
result = filter.create(result);
|
||||||
|
}
|
||||||
for (TokenFilterFactory filter : tokenFilters) {
|
for (TokenFilterFactory filter : tokenFilters) {
|
||||||
if (filter instanceof MultiTermAwareComponent) {
|
if (filter instanceof MultiTermAwareComponent) {
|
||||||
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
|
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
|
||||||
|
|
|
@ -31,9 +31,9 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
|
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
|
||||||
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
|
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
|
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
|
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.LowerCaseTokenizerFactory;
|
||||||
import org.apache.lucene.analysis.core.StopFilterFactory;
|
import org.apache.lucene.analysis.core.StopFilterFactory;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
|
||||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
|
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
|
||||||
|
@ -432,7 +432,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||||
return new KeywordTokenizerFactory(getOriginalArgs());
|
return new DummyTokenFilterFactory(Collections.emptyMap());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -501,6 +501,14 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
||||||
assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c"));
|
assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** test normalize where the TokenizerFactory returns a filter to normalize the text */
|
||||||
|
public void testNormalizationWithLowerCaseTokenizer() throws IOException {
|
||||||
|
CustomAnalyzer analyzer1 = CustomAnalyzer.builder()
|
||||||
|
.withTokenizer(LowerCaseTokenizerFactory.class, Collections.emptyMap())
|
||||||
|
.build();
|
||||||
|
assertEquals(new BytesRef("abc"), analyzer1.normalize("dummy", "ABC"));
|
||||||
|
}
|
||||||
|
|
||||||
public void testConditions() throws IOException {
|
public void testConditions() throws IOException {
|
||||||
CustomAnalyzer analyzer = CustomAnalyzer.builder()
|
CustomAnalyzer analyzer = CustomAnalyzer.builder()
|
||||||
.withTokenizer("whitespace")
|
.withTokenizer("whitespace")
|
||||||
|
|
Loading…
Reference in New Issue