From 26ee8e9bea70e857aa61764020337ce675066bd1 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 30 Dec 2016 10:30:40 +0100 Subject: [PATCH] LUCENE-7606: Normalization with CustomAnalyzer would only apply the last token filter. --- lucene/CHANGES.txt | 3 +++ .../analysis/custom/CustomAnalyzer.java | 2 +- .../analysis/custom/TestCustomAnalyzer.java | 21 +++++++++++++++++++ .../lucene/analysis/custom/mapping1.txt | 1 + .../lucene/analysis/custom/mapping2.txt | 1 + 5 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/mapping1.txt create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/mapping2.txt diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 12b615d6ac6..4c49560620a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -129,6 +129,9 @@ Bug Fixes using helpers for exclusive bounds that are consistent with Double.compare. (Adrien Grand, Dawid Weiss) +* LUCENE-7606: Normalization with CustomAnalyzer would only apply the last + token filter. (Adrien Grand) + Improvements * LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery, diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java index 466642c9f37..1cfdfe37979 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java @@ -145,7 +145,7 @@ public final class CustomAnalyzer extends Analyzer { for (TokenFilterFactory filter : tokenFilters) { if (filter instanceof MultiTermAwareComponent) { filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent(); - result = filter.create(in); + result = filter.create(result); } } return result; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java index aa69b709ec9..d929bfd099e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory; +import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory; import org.apache.lucene.analysis.core.KeywordTokenizerFactory; import org.apache.lucene.analysis.core.LowerCaseFilterFactory; import org.apache.lucene.analysis.core.LowerCaseTokenizer; @@ -479,4 +480,24 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { assertEquals(new BytesRef("2A"), analyzer2.normalize("dummy", "0À")); } + public void testNormalizationWithMultipleTokenFilters() throws IOException { + CustomAnalyzer analyzer = CustomAnalyzer.builder() + // none of these components are multi-term aware so they should not be applied + .withTokenizer(WhitespaceTokenizerFactory.class, Collections.emptyMap()) + .addTokenFilter(LowerCaseFilterFactory.class, Collections.emptyMap()) + .addTokenFilter(ASCIIFoldingFilterFactory.class, Collections.emptyMap()) + .build(); + assertEquals(new BytesRef("a b e"), analyzer.normalize("dummy", "À B é")); + } + + public void testNormalizationWithMultiplCharFilters() throws IOException { + CustomAnalyzer analyzer = CustomAnalyzer.builder() + // none of these components are multi-term aware so they should not be applied + .withTokenizer(WhitespaceTokenizerFactory.class, Collections.emptyMap()) + .addCharFilter(MappingCharFilterFactory.class, new HashMap<>(Collections.singletonMap("mapping", "org/apache/lucene/analysis/custom/mapping1.txt"))) + .addCharFilter(MappingCharFilterFactory.class, new HashMap<>(Collections.singletonMap("mapping", "org/apache/lucene/analysis/custom/mapping2.txt"))) + .build(); + assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c")); + } + } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/mapping1.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/mapping1.txt new file mode 100644 index 00000000000..40aaf5a27d3 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/mapping1.txt @@ -0,0 +1 @@ +"a" => "e" diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/mapping2.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/mapping2.txt new file mode 100644 index 00000000000..cac0bea0694 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/mapping2.txt @@ -0,0 +1 @@ +"b" => "f"