diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index cd689839875..bc34e2f446b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -225,6 +225,9 @@ Improvements * LUCENE-9850: Switch to PFOR encoding for doc IDs (instead of FOR). (Greg Miller) +* LUCENE-9929: Add NorwegianNormalizationFilter, which does the same as ScandinavianNormalizationFilter except + it does not fold oo->ø and ao->å. (janhoy, Robert Muir, Adrien Grand) + Bug fixes diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java index 82d38627429..4c5366f11eb 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java @@ -16,11 +16,12 @@ */ package org.apache.lucene.analysis.miscellaneous; +import static org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.ALL_FOLDINGS; + import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.util.StemmerUtil; /** * This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded @@ -33,98 +34,29 @@ import org.apache.lucene.analysis.util.StemmerUtil; *

blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej but not blabarsyltetoj räksmörgås == * ræksmørgås == ræksmörgaos == raeksmoergaas but not raksmorgas * + *

There are also separate filters for Norwegian, Danish and Swedish with slightly differing + * settings + * * @see ScandinavianFoldingFilter */ public final class ScandinavianNormalizationFilter extends TokenFilter { + private final ScandinavianNormalizer normalizer; + public ScandinavianNormalizationFilter(TokenStream input) { super(input); + this.normalizer = new ScandinavianNormalizer(ALL_FOLDINGS); } private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); - private static final char AA = '\u00C5'; // Å - private static final char aa = '\u00E5'; // å - private static final char AE = '\u00C6'; // Æ - private static final char ae = '\u00E6'; // æ - private static final char AE_se = '\u00C4'; // Ä - private static final char ae_se = '\u00E4'; // ä - private static final char OE = '\u00D8'; // Ø - private static final char oe = '\u00F8'; // ø - private static final char OE_se = '\u00D6'; // Ö - private static final char oe_se = '\u00F6'; // ö - @Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) { return false; } - - char[] buffer = charTermAttribute.buffer(); - int length = charTermAttribute.length(); - - int i; - for (i = 0; i < length; i++) { - - if (buffer[i] == ae_se) { - buffer[i] = ae; - - } else if (buffer[i] == AE_se) { - buffer[i] = AE; - - } else if (buffer[i] == oe_se) { - buffer[i] = oe; - - } else if (buffer[i] == OE_se) { - buffer[i] = OE; - - } else if (length - 1 > i) { - - if (buffer[i] == 'a' - && (buffer[i + 1] == 'a' - || buffer[i + 1] == 'o' - || buffer[i + 1] == 'A' - || buffer[i + 1] == 'O')) { - length = StemmerUtil.delete(buffer, i + 1, length); - buffer[i] = aa; - - } else if (buffer[i] == 'A' - && (buffer[i + 1] == 'a' - || buffer[i + 1] == 'A' - || buffer[i + 1] == 'o' - || buffer[i + 1] == 'O')) { - length = StemmerUtil.delete(buffer, i + 1, length); - buffer[i] = AA; - - } else if (buffer[i] == 'a' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) { - length = StemmerUtil.delete(buffer, i + 1, length); - buffer[i] = ae; - - } else if (buffer[i] == 'A' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) { - length = StemmerUtil.delete(buffer, i + 1, length); - buffer[i] = AE; - - } else if (buffer[i] == 'o' - && (buffer[i + 1] == 'e' - || buffer[i + 1] == 'E' - || buffer[i + 1] == 'o' - || buffer[i + 1] == 'O')) { - length = StemmerUtil.delete(buffer, i + 1, length); - buffer[i] = oe; - - } else if (buffer[i] == 'O' - && (buffer[i + 1] == 'e' - || buffer[i + 1] == 'E' - || buffer[i + 1] == 'o' - || buffer[i + 1] == 'O')) { - length = StemmerUtil.delete(buffer, i + 1, length); - buffer[i] = OE; - } - } - } - - charTermAttribute.setLength(length); - + charTermAttribute.setLength( + normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length())); return true; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java new file mode 100644 index 00000000000..50187264a78 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.util.EnumSet; +import java.util.Set; +import org.apache.lucene.analysis.util.StemmerUtil; + +/** + * This Normalizer does the heavy lifting for a set of Scandinavian normalization filters, + * normalizing use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded variants (aa, + * ao, ae, oe and oo) by transforming them to åÅæÆøØ. + * + * @since 9.0 + * @lucene.internal + */ +public final class ScandinavianNormalizer { + + /** + * Create the instance, while choosing which foldings to apply. This may differ between Norwegian, + * Danish and Swedish. + * + * @param foldings a Set of Foldings to apply (i.e. AE, OE, AA, AO, OO) + */ + public ScandinavianNormalizer(Set foldings) { + this.foldings = foldings; + } + + /** List of possible foldings that can be used when configuring the filter */ + public enum Foldings { + AA, + AO, + AE, + OE, + OO + } + + private final Set foldings; + + public static final Set ALL_FOLDINGS = EnumSet.allOf(Foldings.class); + + static final char AA = '\u00C5'; // Å + static final char aa = '\u00E5'; // å + static final char AE = '\u00C6'; // Æ + static final char ae = '\u00E6'; // æ + static final char AE_se = '\u00C4'; // Ä + static final char ae_se = '\u00E4'; // ä + static final char OE = '\u00D8'; // Ø + static final char oe = '\u00F8'; // ø + static final char OE_se = '\u00D6'; // Ö + static final char oe_se = '\u00F6'; // ö + + /** + * Takes the original buffer and length as input. Modifies the buffer in-place and returns new + * length + * + * @return new length + */ + public int processToken(char[] buffer, int length) { + int i; + for (i = 0; i < length; i++) { + + if (buffer[i] == ae_se) { + buffer[i] = ae; + + } else if (buffer[i] == AE_se) { + buffer[i] = AE; + + } else if (buffer[i] == oe_se) { + buffer[i] = oe; + + } else if (buffer[i] == OE_se) { + buffer[i] = OE; + + } else if (length - 1 > i) { + + if (buffer[i] == 'a' + && (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A') + || foldings.contains(Foldings.AO) + && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) { + length = StemmerUtil.delete(buffer, i + 1, length); + buffer[i] = aa; + + } else if (buffer[i] == 'A' + && (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A') + || foldings.contains(Foldings.AO) + && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) { + length = StemmerUtil.delete(buffer, i + 1, length); + buffer[i] = AA; + + } else if (buffer[i] == 'a' + && foldings.contains(Foldings.AE) + && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) { + length = StemmerUtil.delete(buffer, i + 1, length); + buffer[i] = ae; + + } else if (buffer[i] == 'A' + && foldings.contains(Foldings.AE) + && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) { + length = StemmerUtil.delete(buffer, i + 1, length); + buffer[i] = AE; + + } else if (buffer[i] == 'o' + && (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E') + || foldings.contains(Foldings.OO) + && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) { + length = StemmerUtil.delete(buffer, i + 1, length); + buffer[i] = oe; + + } else if (buffer[i] == 'O' + && (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E') + || foldings.contains(Foldings.OO) + && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) { + length = StemmerUtil.delete(buffer, i + 1, length); + buffer[i] = OE; + } + } + } + return length; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java new file mode 100644 index 00000000000..420a0662491 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.no; + +import java.io.IOException; +import java.util.EnumSet; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded + * variants (ae, oe, aa) by transforming them to åÅæÆøØ. This is similar to + * ScandinavianNormalizationFilter, except for the folding rules customized for Norwegian. + * + *

blåbærsyltetøj == blåbärsyltetöj == blaabaersyltetoej + * + * @see ScandinavianNormalizationFilter + */ +public final class NorwegianNormalizationFilter extends TokenFilter { + private final ScandinavianNormalizer normalizer; + + public NorwegianNormalizationFilter(TokenStream input) { + super(input); + this.normalizer = new ScandinavianNormalizer(EnumSet.of(Foldings.AE, Foldings.OE, Foldings.AA)); + } + + private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); + + @Override + public final boolean incrementToken() throws IOException { + if (!input.incrementToken()) { + return false; + } + charTermAttribute.setLength( + normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length())); + return true; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilterFactory.java new file mode 100644 index 00000000000..448b2e7c68e --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilterFactory.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.no; + +import java.util.Map; +import org.apache.lucene.analysis.TokenFilterFactory; +import org.apache.lucene.analysis.TokenStream; + +/** + * Factory for {@link NorwegianNormalizationFilter}. + * + * @lucene.spi {@value #NAME} + */ +public class NorwegianNormalizationFilterFactory extends TokenFilterFactory { + + /** SPI name */ + public static final String NAME = "norwegianNormalization"; + + public NorwegianNormalizationFilterFactory(Map args) { + super(args); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + /** Default ctor for compatibility with SPI */ + public NorwegianNormalizationFilterFactory() { + throw defaultCtorException(); + } + + @Override + public NorwegianNormalizationFilter create(TokenStream input) { + return new NorwegianNormalizationFilter(input); + } + + @Override + public TokenStream normalize(TokenStream input) { + return create(input); + } +} diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory index ce2fd6434ea..e08399d7cc4 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory @@ -95,6 +95,7 @@ org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory org.apache.lucene.analysis.ngram.NGramFilterFactory org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory +org.apache.lucene.analysis.no.NorwegianNormalizationFilterFactory org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory org.apache.lucene.analysis.pattern.PatternTypingFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java index 0dd551fa419..d26f7a02c8f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java @@ -16,37 +16,12 @@ */ package org.apache.lucene.analysis.miscellaneous; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.core.KeywordTokenizer; public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase { - private Analyzer analyzer; - - @Override - public void setUp() throws Exception { - super.setUp(); - analyzer = - new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String field) { - final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer); - return new TokenStreamComponents(tokenizer, stream); - } - }; - } - - @Override - public void tearDown() throws Exception { - analyzer.close(); - super.tearDown(); - } - - public void test() throws Exception { + public void testDefault() throws Exception { + Analyzer analyzer = createAnalyzer(); checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException @@ -107,6 +82,7 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase checkOneTerm(analyzer, "Oe", "Ø"); checkOneTerm(analyzer, "OO", "Ø"); checkOneTerm(analyzer, "OE", "Ø"); + analyzer.close(); } /** check that the empty string doesn't cause issues */ @@ -126,6 +102,19 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase /** blast some random strings through the analyzer */ public void testRandomData() throws Exception { + Analyzer analyzer = createAnalyzer(); checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER); + analyzer.close(); + } + + private Analyzer createAnalyzer() { + return new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String field) { + final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer); + return new TokenStreamComponents(tokenizer, stream); + } + }; } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java index 97187a71c16..2966fd47846 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java @@ -21,10 +21,10 @@ import org.apache.lucene.analysis.TokenStream; public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase { - public void testStemming() throws Exception { - TokenStream stream = whitespaceMockTokenizer("räksmörgås"); + public void testDefault() throws Exception { + TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO"); stream = tokenFilterFactory("ScandinavianNormalization").create(stream); - assertTokenStreamContents(stream, new String[] {"ræksmørgås"}); + assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_ø_å_Æ_Ø_Å_Ø_Å"}); } /** Test that bogus arguments result in exception */ @@ -35,6 +35,7 @@ public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamF () -> { tokenFilterFactory("ScandinavianNormalization", "bogusArg", "bogusValue"); }); - assertTrue(expected.getMessage().contains("Unknown parameters")); + assertTrue( + "Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters")); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizer.java new file mode 100644 index 00000000000..2bea5346318 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizer.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.io.IOException; +import java.util.Collections; +import java.util.Set; +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** Tests low level the normalizer functionality */ +public class TestScandinavianNormalizer extends BaseTokenStreamTestCase { + public void testNoFoldings() throws Exception { + Analyzer analyzer = createAnalyzer(Collections.emptySet()); + checkOneTerm(analyzer, "aa", "aa"); + checkOneTerm(analyzer, "ao", "ao"); + checkOneTerm(analyzer, "ae", "ae"); + checkOneTerm(analyzer, "oo", "oo"); + checkOneTerm(analyzer, "oe", "oe"); + analyzer.close(); + } + + public void testAeFolding() throws Exception { + Analyzer analyzer = createAnalyzer(Set.of(Foldings.AE)); + checkOneTerm(analyzer, "aa", "aa"); + checkOneTerm(analyzer, "ao", "ao"); + checkOneTerm(analyzer, "ae", "æ"); + checkOneTerm(analyzer, "aE", "æ"); + checkOneTerm(analyzer, "Ae", "Æ"); + checkOneTerm(analyzer, "AE", "Æ"); + checkOneTerm(analyzer, "oo", "oo"); + checkOneTerm(analyzer, "oe", "oe"); + analyzer.close(); + } + + public void testAaFolding() throws Exception { + Analyzer analyzer = createAnalyzer(Set.of(Foldings.AA)); + checkOneTerm(analyzer, "aa", "å"); + checkOneTerm(analyzer, "aA", "å"); + checkOneTerm(analyzer, "Aa", "Å"); + checkOneTerm(analyzer, "AA", "Å"); + checkOneTerm(analyzer, "ao", "ao"); + checkOneTerm(analyzer, "ae", "ae"); + checkOneTerm(analyzer, "oo", "oo"); + checkOneTerm(analyzer, "oe", "oe"); + analyzer.close(); + } + + public void testOeFolding() throws Exception { + Analyzer analyzer = createAnalyzer(Set.of(Foldings.OE)); + checkOneTerm(analyzer, "aa", "aa"); + checkOneTerm(analyzer, "ao", "ao"); + checkOneTerm(analyzer, "ae", "ae"); + checkOneTerm(analyzer, "oo", "oo"); + checkOneTerm(analyzer, "oe", "ø"); + checkOneTerm(analyzer, "oE", "ø"); + checkOneTerm(analyzer, "Oe", "Ø"); + checkOneTerm(analyzer, "OE", "Ø"); + analyzer.close(); + } + + public void testOoFolding() throws Exception { + Analyzer analyzer = createAnalyzer(Set.of(Foldings.OO)); + checkOneTerm(analyzer, "aa", "aa"); + checkOneTerm(analyzer, "ao", "ao"); + checkOneTerm(analyzer, "ae", "ae"); + checkOneTerm(analyzer, "oo", "ø"); + checkOneTerm(analyzer, "oO", "ø"); + checkOneTerm(analyzer, "Oo", "Ø"); + checkOneTerm(analyzer, "OO", "Ø"); + checkOneTerm(analyzer, "oe", "oe"); + analyzer.close(); + } + + public void testAoFolding() throws Exception { + Analyzer analyzer = createAnalyzer(Set.of(Foldings.AO)); + checkOneTerm(analyzer, "aa", "aa"); + checkOneTerm(analyzer, "ao", "å"); + checkOneTerm(analyzer, "aO", "å"); + checkOneTerm(analyzer, "Ao", "Å"); + checkOneTerm(analyzer, "AO", "Å"); + checkOneTerm(analyzer, "ae", "ae"); + checkOneTerm(analyzer, "oo", "oo"); + checkOneTerm(analyzer, "oe", "oe"); + analyzer.close(); + } + + private Analyzer createAnalyzer(Set foldings) { + return new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String field) { + final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + final TokenStream stream = + new TokenFilter(tokenizer) { + private final CharTermAttribute charTermAttribute = + addAttribute(CharTermAttribute.class); + private final ScandinavianNormalizer normalizer = + new ScandinavianNormalizer(foldings); + + @Override + public boolean incrementToken() throws IOException { + if (!input.incrementToken()) { + return false; + } + charTermAttribute.setLength( + normalizer.processToken( + charTermAttribute.buffer(), charTermAttribute.length())); + return true; + } + }; + return new TokenStreamComponents(tokenizer, stream); + } + }; + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilter.java new file mode 100644 index 00000000000..c897fcb57a6 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilter.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.no; + +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; + +public class TestNorwegianNormalizationFilter extends BaseTokenStreamTestCase { + public void testDefault() throws Exception { + Analyzer analyzer = createAnalyzer(); + + checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException + + checkOneTerm(analyzer, "aeäaeeeae", "æææeeæ"); + checkOneTerm(analyzer, "aeaeeeae", "ææeeæ"); + + checkOneTerm(analyzer, "bøen", "bøen"); + checkOneTerm(analyzer, "bOEen", "bØen"); + checkOneTerm(analyzer, "åene", "åene"); + + checkOneTerm(analyzer, "blåbærsyltetøj", "blåbærsyltetøj"); + checkOneTerm(analyzer, "blaabaersyltetöj", "blåbærsyltetøj"); + checkOneTerm(analyzer, "räksmörgås", "ræksmørgås"); + checkOneTerm(analyzer, "raeksmörgaas", "ræksmørgås"); + checkOneTerm(analyzer, "raeksmoergås", "ræksmørgås"); + + checkOneTerm(analyzer, "ab", "ab"); + checkOneTerm(analyzer, "ob", "ob"); + checkOneTerm(analyzer, "Ab", "Ab"); + checkOneTerm(analyzer, "Ob", "Ob"); + + checkOneTerm(analyzer, "å", "å"); + + checkOneTerm(analyzer, "aa", "å"); + checkOneTerm(analyzer, "aA", "å"); + checkOneTerm(analyzer, "ao", "ao"); + checkOneTerm(analyzer, "aO", "aO"); + + checkOneTerm(analyzer, "AA", "Å"); + checkOneTerm(analyzer, "Aa", "Å"); + checkOneTerm(analyzer, "Ao", "Ao"); + checkOneTerm(analyzer, "AO", "AO"); + + checkOneTerm(analyzer, "æ", "æ"); + checkOneTerm(analyzer, "ä", "æ"); + + checkOneTerm(analyzer, "Æ", "Æ"); + checkOneTerm(analyzer, "Ä", "Æ"); + + checkOneTerm(analyzer, "ae", "æ"); + checkOneTerm(analyzer, "aE", "æ"); + + checkOneTerm(analyzer, "Ae", "Æ"); + checkOneTerm(analyzer, "AE", "Æ"); + + checkOneTerm(analyzer, "ö", "ø"); + checkOneTerm(analyzer, "ø", "ø"); + checkOneTerm(analyzer, "Ö", "Ø"); + checkOneTerm(analyzer, "Ø", "Ø"); + + checkOneTerm(analyzer, "oo", "oo"); + checkOneTerm(analyzer, "oe", "ø"); + checkOneTerm(analyzer, "oO", "oO"); + checkOneTerm(analyzer, "oE", "ø"); + + checkOneTerm(analyzer, "Oo", "Oo"); + checkOneTerm(analyzer, "Oe", "Ø"); + checkOneTerm(analyzer, "OO", "OO"); + checkOneTerm(analyzer, "OE", "Ø"); + analyzer.close(); + } + + /** check that the empty string doesn't cause issues */ + public void testEmptyTerm() throws Exception { + Analyzer a = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new KeywordTokenizer(); + return new TokenStreamComponents( + tokenizer, new ScandinavianNormalizationFilter(tokenizer)); + } + }; + checkOneTerm(a, "", ""); + a.close(); + } + + /** blast some random strings through the analyzer */ + public void testRandomData() throws Exception { + Analyzer analyzer = createAnalyzer(); + checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER); + analyzer.close(); + } + + private Analyzer createAnalyzer() { + return new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String field) { + final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + final TokenStream stream = new NorwegianNormalizationFilter(tokenizer); + return new TokenStreamComponents(tokenizer, stream); + } + }; + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilterFactory.java new file mode 100644 index 00000000000..88ed19af6e0 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilterFactory.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.no; + +import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase; +import org.apache.lucene.analysis.TokenStream; + +public class TestNorwegianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase { + + public void testDefault() throws Exception { + TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO"); + stream = tokenFilterFactory("NorwegianNormalization").create(stream); + assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_oo_ao_Æ_Ø_Å_OO_AO"}); + } + + /** Test that bogus arguments result in exception */ + public void testBogusArguments() throws Exception { + IllegalArgumentException expected = + expectThrows( + IllegalArgumentException.class, + () -> { + tokenFilterFactory("NorwegianNormalization", "bogusArg", "bogusValue"); + }); + assertTrue( + "Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters")); + } +}