diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index cd689839875..bc34e2f446b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -225,6 +225,9 @@ Improvements
* LUCENE-9850: Switch to PFOR encoding for doc IDs (instead of FOR). (Greg Miller)
+* LUCENE-9929: Add NorwegianNormalizationFilter, which does the same as ScandinavianNormalizationFilter except
+ it does not fold oo->ø and ao->å. (janhoy, Robert Muir, Adrien Grand)
+
Bug fixes
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java
index 82d38627429..4c5366f11eb 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java
@@ -16,11 +16,12 @@
*/
package org.apache.lucene.analysis.miscellaneous;
+import static org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.ALL_FOLDINGS;
+
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.StemmerUtil;
/**
* This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
@@ -33,98 +34,29 @@ import org.apache.lucene.analysis.util.StemmerUtil;
*
blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej but not blabarsyltetoj räksmörgås ==
* ræksmørgås == ræksmörgaos == raeksmoergaas but not raksmorgas
*
+ *
There are also separate filters for Norwegian, Danish and Swedish with slightly differing
+ * settings
+ *
* @see ScandinavianFoldingFilter
*/
public final class ScandinavianNormalizationFilter extends TokenFilter {
+ private final ScandinavianNormalizer normalizer;
+
public ScandinavianNormalizationFilter(TokenStream input) {
super(input);
+ this.normalizer = new ScandinavianNormalizer(ALL_FOLDINGS);
}
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
- private static final char AA = '\u00C5'; // Å
- private static final char aa = '\u00E5'; // å
- private static final char AE = '\u00C6'; // Æ
- private static final char ae = '\u00E6'; // æ
- private static final char AE_se = '\u00C4'; // Ä
- private static final char ae_se = '\u00E4'; // ä
- private static final char OE = '\u00D8'; // Ø
- private static final char oe = '\u00F8'; // ø
- private static final char OE_se = '\u00D6'; // Ö
- private static final char oe_se = '\u00F6'; // ö
-
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
}
-
- char[] buffer = charTermAttribute.buffer();
- int length = charTermAttribute.length();
-
- int i;
- for (i = 0; i < length; i++) {
-
- if (buffer[i] == ae_se) {
- buffer[i] = ae;
-
- } else if (buffer[i] == AE_se) {
- buffer[i] = AE;
-
- } else if (buffer[i] == oe_se) {
- buffer[i] = oe;
-
- } else if (buffer[i] == OE_se) {
- buffer[i] = OE;
-
- } else if (length - 1 > i) {
-
- if (buffer[i] == 'a'
- && (buffer[i + 1] == 'a'
- || buffer[i + 1] == 'o'
- || buffer[i + 1] == 'A'
- || buffer[i + 1] == 'O')) {
- length = StemmerUtil.delete(buffer, i + 1, length);
- buffer[i] = aa;
-
- } else if (buffer[i] == 'A'
- && (buffer[i + 1] == 'a'
- || buffer[i + 1] == 'A'
- || buffer[i + 1] == 'o'
- || buffer[i + 1] == 'O')) {
- length = StemmerUtil.delete(buffer, i + 1, length);
- buffer[i] = AA;
-
- } else if (buffer[i] == 'a' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
- length = StemmerUtil.delete(buffer, i + 1, length);
- buffer[i] = ae;
-
- } else if (buffer[i] == 'A' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
- length = StemmerUtil.delete(buffer, i + 1, length);
- buffer[i] = AE;
-
- } else if (buffer[i] == 'o'
- && (buffer[i + 1] == 'e'
- || buffer[i + 1] == 'E'
- || buffer[i + 1] == 'o'
- || buffer[i + 1] == 'O')) {
- length = StemmerUtil.delete(buffer, i + 1, length);
- buffer[i] = oe;
-
- } else if (buffer[i] == 'O'
- && (buffer[i + 1] == 'e'
- || buffer[i + 1] == 'E'
- || buffer[i + 1] == 'o'
- || buffer[i + 1] == 'O')) {
- length = StemmerUtil.delete(buffer, i + 1, length);
- buffer[i] = OE;
- }
- }
- }
-
- charTermAttribute.setLength(length);
-
+ charTermAttribute.setLength(
+ normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
return true;
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java
new file mode 100644
index 00000000000..50187264a78
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.util.EnumSet;
+import java.util.Set;
+import org.apache.lucene.analysis.util.StemmerUtil;
+
+/**
+ * This Normalizer does the heavy lifting for a set of Scandinavian normalization filters,
+ * normalizing use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded variants (aa,
+ * ao, ae, oe and oo) by transforming them to åÅæÆøØ.
+ *
+ * @since 9.0
+ * @lucene.internal
+ */
+public final class ScandinavianNormalizer {
+
+ /**
+ * Create the instance, while choosing which foldings to apply. This may differ between Norwegian,
+ * Danish and Swedish.
+ *
+ * @param foldings a Set of Foldings to apply (i.e. AE, OE, AA, AO, OO)
+ */
+ public ScandinavianNormalizer(Set foldings) {
+ this.foldings = foldings;
+ }
+
+ /** List of possible foldings that can be used when configuring the filter */
+ public enum Foldings {
+ AA,
+ AO,
+ AE,
+ OE,
+ OO
+ }
+
+ private final Set foldings;
+
+ public static final Set ALL_FOLDINGS = EnumSet.allOf(Foldings.class);
+
+ static final char AA = '\u00C5'; // Å
+ static final char aa = '\u00E5'; // å
+ static final char AE = '\u00C6'; // Æ
+ static final char ae = '\u00E6'; // æ
+ static final char AE_se = '\u00C4'; // Ä
+ static final char ae_se = '\u00E4'; // ä
+ static final char OE = '\u00D8'; // Ø
+ static final char oe = '\u00F8'; // ø
+ static final char OE_se = '\u00D6'; // Ö
+ static final char oe_se = '\u00F6'; // ö
+
+ /**
+ * Takes the original buffer and length as input. Modifies the buffer in-place and returns new
+ * length
+ *
+ * @return new length
+ */
+ public int processToken(char[] buffer, int length) {
+ int i;
+ for (i = 0; i < length; i++) {
+
+ if (buffer[i] == ae_se) {
+ buffer[i] = ae;
+
+ } else if (buffer[i] == AE_se) {
+ buffer[i] = AE;
+
+ } else if (buffer[i] == oe_se) {
+ buffer[i] = oe;
+
+ } else if (buffer[i] == OE_se) {
+ buffer[i] = OE;
+
+ } else if (length - 1 > i) {
+
+ if (buffer[i] == 'a'
+ && (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
+ || foldings.contains(Foldings.AO)
+ && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = aa;
+
+ } else if (buffer[i] == 'A'
+ && (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
+ || foldings.contains(Foldings.AO)
+ && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = AA;
+
+ } else if (buffer[i] == 'a'
+ && foldings.contains(Foldings.AE)
+ && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = ae;
+
+ } else if (buffer[i] == 'A'
+ && foldings.contains(Foldings.AE)
+ && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = AE;
+
+ } else if (buffer[i] == 'o'
+ && (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
+ || foldings.contains(Foldings.OO)
+ && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = oe;
+
+ } else if (buffer[i] == 'O'
+ && (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
+ || foldings.contains(Foldings.OO)
+ && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = OE;
+ }
+ }
+ }
+ return length;
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java
new file mode 100644
index 00000000000..420a0662491
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.no;
+
+import java.io.IOException;
+import java.util.EnumSet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
+ * variants (ae, oe, aa) by transforming them to åÅæÆøØ. This is similar to
+ * ScandinavianNormalizationFilter, except for the folding rules customized for Norwegian.
+ *
+ * blåbærsyltetøj == blåbärsyltetöj == blaabaersyltetoej
+ *
+ * @see ScandinavianNormalizationFilter
+ */
+public final class NorwegianNormalizationFilter extends TokenFilter {
+ private final ScandinavianNormalizer normalizer;
+
+ public NorwegianNormalizationFilter(TokenStream input) {
+ super(input);
+ this.normalizer = new ScandinavianNormalizer(EnumSet.of(Foldings.AE, Foldings.OE, Foldings.AA));
+ }
+
+ private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (!input.incrementToken()) {
+ return false;
+ }
+ charTermAttribute.setLength(
+ normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
+ return true;
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilterFactory.java
new file mode 100644
index 00000000000..448b2e7c68e
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilterFactory.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.no;
+
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link NorwegianNormalizationFilter}.
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class NorwegianNormalizationFilterFactory extends TokenFilterFactory {
+
+ /** SPI name */
+ public static final String NAME = "norwegianNormalization";
+
+ public NorwegianNormalizationFilterFactory(Map args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ /** Default ctor for compatibility with SPI */
+ public NorwegianNormalizationFilterFactory() {
+ throw defaultCtorException();
+ }
+
+ @Override
+ public NorwegianNormalizationFilter create(TokenStream input) {
+ return new NorwegianNormalizationFilter(input);
+ }
+
+ @Override
+ public TokenStream normalize(TokenStream input) {
+ return create(input);
+ }
+}
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
index ce2fd6434ea..e08399d7cc4 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@@ -95,6 +95,7 @@ org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory
org.apache.lucene.analysis.ngram.NGramFilterFactory
org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
+org.apache.lucene.analysis.no.NorwegianNormalizationFilterFactory
org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
org.apache.lucene.analysis.pattern.PatternTypingFilterFactory
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java
index 0dd551fa419..d26f7a02c8f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java
@@ -16,37 +16,12 @@
*/
package org.apache.lucene.analysis.miscellaneous;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.KeywordTokenizer;
public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase {
- private Analyzer analyzer;
-
- @Override
- public void setUp() throws Exception {
- super.setUp();
- analyzer =
- new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String field) {
- final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
- return new TokenStreamComponents(tokenizer, stream);
- }
- };
- }
-
- @Override
- public void tearDown() throws Exception {
- analyzer.close();
- super.tearDown();
- }
-
- public void test() throws Exception {
+ public void testDefault() throws Exception {
+ Analyzer analyzer = createAnalyzer();
checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
@@ -107,6 +82,7 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase
checkOneTerm(analyzer, "Oe", "Ø");
checkOneTerm(analyzer, "OO", "Ø");
checkOneTerm(analyzer, "OE", "Ø");
+ analyzer.close();
}
/** check that the empty string doesn't cause issues */
@@ -126,6 +102,19 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase
/** blast some random strings through the analyzer */
public void testRandomData() throws Exception {
+ Analyzer analyzer = createAnalyzer();
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
+ analyzer.close();
+ }
+
+ private Analyzer createAnalyzer() {
+ return new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String field) {
+ final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java
index 97187a71c16..2966fd47846 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java
@@ -21,10 +21,10 @@ import org.apache.lucene.analysis.TokenStream;
public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
- public void testStemming() throws Exception {
- TokenStream stream = whitespaceMockTokenizer("räksmörgås");
+ public void testDefault() throws Exception {
+ TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
stream = tokenFilterFactory("ScandinavianNormalization").create(stream);
- assertTokenStreamContents(stream, new String[] {"ræksmørgås"});
+ assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_ø_å_Æ_Ø_Å_Ø_Å"});
}
/** Test that bogus arguments result in exception */
@@ -35,6 +35,7 @@ public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamF
() -> {
tokenFilterFactory("ScandinavianNormalization", "bogusArg", "bogusValue");
});
- assertTrue(expected.getMessage().contains("Unknown parameters"));
+ assertTrue(
+ "Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizer.java
new file mode 100644
index 00000000000..2bea5346318
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizer.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/** Tests low level the normalizer functionality */
+public class TestScandinavianNormalizer extends BaseTokenStreamTestCase {
+ public void testNoFoldings() throws Exception {
+ Analyzer analyzer = createAnalyzer(Collections.emptySet());
+ checkOneTerm(analyzer, "aa", "aa");
+ checkOneTerm(analyzer, "ao", "ao");
+ checkOneTerm(analyzer, "ae", "ae");
+ checkOneTerm(analyzer, "oo", "oo");
+ checkOneTerm(analyzer, "oe", "oe");
+ analyzer.close();
+ }
+
+ public void testAeFolding() throws Exception {
+ Analyzer analyzer = createAnalyzer(Set.of(Foldings.AE));
+ checkOneTerm(analyzer, "aa", "aa");
+ checkOneTerm(analyzer, "ao", "ao");
+ checkOneTerm(analyzer, "ae", "æ");
+ checkOneTerm(analyzer, "aE", "æ");
+ checkOneTerm(analyzer, "Ae", "Æ");
+ checkOneTerm(analyzer, "AE", "Æ");
+ checkOneTerm(analyzer, "oo", "oo");
+ checkOneTerm(analyzer, "oe", "oe");
+ analyzer.close();
+ }
+
+ public void testAaFolding() throws Exception {
+ Analyzer analyzer = createAnalyzer(Set.of(Foldings.AA));
+ checkOneTerm(analyzer, "aa", "å");
+ checkOneTerm(analyzer, "aA", "å");
+ checkOneTerm(analyzer, "Aa", "Å");
+ checkOneTerm(analyzer, "AA", "Å");
+ checkOneTerm(analyzer, "ao", "ao");
+ checkOneTerm(analyzer, "ae", "ae");
+ checkOneTerm(analyzer, "oo", "oo");
+ checkOneTerm(analyzer, "oe", "oe");
+ analyzer.close();
+ }
+
+ public void testOeFolding() throws Exception {
+ Analyzer analyzer = createAnalyzer(Set.of(Foldings.OE));
+ checkOneTerm(analyzer, "aa", "aa");
+ checkOneTerm(analyzer, "ao", "ao");
+ checkOneTerm(analyzer, "ae", "ae");
+ checkOneTerm(analyzer, "oo", "oo");
+ checkOneTerm(analyzer, "oe", "ø");
+ checkOneTerm(analyzer, "oE", "ø");
+ checkOneTerm(analyzer, "Oe", "Ø");
+ checkOneTerm(analyzer, "OE", "Ø");
+ analyzer.close();
+ }
+
+ public void testOoFolding() throws Exception {
+ Analyzer analyzer = createAnalyzer(Set.of(Foldings.OO));
+ checkOneTerm(analyzer, "aa", "aa");
+ checkOneTerm(analyzer, "ao", "ao");
+ checkOneTerm(analyzer, "ae", "ae");
+ checkOneTerm(analyzer, "oo", "ø");
+ checkOneTerm(analyzer, "oO", "ø");
+ checkOneTerm(analyzer, "Oo", "Ø");
+ checkOneTerm(analyzer, "OO", "Ø");
+ checkOneTerm(analyzer, "oe", "oe");
+ analyzer.close();
+ }
+
+ public void testAoFolding() throws Exception {
+ Analyzer analyzer = createAnalyzer(Set.of(Foldings.AO));
+ checkOneTerm(analyzer, "aa", "aa");
+ checkOneTerm(analyzer, "ao", "å");
+ checkOneTerm(analyzer, "aO", "å");
+ checkOneTerm(analyzer, "Ao", "Å");
+ checkOneTerm(analyzer, "AO", "Å");
+ checkOneTerm(analyzer, "ae", "ae");
+ checkOneTerm(analyzer, "oo", "oo");
+ checkOneTerm(analyzer, "oe", "oe");
+ analyzer.close();
+ }
+
+ private Analyzer createAnalyzer(Set foldings) {
+ return new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String field) {
+ final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream stream =
+ new TokenFilter(tokenizer) {
+ private final CharTermAttribute charTermAttribute =
+ addAttribute(CharTermAttribute.class);
+ private final ScandinavianNormalizer normalizer =
+ new ScandinavianNormalizer(foldings);
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (!input.incrementToken()) {
+ return false;
+ }
+ charTermAttribute.setLength(
+ normalizer.processToken(
+ charTermAttribute.buffer(), charTermAttribute.length()));
+ return true;
+ }
+ };
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+ }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilter.java
new file mode 100644
index 00000000000..c897fcb57a6
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilter.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.no;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
+
+public class TestNorwegianNormalizationFilter extends BaseTokenStreamTestCase {
+ public void testDefault() throws Exception {
+ Analyzer analyzer = createAnalyzer();
+
+ checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
+
+ checkOneTerm(analyzer, "aeäaeeeae", "æææeeæ");
+ checkOneTerm(analyzer, "aeaeeeae", "ææeeæ");
+
+ checkOneTerm(analyzer, "bøen", "bøen");
+ checkOneTerm(analyzer, "bOEen", "bØen");
+ checkOneTerm(analyzer, "åene", "åene");
+
+ checkOneTerm(analyzer, "blåbærsyltetøj", "blåbærsyltetøj");
+ checkOneTerm(analyzer, "blaabaersyltetöj", "blåbærsyltetøj");
+ checkOneTerm(analyzer, "räksmörgås", "ræksmørgås");
+ checkOneTerm(analyzer, "raeksmörgaas", "ræksmørgås");
+ checkOneTerm(analyzer, "raeksmoergås", "ræksmørgås");
+
+ checkOneTerm(analyzer, "ab", "ab");
+ checkOneTerm(analyzer, "ob", "ob");
+ checkOneTerm(analyzer, "Ab", "Ab");
+ checkOneTerm(analyzer, "Ob", "Ob");
+
+ checkOneTerm(analyzer, "å", "å");
+
+ checkOneTerm(analyzer, "aa", "å");
+ checkOneTerm(analyzer, "aA", "å");
+ checkOneTerm(analyzer, "ao", "ao");
+ checkOneTerm(analyzer, "aO", "aO");
+
+ checkOneTerm(analyzer, "AA", "Å");
+ checkOneTerm(analyzer, "Aa", "Å");
+ checkOneTerm(analyzer, "Ao", "Ao");
+ checkOneTerm(analyzer, "AO", "AO");
+
+ checkOneTerm(analyzer, "æ", "æ");
+ checkOneTerm(analyzer, "ä", "æ");
+
+ checkOneTerm(analyzer, "Æ", "Æ");
+ checkOneTerm(analyzer, "Ä", "Æ");
+
+ checkOneTerm(analyzer, "ae", "æ");
+ checkOneTerm(analyzer, "aE", "æ");
+
+ checkOneTerm(analyzer, "Ae", "Æ");
+ checkOneTerm(analyzer, "AE", "Æ");
+
+ checkOneTerm(analyzer, "ö", "ø");
+ checkOneTerm(analyzer, "ø", "ø");
+ checkOneTerm(analyzer, "Ö", "Ø");
+ checkOneTerm(analyzer, "Ø", "Ø");
+
+ checkOneTerm(analyzer, "oo", "oo");
+ checkOneTerm(analyzer, "oe", "ø");
+ checkOneTerm(analyzer, "oO", "oO");
+ checkOneTerm(analyzer, "oE", "ø");
+
+ checkOneTerm(analyzer, "Oo", "Oo");
+ checkOneTerm(analyzer, "Oe", "Ø");
+ checkOneTerm(analyzer, "OO", "OO");
+ checkOneTerm(analyzer, "OE", "Ø");
+ analyzer.close();
+ }
+
+ /** check that the empty string doesn't cause issues */
+ public void testEmptyTerm() throws Exception {
+ Analyzer a =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new KeywordTokenizer();
+ return new TokenStreamComponents(
+ tokenizer, new ScandinavianNormalizationFilter(tokenizer));
+ }
+ };
+ checkOneTerm(a, "", "");
+ a.close();
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomData() throws Exception {
+ Analyzer analyzer = createAnalyzer();
+ checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
+ analyzer.close();
+ }
+
+ private Analyzer createAnalyzer() {
+ return new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String field) {
+ final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream stream = new NorwegianNormalizationFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+ }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilterFactory.java
new file mode 100644
index 00000000000..88ed19af6e0
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilterFactory.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.no;
+
+import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.analysis.TokenStream;
+
+public class TestNorwegianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+ public void testDefault() throws Exception {
+ TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
+ stream = tokenFilterFactory("NorwegianNormalization").create(stream);
+ assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_oo_ao_Æ_Ø_Å_OO_AO"});
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ IllegalArgumentException expected =
+ expectThrows(
+ IllegalArgumentException.class,
+ () -> {
+ tokenFilterFactory("NorwegianNormalization", "bogusArg", "bogusValue");
+ });
+ assertTrue(
+ "Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
+ }
+}