LUCENE-9929 NorwegianNormalizationFilter (#84)

2025-02-09 19:45:22 +00:00 · 2021-05-12 14:31:26 +02:00 · 2021-05-12 14:31:26 +02:00 · 7dd7077609
commit 7dd7077609
parent 6ebf959502
11 changed files with 572 additions and 110 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -225,6 +225,9 @@ Improvements

 * LUCENE-9850: Switch to PFOR encoding for doc IDs (instead of FOR). (Greg Miller)

+* LUCENE-9929: Add NorwegianNormalizationFilter, which does the same as ScandinavianNormalizationFilter except
+  it does not fold oo->ø and ao->å. (janhoy, Robert Muir, Adrien Grand)
+
 Bug fixes


--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java
@ -16,11 +16,12 @@
 */
 package org.apache.lucene.analysis.miscellaneous;

+import static org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.ALL_FOLDINGS;
+
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.StemmerUtil;

 /**
 * This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
@ -33,98 +34,29 @@ import org.apache.lucene.analysis.util.StemmerUtil;
 * <p>blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej but not blabarsyltetoj räksmörgås ==
 * ræksmørgås == ræksmörgaos == raeksmoergaas but not raksmorgas
 *
+ * <p>There are also separate filters for Norwegian, Danish and Swedish with slightly differing
+ * settings
+ *
 * @see ScandinavianFoldingFilter
 */
 public final class ScandinavianNormalizationFilter extends TokenFilter {

+  private final ScandinavianNormalizer normalizer;
+
  public ScandinavianNormalizationFilter(TokenStream input) {
    super(input);
+    this.normalizer = new ScandinavianNormalizer(ALL_FOLDINGS);
  }

  private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);

-  private static final char AA = '\u00C5'; // Å
-  private static final char aa = '\u00E5'; // å
-  private static final char AE = '\u00C6'; // Æ
-  private static final char ae = '\u00E6'; // æ
-  private static final char AE_se = '\u00C4'; // Ä
-  private static final char ae_se = '\u00E4'; // ä
-  private static final char OE = '\u00D8'; // Ø
-  private static final char oe = '\u00F8'; // ø
-  private static final char OE_se = '\u00D6'; // Ö
-  private static final char oe_se = '\u00F6'; // ö
-
  @Override
  public boolean incrementToken() throws IOException {
    if (!input.incrementToken()) {
      return false;
    }
-
-    char[] buffer = charTermAttribute.buffer();
-    int length = charTermAttribute.length();
-
-    int i;
-    for (i = 0; i < length; i++) {
-
-      if (buffer[i] == ae_se) {
-        buffer[i] = ae;
-
-      } else if (buffer[i] == AE_se) {
-        buffer[i] = AE;
-
-      } else if (buffer[i] == oe_se) {
-        buffer[i] = oe;
-
-      } else if (buffer[i] == OE_se) {
-        buffer[i] = OE;
-
-      } else if (length - 1 > i) {
-
-        if (buffer[i] == 'a'
-            && (buffer[i + 1] == 'a'
-                || buffer[i + 1] == 'o'
-                || buffer[i + 1] == 'A'
-                || buffer[i + 1] == 'O')) {
-          length = StemmerUtil.delete(buffer, i + 1, length);
-          buffer[i] = aa;
-
-        } else if (buffer[i] == 'A'
-            && (buffer[i + 1] == 'a'
-                || buffer[i + 1] == 'A'
-                || buffer[i + 1] == 'o'
-                || buffer[i + 1] == 'O')) {
-          length = StemmerUtil.delete(buffer, i + 1, length);
-          buffer[i] = AA;
-
-        } else if (buffer[i] == 'a' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
-          length = StemmerUtil.delete(buffer, i + 1, length);
-          buffer[i] = ae;
-
-        } else if (buffer[i] == 'A' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
-          length = StemmerUtil.delete(buffer, i + 1, length);
-          buffer[i] = AE;
-
-        } else if (buffer[i] == 'o'
-            && (buffer[i + 1] == 'e'
-                || buffer[i + 1] == 'E'
-                || buffer[i + 1] == 'o'
-                || buffer[i + 1] == 'O')) {
-          length = StemmerUtil.delete(buffer, i + 1, length);
-          buffer[i] = oe;
-
-        } else if (buffer[i] == 'O'
-            && (buffer[i + 1] == 'e'
-                || buffer[i + 1] == 'E'
-                || buffer[i + 1] == 'o'
-                || buffer[i + 1] == 'O')) {
-          length = StemmerUtil.delete(buffer, i + 1, length);
-          buffer[i] = OE;
-        }
-      }
-    }
-
-    charTermAttribute.setLength(length);
-
+    charTermAttribute.setLength(
+        normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
    return true;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java
@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.util.EnumSet;
+import java.util.Set;
+import org.apache.lucene.analysis.util.StemmerUtil;
+
+/**
+ * This Normalizer does the heavy lifting for a set of Scandinavian normalization filters,
+ * normalizing use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded variants (aa,
+ * ao, ae, oe and oo) by transforming them to åÅæÆøØ.
+ *
+ * @since 9.0
+ * @lucene.internal
+ */
+public final class ScandinavianNormalizer {
+
+  /**
+   * Create the instance, while choosing which foldings to apply. This may differ between Norwegian,
+   * Danish and Swedish.
+   *
+   * @param foldings a Set of Foldings to apply (i.e. AE, OE, AA, AO, OO)
+   */
+  public ScandinavianNormalizer(Set<Foldings> foldings) {
+    this.foldings = foldings;
+  }
+
+  /** List of possible foldings that can be used when configuring the filter */
+  public enum Foldings {
+    AA,
+    AO,
+    AE,
+    OE,
+    OO
+  }
+
+  private final Set<Foldings> foldings;
+
+  public static final Set<Foldings> ALL_FOLDINGS = EnumSet.allOf(Foldings.class);
+
+  static final char AA = '\u00C5'; // Å
+  static final char aa = '\u00E5'; // å
+  static final char AE = '\u00C6'; // Æ
+  static final char ae = '\u00E6'; // æ
+  static final char AE_se = '\u00C4'; // Ä
+  static final char ae_se = '\u00E4'; // ä
+  static final char OE = '\u00D8'; // Ø
+  static final char oe = '\u00F8'; // ø
+  static final char OE_se = '\u00D6'; // Ö
+  static final char oe_se = '\u00F6'; // ö
+
+  /**
+   * Takes the original buffer and length as input. Modifies the buffer in-place and returns new
+   * length
+   *
+   * @return new length
+   */
+  public int processToken(char[] buffer, int length) {
+    int i;
+    for (i = 0; i < length; i++) {
+
+      if (buffer[i] == ae_se) {
+        buffer[i] = ae;
+
+      } else if (buffer[i] == AE_se) {
+        buffer[i] = AE;
+
+      } else if (buffer[i] == oe_se) {
+        buffer[i] = oe;
+
+      } else if (buffer[i] == OE_se) {
+        buffer[i] = OE;
+
+      } else if (length - 1 > i) {
+
+        if (buffer[i] == 'a'
+            && (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
+                || foldings.contains(Foldings.AO)
+                    && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
+          length = StemmerUtil.delete(buffer, i + 1, length);
+          buffer[i] = aa;
+
+        } else if (buffer[i] == 'A'
+            && (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
+                || foldings.contains(Foldings.AO)
+                    && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
+          length = StemmerUtil.delete(buffer, i + 1, length);
+          buffer[i] = AA;
+
+        } else if (buffer[i] == 'a'
+            && foldings.contains(Foldings.AE)
+            && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
+          length = StemmerUtil.delete(buffer, i + 1, length);
+          buffer[i] = ae;
+
+        } else if (buffer[i] == 'A'
+            && foldings.contains(Foldings.AE)
+            && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
+          length = StemmerUtil.delete(buffer, i + 1, length);
+          buffer[i] = AE;
+
+        } else if (buffer[i] == 'o'
+            && (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
+                || foldings.contains(Foldings.OO)
+                    && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
+          length = StemmerUtil.delete(buffer, i + 1, length);
+          buffer[i] = oe;
+
+        } else if (buffer[i] == 'O'
+            && (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
+                || foldings.contains(Foldings.OO)
+                    && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
+          length = StemmerUtil.delete(buffer, i + 1, length);
+          buffer[i] = OE;
+        }
+      }
+    }
+    return length;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java
@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.no;
+
+import java.io.IOException;
+import java.util.EnumSet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
+ * variants (ae, oe, aa) by transforming them to åÅæÆøØ. This is similar to
+ * ScandinavianNormalizationFilter, except for the folding rules customized for Norwegian.
+ *
+ * <p>blåbærsyltetøj == blåbärsyltetöj == blaabaersyltetoej
+ *
+ * @see ScandinavianNormalizationFilter
+ */
+public final class NorwegianNormalizationFilter extends TokenFilter {
+  private final ScandinavianNormalizer normalizer;
+
+  public NorwegianNormalizationFilter(TokenStream input) {
+    super(input);
+    this.normalizer = new ScandinavianNormalizer(EnumSet.of(Foldings.AE, Foldings.OE, Foldings.AA));
+  }
+
+  private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (!input.incrementToken()) {
+      return false;
+    }
+    charTermAttribute.setLength(
+        normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
+    return true;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilterFactory.java
@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.no;
+
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link NorwegianNormalizationFilter}.
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class NorwegianNormalizationFilterFactory extends TokenFilterFactory {
+
+  /** SPI name */
+  public static final String NAME = "norwegianNormalization";
+
+  public NorwegianNormalizationFilterFactory(Map<String, String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  /** Default ctor for compatibility with SPI */
+  public NorwegianNormalizationFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  @Override
+  public NorwegianNormalizationFilter create(TokenStream input) {
+    return new NorwegianNormalizationFilter(input);
+  }
+
+  @Override
+  public TokenStream normalize(TokenStream input) {
+    return create(input);
+  }
+}
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@ -95,6 +95,7 @@ org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory
 org.apache.lucene.analysis.ngram.NGramFilterFactory
 org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
 org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
+org.apache.lucene.analysis.no.NorwegianNormalizationFilterFactory
 org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
 org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
 org.apache.lucene.analysis.pattern.PatternTypingFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java
@ -16,37 +16,12 @@
 */
 package org.apache.lucene.analysis.miscellaneous;

-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.core.KeywordTokenizer;

 public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase {
-  private Analyzer analyzer;
-
-  @Override
-  public void setUp() throws Exception {
-    super.setUp();
-    analyzer =
-        new Analyzer() {
-          @Override
-          protected TokenStreamComponents createComponents(String field) {
-            final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-            final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
-            return new TokenStreamComponents(tokenizer, stream);
-          }
-        };
-  }
-
-  @Override
-  public void tearDown() throws Exception {
-    analyzer.close();
-    super.tearDown();
-  }
-
-  public void test() throws Exception {
+  public void testDefault() throws Exception {
+    Analyzer analyzer = createAnalyzer();

    checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException

@ -107,6 +82,7 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase
    checkOneTerm(analyzer, "Oe", "Ø");
    checkOneTerm(analyzer, "OO", "Ø");
    checkOneTerm(analyzer, "OE", "Ø");
+    analyzer.close();
  }

  /** check that the empty string doesn't cause issues */
@ -126,6 +102,19 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase

  /** blast some random strings through the analyzer */
  public void testRandomData() throws Exception {
+    Analyzer analyzer = createAnalyzer();
    checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
+    analyzer.close();
+  }
+
+  private Analyzer createAnalyzer() {
+    return new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String field) {
+        final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
+        return new TokenStreamComponents(tokenizer, stream);
+      }
+    };
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java
@ -21,10 +21,10 @@ import org.apache.lucene.analysis.TokenStream;

 public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {

-  public void testStemming() throws Exception {
-    TokenStream stream = whitespaceMockTokenizer("räksmörgås");
+  public void testDefault() throws Exception {
+    TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
    stream = tokenFilterFactory("ScandinavianNormalization").create(stream);
-    assertTokenStreamContents(stream, new String[] {"ræksmørgås"});
+    assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_ø_å_Æ_Ø_Å_Ø_Å"});
  }

  /** Test that bogus arguments result in exception */
@ -35,6 +35,7 @@ public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamF
            () -> {
              tokenFilterFactory("ScandinavianNormalization", "bogusArg", "bogusValue");
            });
-    assertTrue(expected.getMessage().contains("Unknown parameters"));
+    assertTrue(
+        "Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizer.java
@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/** Tests low level the normalizer functionality */
+public class TestScandinavianNormalizer extends BaseTokenStreamTestCase {
+  public void testNoFoldings() throws Exception {
+    Analyzer analyzer = createAnalyzer(Collections.emptySet());
+    checkOneTerm(analyzer, "aa", "aa");
+    checkOneTerm(analyzer, "ao", "ao");
+    checkOneTerm(analyzer, "ae", "ae");
+    checkOneTerm(analyzer, "oo", "oo");
+    checkOneTerm(analyzer, "oe", "oe");
+    analyzer.close();
+  }
+
+  public void testAeFolding() throws Exception {
+    Analyzer analyzer = createAnalyzer(Set.of(Foldings.AE));
+    checkOneTerm(analyzer, "aa", "aa");
+    checkOneTerm(analyzer, "ao", "ao");
+    checkOneTerm(analyzer, "ae", "æ");
+    checkOneTerm(analyzer, "aE", "æ");
+    checkOneTerm(analyzer, "Ae", "Æ");
+    checkOneTerm(analyzer, "AE", "Æ");
+    checkOneTerm(analyzer, "oo", "oo");
+    checkOneTerm(analyzer, "oe", "oe");
+    analyzer.close();
+  }
+
+  public void testAaFolding() throws Exception {
+    Analyzer analyzer = createAnalyzer(Set.of(Foldings.AA));
+    checkOneTerm(analyzer, "aa", "å");
+    checkOneTerm(analyzer, "aA", "å");
+    checkOneTerm(analyzer, "Aa", "Å");
+    checkOneTerm(analyzer, "AA", "Å");
+    checkOneTerm(analyzer, "ao", "ao");
+    checkOneTerm(analyzer, "ae", "ae");
+    checkOneTerm(analyzer, "oo", "oo");
+    checkOneTerm(analyzer, "oe", "oe");
+    analyzer.close();
+  }
+
+  public void testOeFolding() throws Exception {
+    Analyzer analyzer = createAnalyzer(Set.of(Foldings.OE));
+    checkOneTerm(analyzer, "aa", "aa");
+    checkOneTerm(analyzer, "ao", "ao");
+    checkOneTerm(analyzer, "ae", "ae");
+    checkOneTerm(analyzer, "oo", "oo");
+    checkOneTerm(analyzer, "oe", "ø");
+    checkOneTerm(analyzer, "oE", "ø");
+    checkOneTerm(analyzer, "Oe", "Ø");
+    checkOneTerm(analyzer, "OE", "Ø");
+    analyzer.close();
+  }
+
+  public void testOoFolding() throws Exception {
+    Analyzer analyzer = createAnalyzer(Set.of(Foldings.OO));
+    checkOneTerm(analyzer, "aa", "aa");
+    checkOneTerm(analyzer, "ao", "ao");
+    checkOneTerm(analyzer, "ae", "ae");
+    checkOneTerm(analyzer, "oo", "ø");
+    checkOneTerm(analyzer, "oO", "ø");
+    checkOneTerm(analyzer, "Oo", "Ø");
+    checkOneTerm(analyzer, "OO", "Ø");
+    checkOneTerm(analyzer, "oe", "oe");
+    analyzer.close();
+  }
+
+  public void testAoFolding() throws Exception {
+    Analyzer analyzer = createAnalyzer(Set.of(Foldings.AO));
+    checkOneTerm(analyzer, "aa", "aa");
+    checkOneTerm(analyzer, "ao", "å");
+    checkOneTerm(analyzer, "aO", "å");
+    checkOneTerm(analyzer, "Ao", "Å");
+    checkOneTerm(analyzer, "AO", "Å");
+    checkOneTerm(analyzer, "ae", "ae");
+    checkOneTerm(analyzer, "oo", "oo");
+    checkOneTerm(analyzer, "oe", "oe");
+    analyzer.close();
+  }
+
+  private Analyzer createAnalyzer(Set<Foldings> foldings) {
+    return new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String field) {
+        final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        final TokenStream stream =
+            new TokenFilter(tokenizer) {
+              private final CharTermAttribute charTermAttribute =
+                  addAttribute(CharTermAttribute.class);
+              private final ScandinavianNormalizer normalizer =
+                  new ScandinavianNormalizer(foldings);
+
+              @Override
+              public boolean incrementToken() throws IOException {
+                if (!input.incrementToken()) {
+                  return false;
+                }
+                charTermAttribute.setLength(
+                    normalizer.processToken(
+                        charTermAttribute.buffer(), charTermAttribute.length()));
+                return true;
+              }
+            };
+        return new TokenStreamComponents(tokenizer, stream);
+      }
+    };
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilter.java
@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.no;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
+
+public class TestNorwegianNormalizationFilter extends BaseTokenStreamTestCase {
+  public void testDefault() throws Exception {
+    Analyzer analyzer = createAnalyzer();
+
+    checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
+
+    checkOneTerm(analyzer, "aeäaeeeae", "æææeeæ");
+    checkOneTerm(analyzer, "aeaeeeae", "ææeeæ");
+
+    checkOneTerm(analyzer, "bøen", "bøen");
+    checkOneTerm(analyzer, "bOEen", "bØen");
+    checkOneTerm(analyzer, "åene", "åene");
+
+    checkOneTerm(analyzer, "blåbærsyltetøj", "blåbærsyltetøj");
+    checkOneTerm(analyzer, "blaabaersyltetöj", "blåbærsyltetøj");
+    checkOneTerm(analyzer, "räksmörgås", "ræksmørgås");
+    checkOneTerm(analyzer, "raeksmörgaas", "ræksmørgås");
+    checkOneTerm(analyzer, "raeksmoergås", "ræksmørgås");
+
+    checkOneTerm(analyzer, "ab", "ab");
+    checkOneTerm(analyzer, "ob", "ob");
+    checkOneTerm(analyzer, "Ab", "Ab");
+    checkOneTerm(analyzer, "Ob", "Ob");
+
+    checkOneTerm(analyzer, "å", "å");
+
+    checkOneTerm(analyzer, "aa", "å");
+    checkOneTerm(analyzer, "aA", "å");
+    checkOneTerm(analyzer, "ao", "ao");
+    checkOneTerm(analyzer, "aO", "aO");
+
+    checkOneTerm(analyzer, "AA", "Å");
+    checkOneTerm(analyzer, "Aa", "Å");
+    checkOneTerm(analyzer, "Ao", "Ao");
+    checkOneTerm(analyzer, "AO", "AO");
+
+    checkOneTerm(analyzer, "æ", "æ");
+    checkOneTerm(analyzer, "ä", "æ");
+
+    checkOneTerm(analyzer, "Æ", "Æ");
+    checkOneTerm(analyzer, "Ä", "Æ");
+
+    checkOneTerm(analyzer, "ae", "æ");
+    checkOneTerm(analyzer, "aE", "æ");
+
+    checkOneTerm(analyzer, "Ae", "Æ");
+    checkOneTerm(analyzer, "AE", "Æ");
+
+    checkOneTerm(analyzer, "ö", "ø");
+    checkOneTerm(analyzer, "ø", "ø");
+    checkOneTerm(analyzer, "Ö", "Ø");
+    checkOneTerm(analyzer, "Ø", "Ø");
+
+    checkOneTerm(analyzer, "oo", "oo");
+    checkOneTerm(analyzer, "oe", "ø");
+    checkOneTerm(analyzer, "oO", "oO");
+    checkOneTerm(analyzer, "oE", "ø");
+
+    checkOneTerm(analyzer, "Oo", "Oo");
+    checkOneTerm(analyzer, "Oe", "Ø");
+    checkOneTerm(analyzer, "OO", "OO");
+    checkOneTerm(analyzer, "OE", "Ø");
+    analyzer.close();
+  }
+
+  /** check that the empty string doesn't cause issues */
+  public void testEmptyTerm() throws Exception {
+    Analyzer a =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new KeywordTokenizer();
+            return new TokenStreamComponents(
+                tokenizer, new ScandinavianNormalizationFilter(tokenizer));
+          }
+        };
+    checkOneTerm(a, "", "");
+    a.close();
+  }
+
+  /** blast some random strings through the analyzer */
+  public void testRandomData() throws Exception {
+    Analyzer analyzer = createAnalyzer();
+    checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
+    analyzer.close();
+  }
+
+  private Analyzer createAnalyzer() {
+    return new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String field) {
+        final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        final TokenStream stream = new NorwegianNormalizationFilter(tokenizer);
+        return new TokenStreamComponents(tokenizer, stream);
+      }
+    };
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilterFactory.java
@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.no;
+
+import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.analysis.TokenStream;
+
+public class TestNorwegianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+  public void testDefault() throws Exception {
+    TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
+    stream = tokenFilterFactory("NorwegianNormalization").create(stream);
+    assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_oo_ao_Æ_Ø_Å_OO_AO"});
+  }
+
+  /** Test that bogus arguments result in exception */
+  public void testBogusArguments() throws Exception {
+    IllegalArgumentException expected =
+        expectThrows(
+            IllegalArgumentException.class,
+            () -> {
+              tokenFilterFactory("NorwegianNormalization", "bogusArg", "bogusValue");
+            });
+    assertTrue(
+        "Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
+  }
+}