LUCENE-9929 NorwegianNormalizationFilter (#84)

2021-05-12 14:31:26 +02:00 · 2021-05-12 14:31:26 +02:00 · 7dd7077609
parent 6ebf959502
commit 7dd7077609
11 changed files with 572 additions and 110 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -225,6 +225,9 @@ Improvements
 * LUCENE-9850: Switch to PFOR encoding for doc IDs (instead of FOR). (Greg Miller)
 * LUCENE-9929: Add NorwegianNormalizationFilter, which does the same as ScandinavianNormalizationFilter except
  it does not fold oo->ø and ao->å. (janhoy, Robert Muir, Adrien Grand)
 Bug fixes
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java
@ -16,11 +16,12 @@
 */
 package org.apache.lucene.analysis.miscellaneous;
 import static org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.ALL_FOLDINGS;
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.util.StemmerUtil;
 /**
 * This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
@ -33,98 +34,29 @@ import org.apache.lucene.analysis.util.StemmerUtil;
 * <p>blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej but not blabarsyltetoj räksmörgås ==
 * ræksmørgås == ræksmörgaos == raeksmoergaas but not raksmorgas
 *
 * <p>There are also separate filters for Norwegian, Danish and Swedish with slightly differing
 * settings
 *
 * @see ScandinavianFoldingFilter
 */
 public final class ScandinavianNormalizationFilter extends TokenFilter {
  private final ScandinavianNormalizer normalizer;
  public ScandinavianNormalizationFilter(TokenStream input) {
    super(input);
    this.normalizer = new ScandinavianNormalizer(ALL_FOLDINGS);
  }
  private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
  private static final char AA = '\u00C5'; // Å
  private static final char aa = '\u00E5'; // å
  private static final char AE = '\u00C6'; // Æ
  private static final char ae = '\u00E6'; // æ
  private static final char AE_se = '\u00C4'; // Ä
  private static final char ae_se = '\u00E4'; // ä
  private static final char OE = '\u00D8'; // Ø
  private static final char oe = '\u00F8'; // ø
  private static final char OE_se = '\u00D6'; // Ö
  private static final char oe_se = '\u00F6'; // ö
  @Override
  public boolean incrementToken() throws IOException {
    if (!input.incrementToken()) {
      return false;
    }
-
+    charTermAttribute.setLength(
-    char[] buffer = charTermAttribute.buffer();
+        normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
    int length = charTermAttribute.length();
    int i;
    for (i = 0; i < length; i++) {
      if (buffer[i] == ae_se) {
        buffer[i] = ae;
      } else if (buffer[i] == AE_se) {
        buffer[i] = AE;
      } else if (buffer[i] == oe_se) {
        buffer[i] = oe;
      } else if (buffer[i] == OE_se) {
        buffer[i] = OE;
      } else if (length - 1 > i) {
        if (buffer[i] == 'a'
            && (buffer[i + 1] == 'a'
                || buffer[i + 1] == 'o'
                || buffer[i + 1] == 'A'
                || buffer[i + 1] == 'O')) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = aa;
        } else if (buffer[i] == 'A'
            && (buffer[i + 1] == 'a'
                || buffer[i + 1] == 'A'
                || buffer[i + 1] == 'o'
                || buffer[i + 1] == 'O')) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = AA;
        } else if (buffer[i] == 'a' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = ae;
        } else if (buffer[i] == 'A' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = AE;
        } else if (buffer[i] == 'o'
            && (buffer[i + 1] == 'e'
                || buffer[i + 1] == 'E'
                || buffer[i + 1] == 'o'
                || buffer[i + 1] == 'O')) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = oe;
        } else if (buffer[i] == 'O'
            && (buffer[i + 1] == 'e'
                || buffer[i + 1] == 'E'
                || buffer[i + 1] == 'o'
                || buffer[i + 1] == 'O')) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = OE;
        }
      }
    }
    charTermAttribute.setLength(length);
    return true;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java
@ -0,0 +1,135 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.miscellaneous;
 import java.util.EnumSet;
 import java.util.Set;
 import org.apache.lucene.analysis.util.StemmerUtil;
 /**
 * This Normalizer does the heavy lifting for a set of Scandinavian normalization filters,
 * normalizing use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded variants (aa,
 * ao, ae, oe and oo) by transforming them to åÅæÆøØ.
 *
 * @since 9.0
 * @lucene.internal
 */
 public final class ScandinavianNormalizer {
  /**
   * Create the instance, while choosing which foldings to apply. This may differ between Norwegian,
   * Danish and Swedish.
   *
   * @param foldings a Set of Foldings to apply (i.e. AE, OE, AA, AO, OO)
   */
  public ScandinavianNormalizer(Set<Foldings> foldings) {
    this.foldings = foldings;
  }
  /** List of possible foldings that can be used when configuring the filter */
  public enum Foldings {
    AA,
    AO,
    AE,
    OE,
    OO
  }
  private final Set<Foldings> foldings;
  public static final Set<Foldings> ALL_FOLDINGS = EnumSet.allOf(Foldings.class);
  static final char AA = '\u00C5'; // Å
  static final char aa = '\u00E5'; // å
  static final char AE = '\u00C6'; // Æ
  static final char ae = '\u00E6'; // æ
  static final char AE_se = '\u00C4'; // Ä
  static final char ae_se = '\u00E4'; // ä
  static final char OE = '\u00D8'; // Ø
  static final char oe = '\u00F8'; // ø
  static final char OE_se = '\u00D6'; // Ö
  static final char oe_se = '\u00F6'; // ö
  /**
   * Takes the original buffer and length as input. Modifies the buffer in-place and returns new
   * length
   *
   * @return new length
   */
  public int processToken(char[] buffer, int length) {
    int i;
    for (i = 0; i < length; i++) {
      if (buffer[i] == ae_se) {
        buffer[i] = ae;
      } else if (buffer[i] == AE_se) {
        buffer[i] = AE;
      } else if (buffer[i] == oe_se) {
        buffer[i] = oe;
      } else if (buffer[i] == OE_se) {
        buffer[i] = OE;
      } else if (length - 1 > i) {
        if (buffer[i] == 'a'
            && (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
                || foldings.contains(Foldings.AO)
                    && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = aa;
        } else if (buffer[i] == 'A'
            && (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
                || foldings.contains(Foldings.AO)
                    && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = AA;
        } else if (buffer[i] == 'a'
            && foldings.contains(Foldings.AE)
            && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = ae;
        } else if (buffer[i] == 'A'
            && foldings.contains(Foldings.AE)
            && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = AE;
        } else if (buffer[i] == 'o'
            && (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
                || foldings.contains(Foldings.OO)
                    && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = oe;
        } else if (buffer[i] == 'O'
            && (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
                || foldings.contains(Foldings.OO)
                    && (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
          length = StemmerUtil.delete(buffer, i + 1, length);
          buffer[i] = OE;
        }
      }
    }
    return length;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java
@ -0,0 +1,56 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.no;
 import java.io.IOException;
 import java.util.EnumSet;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
 import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer;
 import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /**
 * This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
 * variants (ae, oe, aa) by transforming them to åÅæÆøØ. This is similar to
 * ScandinavianNormalizationFilter, except for the folding rules customized for Norwegian.
 *
 * <p>blåbærsyltetøj == blåbärsyltetöj == blaabaersyltetoej
 *
 * @see ScandinavianNormalizationFilter
 */
 public final class NorwegianNormalizationFilter extends TokenFilter {
  private final ScandinavianNormalizer normalizer;
  public NorwegianNormalizationFilter(TokenStream input) {
    super(input);
    this.normalizer = new ScandinavianNormalizer(EnumSet.of(Foldings.AE, Foldings.OE, Foldings.AA));
  }
  private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
  @Override
  public final boolean incrementToken() throws IOException {
    if (!input.incrementToken()) {
      return false;
    }
    charTermAttribute.setLength(
        normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
    return true;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilterFactory.java
@ -0,0 +1,54 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.no;
 import java.util.Map;
 import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
 /**
 * Factory for {@link NorwegianNormalizationFilter}.
 *
 * @lucene.spi {@value #NAME}
 */
 public class NorwegianNormalizationFilterFactory extends TokenFilterFactory {
  /** SPI name */
  public static final String NAME = "norwegianNormalization";
  public NorwegianNormalizationFilterFactory(Map<String, String> args) {
    super(args);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
  }
  /** Default ctor for compatibility with SPI */
  public NorwegianNormalizationFilterFactory() {
    throw defaultCtorException();
  }
  @Override
  public NorwegianNormalizationFilter create(TokenStream input) {
    return new NorwegianNormalizationFilter(input);
  }
  @Override
  public TokenStream normalize(TokenStream input) {
    return create(input);
  }
 }
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@ -95,6 +95,7 @@ org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory
 org.apache.lucene.analysis.ngram.NGramFilterFactory
 org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
 org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
 org.apache.lucene.analysis.no.NorwegianNormalizationFilterFactory
 org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
 org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
 org.apache.lucene.analysis.pattern.PatternTypingFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java
@ -16,37 +16,12 @@
 */
 package org.apache.lucene.analysis.miscellaneous;
-import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase {
-  private Analyzer analyzer;
+  public void testDefault() throws Exception {
-
+    Analyzer analyzer = createAnalyzer();
  @Override
  public void setUp() throws Exception {
    super.setUp();
    analyzer =
        new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String field) {
            final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, stream);
          }
        };
  }
  @Override
  public void tearDown() throws Exception {
    analyzer.close();
    super.tearDown();
  }
  public void test() throws Exception {
    checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
@ -107,6 +82,7 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase
    checkOneTerm(analyzer, "Oe", "Ø");
    checkOneTerm(analyzer, "OO", "Ø");
    checkOneTerm(analyzer, "OE", "Ø");
    analyzer.close();
  }
  /** check that the empty string doesn't cause issues */
@ -126,6 +102,19 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase
  /** blast some random strings through the analyzer */
  public void testRandomData() throws Exception {
    Analyzer analyzer = createAnalyzer();
    checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
    analyzer.close();
  }
  private Analyzer createAnalyzer() {
    return new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String field) {
        final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
        final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
      }
    };
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java
@ -21,10 +21,10 @@ import org.apache.lucene.analysis.TokenStream;
 public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
-  public void testStemming() throws Exception {
+  public void testDefault() throws Exception {
-    TokenStream stream = whitespaceMockTokenizer("räksmörgås");
+    TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
    stream = tokenFilterFactory("ScandinavianNormalization").create(stream);
-    assertTokenStreamContents(stream, new String[] {"ræksmørgås"});
+    assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_ø_å_Æ_Ø_Å_Ø_Å"});
  }
  /** Test that bogus arguments result in exception */
@ -35,6 +35,7 @@ public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamF
            () -> {
              tokenFilterFactory("ScandinavianNormalization", "bogusArg", "bogusValue");
            });
-    assertTrue(expected.getMessage().contains("Unknown parameters"));
+    assertTrue(
        "Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizer.java
@ -0,0 +1,130 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.miscellaneous;
 import java.io.IOException;
 import java.util.Collections;
 import java.util.Set;
 import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /** Tests low level the normalizer functionality */
 public class TestScandinavianNormalizer extends BaseTokenStreamTestCase {
  public void testNoFoldings() throws Exception {
    Analyzer analyzer = createAnalyzer(Collections.emptySet());
    checkOneTerm(analyzer, "aa", "aa");
    checkOneTerm(analyzer, "ao", "ao");
    checkOneTerm(analyzer, "ae", "ae");
    checkOneTerm(analyzer, "oo", "oo");
    checkOneTerm(analyzer, "oe", "oe");
    analyzer.close();
  }
  public void testAeFolding() throws Exception {
    Analyzer analyzer = createAnalyzer(Set.of(Foldings.AE));
    checkOneTerm(analyzer, "aa", "aa");
    checkOneTerm(analyzer, "ao", "ao");
    checkOneTerm(analyzer, "ae", "æ");
    checkOneTerm(analyzer, "aE", "æ");
    checkOneTerm(analyzer, "Ae", "Æ");
    checkOneTerm(analyzer, "AE", "Æ");
    checkOneTerm(analyzer, "oo", "oo");
    checkOneTerm(analyzer, "oe", "oe");
    analyzer.close();
  }
  public void testAaFolding() throws Exception {
    Analyzer analyzer = createAnalyzer(Set.of(Foldings.AA));
    checkOneTerm(analyzer, "aa", "å");
    checkOneTerm(analyzer, "aA", "å");
    checkOneTerm(analyzer, "Aa", "Å");
    checkOneTerm(analyzer, "AA", "Å");
    checkOneTerm(analyzer, "ao", "ao");
    checkOneTerm(analyzer, "ae", "ae");
    checkOneTerm(analyzer, "oo", "oo");
    checkOneTerm(analyzer, "oe", "oe");
    analyzer.close();
  }
  public void testOeFolding() throws Exception {
    Analyzer analyzer = createAnalyzer(Set.of(Foldings.OE));
    checkOneTerm(analyzer, "aa", "aa");
    checkOneTerm(analyzer, "ao", "ao");
    checkOneTerm(analyzer, "ae", "ae");
    checkOneTerm(analyzer, "oo", "oo");
    checkOneTerm(analyzer, "oe", "ø");
    checkOneTerm(analyzer, "oE", "ø");
    checkOneTerm(analyzer, "Oe", "Ø");
    checkOneTerm(analyzer, "OE", "Ø");
    analyzer.close();
  }
  public void testOoFolding() throws Exception {
    Analyzer analyzer = createAnalyzer(Set.of(Foldings.OO));
    checkOneTerm(analyzer, "aa", "aa");
    checkOneTerm(analyzer, "ao", "ao");
    checkOneTerm(analyzer, "ae", "ae");
    checkOneTerm(analyzer, "oo", "ø");
    checkOneTerm(analyzer, "oO", "ø");
    checkOneTerm(analyzer, "Oo", "Ø");
    checkOneTerm(analyzer, "OO", "Ø");
    checkOneTerm(analyzer, "oe", "oe");
    analyzer.close();
  }
  public void testAoFolding() throws Exception {
    Analyzer analyzer = createAnalyzer(Set.of(Foldings.AO));
    checkOneTerm(analyzer, "aa", "aa");
    checkOneTerm(analyzer, "ao", "å");
    checkOneTerm(analyzer, "aO", "å");
    checkOneTerm(analyzer, "Ao", "Å");
    checkOneTerm(analyzer, "AO", "Å");
    checkOneTerm(analyzer, "ae", "ae");
    checkOneTerm(analyzer, "oo", "oo");
    checkOneTerm(analyzer, "oe", "oe");
    analyzer.close();
  }
  private Analyzer createAnalyzer(Set<Foldings> foldings) {
    return new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String field) {
        final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
        final TokenStream stream =
            new TokenFilter(tokenizer) {
              private final CharTermAttribute charTermAttribute =
                  addAttribute(CharTermAttribute.class);
              private final ScandinavianNormalizer normalizer =
                  new ScandinavianNormalizer(foldings);
              @Override
              public boolean incrementToken() throws IOException {
                if (!input.incrementToken()) {
                  return false;
                }
                charTermAttribute.setLength(
                    normalizer.processToken(
                        charTermAttribute.buffer(), charTermAttribute.length()));
                return true;
              }
            };
        return new TokenStreamComponents(tokenizer, stream);
      }
    };
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilter.java
@ -0,0 +1,120 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.no;
 import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
 public class TestNorwegianNormalizationFilter extends BaseTokenStreamTestCase {
  public void testDefault() throws Exception {
    Analyzer analyzer = createAnalyzer();
    checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
    checkOneTerm(analyzer, "aeäaeeeae", "æææeeæ");
    checkOneTerm(analyzer, "aeaeeeae", "ææeeæ");
    checkOneTerm(analyzer, "bøen", "bøen");
    checkOneTerm(analyzer, "bOEen", "bØen");
    checkOneTerm(analyzer, "åene", "åene");
    checkOneTerm(analyzer, "blåbærsyltetøj", "blåbærsyltetøj");
    checkOneTerm(analyzer, "blaabaersyltetöj", "blåbærsyltetøj");
    checkOneTerm(analyzer, "räksmörgås", "ræksmørgås");
    checkOneTerm(analyzer, "raeksmörgaas", "ræksmørgås");
    checkOneTerm(analyzer, "raeksmoergås", "ræksmørgås");
    checkOneTerm(analyzer, "ab", "ab");
    checkOneTerm(analyzer, "ob", "ob");
    checkOneTerm(analyzer, "Ab", "Ab");
    checkOneTerm(analyzer, "Ob", "Ob");
    checkOneTerm(analyzer, "å", "å");
    checkOneTerm(analyzer, "aa", "å");
    checkOneTerm(analyzer, "aA", "å");
    checkOneTerm(analyzer, "ao", "ao");
    checkOneTerm(analyzer, "aO", "aO");
    checkOneTerm(analyzer, "AA", "Å");
    checkOneTerm(analyzer, "Aa", "Å");
    checkOneTerm(analyzer, "Ao", "Ao");
    checkOneTerm(analyzer, "AO", "AO");
    checkOneTerm(analyzer, "æ", "æ");
    checkOneTerm(analyzer, "ä", "æ");
    checkOneTerm(analyzer, "Æ", "Æ");
    checkOneTerm(analyzer, "Ä", "Æ");
    checkOneTerm(analyzer, "ae", "æ");
    checkOneTerm(analyzer, "aE", "æ");
    checkOneTerm(analyzer, "Ae", "Æ");
    checkOneTerm(analyzer, "AE", "Æ");
    checkOneTerm(analyzer, "ö", "ø");
    checkOneTerm(analyzer, "ø", "ø");
    checkOneTerm(analyzer, "Ö", "Ø");
    checkOneTerm(analyzer, "Ø", "Ø");
    checkOneTerm(analyzer, "oo", "oo");
    checkOneTerm(analyzer, "oe", "ø");
    checkOneTerm(analyzer, "oO", "oO");
    checkOneTerm(analyzer, "oE", "ø");
    checkOneTerm(analyzer, "Oo", "Oo");
    checkOneTerm(analyzer, "Oe", "Ø");
    checkOneTerm(analyzer, "OO", "OO");
    checkOneTerm(analyzer, "OE", "Ø");
    analyzer.close();
  }
  /** check that the empty string doesn't cause issues */
  public void testEmptyTerm() throws Exception {
    Analyzer a =
        new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(
                tokenizer, new ScandinavianNormalizationFilter(tokenizer));
          }
        };
    checkOneTerm(a, "", "");
    a.close();
  }
  /** blast some random strings through the analyzer */
  public void testRandomData() throws Exception {
    Analyzer analyzer = createAnalyzer();
    checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
    analyzer.close();
  }
  private Analyzer createAnalyzer() {
    return new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String field) {
        final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
        final TokenStream stream = new NorwegianNormalizationFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
      }
    };
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianNormalizationFilterFactory.java
@ -0,0 +1,41 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.no;
 import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
 import org.apache.lucene.analysis.TokenStream;
 public class TestNorwegianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
  public void testDefault() throws Exception {
    TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
    stream = tokenFilterFactory("NorwegianNormalization").create(stream);
    assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_oo_ao_Æ_Ø_Å_OO_AO"});
  }
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    IllegalArgumentException expected =
        expectThrows(
            IllegalArgumentException.class,
            () -> {
              tokenFilterFactory("NorwegianNormalization", "bogusArg", "bogusValue");
            });
    assertTrue(
        "Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
  }
 }