Add RomanianNormalizationFilter (#13233)

2025-02-08 19:15:06 +00:00 · 2024-03-28 18:35:39 -04:00 · 2024-03-28 18:35:39 -04:00 · 8f4e449669
commit 8f4e449669
parent a7e916223c
9 changed files with 252 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -107,6 +107,8 @@ New Features
  maintained alongside their parent documents during sort and merge. IndexWriterConfig now requires a parent field to be
  specified if index sorting is used together with document blocks. (Simon Willnauer)

+* GITHUB#13233: Add RomanianNormalizationFilter (Trey Jones, Robert Muir)
+
 Improvements
 ---------------------

--- a/lucene/MIGRATE.md
+++ b/lucene/MIGRATE.md
@ -27,6 +27,10 @@

 Snowball has folded the "German2" stemmer into their "German" stemmer, so there's no "German2" anymore.  For Lucene APIs (TokenFilter, TokenFilterFactory) that accept String, "German2" will be mapped to "German" to avoid breaking users. If you were previously creating German2Stemmer instances, you'll need to change your code to create GermanStemmer instances instead.  For more information see https://snowballstem.org/algorithms/german2/stemmer.html

+### Romanian analysis
+
+RomanianAnalyzer now works with Romanian in its modern unicode form, and normalizes cedilla forms to forms with commas. Both forms are still in use in "the wild": you should reindex Romanian documents.
+
 ### IndexWriter requires a parent document field in order to use index sorting with document blocks (GITHUB#12829)

 For indices newly created as of 10.0.0 onwards, IndexWriter preserves document blocks indexed via
--- a/lucene/analysis/common/src/java/module-info.java
+++ b/lucene/analysis/common/src/java/module-info.java
@ -247,6 +247,7 @@ module org.apache.lucene.analysis.common {
      org.apache.lucene.analysis.pt.PortugueseMinimalStemFilterFactory,
      org.apache.lucene.analysis.pt.PortugueseStemFilterFactory,
      org.apache.lucene.analysis.reverse.ReverseStringFilterFactory,
+      org.apache.lucene.analysis.ro.RomanianNormalizationFilterFactory,
      org.apache.lucene.analysis.ru.RussianLightStemFilterFactory,
      org.apache.lucene.analysis.shingle.ShingleFilterFactory,
      org.apache.lucene.analysis.shingle.FixedShingleFilterFactory,
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
@ -111,13 +111,15 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
   *
   * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an
   *     {@link StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link StopFilter}, {@link
-   *     SetKeywordMarkerFilter} if a stem exclusion set is provided and {@link SnowballFilter}.
+   *     RomanianNormalizationFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is
+   *     provided and {@link SnowballFilter}.
   */
  @Override
  protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new LowerCaseFilter(source);
    result = new StopFilter(result, stopwords);
+    result = new RomanianNormalizationFilter(result);
    if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new RomanianStemmer());
    return new TokenStreamComponents(source, result);
@ -125,6 +127,8 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {

  @Override
  protected TokenStream normalize(String fieldName, TokenStream in) {
-    return new LowerCaseFilter(in);
+    TokenStream result = new LowerCaseFilter(in);
+    result = new RomanianNormalizationFilter(result);
+    return result;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianNormalizationFilter.java
@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ro;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/** TokenFilter that normalizes cedilla forms to comma forms. */
+public final class RomanianNormalizationFilter extends TokenFilter {
+  private final RomanianNormalizer normalizer = new RomanianNormalizer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  public RomanianNormalizationFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
+      termAtt.setLength(newlen);
+      return true;
+    }
+    return false;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianNormalizationFilterFactory.java
@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ro;
+
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link RomanianNormalizationFilter}.
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class RomanianNormalizationFilterFactory extends TokenFilterFactory {
+
+  /** SPI name */
+  public static final String NAME = "romanianNormalization";
+
+  public RomanianNormalizationFilterFactory(Map<String, String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  /** Default ctor for compatibility with SPI */
+  public RomanianNormalizationFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  @Override
+  public RomanianNormalizationFilter create(TokenStream input) {
+    return new RomanianNormalizationFilter(input);
+  }
+
+  @Override
+  public TokenStream normalize(TokenStream input) {
+    return create(input);
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianNormalizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianNormalizer.java
@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ro;
+
+/**
+ * Normalizer for Romanian.
+ *
+ * <p>Cedilla forms are normalized to forms with comma.
+ */
+class RomanianNormalizer {
+  static final char CAPITAL_S_WITH_COMMA_BELOW = '\u0218';
+  static final char SMALL_S_WITH_COMMA_BELOW = '\u0219';
+  static final char CAPITAL_T_WITH_COMMA_BELOW = '\u021A';
+  static final char SMALL_T_WITH_COMMA_BELOW = '\u021B';
+
+  static final char CAPITAL_S_WITH_CEDILLA = '\u015E';
+  static final char SMALL_S_WITH_CEDILLA = '\u015F';
+  static final char CAPITAL_T_WITH_CEDILLA = '\u0162';
+  static final char SMALL_T_WITH_CEDILLA = '\u0163';
+
+  /**
+   * Normalize an input buffer of Romanian text
+   *
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  int normalize(char[] s, int len) {
+
+    for (int i = 0; i < len; i++) {
+      switch (s[i]) {
+        case CAPITAL_S_WITH_CEDILLA:
+          s[i] = CAPITAL_S_WITH_COMMA_BELOW;
+          break;
+        case SMALL_S_WITH_CEDILLA:
+          s[i] = SMALL_S_WITH_COMMA_BELOW;
+          break;
+        case CAPITAL_T_WITH_CEDILLA:
+          s[i] = CAPITAL_T_WITH_COMMA_BELOW;
+          break;
+        case SMALL_T_WITH_CEDILLA:
+          s[i] = SMALL_T_WITH_COMMA_BELOW;
+          break;
+        default:
+          break;
+      }
+    }
+
+    return len;
+  }
+}
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@ -109,6 +109,7 @@ org.apache.lucene.analysis.pt.PortugueseLightStemFilterFactory
 org.apache.lucene.analysis.pt.PortugueseMinimalStemFilterFactory
 org.apache.lucene.analysis.pt.PortugueseStemFilterFactory
 org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
+org.apache.lucene.analysis.ro.RomanianNormalizationFilterFactory
 org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
 org.apache.lucene.analysis.shingle.ShingleFilterFactory
 org.apache.lucene.analysis.shingle.FixedShingleFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianNormalizationFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianNormalizationFilter.java
@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ro;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.analysis.MockTokenizer;
+
+/** Test the Romanian Normalization Filter */
+public class TestRomanianNormalizationFilter extends BaseTokenStreamTestCase {
+  private Analyzer a;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    a =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            return new TokenStreamComponents(tokenizer, new RomanianNormalizationFilter(tokenizer));
+          }
+        };
+  }
+
+  @Override
+  public void tearDown() throws Exception {
+    a.close();
+    super.tearDown();
+  }
+
+  public void testSmallSCedilla() throws Exception {
+    checkOneTerm(a, "aceşti", "acești");
+  }
+
+  public void testCapitalSCedilla() throws Exception {
+    checkOneTerm(a, "ACEŞTI", "ACEȘTI");
+  }
+
+  public void testSmallTCedilla() throws Exception {
+    checkOneTerm(a, "câţi", "câți");
+  }
+
+  public void testCapitalTCedilla() throws Exception {
+    checkOneTerm(a, "CÂŢI", "CÂȚI");
+  }
+
+  public void testEmptyTerm() throws IOException {
+    Analyzer a =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new KeywordTokenizer();
+            return new TokenStreamComponents(tokenizer, new RomanianNormalizationFilter(tokenizer));
+          }
+        };
+    checkOneTerm(a, "", "");
+    a.close();
+  }
+}