LUCENE-9413: Add CJKWidthCharFilter and its factory. (#2081)

2020-11-17 17:32:10 +09:00 · 2020-11-17 17:32:10 +09:00 · 8503efdcff
parent 2d583eaba7
commit 8503efdcff
6 changed files with 369 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -230,6 +230,8 @@ New Features
 * LUCENE-9378: Doc values now allow configuring how to trade compression for
  retrieval speed. (Adrien Grand)
 * LUCENE-9413: Add CJKWidthCharFilter and its factory (Tomoko Uchida)
 Improvements
 ---------------------
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java
@ -0,0 +1,148 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.cjk;
 import org.apache.lucene.analysis.charfilter.BaseCharFilter;
 import java.io.IOException;
 import java.io.Reader;
 /**
 * A {@link org.apache.lucene.analysis.CharFilter} that normalizes CJK width differences:
 * <ul>
 *   <li>Folds fullwidth ASCII variants into the equivalent basic latin
 *   <li>Folds halfwidth Katakana variants into the equivalent kana
 * </ul>
 * <p>
 * NOTE: this char filter is the exact counterpart of {@link CJKWidthFilter}.
 */
 public class CJKWidthCharFilter extends BaseCharFilter {
  /* halfwidth kana mappings: 0xFF65-0xFF9D
   *
   * note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A
   * as a fallback when they cannot properly combine with a preceding
   * character into a composed form.
   */
  private static final char KANA_NORM[] = new char[] {
    0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
    0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
    0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
    0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
    0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
    0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
    0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A
  };
  /* kana combining diffs: 0x30A6-0x30FD */
  private static final byte KANA_COMBINE_VOICED[] = new byte[] {
    78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
    0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
    0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
  };
  private static final byte KANA_COMBINE_SEMI_VOICED[] = new byte[] {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
    0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  };
  private static final int HW_KATAKANA_VOICED_MARK = 0xFF9E;
  private static final int HW_KATAKANA_SEMI_VOICED_MARK = 0xFF9F;
  private int prevChar = -1;
  private int inputOff = 0;
  /** Default constructor that takes a {@link Reader}. */
  public CJKWidthCharFilter(Reader in) {
    super(in);
  }
  @Override
  public int read() throws IOException {
    while(true) {
      final int ch = input.read();
      if (ch == -1) {
        // reached end of the input
        int ret = prevChar;
        prevChar = ch;
        return ret;
      }
      inputOff++;
      int ret = -1;
      // if the current char is a voice mark, then try to combine it with the previous char.
      if (ch == HW_KATAKANA_SEMI_VOICED_MARK || ch == HW_KATAKANA_VOICED_MARK) {
        final int combinedChar = combineVoiceMark(prevChar, ch);
        if (prevChar != combinedChar) {
          // successfully combined. returns the combined char immediately
          prevChar = -1;
          // offset needs to be corrected
          final int prevCumulativeDiff = getLastCumulativeDiff();
          addOffCorrectMap(inputOff - 1 - prevCumulativeDiff, prevCumulativeDiff + 1);
          return combinedChar;
        }
      }
      if (prevChar != -1) {
        ret = prevChar;
      }
      if (ch >= 0xFF01 && ch <= 0xFF5E) {
        // Fullwidth ASCII variants
        prevChar = ch - 0xFEE0;
      } else if (ch >= 0xFF65 && ch <= 0xFF9F) {
        // Halfwidth Katakana variants
        prevChar = KANA_NORM[ch - 0xFF65];
      } else {
        // no need to normalize
        prevChar = ch;
      }
      if (ret != -1) {
        return ret;
      }
    }
  }
  /** returns combined char if we successfully combined the voice mark, otherwise original char */
  private int combineVoiceMark(int ch, int voiceMark) {
    assert voiceMark == HW_KATAKANA_SEMI_VOICED_MARK || voiceMark == HW_KATAKANA_VOICED_MARK;
    if (ch >= 0x30A6 && ch <= 0x30FD) {
      ch += (voiceMark == HW_KATAKANA_SEMI_VOICED_MARK)
        ? KANA_COMBINE_SEMI_VOICED[prevChar - 0x30A6]
        : KANA_COMBINE_VOICED[prevChar - 0x30A6];
    }
    return ch;
  }
  @Override
  public int read(char[] cbuf, int off, int len) throws IOException {
    int numRead = 0;
    for(int i = off; i < off + len; i++) {
      int c = read();
      if (c == -1) break;
      cbuf[i] = (char) c;
      numRead++;
    }
    return numRead == 0 ? -1 : numRead;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java
@ -0,0 +1,57 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.cjk;
 import org.apache.lucene.analysis.CharFilterFactory;
 import java.io.Reader;
 import java.util.Map;
 /**
 * Factory for {@link CJKWidthCharFilter}.
 * @lucene.spi {@value #NAME}
 */
 public class CJKWidthCharFilterFactory extends CharFilterFactory {
  /** SPI name */
  public static final String NAME = "cjkWidth";
  /** Creates a new CJKWidthCharFilterFactory */
  public CJKWidthCharFilterFactory(Map<String,String> args) {
    super(args);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
  }
  /** Default ctor for compatibility with SPI */
  public CJKWidthCharFilterFactory() {
    throw defaultCtorException();
  }
  @Override
  public Reader create(Reader input) {
    return new CJKWidthCharFilter(input);
  }
  @Override
  public Reader normalize(Reader input) {
    return create(input);
  }
 }
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.CharFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.CharFilterFactory
@ -15,5 +15,6 @@
 org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory
 org.apache.lucene.analysis.charfilter.MappingCharFilterFactory
 org.apache.lucene.analysis.cjk.CJKWidthCharFilterFactory
 org.apache.lucene.analysis.fa.PersianCharFilterFactory
 org.apache.lucene.analysis.pattern.PatternReplaceCharFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilter.java
@ -0,0 +1,118 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.cjk;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 public class TestCJKWidthCharFilter extends BaseTokenStreamTestCase {
  /**
   * Full-width ASCII forms normalized to half-width (basic latin)
   */
  public void testFullWidthASCII() throws IOException {
    CharFilter reader = new CJKWidthCharFilter(new StringReader("Ｔｅｓｔ １２３４"));
    TokenStream ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{"Test", "1234"}, new int[]{0, 5}, new int[]{4, 9}, 9);
  }
  /**
   * Half-width katakana forms normalized to standard katakana.
   * A bit trickier in some cases, since half-width forms are decomposed
   * and voice marks need to be recombined with a preceding base form.
   */
  public void testHalfWidthKana() throws IOException {
    CharFilter reader = new CJKWidthCharFilter(new StringReader("ｶﾀｶﾅ"));
    TokenStream ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{"カタカナ"}, new int[]{0}, new int[]{4}, 4);
    reader = new CJKWidthCharFilter(new StringReader("ｳﾞｨｯﾂ"));
    ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{"ヴィッツ"}, new int[]{0}, new int[]{5}, 5);
    reader = new CJKWidthCharFilter(new StringReader("ﾊﾟﾅｿﾆｯｸ"));
    ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{"パナソニック"}, new int[]{0}, new int[]{7}, 7);
    reader = new CJKWidthCharFilter(new StringReader("ｳﾞｨｯﾂ ﾊﾟﾅｿﾆｯｸ"));
    ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{"ヴィッツ", "パナソニック"}, new int[]{0, 6}, new int[]{5, 13}, 13);
  }
  /**
   * Input may contain orphan voiced marks that cannot be combined with the previous character.
   */
  public void testOrphanVoiceMark() throws Exception {
    CharFilter reader = new CJKWidthCharFilter(new StringReader("ｱﾞｨｯﾂ"));
    TokenStream ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{"ア\u3099ィッツ"}, new int[]{0}, new int[]{5}, 5);
    reader = new CJKWidthCharFilter(new StringReader("ﾞｨｯﾂ"));
    ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{"\u3099ィッツ"}, new int[]{0}, new int[]{4}, 4);
    reader = new CJKWidthCharFilter(new StringReader("ｱﾟﾅｿﾆｯｸ"));
    ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{"ア\u309Aナソニック"}, new int[]{0}, new int[]{7}, 7);
    reader = new CJKWidthCharFilter(new StringReader("ﾟﾅｿﾆｯｸ"));
    ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{"\u309Aナソニック"}, new int[]{0}, new int[]{6}, 6);
  }
  public void testComplexInput() throws Exception {
    CharFilter reader = new CJKWidthCharFilter(new StringReader("Ｔｅst １２34"));
    TokenStream ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{"Test", "1234"}, new int[]{0, 5}, new int[]{4, 9}, 9);
    reader = new CJKWidthCharFilter(new StringReader("ｶﾀカナ ｳﾞｨッツ ﾊﾟﾅｿニック"));
    ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{"カタカナ", "ヴィッツ", "パナソニック"}, new int[]{0, 5, 11}, new int[]{4, 10, 18}, 18);
  }
  public void testEmptyInput() throws Exception {
    CharFilter reader = new CJKWidthCharFilter(new StringReader(""));
    TokenStream ts = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(ts, new String[]{});
  }
  public void testRandom() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }
      @Override
      protected Reader initReader(String fieldName, Reader reader) {
        return new CJKWidthCharFilter(reader);
      }
    };
    int numRounds = RANDOM_MULTIPLIER * 1000;
    checkRandomData(random(), analyzer, numRounds);
    analyzer.close();
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilterFactory.java
@ -0,0 +1,43 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.cjk;
 import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import java.io.Reader;
 import java.io.StringReader;
 /**
 * Simple tests to ensure {@link CJKWidthCharFilter} is working
 */
 public class TestCJKWidthCharFilterFactory extends BaseTokenStreamFactoryTestCase {
  public void test() throws Exception {
    Reader reader = charFilterFactory("cjkWidth").create(new StringReader("Ｔｅｓｔ １２３４"));
    TokenStream stream = whitespaceMockTokenizer(reader);
    assertTokenStreamContents(stream, new String[] { "Test", "1234" });
  }
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
      charFilterFactory("cjkWidth", "bogusArg", "bogusValue");
    });
    assertTrue(expected.getMessage().contains("Unknown parameters"));
  }
 }