From 8503efdcff91461114a26f6aaae180a90570da2b Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Tue, 17 Nov 2020 17:32:10 +0900 Subject: [PATCH] LUCENE-9413: Add CJKWidthCharFilter and its factory. (#2081) --- lucene/CHANGES.txt | 2 + .../analysis/cjk/CJKWidthCharFilter.java | 148 ++++++++++++++++++ .../cjk/CJKWidthCharFilterFactory.java | 57 +++++++ ...g.apache.lucene.analysis.CharFilterFactory | 1 + .../analysis/cjk/TestCJKWidthCharFilter.java | 118 ++++++++++++++ .../cjk/TestCJKWidthCharFilterFactory.java | 43 +++++ 6 files changed, 369 insertions(+) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilter.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilterFactory.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0fc90b4dcf9..c114bc90d32 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -230,6 +230,8 @@ New Features * LUCENE-9378: Doc values now allow configuring how to trade compression for retrieval speed. (Adrien Grand) +* LUCENE-9413: Add CJKWidthCharFilter and its factory (Tomoko Uchida) + Improvements --------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java new file mode 100644 index 00000000000..4fb7ced92a8 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.cjk; + +import org.apache.lucene.analysis.charfilter.BaseCharFilter; + +import java.io.IOException; +import java.io.Reader; + +/** + * A {@link org.apache.lucene.analysis.CharFilter} that normalizes CJK width differences: + * + *

+ * NOTE: this char filter is the exact counterpart of {@link CJKWidthFilter}. + */ +public class CJKWidthCharFilter extends BaseCharFilter { + + /* halfwidth kana mappings: 0xFF65-0xFF9D + * + * note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A + * as a fallback when they cannot properly combine with a preceding + * character into a composed form. + */ + private static final char KANA_NORM[] = new char[] { + 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, + 0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, + 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd, + 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, + 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0, + 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, + 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A + }; + + /* kana combining diffs: 0x30A6-0x30FD */ + private static final byte KANA_COMBINE_VOICED[] = new byte[] { + 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, + 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 + }; + + private static final byte KANA_COMBINE_SEMI_VOICED[] = new byte[] { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, + 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + + private static final int HW_KATAKANA_VOICED_MARK = 0xFF9E; + private static final int HW_KATAKANA_SEMI_VOICED_MARK = 0xFF9F; + + private int prevChar = -1; + private int inputOff = 0; + + /** Default constructor that takes a {@link Reader}. */ + public CJKWidthCharFilter(Reader in) { + super(in); + } + + @Override + public int read() throws IOException { + while(true) { + final int ch = input.read(); + if (ch == -1) { + // reached end of the input + int ret = prevChar; + prevChar = ch; + return ret; + } + + inputOff++; + int ret = -1; + // if the current char is a voice mark, then try to combine it with the previous char. + if (ch == HW_KATAKANA_SEMI_VOICED_MARK || ch == HW_KATAKANA_VOICED_MARK) { + final int combinedChar = combineVoiceMark(prevChar, ch); + if (prevChar != combinedChar) { + // successfully combined. returns the combined char immediately + prevChar = -1; + // offset needs to be corrected + final int prevCumulativeDiff = getLastCumulativeDiff(); + addOffCorrectMap(inputOff - 1 - prevCumulativeDiff, prevCumulativeDiff + 1); + return combinedChar; + } + } + + if (prevChar != -1) { + ret = prevChar; + } + + if (ch >= 0xFF01 && ch <= 0xFF5E) { + // Fullwidth ASCII variants + prevChar = ch - 0xFEE0; + } else if (ch >= 0xFF65 && ch <= 0xFF9F) { + // Halfwidth Katakana variants + prevChar = KANA_NORM[ch - 0xFF65]; + } else { + // no need to normalize + prevChar = ch; + } + + if (ret != -1) { + return ret; + } + } + } + + /** returns combined char if we successfully combined the voice mark, otherwise original char */ + private int combineVoiceMark(int ch, int voiceMark) { + assert voiceMark == HW_KATAKANA_SEMI_VOICED_MARK || voiceMark == HW_KATAKANA_VOICED_MARK; + if (ch >= 0x30A6 && ch <= 0x30FD) { + ch += (voiceMark == HW_KATAKANA_SEMI_VOICED_MARK) + ? KANA_COMBINE_SEMI_VOICED[prevChar - 0x30A6] + : KANA_COMBINE_VOICED[prevChar - 0x30A6]; + } + return ch; + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + int numRead = 0; + for(int i = off; i < off + len; i++) { + int c = read(); + if (c == -1) break; + cbuf[i] = (char) c; + numRead++; + } + return numRead == 0 ? -1 : numRead; + } + +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java new file mode 100644 index 00000000000..4f8bf096633 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.cjk; + + +import org.apache.lucene.analysis.CharFilterFactory; + +import java.io.Reader; +import java.util.Map; + +/** + * Factory for {@link CJKWidthCharFilter}. + * @lucene.spi {@value #NAME} + */ +public class CJKWidthCharFilterFactory extends CharFilterFactory { + + /** SPI name */ + public static final String NAME = "cjkWidth"; + + /** Creates a new CJKWidthCharFilterFactory */ + public CJKWidthCharFilterFactory(Map args) { + super(args); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + /** Default ctor for compatibility with SPI */ + public CJKWidthCharFilterFactory() { + throw defaultCtorException(); + } + + @Override + public Reader create(Reader input) { + return new CJKWidthCharFilter(input); + } + + @Override + public Reader normalize(Reader input) { + return create(input); + } + +} diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.CharFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.CharFilterFactory index b53db41238a..c9f43ec2df7 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.CharFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.CharFilterFactory @@ -15,5 +15,6 @@ org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory org.apache.lucene.analysis.charfilter.MappingCharFilterFactory +org.apache.lucene.analysis.cjk.CJKWidthCharFilterFactory org.apache.lucene.analysis.fa.PersianCharFilterFactory org.apache.lucene.analysis.pattern.PatternReplaceCharFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilter.java new file mode 100644 index 00000000000..92f9851e6c6 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilter.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.cjk; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +public class TestCJKWidthCharFilter extends BaseTokenStreamTestCase { + /** + * Full-width ASCII forms normalized to half-width (basic latin) + */ + public void testFullWidthASCII() throws IOException { + CharFilter reader = new CJKWidthCharFilter(new StringReader("Test 1234")); + TokenStream ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{"Test", "1234"}, new int[]{0, 5}, new int[]{4, 9}, 9); + } + + /** + * Half-width katakana forms normalized to standard katakana. + * A bit trickier in some cases, since half-width forms are decomposed + * and voice marks need to be recombined with a preceding base form. + */ + public void testHalfWidthKana() throws IOException { + CharFilter reader = new CJKWidthCharFilter(new StringReader("カタカナ")); + TokenStream ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{"カタカナ"}, new int[]{0}, new int[]{4}, 4); + + reader = new CJKWidthCharFilter(new StringReader("ヴィッツ")); + ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{"ヴィッツ"}, new int[]{0}, new int[]{5}, 5); + + reader = new CJKWidthCharFilter(new StringReader("パナソニック")); + ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{"パナソニック"}, new int[]{0}, new int[]{7}, 7); + + reader = new CJKWidthCharFilter(new StringReader("ヴィッツ パナソニック")); + ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{"ヴィッツ", "パナソニック"}, new int[]{0, 6}, new int[]{5, 13}, 13); + } + + /** + * Input may contain orphan voiced marks that cannot be combined with the previous character. + */ + public void testOrphanVoiceMark() throws Exception { + CharFilter reader = new CJKWidthCharFilter(new StringReader("ア゙ィッツ")); + TokenStream ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{"ア\u3099ィッツ"}, new int[]{0}, new int[]{5}, 5); + + reader = new CJKWidthCharFilter(new StringReader("゙ィッツ")); + ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{"\u3099ィッツ"}, new int[]{0}, new int[]{4}, 4); + + reader = new CJKWidthCharFilter(new StringReader("ア゚ナソニック")); + ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{"ア\u309Aナソニック"}, new int[]{0}, new int[]{7}, 7); + + reader = new CJKWidthCharFilter(new StringReader("゚ナソニック")); + ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{"\u309Aナソニック"}, new int[]{0}, new int[]{6}, 6); + } + + public void testComplexInput() throws Exception { + CharFilter reader = new CJKWidthCharFilter(new StringReader("Test 1234")); + TokenStream ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{"Test", "1234"}, new int[]{0, 5}, new int[]{4, 9}, 9); + + reader = new CJKWidthCharFilter(new StringReader("カタカナ ヴィッツ パナソニック")); + ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{"カタカナ", "ヴィッツ", "パナソニック"}, new int[]{0, 5, 11}, new int[]{4, 10, 18}, 18); + } + + public void testEmptyInput() throws Exception { + CharFilter reader = new CJKWidthCharFilter(new StringReader("")); + TokenStream ts = whitespaceMockTokenizer(reader); + assertTokenStreamContents(ts, new String[]{}); + } + + public void testRandom() throws Exception { + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, tokenizer); + } + @Override + protected Reader initReader(String fieldName, Reader reader) { + return new CJKWidthCharFilter(reader); + } + }; + int numRounds = RANDOM_MULTIPLIER * 1000; + checkRandomData(random(), analyzer, numRounds); + analyzer.close(); + } + +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilterFactory.java new file mode 100644 index 00000000000..777043466ce --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilterFactory.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.cjk; + +import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase; +import org.apache.lucene.analysis.TokenStream; + +import java.io.Reader; +import java.io.StringReader; + +/** + * Simple tests to ensure {@link CJKWidthCharFilter} is working + */ +public class TestCJKWidthCharFilterFactory extends BaseTokenStreamFactoryTestCase { + public void test() throws Exception { + Reader reader = charFilterFactory("cjkWidth").create(new StringReader("Test 1234")); + TokenStream stream = whitespaceMockTokenizer(reader); + assertTokenStreamContents(stream, new String[] { "Test", "1234" }); + } + + /** Test that bogus arguments result in exception */ + public void testBogusArguments() throws Exception { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + charFilterFactory("cjkWidth", "bogusArg", "bogusValue"); + }); + assertTrue(expected.getMessage().contains("Unknown parameters")); + } +}