LUCENE-8812: Add new KoreanNumberFilter that can change Hangul character to number and process decimal point

Signed-off-by: Namgyu Kim <namgyu@apache.org>
This commit is contained in:
Namgyu Kim 2019-06-09 23:00:14 +09:00
parent 3d57a323a9
commit 5a75b8a080
5 changed files with 1072 additions and 0 deletions

View File

@ -43,6 +43,10 @@ New Features
to false (defaults to true).
(Namgyu Kim via Jim Ferenczi)
* LUCENE-8812: Add new KoreanNumberFilter that can change Hangul character to number
and process decimal point. It is similar to the JapaneseNumberFilter.
(Namgyu Kim)
Bug Fixes
* LUCENE-8831: Fixed LatLonShapeBoundingBoxQuery .hashCode methods. (Ignacio Vera)

View File

@ -0,0 +1,618 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko;
import java.io.IOException;
import java.math.BigDecimal;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
/**
* A {@link TokenFilter} that normalizes Korean numbers to regular Arabic
* decimal numbers in half-width characters.
* <p>
* Korean numbers are often written using a combination of Hangul and Arabic numbers with
* various kinds punctuation. For example, 2천 means 3200. This filter does this kind
* of normalization and allows a search for 3200 to match 2천 in text, but can also be
* used to make range facets based on the normalized numbers and so on.
* <p>
* Notice that this analyzer uses a token composition scheme and relies on punctuation
* tokens being found in the token stream. Please make sure your {@link KoreanTokenizer}
* has {@code discardPunctuation} set to false. In case punctuation characters, such as
* (U+FF0E FULLWIDTH FULL STOP), is removed from the token stream, this filter would find
* input tokens tokens and 2천 and give outputs 3 and 2000 instead of 3200, which is
* likely not the intended result. If you want to remove punctuation characters from your
* index that are not part of normalized numbers, add a
* {@link org.apache.lucene.analysis.StopFilter} with the punctuation you wish to
* remove after {@link KoreanNumberFilter} in your analyzer chain.
* <p>
* Below are some examples of normalizations this filter supports. The input is untokenized
* text and the result is the single term attribute emitted for the input.
* <ul>
* <li>영영칠 becomes 7</li>
* <li>일영영영 becomes 1000</li>
* <li>삼천2백2십삼 becomes 3223</li>
* <li>조육백만오천일 becomes 1000006005001</li>
* <li>.2천 becomes 3200</li>
* <li>.2만345. becomes 12345.67</li>
* <li>4,647.100 becomes 4647.1</li>
* <li>15,7 becomes 157 (be aware of this weakness)</li>
* </ul>
* <p>
* Tokens preceded by a token with {@link PositionIncrementAttribute} of zero are left
* left untouched and emitted as-is.
* <p>
* This filter does not use any part-of-speech information for its normalization and
* the motivation for this is to also support n-grammed token streams in the future.
* <p>
* This filter may in some cases normalize tokens that are not numbers in their context.
* For example, is 전중경일 is a name and means Tanaka Kyōichi, but 경일 (Kyōichi) out of
* context can strictly speaking also represent the number 10000000000000001. This filter
* respects the {@link KeywordAttribute}, which can be used to prevent specific
* normalizations from happening.
*
* @lucene.experimental
*/
public class KoreanNumberFilter extends TokenFilter {
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttr = addAttribute(OffsetAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final PositionIncrementAttribute posIncrAttr = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAttr = addAttribute(PositionLengthAttribute.class);
private static char NO_NUMERAL = Character.MAX_VALUE;
private static char[] numerals;
private static char[] exponents;
private State state;
private StringBuilder numeral;
private int fallThroughTokens;
private boolean exhausted = false;
static {
numerals = new char[0x10000];
for (int i = 0; i < numerals.length; i++) {
numerals[i] = NO_NUMERAL;
}
numerals['영'] = 0; // U+C601 0
numerals['일'] = 1; // U+C77C 1
numerals['이'] = 2; // U+C774 2
numerals['삼'] = 3; // U+C0BC 3
numerals['사'] = 4; // U+C0AC 4
numerals['오'] = 5; // U+C624 5
numerals['육'] = 6; // U+C721 6
numerals['칠'] = 7; // U+CE60 7
numerals['팔'] = 8; // U+D314 8
numerals['구'] = 9; // U+AD6C 9
exponents = new char[0x10000];
for (int i = 0; i < exponents.length; i++) {
exponents[i] = 0;
}
exponents['십'] = 1; // U+C2ED 10
exponents['백'] = 2; // U+BC31 100
exponents['천'] = 3; // U+CC9C 1,000
exponents['만'] = 4; // U+B9CC 10,000
exponents['억'] = 8; // U+C5B5 100,000,000
exponents['조'] = 12; // U+C870 1,000,000,000,000
exponents['경'] = 16; // U+ACBD 10,000,000,000,000,000
exponents['해'] = 20; // U+D574 100,000,000,000,000,000,000
}
public KoreanNumberFilter(TokenStream input) {
super(input);
}
@Override
public final boolean incrementToken() throws IOException {
// Emit previously captured token we read past earlier
if (state != null) {
restoreState(state);
state = null;
return true;
}
if (exhausted) {
return false;
}
if (!input.incrementToken()) {
exhausted = true;
return false;
}
if (keywordAttr.isKeyword()) {
return true;
}
if (fallThroughTokens > 0) {
fallThroughTokens--;
return true;
}
if (posIncrAttr.getPositionIncrement() == 0) {
fallThroughTokens = posLengthAttr.getPositionLength() - 1;
return true;
}
boolean moreTokens = true;
boolean composedNumberToken = false;
int startOffset = 0;
int endOffset = 0;
State preCompositionState = captureState();
String term = termAttr.toString();
boolean numeralTerm = isNumeral(term);
while (moreTokens && numeralTerm) {
if (!composedNumberToken) {
startOffset = offsetAttr.startOffset();
composedNumberToken = true;
}
endOffset = offsetAttr.endOffset();
moreTokens = input.incrementToken();
if (moreTokens == false) {
exhausted = true;
}
if (posIncrAttr.getPositionIncrement() == 0) {
// This token is a stacked/synonym token, capture number of tokens "under" this token,
// except the first token, which we will emit below after restoring state
fallThroughTokens = posLengthAttr.getPositionLength() - 1;
state = captureState();
restoreState(preCompositionState);
return moreTokens;
}
numeral.append(term);
if (moreTokens) {
term = termAttr.toString();
numeralTerm = isNumeral(term) || isNumeralPunctuation(term);
}
}
if (composedNumberToken) {
if (moreTokens) {
// We have read past all numerals and there are still tokens left, so
// capture the state of this token and emit it on our next incrementToken()
state = captureState();
}
String normalizedNumber = normalizeNumber(numeral.toString());
termAttr.setEmpty();
termAttr.append(normalizedNumber);
offsetAttr.setOffset(startOffset, endOffset);
numeral = new StringBuilder();
return true;
}
return moreTokens;
}
@Override
public void reset() throws IOException {
super.reset();
fallThroughTokens = 0;
numeral = new StringBuilder();
state = null;
exhausted = false;
}
/**
* Normalizes a Korean number
*
* @param number number or normalize
* @return normalized number, or number to normalize on error (no op)
*/
public String normalizeNumber(String number) {
try {
BigDecimal normalizedNumber = parseNumber(new NumberBuffer(number));
if (normalizedNumber == null) {
return number;
}
return normalizedNumber.stripTrailingZeros().toPlainString();
} catch (NumberFormatException | ArithmeticException e) {
// Return the source number in case of error, i.e. malformed input
return number;
}
}
/**
* Parses a Korean number
*
* @param buffer buffer to parse
* @return parsed number, or null on error or end of input
*/
private BigDecimal parseNumber(NumberBuffer buffer) {
BigDecimal sum = BigDecimal.ZERO;
BigDecimal result = parseLargePair(buffer);
if (result == null) {
return null;
}
while (result != null) {
sum = sum.add(result);
result = parseLargePair(buffer);
}
return sum;
}
/**
* Parses a pair of large numbers, i.e. large Hangul factor is 10,000or larger
*
* @param buffer buffer to parse
* @return parsed pair, or null on error or end of input
*/
private BigDecimal parseLargePair(NumberBuffer buffer) {
BigDecimal first = parseMediumNumber(buffer);
BigDecimal second = parseLargeHangulNumeral(buffer);
if (first == null && second == null) {
return null;
}
if (second == null) {
// If there's no second factor, we return the first one
// This can happen if we our number is smaller than 10,000 ()
return first;
}
if (first == null) {
// If there's no first factor, just return the second one,
// which is the same as multiplying by 1, i.e. with
return second;
}
return first.multiply(second);
}
/**
* Parses a "medium sized" number, typically less than 10,000, but might be larger
* due to a larger factor from {link parseBasicNumber}.
*
* @param buffer buffer to parse
* @return parsed number, or null on error or end of input
*/
private BigDecimal parseMediumNumber(NumberBuffer buffer) {
BigDecimal sum = BigDecimal.ZERO;
BigDecimal result = parseMediumPair(buffer);
if (result == null) {
return null;
}
while (result != null) {
sum = sum.add(result);
result = parseMediumPair(buffer);
}
return sum;
}
/**
* Parses a pair of "medium sized" numbers, i.e. large Hangul factor is at most 1,000
*
* @param buffer buffer to parse
* @return parsed pair, or null on error or end of input
*/
private BigDecimal parseMediumPair(NumberBuffer buffer) {
BigDecimal first = parseBasicNumber(buffer);
BigDecimal second = parseMediumHangulNumeral(buffer);
if (first == null && second == null) {
return null;
}
if (second == null) {
// If there's no second factor, we return the first one
// This can happen if we just have a plain number such as
return first;
}
if (first == null) {
// If there's no first factor, just return the second one,
// which is the same as multiplying by 1, i.e. with
return second;
}
// Return factors multiplied
return first.multiply(second);
}
/**
* Parse a basic number, which is a sequence of Arabic numbers or a sequence or 0-9 Hangul numerals ( to ).
*
* @param buffer buffer to parse
* @return parsed number, or null on error or end of input
*/
private BigDecimal parseBasicNumber(NumberBuffer buffer) {
StringBuilder builder = new StringBuilder();
int i = buffer.position();
while (i < buffer.length()) {
char c = buffer.charAt(i);
if (isArabicNumeral(c)) {
// Arabic numerals; 0 to 9 or to (full-width)
builder.append(arabicNumeralValue(c));
} else if (isHangulNumeral(c)) {
// Hangul numerals; , , , , , , , , , or
builder.append(HangulNumeralValue(c));
} else if (isDecimalPoint(c)) {
builder.append(".");
} else if (isThousandSeparator(c)) {
// Just skip and move to the next character
} else {
// We don't have an Arabic nor Hangul numeral, nor separation or punctuation, so we'll stop.
break;
}
i++;
buffer.advance();
}
if (builder.length() == 0) {
// We didn't build anything, so we don't have a number
return null;
}
return new BigDecimal(builder.toString());
}
/**
* Parse large Hangul numerals (ten thousands or larger)
*
* @param buffer buffer to parse
* @return parsed number, or null on error or end of input
*/
public BigDecimal parseLargeHangulNumeral(NumberBuffer buffer) {
int i = buffer.position();
if (i >= buffer.length()) {
return null;
}
char c = buffer.charAt(i);
int power = exponents[c];
if (power > 3) {
buffer.advance();
return BigDecimal.TEN.pow(power);
}
return null;
}
/**
* Parse medium Hangul numerals (tens, hundreds or thousands)
*
* @param buffer buffer to parse
* @return parsed number or null on error
*/
public BigDecimal parseMediumHangulNumeral(NumberBuffer buffer) {
int i = buffer.position();
if (i >= buffer.length()) {
return null;
}
char c = buffer.charAt(i);
int power = exponents[c];
if (1 <= power && power <= 3) {
buffer.advance();
return BigDecimal.TEN.pow(power);
}
return null;
}
/**
* Numeral predicate
*
* @param input string to test
* @return true if and only if input is a numeral
*/
public boolean isNumeral(String input) {
for (int i = 0; i < input.length(); i++) {
if (!isNumeral(input.charAt(i))) {
return false;
}
}
return true;
}
/**
* Numeral predicate
*
* @param c character to test
* @return true if and only if c is a numeral
*/
public boolean isNumeral(char c) {
return isArabicNumeral(c) || isHangulNumeral(c) || exponents[c] > 0;
}
/**
* Numeral punctuation predicate
*
* @param input string to test
* @return true if and only if c is a numeral punctuation string
*/
public boolean isNumeralPunctuation(String input) {
for (int i = 0; i < input.length(); i++) {
if (!isNumeralPunctuation(input.charAt(i))) {
return false;
}
}
return true;
}
/**
* Numeral punctuation predicate
*
* @param c character to test
* @return true if and only if c is a numeral punctuation character
*/
public boolean isNumeralPunctuation(char c) {
return isDecimalPoint(c) || isThousandSeparator(c);
}
/**
* Arabic numeral predicate. Both half-width and full-width characters are supported
*
* @param c character to test
* @return true if and only if c is an Arabic numeral
*/
public boolean isArabicNumeral(char c) {
return isHalfWidthArabicNumeral(c) || isFullWidthArabicNumeral(c);
}
/**
* Arabic half-width numeral predicate
*
* @param c character to test
* @return true if and only if c is a half-width Arabic numeral
*/
private boolean isHalfWidthArabicNumeral(char c) {
// 0 U+0030 - 9 U+0039
return '0' <= c && c <= '9';
}
/**
* Arabic full-width numeral predicate
*
* @param c character to test
* @return true if and only if c is a full-width Arabic numeral
*/
private boolean isFullWidthArabicNumeral(char c) {
// U+FF10 - U+FF19
return '' <= c && c <= '';
}
/**
* Returns the numeric value for the specified character Arabic numeral.
* Behavior is undefined if a non-Arabic numeral is provided
*
* @param c arabic numeral character
* @return numeral value
*/
private int arabicNumeralValue(char c) {
int offset;
if (isHalfWidthArabicNumeral(c)) {
offset = '0';
} else {
offset = '';
}
return c - offset;
}
/**
* Hangul numeral predicate that tests if the provided character is one of , , , , , , , , , or .
* Larger number Hangul gives a false value.
*
* @param c character to test
* @return true if and only is character is one of , , , , , , , , , or (0 to 9)
*/
private boolean isHangulNumeral(char c) {
return numerals[c] != NO_NUMERAL;
}
/**
* Returns the value for the provided Hangul numeral. Only numeric values for the characters where
* {link isHangulNumeral} return true are supported - behavior is undefined for other characters.
*
* @param c Hangul numeral character
* @return numeral value
* @see #isHangulNumeral(char)
*/
private int HangulNumeralValue(char c) {
return numerals[c];
}
/**
* Decimal point predicate
*
* @param c character to test
* @return true if and only if c is a decimal point
*/
private boolean isDecimalPoint(char c) {
return c == '.' // U+002E FULL STOP
|| c == ''; // U+FF0E FULLWIDTH FULL STOP
}
/**
* Thousand separator predicate
*
* @param c character to test
* @return true if and only if c is a thousand separator predicate
*/
private boolean isThousandSeparator(char c) {
return c == ',' // U+002C COMMA
|| c == ''; // U+FF0C FULLWIDTH COMMA
}
/**
* Buffer that holds a Korean number string and a position index used as a parsed-to marker
*/
public static class NumberBuffer {
private int position;
private String string;
public NumberBuffer(String string) {
this.string = string;
this.position = 0;
}
public char charAt(int index) {
return string.charAt(index);
}
public int length() {
return string.length();
}
public void advance() {
position++;
}
public int position() {
return position;
}
}
}

View File

@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link KoreanNumberFilter}.
* <br>
* <pre class="prettyprint">
* &lt;fieldType name="text_ko" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.KoreanTokenizerFactory" discardPunctuation="false"/&gt;
* &lt;filter class="solr.KoreanNumberFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
* <p>
* It is important that punctuation is not discarded by the tokenizer so use
* {@code discardPunctuation="false"} in your {@link KoreanTokenizerFactory}.
* @since 8.2.0
*/
public class KoreanNumberFilterFactory extends TokenFilterFactory {
public KoreanNumberFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public TokenStream create(TokenStream input) {
return new KoreanNumberFilter(input);
}
}

View File

@ -0,0 +1,335 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ko.dict.UserDictionary;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.junit.Ignore;
import org.junit.Test;
public class TestKoreanNumberFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer;
public static UserDictionary readDict() {
InputStream is = TestKoreanTokenizer.class.getResourceAsStream("userdict.txt");
if (is == null) {
throw new RuntimeException("Cannot find userdict.txt in test classpath!");
}
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
try (reader) {
return UserDictionary.open(reader);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
@Override
public void setUp() throws Exception {
super.setUp();
UserDictionary userDictionary = readDict();
Set<POS.Tag> stopTags = new HashSet<>();
stopTags.add(POS.Tag.SP);
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KoreanTokenizer(newAttributeFactory(), userDictionary,
KoreanTokenizer.DEFAULT_DECOMPOUND, false, false);
TokenStream stream = new KoreanPartOfSpeechStopFilter(tokenizer, stopTags);
return new TokenStreamComponents(tokenizer, new KoreanNumberFilter(stream));
}
};
}
@Override
public void tearDown() throws Exception {
analyzer.close();
super.tearDown();
}
@Test
public void testBasics() throws IOException {
assertAnalyzesTo(analyzer, "오늘 십만이천오백원의 와인 구입",
new String[]{"오늘", "102500", "", "", "와인", "구입"},
new int[]{0, 3, 9, 10, 12, 15},
new int[]{2, 9, 10, 11, 14, 17}
);
// Wrong analysis
// "초밥" => "초밥" O, ""+"" X
assertAnalyzesTo(analyzer, "어제 초밥 가격은 10만 원",
new String[]{"어제", "", "", "가격", "", "100000", ""},
new int[]{0, 3, 4, 6, 8, 10, 14, 15, 13},
new int[]{2, 4, 5, 8, 9, 13, 15, 13, 14}
);
assertAnalyzesTo(analyzer, "자본금 600만 원",
new String[]{"자본", "", "6000000", ""},
new int[]{0, 2, 4, 9, 10},
new int[]{2, 3, 8, 10, 11}
);
}
@Test
public void testVariants() throws IOException {
// Test variants of three
assertAnalyzesTo(analyzer, "3", new String[]{"3"});
assertAnalyzesTo(analyzer, "", new String[]{"3"});
assertAnalyzesTo(analyzer, "", new String[]{"3"});
// Test three variations with trailing zero
assertAnalyzesTo(analyzer, "03", new String[]{"3"});
assertAnalyzesTo(analyzer, "", new String[]{"3"});
assertAnalyzesTo(analyzer, "영삼", new String[]{"3"});
assertAnalyzesTo(analyzer, "003", new String[]{"3"});
assertAnalyzesTo(analyzer, "", new String[]{"3"});
assertAnalyzesTo(analyzer, "영영삼", new String[]{"3"});
// Test thousand variants
assertAnalyzesTo(analyzer, "", new String[]{"1000"});
assertAnalyzesTo(analyzer, "1천", new String[]{"1000"});
assertAnalyzesTo(analyzer, "1천", new String[]{"1000"});
assertAnalyzesTo(analyzer, "일천", new String[]{"1000"});
assertAnalyzesTo(analyzer, "일영영영", new String[]{"1000"});
assertAnalyzesTo(analyzer, "10백", new String[]{"1000"}); // Strange, but supported
}
@Test
public void testLargeVariants() throws IOException {
// Test large numbers
assertAnalyzesTo(analyzer, "삼오칠팔구", new String[]{"35789"});
assertAnalyzesTo(analyzer, "육백이만오천일", new String[]{"6025001"});
assertAnalyzesTo(analyzer, "조육백만오천일", new String[]{"1000006005001"});
assertAnalyzesTo(analyzer, "십조육백만오천일", new String[]{"10000006005001"});
assertAnalyzesTo(analyzer, "일경일", new String[]{"10000000000000001"});
assertAnalyzesTo(analyzer, "십경십", new String[]{"100000000000000010"});
assertAnalyzesTo(analyzer, "해경조억만천백십일", new String[]{"100010001000100011111"});
}
@Test
public void testNegative() throws IOException {
assertAnalyzesTo(analyzer, "-백만", new String[]{"-", "1000000"});
}
@Test
public void testMixed() throws IOException {
// Test mixed numbers
assertAnalyzesTo(analyzer, "삼천2백십삼", new String[]{"3223"});
assertAnalyzesTo(analyzer, "32이삼", new String[]{"3223"});
}
@Test
public void testFunny() throws IOException {
// Test some oddities for inconsistent input
assertAnalyzesTo(analyzer, "십십", new String[]{"20"}); // 100?
assertAnalyzesTo(analyzer, "백백백", new String[]{"300"}); // 10,000?
assertAnalyzesTo(analyzer, "천천천천", new String[]{"4000"}); // 1,000,000,000,000?
}
@Test
public void testHangulArabic() throws IOException {
// Test kanji numerals used as Arabic numbers (with head zero)
assertAnalyzesTo(analyzer, "영일이삼사오육칠팔구구팔칠육오사삼이일영",
new String[]{"1234567899876543210"}
);
// I'm Bond, James "normalized" Bond...
assertAnalyzesTo(analyzer, "영영칠", new String[]{"7"});
}
@Test
public void testDoubleZero() throws IOException {
assertAnalyzesTo(analyzer, "영영",
new String[]{"0"},
new int[]{0},
new int[]{2},
new int[]{1}
);
}
@Test
public void testName() throws IOException {
// Test name that normalises to number
assertAnalyzesTo(analyzer, "전중경일",
new String[]{"전중", "10000000000000001"}, // 경일 is normalized to a number
new int[]{0, 2},
new int[]{2, 4},
new int[]{1, 1}
);
// An analyzer that marks 경일 as a keyword
Analyzer keywordMarkingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
CharArraySet set = new CharArraySet(1, false);
set.add("경일");
UserDictionary userDictionary = readDict();
Set<POS.Tag> stopTags = new HashSet<>();
stopTags.add(POS.Tag.SP);
Tokenizer tokenizer = new KoreanTokenizer(newAttributeFactory(), userDictionary,
KoreanTokenizer.DEFAULT_DECOMPOUND, false, false);
TokenStream stream = new KoreanPartOfSpeechStopFilter(tokenizer, stopTags);
return new TokenStreamComponents(tokenizer, new KoreanNumberFilter(new SetKeywordMarkerFilter(stream, set)));
}
};
assertAnalyzesTo(keywordMarkingAnalyzer, "전중경일",
new String[]{"전중", "경일"}, // 경일 is not normalized
new int[]{0, 2},
new int[]{2, 4},
new int[]{1, 1}
);
keywordMarkingAnalyzer.close();
}
@Test
public void testDecimal() throws IOException {
// Test Arabic numbers with punctuation, i.e. 3.2 thousands
assertAnalyzesTo(analyzer, "1.2만345.67",
new String[]{"12345.67"}
);
}
@Test
public void testDecimalPunctuation() throws IOException {
// Test Arabic numbers with punctuation, i.e. 3.2 thousands won
assertAnalyzesTo(analyzer, "3.2천 원",
new String[]{"3200", ""}
);
}
@Test
public void testThousandSeparator() throws IOException {
assertAnalyzesTo(analyzer, "4,647",
new String[]{"4647"}
);
}
@Test
public void testDecimalThousandSeparator() throws IOException {
assertAnalyzesTo(analyzer, "4,647.0010",
new String[]{"4647.001"}
);
}
@Test
public void testCommaDecimalSeparator() throws IOException {
assertAnalyzesTo(analyzer, "15,7",
new String[]{"157"}
);
}
@Test
public void testTrailingZeroStripping() throws IOException {
assertAnalyzesTo(analyzer, "1000.1000",
new String[]{"1000.1"}
);
assertAnalyzesTo(analyzer, "1000.0000",
new String[]{"1000"}
);
}
@Test
public void testEmpty() throws IOException {
assertAnalyzesTo(analyzer, "", new String[]{});
}
@Test
public void testRandomHugeStrings() throws Exception {
checkRandomData(random(), analyzer, 50 * RANDOM_MULTIPLIER, 8192);
}
@Test
public void testRandomSmallStrings() throws Exception {
checkRandomData(random(), analyzer, 500 * RANDOM_MULTIPLIER, 128);
}
@Test
public void testFunnyIssue() throws Exception {
BaseTokenStreamTestCase.checkAnalysisConsistency(
random(), analyzer, true, "영영\u302f\u3029\u3039\u3023\u3033\u302bB", true
);
}
@Ignore("This test is used during development when analyze normalizations in large amounts of text")
@Test
public void testLargeData() throws IOException {
Path input = Paths.get("/tmp/test.txt");
Path tokenizedOutput = Paths.get("/tmp/test.tok.txt");
Path normalizedOutput = Paths.get("/tmp/test.norm.txt");
Analyzer plainAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
UserDictionary userDictionary = readDict();
Set<POS.Tag> stopTags = new HashSet<>();
stopTags.add(POS.Tag.SP);
Tokenizer tokenizer = new KoreanTokenizer(newAttributeFactory(), userDictionary,
KoreanTokenizer.DEFAULT_DECOMPOUND, false, false);
return new TokenStreamComponents(tokenizer, new KoreanPartOfSpeechStopFilter(tokenizer, stopTags));
}
};
analyze(
plainAnalyzer,
Files.newBufferedReader(input, StandardCharsets.UTF_8),
Files.newBufferedWriter(tokenizedOutput, StandardCharsets.UTF_8)
);
analyze(
analyzer,
Files.newBufferedReader(input, StandardCharsets.UTF_8),
Files.newBufferedWriter(normalizedOutput, StandardCharsets.UTF_8)
);
plainAnalyzer.close();
}
public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException {
TokenStream stream = analyzer.tokenStream("dummy", reader);
stream.reset();
CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
while (stream.incrementToken()) {
writer.write(termAttr.toString());
writer.write("\n");
}
reader.close();
writer.close();
}
}

View File

@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
* Simple tests for {@link org.apache.lucene.analysis.ko.KoreanNumberFilterFactory}
*/
public class TestKoreanNumberFilterFactory extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Map<String, String> args = new HashMap<>();
args.put("discardPunctuation", "false");
KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(args);
tokenizerFactory.inform(new StringMockResourceLoader(""));
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
((Tokenizer)tokenStream).setReader(new StringReader("어제 초밥 가격은 10만 원"));
KoreanNumberFilterFactory factory = new KoreanNumberFilterFactory(new HashMap<>());
tokenStream = factory.create(tokenStream);
// Wrong analysis
// "초밥" => "초밥" O, ""+"" X
assertTokenStreamContents(tokenStream,
new String[] { "어제", " ", "", "", " ", "가격", "", " ", "100000", " ", "" }
);
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
new KoreanNumberFilterFactory(new HashMap<String,String>() {{
put("bogusArg", "bogusValue");
}});
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}