mirror of https://github.com/apache/lucene.git
LUCENE-8784: The KoreanTokenizer now preserves punctuations if discardPunctuation is set to false (defaults to true).
Signed-off-by: Namgyu Kim <kng0828@gmail.com> Signed-off-by: jimczi <jimczi@apache.org>
This commit is contained in:
parent
97e7d8a3d7
commit
a556925eb8
|
@ -45,6 +45,10 @@ New Features
|
|||
feature. This is exposed via the factory method FeatureField#newFeatureSort.
|
||||
(Colin Goodheart-Smithe via Adrien Grand)
|
||||
|
||||
* LUCENE-8784: The KoreanTokenizer now preserves punctuations if discardPunctuation is set
|
||||
to false (defaults to true).
|
||||
(Namgyu Kim via Jim Ferenczi)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-8785: Ensure new threadstates are locked before retrieving the number of active threadstates.
|
||||
|
|
|
@ -62,7 +62,7 @@ public class KoreanAnalyzer extends Analyzer {
|
|||
this.stopTags = stopTags;
|
||||
this.outputUnknownUnigrams = outputUnknownUnigrams;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KoreanTokenizer(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDict, mode, outputUnknownUnigrams);
|
||||
|
|
|
@ -126,6 +126,7 @@ public final class KoreanTokenizer extends Tokenizer {
|
|||
private final FST.BytesReader userFSTReader;
|
||||
private final TokenInfoFST userFST;
|
||||
|
||||
private final boolean discardPunctuation;
|
||||
private final DecompoundMode mode;
|
||||
private final boolean outputUnknownUnigrams;
|
||||
|
||||
|
@ -159,7 +160,7 @@ public final class KoreanTokenizer extends Tokenizer {
|
|||
* Uses the default AttributeFactory.
|
||||
*/
|
||||
public KoreanTokenizer() {
|
||||
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, null, DEFAULT_DECOMPOUND, false);
|
||||
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, null, DEFAULT_DECOMPOUND, false, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -171,8 +172,22 @@ public final class KoreanTokenizer extends Tokenizer {
|
|||
* @param outputUnknownUnigrams If true outputs unigrams for unknown words.
|
||||
*/
|
||||
public KoreanTokenizer(AttributeFactory factory, UserDictionary userDictionary, DecompoundMode mode, boolean outputUnknownUnigrams) {
|
||||
this(factory, userDictionary, mode, outputUnknownUnigrams, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new KoreanTokenizer.
|
||||
*
|
||||
* @param factory the AttributeFactory to use
|
||||
* @param userDictionary Optional: if non-null, user dictionary.
|
||||
* @param mode Decompound mode.
|
||||
* @param outputUnknownUnigrams If true outputs unigrams for unknown words.
|
||||
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
|
||||
*/
|
||||
public KoreanTokenizer(AttributeFactory factory, UserDictionary userDictionary, DecompoundMode mode, boolean outputUnknownUnigrams, boolean discardPunctuation) {
|
||||
super(factory);
|
||||
this.mode = mode;
|
||||
this.discardPunctuation = discardPunctuation;
|
||||
this.outputUnknownUnigrams = outputUnknownUnigrams;
|
||||
dictionary = TokenInfoDictionary.getInstance();
|
||||
fst = dictionary.getFST();
|
||||
|
@ -874,11 +889,9 @@ public final class KoreanTokenizer extends Tokenizer {
|
|||
backWordPos+i,
|
||||
backWordPos+i+charLen
|
||||
);
|
||||
if (shouldFilterToken(token) == false) {
|
||||
pending.add(token);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||
}
|
||||
pending.add(token);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -939,6 +952,16 @@ public final class KoreanTokenizer extends Tokenizer {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (discardPunctuation == false && backWordPos != backPos) {
|
||||
// Add a token for whitespaces between terms
|
||||
int offset = backPos - lastBackTracePos;
|
||||
int len = backWordPos - backPos;
|
||||
//System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " + backPos);
|
||||
unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
|
||||
DictionaryToken spaceToken = new DictionaryToken(Type.UNKNOWN, unkDictionary,
|
||||
wordIdRef.ints[wordIdRef.offset], fragment, offset, len, backPos, backPos+len);
|
||||
pending.add(spaceToken);
|
||||
}
|
||||
|
||||
pos = backPos;
|
||||
bestIDX = nextBestIDX;
|
||||
|
@ -960,7 +983,7 @@ public final class KoreanTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
private boolean shouldFilterToken(Token token) {
|
||||
return isPunctuation(token.getSurfaceForm()[token.getOffset()]);
|
||||
return discardPunctuation && isPunctuation(token.getSurfaceForm()[token.getOffset()]);
|
||||
}
|
||||
|
||||
private static boolean isPunctuation(char ch) {
|
||||
|
|
|
@ -44,6 +44,7 @@ import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
|
|||
* userDictionary="user.txt"
|
||||
* userDictionaryEncoding="UTF-8"
|
||||
* outputUnknownUnigrams="false"
|
||||
* discardPunctuation="true"
|
||||
* />
|
||||
* </analyzer>
|
||||
* </fieldType>
|
||||
|
@ -56,6 +57,7 @@ import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
|
|||
* <li>userDictionaryEncoding: User dictionary encoding.</li>
|
||||
* <li>decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is discard. See {@link DecompoundMode}</li>
|
||||
* <li>outputUnknownUnigrams: If true outputs unigrams for unknown words.</li>
|
||||
* <li>discardPunctuation: true if punctuation tokens should be dropped from the output.</li>
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*
|
||||
|
@ -66,6 +68,7 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource
|
|||
private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
|
||||
private static final String DECOMPOUND_MODE = "decompoundMode";
|
||||
private static final String OUTPUT_UNKNOWN_UNIGRAMS = "outputUnknownUnigrams";
|
||||
private static final String DISCARD_PUNCTUATION = "discardPunctuation";
|
||||
|
||||
private final String userDictionaryPath;
|
||||
private final String userDictionaryEncoding;
|
||||
|
@ -73,6 +76,7 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource
|
|||
|
||||
private final KoreanTokenizer.DecompoundMode mode;
|
||||
private final boolean outputUnknownUnigrams;
|
||||
private final boolean discardPunctuation;
|
||||
|
||||
/** Creates a new KoreanTokenizerFactory */
|
||||
public KoreanTokenizerFactory(Map<String, String> args) {
|
||||
|
@ -81,6 +85,7 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource
|
|||
userDictionaryEncoding = args.remove(USER_DICT_ENCODING);
|
||||
mode = KoreanTokenizer.DecompoundMode.valueOf(get(args, DECOMPOUND_MODE, KoreanTokenizer.DEFAULT_DECOMPOUND.toString()).toUpperCase(Locale.ROOT));
|
||||
outputUnknownUnigrams = getBoolean(args, OUTPUT_UNKNOWN_UNIGRAMS, false);
|
||||
discardPunctuation = getBoolean(args, DISCARD_PUNCTUATION, true);
|
||||
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
|
@ -108,6 +113,6 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource
|
|||
|
||||
@Override
|
||||
public KoreanTokenizer create(AttributeFactory factory) {
|
||||
return new KoreanTokenizer(factory, userDictionary, mode, outputUnknownUnigrams);
|
||||
return new KoreanTokenizer(factory, userDictionary, mode, outputUnknownUnigrams, discardPunctuation);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,7 +37,7 @@ public final class CharacterDefinition {
|
|||
public static final int CLASS_COUNT = CharacterClass.values().length;
|
||||
|
||||
// only used internally for lookup:
|
||||
private enum CharacterClass {
|
||||
enum CharacterClass {
|
||||
NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, HANGUL, HANJA, HANJANUMERIC;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
|
|||
import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
|
||||
|
||||
public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer, analyzerUnigram, analyzerDecompound, analyzerDecompoundKeep, analyzerReading;
|
||||
private Analyzer analyzer, analyzerWithPunctuation, analyzerUnigram, analyzerDecompound, analyzerDecompoundKeep, analyzerReading;
|
||||
|
||||
public static UserDictionary readDict() {
|
||||
InputStream is = TestKoreanTokenizer.class.getResourceAsStream("userdict.txt");
|
||||
|
@ -65,6 +65,14 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
analyzerWithPunctuation = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KoreanTokenizer(newAttributeFactory(), userDictionary,
|
||||
DecompoundMode.NONE, false, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
analyzerUnigram = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
|
@ -128,6 +136,36 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
|||
);
|
||||
}
|
||||
|
||||
public void testPartOfSpeechsWithPunc() throws IOException {
|
||||
assertAnalyzesTo(analyzerWithPunctuation, "화학 이외의 것!",
|
||||
new String[]{"화학", " ", "이외", "의", " ", "것", "!"},
|
||||
new int[]{0, 2, 3, 5, 6, 7, 8, 9},
|
||||
new int[]{2, 3, 5, 6, 7, 8, 9, 11},
|
||||
new int[]{1, 1, 1, 1, 1, 1, 1, 1}
|
||||
);
|
||||
assertPartsOfSpeech(analyzerWithPunctuation, "화학 이외의 것!",
|
||||
new POS.Type[] { POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME },
|
||||
new POS.Tag[] { POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.J, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF },
|
||||
new POS.Tag[] { POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.J, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF }
|
||||
);
|
||||
}
|
||||
|
||||
public void testFloatingPointNumber() throws IOException {
|
||||
assertAnalyzesTo(analyzerWithPunctuation, "10.1 인치 모니터",
|
||||
new String[]{"10", ".", "1", " ", "인치", " ", "모니터"},
|
||||
new int[]{0, 2, 3, 4, 5, 7, 8},
|
||||
new int[]{2, 3, 4, 5, 7, 8, 11},
|
||||
new int[]{1, 1, 1, 1, 1, 1, 1}
|
||||
);
|
||||
|
||||
assertAnalyzesTo(analyzer, "10.1 인치 모니터",
|
||||
new String[]{"10", "1", "인치", "모니터"},
|
||||
new int[]{0, 3, 5, 8},
|
||||
new int[]{2, 4, 7, 11},
|
||||
new int[]{1, 1, 1, 1}
|
||||
);
|
||||
}
|
||||
|
||||
public void testPartOfSpeechsWithCompound() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "가락지나물은 한국, 중국, 일본",
|
||||
new String[]{"가락지나물", "은", "한국", "중국", "일본"},
|
||||
|
|
|
@ -101,6 +101,36 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test discardPunctuation True
|
||||
*/
|
||||
public void testDiscardPunctuation_true() throws IOException {
|
||||
Map<String,String> args = new HashMap<>();
|
||||
args.put("discardPunctuation", "true");
|
||||
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
||||
factory.inform(new StringMockResourceLoader(""));
|
||||
TokenStream ts = factory.create(newAttributeFactory());
|
||||
((Tokenizer)ts).setReader(new StringReader("10.1 인치 모니터"));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "10", "1", "인치", "모니터" }
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test discardPunctuation False
|
||||
*/
|
||||
public void testDiscardPunctuation_false() throws IOException {
|
||||
Map<String,String> args = new HashMap<>();
|
||||
args.put("discardPunctuation", "false");
|
||||
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
||||
factory.inform(new StringMockResourceLoader(""));
|
||||
TokenStream ts = factory.create(newAttributeFactory());
|
||||
((Tokenizer)ts).setReader(new StringReader("10.1 인치 모니터"));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "10", ".", "1", " ", "인치", " ", "모니터" }
|
||||
);
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
|
|
|
@ -32,7 +32,7 @@ import java.util.List;
|
|||
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
|
||||
|
||||
public class UnknownDictionaryBuilder {
|
||||
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,SY,*,*,*,*,*,*,*";
|
||||
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,UNKNOWN,*,*,*,*,*,*,*";
|
||||
|
||||
private String encoding = "utf-8";
|
||||
|
||||
|
|
Loading…
Reference in New Issue