LUCENE-8784: The KoreanTokenizer now preserves punctuations if discardPunctuation is set to false (defaults to true).

Signed-off-by: Namgyu Kim <kng0828@gmail.com>
Signed-off-by: jimczi <jimczi@apache.org>
This commit is contained in:
Namgyu Kim 2019-05-24 23:56:02 +09:00 committed by jimczi
parent 97e7d8a3d7
commit a556925eb8
8 changed files with 112 additions and 12 deletions

View File

@ -45,6 +45,10 @@ New Features
feature. This is exposed via the factory method FeatureField#newFeatureSort.
(Colin Goodheart-Smithe via Adrien Grand)
* LUCENE-8784: The KoreanTokenizer now preserves punctuations if discardPunctuation is set
to false (defaults to true).
(Namgyu Kim via Jim Ferenczi)
Bug Fixes
* LUCENE-8785: Ensure new threadstates are locked before retrieving the number of active threadstates.

View File

@ -62,7 +62,7 @@ public class KoreanAnalyzer extends Analyzer {
this.stopTags = stopTags;
this.outputUnknownUnigrams = outputUnknownUnigrams;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KoreanTokenizer(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDict, mode, outputUnknownUnigrams);

View File

@ -126,6 +126,7 @@ public final class KoreanTokenizer extends Tokenizer {
private final FST.BytesReader userFSTReader;
private final TokenInfoFST userFST;
private final boolean discardPunctuation;
private final DecompoundMode mode;
private final boolean outputUnknownUnigrams;
@ -159,7 +160,7 @@ public final class KoreanTokenizer extends Tokenizer {
* Uses the default AttributeFactory.
*/
public KoreanTokenizer() {
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, null, DEFAULT_DECOMPOUND, false);
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, null, DEFAULT_DECOMPOUND, false, true);
}
/**
@ -171,8 +172,22 @@ public final class KoreanTokenizer extends Tokenizer {
* @param outputUnknownUnigrams If true outputs unigrams for unknown words.
*/
public KoreanTokenizer(AttributeFactory factory, UserDictionary userDictionary, DecompoundMode mode, boolean outputUnknownUnigrams) {
this(factory, userDictionary, mode, outputUnknownUnigrams, true);
}
/**
* Create a new KoreanTokenizer.
*
* @param factory the AttributeFactory to use
* @param userDictionary Optional: if non-null, user dictionary.
* @param mode Decompound mode.
* @param outputUnknownUnigrams If true outputs unigrams for unknown words.
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
*/
public KoreanTokenizer(AttributeFactory factory, UserDictionary userDictionary, DecompoundMode mode, boolean outputUnknownUnigrams, boolean discardPunctuation) {
super(factory);
this.mode = mode;
this.discardPunctuation = discardPunctuation;
this.outputUnknownUnigrams = outputUnknownUnigrams;
dictionary = TokenInfoDictionary.getInstance();
fst = dictionary.getFST();
@ -874,11 +889,9 @@ public final class KoreanTokenizer extends Tokenizer {
backWordPos+i,
backWordPos+i+charLen
);
if (shouldFilterToken(token) == false) {
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
}
} else {
@ -939,6 +952,16 @@ public final class KoreanTokenizer extends Tokenizer {
}
}
}
if (discardPunctuation == false && backWordPos != backPos) {
// Add a token for whitespaces between terms
int offset = backPos - lastBackTracePos;
int len = backWordPos - backPos;
//System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " + backPos);
unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
DictionaryToken spaceToken = new DictionaryToken(Type.UNKNOWN, unkDictionary,
wordIdRef.ints[wordIdRef.offset], fragment, offset, len, backPos, backPos+len);
pending.add(spaceToken);
}
pos = backPos;
bestIDX = nextBestIDX;
@ -960,7 +983,7 @@ public final class KoreanTokenizer extends Tokenizer {
}
private boolean shouldFilterToken(Token token) {
return isPunctuation(token.getSurfaceForm()[token.getOffset()]);
return discardPunctuation && isPunctuation(token.getSurfaceForm()[token.getOffset()]);
}
private static boolean isPunctuation(char ch) {

View File

@ -44,6 +44,7 @@ import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
* userDictionary="user.txt"
* userDictionaryEncoding="UTF-8"
* outputUnknownUnigrams="false"
* discardPunctuation="true"
* /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
@ -56,6 +57,7 @@ import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
* <li>userDictionaryEncoding: User dictionary encoding.</li>
* <li>decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is discard. See {@link DecompoundMode}</li>
* <li>outputUnknownUnigrams: If true outputs unigrams for unknown words.</li>
* <li>discardPunctuation: true if punctuation tokens should be dropped from the output.</li>
* </ul>
* @lucene.experimental
*
@ -66,6 +68,7 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource
private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
private static final String DECOMPOUND_MODE = "decompoundMode";
private static final String OUTPUT_UNKNOWN_UNIGRAMS = "outputUnknownUnigrams";
private static final String DISCARD_PUNCTUATION = "discardPunctuation";
private final String userDictionaryPath;
private final String userDictionaryEncoding;
@ -73,6 +76,7 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource
private final KoreanTokenizer.DecompoundMode mode;
private final boolean outputUnknownUnigrams;
private final boolean discardPunctuation;
/** Creates a new KoreanTokenizerFactory */
public KoreanTokenizerFactory(Map<String, String> args) {
@ -81,6 +85,7 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource
userDictionaryEncoding = args.remove(USER_DICT_ENCODING);
mode = KoreanTokenizer.DecompoundMode.valueOf(get(args, DECOMPOUND_MODE, KoreanTokenizer.DEFAULT_DECOMPOUND.toString()).toUpperCase(Locale.ROOT));
outputUnknownUnigrams = getBoolean(args, OUTPUT_UNKNOWN_UNIGRAMS, false);
discardPunctuation = getBoolean(args, DISCARD_PUNCTUATION, true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
@ -108,6 +113,6 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource
@Override
public KoreanTokenizer create(AttributeFactory factory) {
return new KoreanTokenizer(factory, userDictionary, mode, outputUnknownUnigrams);
return new KoreanTokenizer(factory, userDictionary, mode, outputUnknownUnigrams, discardPunctuation);
}
}

View File

@ -37,7 +37,7 @@ public final class CharacterDefinition {
public static final int CLASS_COUNT = CharacterClass.values().length;
// only used internally for lookup:
private enum CharacterClass {
enum CharacterClass {
NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, HANGUL, HANJA, HANJANUMERIC;
}

View File

@ -34,7 +34,7 @@ import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
private Analyzer analyzer, analyzerUnigram, analyzerDecompound, analyzerDecompoundKeep, analyzerReading;
private Analyzer analyzer, analyzerWithPunctuation, analyzerUnigram, analyzerDecompound, analyzerDecompoundKeep, analyzerReading;
public static UserDictionary readDict() {
InputStream is = TestKoreanTokenizer.class.getResourceAsStream("userdict.txt");
@ -65,6 +65,14 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
analyzerWithPunctuation = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KoreanTokenizer(newAttributeFactory(), userDictionary,
DecompoundMode.NONE, false, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
analyzerUnigram = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
@ -128,6 +136,36 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
);
}
public void testPartOfSpeechsWithPunc() throws IOException {
assertAnalyzesTo(analyzerWithPunctuation, "화학 이외의 것!",
new String[]{"화학", " ", "이외", "", " ", "", "!"},
new int[]{0, 2, 3, 5, 6, 7, 8, 9},
new int[]{2, 3, 5, 6, 7, 8, 9, 11},
new int[]{1, 1, 1, 1, 1, 1, 1, 1}
);
assertPartsOfSpeech(analyzerWithPunctuation, "화학 이외의 것!",
new POS.Type[] { POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME },
new POS.Tag[] { POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.J, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF },
new POS.Tag[] { POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.J, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF }
);
}
public void testFloatingPointNumber() throws IOException {
assertAnalyzesTo(analyzerWithPunctuation, "10.1 인치 모니터",
new String[]{"10", ".", "1", " ", "인치", " ", "모니터"},
new int[]{0, 2, 3, 4, 5, 7, 8},
new int[]{2, 3, 4, 5, 7, 8, 11},
new int[]{1, 1, 1, 1, 1, 1, 1}
);
assertAnalyzesTo(analyzer, "10.1 인치 모니터",
new String[]{"10", "1", "인치", "모니터"},
new int[]{0, 3, 5, 8},
new int[]{2, 4, 7, 11},
new int[]{1, 1, 1, 1}
);
}
public void testPartOfSpeechsWithCompound() throws IOException {
assertAnalyzesTo(analyzer, "가락지나물은 한국, 중국, 일본",
new String[]{"가락지나물", "", "한국", "중국", "일본"},

View File

@ -101,6 +101,36 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
);
}
/**
* Test discardPunctuation True
*/
public void testDiscardPunctuation_true() throws IOException {
Map<String,String> args = new HashMap<>();
args.put("discardPunctuation", "true");
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("10.1 인치 모니터"));
assertTokenStreamContents(ts,
new String[] { "10", "1", "인치", "모니터" }
);
}
/**
* Test discardPunctuation False
*/
public void testDiscardPunctuation_false() throws IOException {
Map<String,String> args = new HashMap<>();
args.put("discardPunctuation", "false");
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("10.1 인치 모니터"));
assertTokenStreamContents(ts,
new String[] { "10", ".", "1", " ", "인치", " ", "모니터" }
);
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {

View File

@ -32,7 +32,7 @@ import java.util.List;
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
public class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,SY,*,*,*,*,*,*,*";
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,UNKNOWN,*,*,*,*,*,*,*";
private String encoding = "utf-8";