mirror of https://github.com/apache/lucene.git
GITHUB#11778: Add detailed part-of-speech tag for particle and ending on Nori (#11779)
This commit is contained in:
parent
155876a902
commit
451bab300e
|
@ -93,6 +93,11 @@ API Changes
|
||||||
* GITHUB#11772: Removed native subproject and WindowsDirectory implementation from lucene.misc. Recommendation:
|
* GITHUB#11772: Removed native subproject and WindowsDirectory implementation from lucene.misc. Recommendation:
|
||||||
use MMapDirectory implementation on Windows. (Robert Muir, Uwe Schindler, Dawid Weiss)
|
use MMapDirectory implementation on Windows. (Robert Muir, Uwe Schindler, Dawid Weiss)
|
||||||
|
|
||||||
|
Improvements
|
||||||
|
---------------------
|
||||||
|
* GITHUB#11778: Detailed part-of-speech information for particle(조사) and ending(어미) on Nori
|
||||||
|
is now tagged. (Namgyu Kim)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
* GITHUB#11726: Indexing term vectors on large documents could fail due to
|
* GITHUB#11726: Indexing term vectors on large documents could fail due to
|
||||||
|
|
|
@ -36,9 +36,21 @@ public final class KoreanPartOfSpeechStopFilter extends FilteringTokenFilter {
|
||||||
public static final Set<POS.Tag> DEFAULT_STOP_TAGS =
|
public static final Set<POS.Tag> DEFAULT_STOP_TAGS =
|
||||||
new HashSet<>(
|
new HashSet<>(
|
||||||
Arrays.asList(
|
Arrays.asList(
|
||||||
POS.Tag.E,
|
POS.Tag.EP,
|
||||||
|
POS.Tag.EF,
|
||||||
|
POS.Tag.EC,
|
||||||
|
POS.Tag.ETN,
|
||||||
|
POS.Tag.ETM,
|
||||||
POS.Tag.IC,
|
POS.Tag.IC,
|
||||||
POS.Tag.J,
|
POS.Tag.JKS,
|
||||||
|
POS.Tag.JKC,
|
||||||
|
POS.Tag.JKG,
|
||||||
|
POS.Tag.JKO,
|
||||||
|
POS.Tag.JKB,
|
||||||
|
POS.Tag.JKV,
|
||||||
|
POS.Tag.JKQ,
|
||||||
|
POS.Tag.JX,
|
||||||
|
POS.Tag.JC,
|
||||||
POS.Tag.MAG,
|
POS.Tag.MAG,
|
||||||
POS.Tag.MAJ,
|
POS.Tag.MAJ,
|
||||||
POS.Tag.MM,
|
POS.Tag.MM,
|
||||||
|
|
|
@ -42,14 +42,50 @@ public class POS {
|
||||||
|
|
||||||
/** Part of speech tag for Korean based on Sejong corpus classification. */
|
/** Part of speech tag for Korean based on Sejong corpus classification. */
|
||||||
public enum Tag {
|
public enum Tag {
|
||||||
/** Verbal endings */
|
/** Pre-final ending */
|
||||||
E(100, "Verbal endings"),
|
EP(100, "Pre-final ending"),
|
||||||
|
|
||||||
|
/** Sentence-closing ending */
|
||||||
|
EF(101, "Sentence-closing ending"),
|
||||||
|
|
||||||
|
/** Connective ending */
|
||||||
|
EC(102, "Connective ending"),
|
||||||
|
|
||||||
|
/** Nominal transformative ending */
|
||||||
|
ETN(103, "Nominal transformative ending"),
|
||||||
|
|
||||||
|
/** Adnominal form transformative ending */
|
||||||
|
ETM(104, "Adnominal form transformative ending"),
|
||||||
|
|
||||||
/** Interjection */
|
/** Interjection */
|
||||||
IC(110, "Interjection"),
|
IC(110, "Interjection"),
|
||||||
|
|
||||||
/** Ending Particle */
|
/** Subject case marker */
|
||||||
J(120, "Ending Particle"),
|
JKS(120, "Subject case marker"),
|
||||||
|
|
||||||
|
/** Complement case marker */
|
||||||
|
JKC(121, "Complement case marker"),
|
||||||
|
|
||||||
|
/** Adnominal case marker */
|
||||||
|
JKG(122, "Adnominal case marker"),
|
||||||
|
|
||||||
|
/** Object case marker */
|
||||||
|
JKO(123, "Object case marker"),
|
||||||
|
|
||||||
|
/** Adverbial case marker */
|
||||||
|
JKB(124, "Adverbial case marker"),
|
||||||
|
|
||||||
|
/** Vocative case marker */
|
||||||
|
JKV(125, "Vocative case marker"),
|
||||||
|
|
||||||
|
/** Quotative case marker */
|
||||||
|
JKQ(126, "Quotative case marker"),
|
||||||
|
|
||||||
|
/** Auxiliary postpositional particle */
|
||||||
|
JX(127, "Auxiliary postpositional particle"),
|
||||||
|
|
||||||
|
/** Conjunctive postpositional particle */
|
||||||
|
JC(128, "Conjunctive postpositional particle"),
|
||||||
|
|
||||||
/** General Adverb */
|
/** General Adverb */
|
||||||
MAG(130, "General Adverb"),
|
MAG(130, "General Adverb"),
|
||||||
|
@ -177,14 +213,7 @@ public class POS {
|
||||||
|
|
||||||
/** Returns the {@link Tag} of the provided <code>name</code>. */
|
/** Returns the {@link Tag} of the provided <code>name</code>. */
|
||||||
public static Tag resolveTag(String name) {
|
public static Tag resolveTag(String name) {
|
||||||
String tagUpper = name.toUpperCase(Locale.ENGLISH);
|
return Tag.valueOf(name.toUpperCase(Locale.ENGLISH));
|
||||||
if (tagUpper.startsWith("J")) {
|
|
||||||
return Tag.J;
|
|
||||||
} else if (tagUpper.startsWith("E")) {
|
|
||||||
return Tag.E;
|
|
||||||
} else {
|
|
||||||
return Tag.valueOf(tagUpper);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the {@link Tag} of the provided <code>tag</code>. */
|
/** Returns the {@link Tag} of the provided <code>tag</code>. */
|
||||||
|
|
|
@ -348,8 +348,20 @@ final class Viterbi
|
||||||
if (numSpaces > 0) {
|
if (numSpaces > 0) {
|
||||||
// TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
|
// TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
|
||||||
switch (leftPOS) {
|
switch (leftPOS) {
|
||||||
case E:
|
case EP:
|
||||||
case J:
|
case EF:
|
||||||
|
case EC:
|
||||||
|
case ETN:
|
||||||
|
case ETM:
|
||||||
|
case JKS:
|
||||||
|
case JKC:
|
||||||
|
case JKG:
|
||||||
|
case JKO:
|
||||||
|
case JKB:
|
||||||
|
case JKV:
|
||||||
|
case JKQ:
|
||||||
|
case JX:
|
||||||
|
case JC:
|
||||||
case VCP:
|
case VCP:
|
||||||
case XSA:
|
case XSA:
|
||||||
case XSN:
|
case XSN:
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -35,7 +35,7 @@ public class TestKoreanPartOfSpeechStopFilterFactory extends BaseTokenStreamTest
|
||||||
((Tokenizer) ts).setReader(new StringReader(" 한국은 대단한 나라입니다."));
|
((Tokenizer) ts).setReader(new StringReader(" 한국은 대단한 나라입니다."));
|
||||||
Map<String, String> args = new HashMap<>();
|
Map<String, String> args = new HashMap<>();
|
||||||
args.put("luceneMatchVersion", Version.LATEST.toString());
|
args.put("luceneMatchVersion", Version.LATEST.toString());
|
||||||
args.put("tags", "E, J");
|
args.put("tags", "EP, EF, EC, ETN, ETM, JKS, JKC, JKG, JKO, JKB, JKV, JKQ, JX, JC");
|
||||||
KoreanPartOfSpeechStopFilterFactory factory = new KoreanPartOfSpeechStopFilterFactory(args);
|
KoreanPartOfSpeechStopFilterFactory factory = new KoreanPartOfSpeechStopFilterFactory(args);
|
||||||
ts = factory.create(ts);
|
ts = factory.create(ts);
|
||||||
assertTokenStreamContents(ts, new String[] {"한국", "대단", "하", "나라", "이"});
|
assertTokenStreamContents(ts, new String[] {"한국", "대단", "하", "나라", "이"});
|
||||||
|
|
|
@ -154,8 +154,8 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
||||||
analyzer,
|
analyzer,
|
||||||
"화학 이외의 것",
|
"화학 이외의 것",
|
||||||
new POS.Type[] {POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME},
|
new POS.Type[] {POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME},
|
||||||
new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNB},
|
new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.NNB},
|
||||||
new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNB});
|
new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.NNB});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPartOfSpeechs() throws IOException {
|
public void testPartOfSpeechs() throws IOException {
|
||||||
|
@ -170,8 +170,8 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
||||||
analyzer,
|
analyzer,
|
||||||
"화학 이외의 것",
|
"화학 이외의 것",
|
||||||
new POS.Type[] {POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME},
|
new POS.Type[] {POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME},
|
||||||
new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNB},
|
new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.NNB},
|
||||||
new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNB});
|
new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.NNB});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPartOfSpeechsWithPunc() throws IOException {
|
public void testPartOfSpeechsWithPunc() throws IOException {
|
||||||
|
@ -195,10 +195,10 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
||||||
POS.Type.MORPHEME
|
POS.Type.MORPHEME
|
||||||
},
|
},
|
||||||
new POS.Tag[] {
|
new POS.Tag[] {
|
||||||
POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.J, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF
|
POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF
|
||||||
},
|
},
|
||||||
new POS.Tag[] {
|
new POS.Tag[] {
|
||||||
POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.J, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF
|
POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -239,8 +239,8 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
||||||
POS.Type.MORPHEME,
|
POS.Type.MORPHEME,
|
||||||
POS.Type.MORPHEME
|
POS.Type.MORPHEME
|
||||||
},
|
},
|
||||||
new POS.Tag[] {POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP},
|
new POS.Tag[] {POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP},
|
||||||
new POS.Tag[] {POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP});
|
new POS.Tag[] {POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP});
|
||||||
|
|
||||||
assertAnalyzesTo(
|
assertAnalyzesTo(
|
||||||
analyzerDecompound,
|
analyzerDecompound,
|
||||||
|
@ -271,8 +271,10 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
||||||
POS.Type.MORPHEME,
|
POS.Type.MORPHEME,
|
||||||
POS.Type.MORPHEME
|
POS.Type.MORPHEME
|
||||||
},
|
},
|
||||||
new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP},
|
new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP},
|
||||||
new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP});
|
new POS.Tag[] {
|
||||||
|
POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP
|
||||||
|
});
|
||||||
|
|
||||||
assertPartsOfSpeech(
|
assertPartsOfSpeech(
|
||||||
analyzerDecompoundKeep,
|
analyzerDecompoundKeep,
|
||||||
|
@ -287,10 +289,10 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
||||||
POS.Type.MORPHEME
|
POS.Type.MORPHEME
|
||||||
},
|
},
|
||||||
new POS.Tag[] {
|
new POS.Tag[] {
|
||||||
POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP
|
POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP
|
||||||
},
|
},
|
||||||
new POS.Tag[] {
|
new POS.Tag[] {
|
||||||
POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP
|
POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -303,7 +305,7 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
||||||
"감싸여",
|
"감싸여",
|
||||||
new POS.Type[] {POS.Type.INFLECT},
|
new POS.Type[] {POS.Type.INFLECT},
|
||||||
new POS.Tag[] {POS.Tag.VV},
|
new POS.Tag[] {POS.Tag.VV},
|
||||||
new POS.Tag[] {POS.Tag.E});
|
new POS.Tag[] {POS.Tag.EC});
|
||||||
|
|
||||||
assertAnalyzesTo(
|
assertAnalyzesTo(
|
||||||
analyzerDecompound,
|
analyzerDecompound,
|
||||||
|
@ -327,15 +329,15 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
||||||
analyzerDecompound,
|
analyzerDecompound,
|
||||||
"감싸여",
|
"감싸여",
|
||||||
new POS.Type[] {POS.Type.MORPHEME, POS.Type.MORPHEME},
|
new POS.Type[] {POS.Type.MORPHEME, POS.Type.MORPHEME},
|
||||||
new POS.Tag[] {POS.Tag.VV, POS.Tag.E},
|
new POS.Tag[] {POS.Tag.VV, POS.Tag.EC},
|
||||||
new POS.Tag[] {POS.Tag.VV, POS.Tag.E});
|
new POS.Tag[] {POS.Tag.VV, POS.Tag.EC});
|
||||||
|
|
||||||
assertPartsOfSpeech(
|
assertPartsOfSpeech(
|
||||||
analyzerDecompoundKeep,
|
analyzerDecompoundKeep,
|
||||||
"감싸여",
|
"감싸여",
|
||||||
new POS.Type[] {POS.Type.INFLECT, POS.Type.MORPHEME, POS.Type.MORPHEME},
|
new POS.Type[] {POS.Type.INFLECT, POS.Type.MORPHEME, POS.Type.MORPHEME},
|
||||||
new POS.Tag[] {POS.Tag.VV, POS.Tag.VV, POS.Tag.E},
|
new POS.Tag[] {POS.Tag.VV, POS.Tag.VV, POS.Tag.EC},
|
||||||
new POS.Tag[] {POS.Tag.E, POS.Tag.VV, POS.Tag.E});
|
new POS.Tag[] {POS.Tag.EC, POS.Tag.VV, POS.Tag.EC});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testUnknownWord() throws IOException {
|
public void testUnknownWord() throws IOException {
|
||||||
|
|
Loading…
Reference in New Issue