mirror of https://github.com/apache/lucene.git
LUCENE-8954: refactor Nori analyzer
Signed-off-by: Namgyu Kim <namgyu@apache.org>
This commit is contained in:
parent
ff1e2fa658
commit
a9607b2a88
|
@ -157,17 +157,14 @@ public class GraphvizFormatter {
|
||||||
}
|
}
|
||||||
|
|
||||||
private String formatHeader() {
|
private String formatHeader() {
|
||||||
StringBuilder sb = new StringBuilder();
|
return "digraph viterbi {\n" +
|
||||||
sb.append("digraph viterbi {\n");
|
" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n" +
|
||||||
sb.append(" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
|
//sb.append(" // A2 paper size\n");
|
||||||
//sb.append(" // A2 paper size\n");
|
//sb.append(" size = \"34.4,16.5\";\n");
|
||||||
//sb.append(" size = \"34.4,16.5\";\n");
|
//sb.append(" // try to fill paper\n");
|
||||||
//sb.append(" // try to fill paper\n");
|
//sb.append(" ratio = fill;\n");
|
||||||
//sb.append(" ratio = fill;\n");
|
" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n" +
|
||||||
sb.append(" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
|
" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n";
|
||||||
sb.append(" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
|
|
||||||
|
|
||||||
return sb.toString();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String formatTrailer() {
|
private String formatTrailer() {
|
||||||
|
|
|
@ -74,7 +74,6 @@ public class KoreanAnalyzer extends Analyzer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||||
TokenStream result = new LowerCaseFilter(in);
|
return new LowerCaseFilter(in);
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ko;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.math.BigDecimal;
|
import java.math.BigDecimal;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -98,9 +99,7 @@ public class KoreanNumberFilter extends TokenFilter {
|
||||||
|
|
||||||
static {
|
static {
|
||||||
numerals = new char[0x10000];
|
numerals = new char[0x10000];
|
||||||
for (int i = 0; i < numerals.length; i++) {
|
Arrays.fill(numerals, NO_NUMERAL);
|
||||||
numerals[i] = NO_NUMERAL;
|
|
||||||
}
|
|
||||||
numerals['영'] = 0; // 영 U+C601 0
|
numerals['영'] = 0; // 영 U+C601 0
|
||||||
numerals['일'] = 1; // 일 U+C77C 1
|
numerals['일'] = 1; // 일 U+C77C 1
|
||||||
numerals['이'] = 2; // 이 U+C774 2
|
numerals['이'] = 2; // 이 U+C774 2
|
||||||
|
@ -113,9 +112,7 @@ public class KoreanNumberFilter extends TokenFilter {
|
||||||
numerals['구'] = 9; // 구 U+AD6C 9
|
numerals['구'] = 9; // 구 U+AD6C 9
|
||||||
|
|
||||||
exponents = new char[0x10000];
|
exponents = new char[0x10000];
|
||||||
for (int i = 0; i < exponents.length; i++) {
|
Arrays.fill(exponents, (char) 0);
|
||||||
exponents[i] = 0;
|
|
||||||
}
|
|
||||||
exponents['십'] = 1; // 십 U+C2ED 10
|
exponents['십'] = 1; // 십 U+C2ED 10
|
||||||
exponents['백'] = 2; // 백 U+BC31 100
|
exponents['백'] = 2; // 백 U+BC31 100
|
||||||
exponents['천'] = 3; // 천 U+CC9C 1,000
|
exponents['천'] = 3; // 천 U+CC9C 1,000
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ko;
|
||||||
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ -36,7 +37,7 @@ public final class KoreanPartOfSpeechStopFilter extends FilteringTokenFilter {
|
||||||
/**
|
/**
|
||||||
* Default list of tags to filter.
|
* Default list of tags to filter.
|
||||||
*/
|
*/
|
||||||
public static final Set<POS.Tag> DEFAULT_STOP_TAGS = Arrays.asList(
|
public static final Set<POS.Tag> DEFAULT_STOP_TAGS = new HashSet<>(Arrays.asList(
|
||||||
POS.Tag.E,
|
POS.Tag.E,
|
||||||
POS.Tag.IC,
|
POS.Tag.IC,
|
||||||
POS.Tag.J,
|
POS.Tag.J,
|
||||||
|
@ -55,7 +56,7 @@ public final class KoreanPartOfSpeechStopFilter extends FilteringTokenFilter {
|
||||||
POS.Tag.UNA,
|
POS.Tag.UNA,
|
||||||
POS.Tag.NA,
|
POS.Tag.NA,
|
||||||
POS.Tag.VSV
|
POS.Tag.VSV
|
||||||
).stream().collect(Collectors.toSet());
|
));
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new {@link KoreanPartOfSpeechStopFilter} with the default
|
* Create a new {@link KoreanPartOfSpeechStopFilter} with the default
|
||||||
|
|
|
@ -340,7 +340,7 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void add(Dictionary dict, Position fromPosData, int wordPos, int endPos, int wordID, Type type) throws IOException {
|
private void add(Dictionary dict, Position fromPosData, int wordPos, int endPos, int wordID, Type type) {
|
||||||
final POS.Tag leftPOS = dict.getLeftPOS(wordID);
|
final POS.Tag leftPOS = dict.getLeftPOS(wordID);
|
||||||
final int wordCost = dict.getWordCost(wordID);
|
final int wordCost = dict.getWordCost(wordID);
|
||||||
final int leftID = dict.getLeftId(wordID);
|
final int leftID = dict.getLeftId(wordID);
|
||||||
|
@ -533,15 +533,9 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
int userWordMaxPosAhead = -1;
|
int userWordMaxPosAhead = -1;
|
||||||
|
|
||||||
// Advances over each position (character):
|
// Advances over each position (character):
|
||||||
while (true) {
|
while (buffer.get(pos) != -1) {
|
||||||
|
|
||||||
if (buffer.get(pos) == -1) {
|
|
||||||
// End
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
final Position posData = positions.get(pos);
|
final Position posData = positions.get(pos);
|
||||||
final boolean isFrontier = positions.getNextPos() == pos+1;
|
final boolean isFrontier = positions.getNextPos() == pos + 1;
|
||||||
|
|
||||||
if (posData.count == 0) {
|
if (posData.count == 0) {
|
||||||
// No arcs arrive here; move to next position:
|
// No arcs arrive here; move to next position:
|
||||||
|
@ -585,9 +579,9 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
int leastIDX = -1;
|
int leastIDX = -1;
|
||||||
int leastCost = Integer.MAX_VALUE;
|
int leastCost = Integer.MAX_VALUE;
|
||||||
Position leastPosData = null;
|
Position leastPosData = null;
|
||||||
for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
|
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
|
||||||
final Position posData2 = positions.get(pos2);
|
final Position posData2 = positions.get(pos2);
|
||||||
for(int idx=0;idx<posData2.count;idx++) {
|
for (int idx = 0; idx < posData2.count; idx++) {
|
||||||
//System.out.println(" idx=" + idx + " cost=" + cost);
|
//System.out.println(" idx=" + idx + " cost=" + cost);
|
||||||
final int cost = posData2.costs[idx];
|
final int cost = posData2.costs[idx];
|
||||||
if (cost < leastCost) {
|
if (cost < leastCost) {
|
||||||
|
@ -602,7 +596,7 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
assert leastIDX != -1;
|
assert leastIDX != -1;
|
||||||
|
|
||||||
// Second pass: prune all but the best path:
|
// Second pass: prune all but the best path:
|
||||||
for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
|
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
|
||||||
final Position posData2 = positions.get(pos2);
|
final Position posData2 = positions.get(pos2);
|
||||||
if (posData2 != leastPosData) {
|
if (posData2 != leastPosData) {
|
||||||
posData2.reset();
|
posData2.reset();
|
||||||
|
@ -655,7 +649,7 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
|
if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
|
||||||
int nextChar = buffer.get(++pos);
|
int nextChar = buffer.get(++pos);
|
||||||
while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) {
|
while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) {
|
||||||
pos ++;
|
pos++;
|
||||||
nextChar = buffer.get(pos);
|
nextChar = buffer.get(pos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -673,7 +667,7 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
int outputMaxPosAhead = 0;
|
int outputMaxPosAhead = 0;
|
||||||
int arcFinalOutMaxPosAhead = 0;
|
int arcFinalOutMaxPosAhead = 0;
|
||||||
|
|
||||||
for(int posAhead=pos;;posAhead++) {
|
for (int posAhead = pos; ; posAhead++) {
|
||||||
final int ch = buffer.get(posAhead);
|
final int ch = buffer.get(posAhead);
|
||||||
if (ch == -1) {
|
if (ch == -1) {
|
||||||
break;
|
break;
|
||||||
|
@ -695,9 +689,9 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(" USER word " + new String(buffer.get(pos, maxPosAhead + 1)) + " toPos=" + (maxPosAhead + 1));
|
System.out.println(" USER word " + new String(buffer.get(pos, maxPosAhead + 1)) + " toPos=" + (maxPosAhead + 1));
|
||||||
}
|
}
|
||||||
add(userDictionary, posData, pos, maxPosAhead+1, outputMaxPosAhead+arcFinalOutMaxPosAhead, Type.USER);
|
add(userDictionary, posData, pos, maxPosAhead + 1, outputMaxPosAhead + arcFinalOutMaxPosAhead, Type.USER);
|
||||||
userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
|
userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: we can be more aggressive about user
|
// TODO: we can be more aggressive about user
|
||||||
|
@ -709,7 +703,7 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
fst.getFirstArc(arc);
|
fst.getFirstArc(arc);
|
||||||
int output = 0;
|
int output = 0;
|
||||||
|
|
||||||
for(int posAhead=pos;;posAhead++) {
|
for (int posAhead = pos; ; posAhead++) {
|
||||||
final int ch = buffer.get(posAhead);
|
final int ch = buffer.get(posAhead);
|
||||||
if (ch == -1) {
|
if (ch == -1) {
|
||||||
break;
|
break;
|
||||||
|
@ -734,7 +728,7 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
System.out.println(" KNOWN word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.length + " wordIDs");
|
System.out.println(" KNOWN word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.length + " wordIDs");
|
||||||
}
|
}
|
||||||
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
|
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
|
||||||
add(dictionary, posData, pos, posAhead+1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN);
|
add(dictionary, posData, pos, posAhead + 1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN);
|
||||||
anyMatches = true;
|
anyMatches = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -758,7 +752,7 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
} else {
|
} else {
|
||||||
// Extract unknown word. Characters with the same script are considered to be part of unknown word
|
// Extract unknown word. Characters with the same script are considered to be part of unknown word
|
||||||
unknownWordLength = 1;
|
unknownWordLength = 1;
|
||||||
UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter);
|
UnicodeScript scriptCode = UnicodeScript.of(firstCharacter);
|
||||||
final boolean isPunct = isPunctuation(firstCharacter);
|
final boolean isPunct = isPunctuation(firstCharacter);
|
||||||
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
|
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
|
||||||
int next = buffer.get(posAhead);
|
int next = buffer.get(posAhead);
|
||||||
|
@ -774,8 +768,8 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
|| chType == Character.NON_SPACING_MARK;
|
|| chType == Character.NON_SPACING_MARK;
|
||||||
|
|
||||||
if (sameScript
|
if (sameScript
|
||||||
&& isPunctuation(ch, chType) == isPunct
|
&& isPunctuation(ch, chType) == isPunct
|
||||||
&& characterDefinition.isGroup(ch)) {
|
&& characterDefinition.isGroup(ch)) {
|
||||||
unknownWordLength++;
|
unknownWordLength++;
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -81,9 +81,8 @@ public abstract class BinaryDictionary implements Dictionary {
|
||||||
this.resourcePath = resourcePath;
|
this.resourcePath = resourcePath;
|
||||||
}
|
}
|
||||||
InputStream mapIS = null, dictIS = null, posIS = null;
|
InputStream mapIS = null, dictIS = null, posIS = null;
|
||||||
int[] targetMapOffsets = null, targetMap = null;
|
int[] targetMapOffsets, targetMap;
|
||||||
ByteBuffer buffer = null;
|
ByteBuffer buffer;
|
||||||
boolean success = false;
|
|
||||||
try {
|
try {
|
||||||
mapIS = getResource(TARGETMAP_FILENAME_SUFFIX);
|
mapIS = getResource(TARGETMAP_FILENAME_SUFFIX);
|
||||||
mapIS = new BufferedInputStream(mapIS);
|
mapIS = new BufferedInputStream(mapIS);
|
||||||
|
@ -132,13 +131,8 @@ public abstract class BinaryDictionary implements Dictionary {
|
||||||
}
|
}
|
||||||
dictIS.close(); dictIS = null;
|
dictIS.close(); dictIS = null;
|
||||||
buffer = tmpBuffer.asReadOnlyBuffer();
|
buffer = tmpBuffer.asReadOnlyBuffer();
|
||||||
success = true;
|
|
||||||
} finally {
|
} finally {
|
||||||
if (success) {
|
IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS);
|
||||||
IOUtils.close(mapIS, posIS, dictIS);
|
|
||||||
} else {
|
|
||||||
IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
this.targetMap = targetMap;
|
this.targetMap = targetMap;
|
||||||
|
@ -158,7 +152,7 @@ public abstract class BinaryDictionary implements Dictionary {
|
||||||
}
|
}
|
||||||
|
|
||||||
// util, reused by ConnectionCosts and CharacterDefinition
|
// util, reused by ConnectionCosts and CharacterDefinition
|
||||||
public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
|
public static InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
|
||||||
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
|
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
|
||||||
if (is == null) {
|
if (is == null) {
|
||||||
throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.', '/') + suffix);
|
throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.', '/') + suffix);
|
||||||
|
@ -236,7 +230,7 @@ public abstract class BinaryDictionary implements Dictionary {
|
||||||
int offset = wordId + 6;
|
int offset = wordId + 6;
|
||||||
boolean hasSinglePos = hasSinglePOS(wordId);
|
boolean hasSinglePos = hasSinglePOS(wordId);
|
||||||
if (hasSinglePos == false) {
|
if (hasSinglePos == false) {
|
||||||
offset ++; // skip rightPOS
|
offset++; // skip rightPOS
|
||||||
}
|
}
|
||||||
int length = buffer.get(offset++);
|
int length = buffer.get(offset++);
|
||||||
if (length == 0) {
|
if (length == 0) {
|
||||||
|
@ -264,7 +258,7 @@ public abstract class BinaryDictionary implements Dictionary {
|
||||||
private String readString(int offset) {
|
private String readString(int offset) {
|
||||||
int strOffset = offset;
|
int strOffset = offset;
|
||||||
int len = buffer.get(strOffset++);
|
int len = buffer.get(strOffset++);
|
||||||
char text[] = new char[len];
|
char[] text = new char[len];
|
||||||
for (int i = 0; i < len; i++) {
|
for (int i = 0; i < len; i++) {
|
||||||
text[i] = buffer.getChar(strOffset + (i<<1));
|
text[i] = buffer.getChar(strOffset + (i<<1));
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ public final class CharacterDefinition {
|
||||||
|
|
||||||
// only used internally for lookup:
|
// only used internally for lookup:
|
||||||
enum CharacterClass {
|
enum CharacterClass {
|
||||||
NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, HANGUL, HANJA, HANJANUMERIC;
|
NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, HANGUL, HANJA, HANJANUMERIC
|
||||||
}
|
}
|
||||||
|
|
||||||
private final byte[] characterCategoryMap = new byte[0x10000];
|
private final byte[] characterCategoryMap = new byte[0x10000];
|
||||||
|
@ -108,11 +108,7 @@ public final class CharacterDefinition {
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasCoda(char ch){
|
public boolean hasCoda(char ch){
|
||||||
if (((ch - 0xAC00) % 0x001C) == 0) {
|
return ((ch - 0xAC00) % 0x001C) != 0;
|
||||||
return false;
|
|
||||||
} else {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static byte lookupCharacterClass(String characterClassName) {
|
public static byte lookupCharacterClass(String characterClassName) {
|
||||||
|
|
|
@ -40,7 +40,7 @@ public final class ConnectionCosts {
|
||||||
|
|
||||||
private ConnectionCosts() throws IOException {
|
private ConnectionCosts() throws IOException {
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
ByteBuffer buffer = null;
|
ByteBuffer buffer;
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);
|
is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);
|
||||||
|
|
|
@ -28,7 +28,7 @@ public final class TokenInfoFST {
|
||||||
private final FST<Long> fst;
|
private final FST<Long> fst;
|
||||||
|
|
||||||
private final int cacheCeiling;
|
private final int cacheCeiling;
|
||||||
private final FST.Arc<Long> rootCache[];
|
private final FST.Arc<Long>[] rootCache;
|
||||||
|
|
||||||
public final Long NO_OUTPUT;
|
public final Long NO_OUTPUT;
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ public final class TokenInfoFST {
|
||||||
|
|
||||||
@SuppressWarnings({"rawtypes","unchecked"})
|
@SuppressWarnings({"rawtypes","unchecked"})
|
||||||
private FST.Arc<Long>[] cacheRootArcs() throws IOException {
|
private FST.Arc<Long>[] cacheRootArcs() throws IOException {
|
||||||
FST.Arc<Long> rootCache[] = new FST.Arc[1+(cacheCeiling-0xAC00)];
|
FST.Arc<Long>[] rootCache = new FST.Arc[1+(cacheCeiling-0xAC00)];
|
||||||
FST.Arc<Long> firstArc = new FST.Arc<>();
|
FST.Arc<Long> firstArc = new FST.Arc<>();
|
||||||
fst.getFirstArc(firstArc);
|
fst.getFirstArc(firstArc);
|
||||||
FST.Arc<Long> arc = new FST.Arc<>();
|
FST.Arc<Long> arc = new FST.Arc<>();
|
||||||
|
|
|
@ -37,26 +37,26 @@ public final class UserDictionary implements Dictionary {
|
||||||
// text -> wordID
|
// text -> wordID
|
||||||
private final TokenInfoFST fst;
|
private final TokenInfoFST fst;
|
||||||
|
|
||||||
public static final int WORD_COST = -100000;
|
private static final int WORD_COST = -100000;
|
||||||
|
|
||||||
// NNG left
|
// NNG left
|
||||||
public static final short LEFT_ID = 1781;
|
private static final short LEFT_ID = 1781;
|
||||||
|
|
||||||
// NNG right
|
// NNG right
|
||||||
public static final short RIGHT_ID = 3533;
|
private static final short RIGHT_ID = 3533;
|
||||||
// NNG right with hangul and a coda on the last char
|
// NNG right with hangul and a coda on the last char
|
||||||
public static final short RIGHT_ID_T = 3535;
|
private static final short RIGHT_ID_T = 3535;
|
||||||
// NNG right with hangul and no coda on the last char
|
// NNG right with hangul and no coda on the last char
|
||||||
public static final short RIGHT_ID_F = 3534;
|
private static final short RIGHT_ID_F = 3534;
|
||||||
|
|
||||||
// length, length... indexed by compound ID or null for simple noun
|
// length, length... indexed by compound ID or null for simple noun
|
||||||
private final int segmentations[][];
|
private final int[][] segmentations;
|
||||||
private final short[] rightIds;
|
private final short[] rightIds;
|
||||||
|
|
||||||
public static UserDictionary open(Reader reader) throws IOException {
|
public static UserDictionary open(Reader reader) throws IOException {
|
||||||
|
|
||||||
BufferedReader br = new BufferedReader(reader);
|
BufferedReader br = new BufferedReader(reader);
|
||||||
String line = null;
|
String line;
|
||||||
List<String> entries = new ArrayList<>();
|
List<String> entries = new ArrayList<>();
|
||||||
|
|
||||||
// text + optional segmentations
|
// text + optional segmentations
|
||||||
|
@ -127,7 +127,7 @@ public final class UserDictionary implements Dictionary {
|
||||||
scratch.grow(token.length());
|
scratch.grow(token.length());
|
||||||
scratch.setLength(token.length());
|
scratch.setLength(token.length());
|
||||||
for (int i = 0; i < token.length(); i++) {
|
for (int i = 0; i < token.length(); i++) {
|
||||||
scratch.setIntAt(i, (int) token.charAt(i));
|
scratch.setIntAt(i, token.charAt(i));
|
||||||
}
|
}
|
||||||
fstBuilder.add(scratch.get(), ord);
|
fstBuilder.add(scratch.get(), ord);
|
||||||
lastToken = token;
|
lastToken = token;
|
||||||
|
|
|
@ -117,8 +117,8 @@ abstract class BinaryDictionaryWriter {
|
||||||
boolean hasSinglePOS = (leftPOS == rightPOS);
|
boolean hasSinglePOS = (leftPOS == rightPOS);
|
||||||
if (posType != POS.Type.MORPHEME && expression.length() > 0) {
|
if (posType != POS.Type.MORPHEME && expression.length() > 0) {
|
||||||
String[] exprTokens = expression.split("\\+");
|
String[] exprTokens = expression.split("\\+");
|
||||||
for (int i = 0; i < exprTokens.length; i++) {
|
for (String exprToken : exprTokens) {
|
||||||
String[] tokenSplit = exprTokens[i].split("/");
|
String[] tokenSplit = exprToken.split("/");
|
||||||
assert tokenSplit.length == 3;
|
assert tokenSplit.length == 3;
|
||||||
String surfaceForm = tokenSplit[0].trim();
|
String surfaceForm = tokenSplit[0].trim();
|
||||||
if (surfaceForm.isEmpty() == false) {
|
if (surfaceForm.isEmpty() == false) {
|
||||||
|
|
|
@ -70,7 +70,7 @@ public final class CSVUtil {
|
||||||
return new String[0];
|
return new String[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
return result.toArray(new String[result.size()]);
|
return result.toArray(new String[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String unQuoteUnEscape(String original) {
|
private static String unQuoteUnEscape(String original) {
|
||||||
|
@ -84,7 +84,7 @@ public final class CSVUtil {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unescape
|
// Unescape
|
||||||
if (result.indexOf(ESCAPED_QUOTE) >= 0) {
|
if (result.contains(ESCAPED_QUOTE)) {
|
||||||
result = result.replace(ESCAPED_QUOTE, "\"");
|
result = result.replace(ESCAPED_QUOTE, "\"");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -114,7 +114,7 @@ class TokenInfoDictionaryBuilder {
|
||||||
scratch.grow(surfaceForm.length());
|
scratch.grow(surfaceForm.length());
|
||||||
scratch.setLength(surfaceForm.length());
|
scratch.setLength(surfaceForm.length());
|
||||||
for (int i = 0; i < surfaceForm.length(); i++) {
|
for (int i = 0; i < surfaceForm.length(); i++) {
|
||||||
scratch.setIntAt(i, (int) surfaceForm.charAt(i));
|
scratch.setIntAt(i, surfaceForm.charAt(i));
|
||||||
}
|
}
|
||||||
fstBuilder.add(scratch.get(), ord);
|
fstBuilder.add(scratch.get(), ord);
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.util.ResourceLoader;
|
||||||
|
|
||||||
/** Fake resource loader for tests: works if you want to fake reading a single file */
|
/** Fake resource loader for tests: works if you want to fake reading a single file */
|
||||||
class StringMockResourceLoader implements ResourceLoader {
|
class StringMockResourceLoader implements ResourceLoader {
|
||||||
String text;
|
private String text;
|
||||||
|
|
||||||
public StringMockResourceLoader(String text) {
|
public StringMockResourceLoader(String text) {
|
||||||
this.text = text;
|
this.text = text;
|
||||||
|
|
|
@ -18,9 +18,9 @@ package org.apache.lucene.analysis.ko;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
@ -41,7 +41,7 @@ public class TestKoreanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStopTags() throws IOException {
|
public void testStopTags() throws IOException {
|
||||||
Set<POS.Tag> stopTags = Arrays.asList(POS.Tag.NNP, POS.Tag.NNG).stream().collect(Collectors.toSet());
|
Set<POS.Tag> stopTags = new HashSet<>(Arrays.asList(POS.Tag.NNP, POS.Tag.NNG));
|
||||||
Analyzer a = new KoreanAnalyzer(null, KoreanTokenizer.DecompoundMode.DISCARD, stopTags, false);
|
Analyzer a = new KoreanAnalyzer(null, KoreanTokenizer.DecompoundMode.DISCARD, stopTags, false);
|
||||||
assertAnalyzesTo(a, "한국은 대단한 나라입니다.",
|
assertAnalyzesTo(a, "한국은 대단한 나라입니다.",
|
||||||
new String[]{"은", "대단", "하", "ᆫ", "이", "ᄇ니다"},
|
new String[]{"은", "대단", "하", "ᆫ", "이", "ᄇ니다"},
|
||||||
|
|
|
@ -50,12 +50,12 @@ public class TestKoreanNumberFilterFactory extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
public void testBogusArguments() throws Exception {
|
public void testBogusArguments() {
|
||||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
|
||||||
new KoreanNumberFilterFactory(new HashMap<String,String>() {{
|
new KoreanNumberFilterFactory(new HashMap<>() {{
|
||||||
put("bogusArg", "bogusValue");
|
put("bogusArg", "bogusValue");
|
||||||
}});
|
}})
|
||||||
});
|
);
|
||||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,7 +32,7 @@ import org.apache.lucene.util.Version;
|
||||||
*/
|
*/
|
||||||
public class TestKoreanPartOfSpeechStopFilterFactory extends BaseTokenStreamTestCase {
|
public class TestKoreanPartOfSpeechStopFilterFactory extends BaseTokenStreamTestCase {
|
||||||
public void testStopTags() throws IOException {
|
public void testStopTags() throws IOException {
|
||||||
KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(new HashMap<String,String>());
|
KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(new HashMap<>());
|
||||||
tokenizerFactory.inform(new StringMockResourceLoader(""));
|
tokenizerFactory.inform(new StringMockResourceLoader(""));
|
||||||
TokenStream ts = tokenizerFactory.create();
|
TokenStream ts = tokenizerFactory.create();
|
||||||
((Tokenizer)ts).setReader(new StringReader(" 한국은 대단한 나라입니다."));
|
((Tokenizer)ts).setReader(new StringReader(" 한국은 대단한 나라입니다."));
|
||||||
|
@ -47,13 +47,13 @@ public class TestKoreanPartOfSpeechStopFilterFactory extends BaseTokenStreamTest
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
public void testBogusArguments() throws Exception {
|
public void testBogusArguments() {
|
||||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
|
||||||
new KoreanPartOfSpeechStopFilterFactory(new HashMap<String,String>() {{
|
new KoreanPartOfSpeechStopFilterFactory(new HashMap<>() {{
|
||||||
put("luceneMatchVersion", Version.LATEST.toString());
|
put("luceneMatchVersion", Version.LATEST.toString());
|
||||||
put("bogusArg", "bogusValue");
|
put("bogusArg", "bogusValue");
|
||||||
}});
|
}})
|
||||||
});
|
);
|
||||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,8 +31,8 @@ public class TestKoreanReadingFormFilterFactory extends BaseTokenStreamTestCase
|
||||||
public void testReadings() throws IOException {
|
public void testReadings() throws IOException {
|
||||||
KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(new HashMap<>());
|
KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(new HashMap<>());
|
||||||
tokenizerFactory.inform(new StringMockResourceLoader(""));
|
tokenizerFactory.inform(new StringMockResourceLoader(""));
|
||||||
TokenStream tokenStream = tokenizerFactory.create();
|
Tokenizer tokenStream = tokenizerFactory.create();
|
||||||
((Tokenizer)tokenStream).setReader(new StringReader("丞相"));
|
tokenStream.setReader(new StringReader("丞相"));
|
||||||
KoreanReadingFormFilterFactory filterFactory = new KoreanReadingFormFilterFactory(new HashMap<>());
|
KoreanReadingFormFilterFactory filterFactory = new KoreanReadingFormFilterFactory(new HashMap<>());
|
||||||
assertTokenStreamContents(filterFactory.create(tokenStream),
|
assertTokenStreamContents(filterFactory.create(tokenStream),
|
||||||
new String[] { "승상" }
|
new String[] { "승상" }
|
||||||
|
@ -40,12 +40,12 @@ public class TestKoreanReadingFormFilterFactory extends BaseTokenStreamTestCase
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
public void testBogusArguments() throws Exception {
|
public void testBogusArguments() {
|
||||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
|
||||||
new KoreanReadingFormFilterFactory(new HashMap<String,String>() {{
|
new KoreanReadingFormFilterFactory(new HashMap<>() {{
|
||||||
put("bogusArg", "bogusValue");
|
put("bogusArg", "bogusValue");
|
||||||
}});
|
}})
|
||||||
});
|
);
|
||||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,8 +33,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
|
||||||
public void testSimple() throws IOException {
|
public void testSimple() throws IOException {
|
||||||
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(Collections.emptyMap());
|
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(Collections.emptyMap());
|
||||||
factory.inform(new StringMockResourceLoader(""));
|
factory.inform(new StringMockResourceLoader(""));
|
||||||
TokenStream ts = factory.create(newAttributeFactory());
|
Tokenizer ts = factory.create(newAttributeFactory());
|
||||||
((Tokenizer)ts).setReader(new StringReader("안녕하세요"));
|
ts.setReader(new StringReader("안녕하세요"));
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[] { "안녕", "하", "시", "어요" },
|
new String[] { "안녕", "하", "시", "어요" },
|
||||||
new int[] { 0, 2, 3, 3 },
|
new int[] { 0, 2, 3, 3 },
|
||||||
|
@ -50,8 +50,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
|
||||||
args.put("decompoundMode", "discard");
|
args.put("decompoundMode", "discard");
|
||||||
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
||||||
factory.inform(new StringMockResourceLoader(""));
|
factory.inform(new StringMockResourceLoader(""));
|
||||||
TokenStream ts = factory.create(newAttributeFactory());
|
Tokenizer ts = factory.create(newAttributeFactory());
|
||||||
((Tokenizer)ts).setReader(new StringReader("갠지스강"));
|
ts.setReader(new StringReader("갠지스강"));
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[] { "갠지스", "강" }
|
new String[] { "갠지스", "강" }
|
||||||
);
|
);
|
||||||
|
@ -62,8 +62,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
|
||||||
args.put("decompoundMode", "none");
|
args.put("decompoundMode", "none");
|
||||||
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
||||||
factory.inform(new StringMockResourceLoader(""));
|
factory.inform(new StringMockResourceLoader(""));
|
||||||
TokenStream ts = factory.create(newAttributeFactory());
|
Tokenizer ts = factory.create(newAttributeFactory());
|
||||||
((Tokenizer)ts).setReader(new StringReader("갠지스강"));
|
ts.setReader(new StringReader("갠지스강"));
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[] { "갠지스강" }
|
new String[] { "갠지스강" }
|
||||||
);
|
);
|
||||||
|
@ -74,8 +74,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
|
||||||
args.put("decompoundMode", "mixed");
|
args.put("decompoundMode", "mixed");
|
||||||
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
||||||
factory.inform(new StringMockResourceLoader(""));
|
factory.inform(new StringMockResourceLoader(""));
|
||||||
TokenStream ts = factory.create(newAttributeFactory());
|
Tokenizer ts = factory.create(newAttributeFactory());
|
||||||
((Tokenizer)ts).setReader(new StringReader("갠지스강"));
|
ts.setReader(new StringReader("갠지스강"));
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[] { "갠지스강", "갠지스", "강" }
|
new String[] { "갠지스강", "갠지스", "강" }
|
||||||
);
|
);
|
||||||
|
@ -94,8 +94,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
|
||||||
args.put("userDictionary", "userdict.txt");
|
args.put("userDictionary", "userdict.txt");
|
||||||
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
||||||
factory.inform(new StringMockResourceLoader(userDict));
|
factory.inform(new StringMockResourceLoader(userDict));
|
||||||
TokenStream ts = factory.create(newAttributeFactory());
|
Tokenizer ts = factory.create(newAttributeFactory());
|
||||||
((Tokenizer)ts).setReader(new StringReader("세종시"));
|
ts.setReader(new StringReader("세종시"));
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[] { "세종", "시" }
|
new String[] { "세종", "시" }
|
||||||
);
|
);
|
||||||
|
@ -109,8 +109,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
|
||||||
args.put("discardPunctuation", "true");
|
args.put("discardPunctuation", "true");
|
||||||
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
||||||
factory.inform(new StringMockResourceLoader(""));
|
factory.inform(new StringMockResourceLoader(""));
|
||||||
TokenStream ts = factory.create(newAttributeFactory());
|
Tokenizer ts = factory.create(newAttributeFactory());
|
||||||
((Tokenizer)ts).setReader(new StringReader("10.1 인치 모니터"));
|
ts.setReader(new StringReader("10.1 인치 모니터"));
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[] { "10", "1", "인치", "모니터" }
|
new String[] { "10", "1", "인치", "모니터" }
|
||||||
);
|
);
|
||||||
|
@ -124,20 +124,20 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
|
||||||
args.put("discardPunctuation", "false");
|
args.put("discardPunctuation", "false");
|
||||||
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
|
||||||
factory.inform(new StringMockResourceLoader(""));
|
factory.inform(new StringMockResourceLoader(""));
|
||||||
TokenStream ts = factory.create(newAttributeFactory());
|
Tokenizer ts = factory.create(newAttributeFactory());
|
||||||
((Tokenizer)ts).setReader(new StringReader("10.1 인치 모니터"));
|
ts.setReader(new StringReader("10.1 인치 모니터"));
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[] { "10", ".", "1", " ", "인치", " ", "모니터" }
|
new String[] { "10", ".", "1", " ", "인치", " ", "모니터" }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
public void testBogusArguments() throws Exception {
|
public void testBogusArguments() {
|
||||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
|
||||||
new KoreanTokenizerFactory(new HashMap<String,String>() {{
|
new KoreanTokenizerFactory(new HashMap<>() {{
|
||||||
put("bogusArg", "bogusValue");
|
put("bogusArg", "bogusValue");
|
||||||
}});
|
}})
|
||||||
});
|
);
|
||||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -135,11 +135,11 @@ public class TokenInfoDictionaryTest extends LuceneTestCase {
|
||||||
POS.Tag rightPOS = tid.getRightPOS(wordId);
|
POS.Tag rightPOS = tid.getRightPOS(wordId);
|
||||||
|
|
||||||
if (type == POS.Type.MORPHEME) {
|
if (type == POS.Type.MORPHEME) {
|
||||||
assertTrue(leftPOS == rightPOS);
|
assertSame(leftPOS, rightPOS);
|
||||||
String reading = tid.getReading(wordId);
|
String reading = tid.getReading(wordId);
|
||||||
boolean isHanja = charDef.isHanja(surfaceForm.charAt(0));
|
boolean isHanja = charDef.isHanja(surfaceForm.charAt(0));
|
||||||
if (isHanja) {
|
if (isHanja) {
|
||||||
assertTrue(reading != null);
|
assertNotNull(reading);
|
||||||
for (int j = 0; j < reading.length(); j++) {
|
for (int j = 0; j < reading.length(); j++) {
|
||||||
assertTrue(charDef.isHangul(reading.charAt(j)));
|
assertTrue(charDef.isHangul(reading.charAt(j)));
|
||||||
}
|
}
|
||||||
|
@ -149,7 +149,7 @@ public class TokenInfoDictionaryTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (type == POS.Type.COMPOUND) {
|
if (type == POS.Type.COMPOUND) {
|
||||||
assertTrue(leftPOS == rightPOS);
|
assertSame(leftPOS, rightPOS);
|
||||||
assertTrue(leftPOS == POS.Tag.NNG || rightPOS == POS.Tag.NNP);
|
assertTrue(leftPOS == POS.Tag.NNG || rightPOS == POS.Tag.NNP);
|
||||||
}
|
}
|
||||||
Dictionary.Morpheme[] decompound = tid.getMorphemes(wordId, chars, 0, chars.length);
|
Dictionary.Morpheme[] decompound = tid.getMorphemes(wordId, chars, 0, chars.length);
|
||||||
|
|
|
@ -41,7 +41,8 @@ public class UserDictionaryTest extends LuceneTestCase {
|
||||||
assertNull(dictionary.getMorphemes(wordIds.get(0), sArray, 0, s.length()));
|
assertNull(dictionary.getMorphemes(wordIds.get(0), sArray, 0, s.length()));
|
||||||
|
|
||||||
Dictionary.Morpheme[] decompound = dictionary.getMorphemes(wordIds.get(1), sArray, 0, s.length());
|
Dictionary.Morpheme[] decompound = dictionary.getMorphemes(wordIds.get(1), sArray, 0, s.length());
|
||||||
assertTrue(decompound.length == 2);
|
assertNotNull(decompound);
|
||||||
|
assertEquals(2, decompound.length);
|
||||||
assertEquals(decompound[0].posTag, POS.Tag.NNG);
|
assertEquals(decompound[0].posTag, POS.Tag.NNG);
|
||||||
assertEquals(decompound[0].surfaceForm, "세종");
|
assertEquals(decompound[0].surfaceForm, "세종");
|
||||||
assertEquals(decompound[1].posTag, POS.Tag.NNG);
|
assertEquals(decompound[1].posTag, POS.Tag.NNG);
|
||||||
|
@ -55,7 +56,7 @@ public class UserDictionaryTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRead() throws IOException {
|
public void testRead() {
|
||||||
UserDictionary dictionary = TestKoreanTokenizer.readDict();
|
UserDictionary dictionary = TestKoreanTokenizer.readDict();
|
||||||
assertNotNull(dictionary);
|
assertNotNull(dictionary);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue