LUCENE-8954: refactor Nori analyzer

Signed-off-by: Namgyu Kim <namgyu@apache.org>
This commit is contained in:
Namgyu Kim 2019-08-27 03:09:03 +09:00 committed by GitHub
parent ff1e2fa658
commit a9607b2a88
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 105 additions and 126 deletions

View File

@ -157,17 +157,14 @@ public class GraphvizFormatter {
} }
private String formatHeader() { private String formatHeader() {
StringBuilder sb = new StringBuilder(); return "digraph viterbi {\n" +
sb.append("digraph viterbi {\n"); " graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n" +
sb.append(" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n"); //sb.append(" // A2 paper size\n");
//sb.append(" // A2 paper size\n"); //sb.append(" size = \"34.4,16.5\";\n");
//sb.append(" size = \"34.4,16.5\";\n"); //sb.append(" // try to fill paper\n");
//sb.append(" // try to fill paper\n"); //sb.append(" ratio = fill;\n");
//sb.append(" ratio = fill;\n"); " edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n" +
sb.append(" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n"); " node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n";
sb.append(" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
return sb.toString();
} }
private String formatTrailer() { private String formatTrailer() {

View File

@ -74,7 +74,6 @@ public class KoreanAnalyzer extends Analyzer {
@Override @Override
protected TokenStream normalize(String fieldName, TokenStream in) { protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new LowerCaseFilter(in); return new LowerCaseFilter(in);
return result;
} }
} }

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ko;
import java.io.IOException; import java.io.IOException;
import java.math.BigDecimal; import java.math.BigDecimal;
import java.util.Arrays;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -98,9 +99,7 @@ public class KoreanNumberFilter extends TokenFilter {
static { static {
numerals = new char[0x10000]; numerals = new char[0x10000];
for (int i = 0; i < numerals.length; i++) { Arrays.fill(numerals, NO_NUMERAL);
numerals[i] = NO_NUMERAL;
}
numerals['영'] = 0; // U+C601 0 numerals['영'] = 0; // U+C601 0
numerals['일'] = 1; // U+C77C 1 numerals['일'] = 1; // U+C77C 1
numerals['이'] = 2; // U+C774 2 numerals['이'] = 2; // U+C774 2
@ -113,9 +112,7 @@ public class KoreanNumberFilter extends TokenFilter {
numerals['구'] = 9; // U+AD6C 9 numerals['구'] = 9; // U+AD6C 9
exponents = new char[0x10000]; exponents = new char[0x10000];
for (int i = 0; i < exponents.length; i++) { Arrays.fill(exponents, (char) 0);
exponents[i] = 0;
}
exponents['십'] = 1; // U+C2ED 10 exponents['십'] = 1; // U+C2ED 10
exponents['백'] = 2; // U+BC31 100 exponents['백'] = 2; // U+BC31 100
exponents['천'] = 3; // U+CC9C 1,000 exponents['천'] = 3; // U+CC9C 1,000

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ko;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -36,7 +37,7 @@ public final class KoreanPartOfSpeechStopFilter extends FilteringTokenFilter {
/** /**
* Default list of tags to filter. * Default list of tags to filter.
*/ */
public static final Set<POS.Tag> DEFAULT_STOP_TAGS = Arrays.asList( public static final Set<POS.Tag> DEFAULT_STOP_TAGS = new HashSet<>(Arrays.asList(
POS.Tag.E, POS.Tag.E,
POS.Tag.IC, POS.Tag.IC,
POS.Tag.J, POS.Tag.J,
@ -55,7 +56,7 @@ public final class KoreanPartOfSpeechStopFilter extends FilteringTokenFilter {
POS.Tag.UNA, POS.Tag.UNA,
POS.Tag.NA, POS.Tag.NA,
POS.Tag.VSV POS.Tag.VSV
).stream().collect(Collectors.toSet()); ));
/** /**
* Create a new {@link KoreanPartOfSpeechStopFilter} with the default * Create a new {@link KoreanPartOfSpeechStopFilter} with the default

View File

@ -340,7 +340,7 @@ public final class KoreanTokenizer extends Tokenizer {
} }
private void add(Dictionary dict, Position fromPosData, int wordPos, int endPos, int wordID, Type type) throws IOException { private void add(Dictionary dict, Position fromPosData, int wordPos, int endPos, int wordID, Type type) {
final POS.Tag leftPOS = dict.getLeftPOS(wordID); final POS.Tag leftPOS = dict.getLeftPOS(wordID);
final int wordCost = dict.getWordCost(wordID); final int wordCost = dict.getWordCost(wordID);
final int leftID = dict.getLeftId(wordID); final int leftID = dict.getLeftId(wordID);
@ -533,15 +533,9 @@ public final class KoreanTokenizer extends Tokenizer {
int userWordMaxPosAhead = -1; int userWordMaxPosAhead = -1;
// Advances over each position (character): // Advances over each position (character):
while (true) { while (buffer.get(pos) != -1) {
if (buffer.get(pos) == -1) {
// End
break;
}
final Position posData = positions.get(pos); final Position posData = positions.get(pos);
final boolean isFrontier = positions.getNextPos() == pos+1; final boolean isFrontier = positions.getNextPos() == pos + 1;
if (posData.count == 0) { if (posData.count == 0) {
// No arcs arrive here; move to next position: // No arcs arrive here; move to next position:
@ -585,9 +579,9 @@ public final class KoreanTokenizer extends Tokenizer {
int leastIDX = -1; int leastIDX = -1;
int leastCost = Integer.MAX_VALUE; int leastCost = Integer.MAX_VALUE;
Position leastPosData = null; Position leastPosData = null;
for(int pos2=pos;pos2<positions.getNextPos();pos2++) { for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
final Position posData2 = positions.get(pos2); final Position posData2 = positions.get(pos2);
for(int idx=0;idx<posData2.count;idx++) { for (int idx = 0; idx < posData2.count; idx++) {
//System.out.println(" idx=" + idx + " cost=" + cost); //System.out.println(" idx=" + idx + " cost=" + cost);
final int cost = posData2.costs[idx]; final int cost = posData2.costs[idx];
if (cost < leastCost) { if (cost < leastCost) {
@ -602,7 +596,7 @@ public final class KoreanTokenizer extends Tokenizer {
assert leastIDX != -1; assert leastIDX != -1;
// Second pass: prune all but the best path: // Second pass: prune all but the best path:
for(int pos2=pos;pos2<positions.getNextPos();pos2++) { for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
final Position posData2 = positions.get(pos2); final Position posData2 = positions.get(pos2);
if (posData2 != leastPosData) { if (posData2 != leastPosData) {
posData2.reset(); posData2.reset();
@ -655,7 +649,7 @@ public final class KoreanTokenizer extends Tokenizer {
if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) { if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
int nextChar = buffer.get(++pos); int nextChar = buffer.get(++pos);
while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) { while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) {
pos ++; pos++;
nextChar = buffer.get(pos); nextChar = buffer.get(pos);
} }
} }
@ -673,7 +667,7 @@ public final class KoreanTokenizer extends Tokenizer {
int outputMaxPosAhead = 0; int outputMaxPosAhead = 0;
int arcFinalOutMaxPosAhead = 0; int arcFinalOutMaxPosAhead = 0;
for(int posAhead=pos;;posAhead++) { for (int posAhead = pos; ; posAhead++) {
final int ch = buffer.get(posAhead); final int ch = buffer.get(posAhead);
if (ch == -1) { if (ch == -1) {
break; break;
@ -695,9 +689,9 @@ public final class KoreanTokenizer extends Tokenizer {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" USER word " + new String(buffer.get(pos, maxPosAhead + 1)) + " toPos=" + (maxPosAhead + 1)); System.out.println(" USER word " + new String(buffer.get(pos, maxPosAhead + 1)) + " toPos=" + (maxPosAhead + 1));
} }
add(userDictionary, posData, pos, maxPosAhead+1, outputMaxPosAhead+arcFinalOutMaxPosAhead, Type.USER); add(userDictionary, posData, pos, maxPosAhead + 1, outputMaxPosAhead + arcFinalOutMaxPosAhead, Type.USER);
userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead); userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
} }
} }
// TODO: we can be more aggressive about user // TODO: we can be more aggressive about user
@ -709,7 +703,7 @@ public final class KoreanTokenizer extends Tokenizer {
fst.getFirstArc(arc); fst.getFirstArc(arc);
int output = 0; int output = 0;
for(int posAhead=pos;;posAhead++) { for (int posAhead = pos; ; posAhead++) {
final int ch = buffer.get(posAhead); final int ch = buffer.get(posAhead);
if (ch == -1) { if (ch == -1) {
break; break;
@ -734,7 +728,7 @@ public final class KoreanTokenizer extends Tokenizer {
System.out.println(" KNOWN word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.length + " wordIDs"); System.out.println(" KNOWN word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.length + " wordIDs");
} }
for (int ofs = 0; ofs < wordIdRef.length; ofs++) { for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
add(dictionary, posData, pos, posAhead+1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN); add(dictionary, posData, pos, posAhead + 1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN);
anyMatches = true; anyMatches = true;
} }
} }
@ -758,7 +752,7 @@ public final class KoreanTokenizer extends Tokenizer {
} else { } else {
// Extract unknown word. Characters with the same script are considered to be part of unknown word // Extract unknown word. Characters with the same script are considered to be part of unknown word
unknownWordLength = 1; unknownWordLength = 1;
UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter); UnicodeScript scriptCode = UnicodeScript.of(firstCharacter);
final boolean isPunct = isPunctuation(firstCharacter); final boolean isPunct = isPunctuation(firstCharacter);
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) { for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
int next = buffer.get(posAhead); int next = buffer.get(posAhead);
@ -774,8 +768,8 @@ public final class KoreanTokenizer extends Tokenizer {
|| chType == Character.NON_SPACING_MARK; || chType == Character.NON_SPACING_MARK;
if (sameScript if (sameScript
&& isPunctuation(ch, chType) == isPunct && isPunctuation(ch, chType) == isPunct
&& characterDefinition.isGroup(ch)) { && characterDefinition.isGroup(ch)) {
unknownWordLength++; unknownWordLength++;
} else { } else {
break; break;

View File

@ -81,9 +81,8 @@ public abstract class BinaryDictionary implements Dictionary {
this.resourcePath = resourcePath; this.resourcePath = resourcePath;
} }
InputStream mapIS = null, dictIS = null, posIS = null; InputStream mapIS = null, dictIS = null, posIS = null;
int[] targetMapOffsets = null, targetMap = null; int[] targetMapOffsets, targetMap;
ByteBuffer buffer = null; ByteBuffer buffer;
boolean success = false;
try { try {
mapIS = getResource(TARGETMAP_FILENAME_SUFFIX); mapIS = getResource(TARGETMAP_FILENAME_SUFFIX);
mapIS = new BufferedInputStream(mapIS); mapIS = new BufferedInputStream(mapIS);
@ -132,13 +131,8 @@ public abstract class BinaryDictionary implements Dictionary {
} }
dictIS.close(); dictIS = null; dictIS.close(); dictIS = null;
buffer = tmpBuffer.asReadOnlyBuffer(); buffer = tmpBuffer.asReadOnlyBuffer();
success = true;
} finally { } finally {
if (success) { IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS);
IOUtils.close(mapIS, posIS, dictIS);
} else {
IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS);
}
} }
this.targetMap = targetMap; this.targetMap = targetMap;
@ -158,7 +152,7 @@ public abstract class BinaryDictionary implements Dictionary {
} }
// util, reused by ConnectionCosts and CharacterDefinition // util, reused by ConnectionCosts and CharacterDefinition
public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException { public static InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix); final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
if (is == null) { if (is == null) {
throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.', '/') + suffix); throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.', '/') + suffix);
@ -236,7 +230,7 @@ public abstract class BinaryDictionary implements Dictionary {
int offset = wordId + 6; int offset = wordId + 6;
boolean hasSinglePos = hasSinglePOS(wordId); boolean hasSinglePos = hasSinglePOS(wordId);
if (hasSinglePos == false) { if (hasSinglePos == false) {
offset ++; // skip rightPOS offset++; // skip rightPOS
} }
int length = buffer.get(offset++); int length = buffer.get(offset++);
if (length == 0) { if (length == 0) {
@ -264,7 +258,7 @@ public abstract class BinaryDictionary implements Dictionary {
private String readString(int offset) { private String readString(int offset) {
int strOffset = offset; int strOffset = offset;
int len = buffer.get(strOffset++); int len = buffer.get(strOffset++);
char text[] = new char[len]; char[] text = new char[len];
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
text[i] = buffer.getChar(strOffset + (i<<1)); text[i] = buffer.getChar(strOffset + (i<<1));
} }

View File

@ -38,7 +38,7 @@ public final class CharacterDefinition {
// only used internally for lookup: // only used internally for lookup:
enum CharacterClass { enum CharacterClass {
NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, HANGUL, HANJA, HANJANUMERIC; NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, HANGUL, HANJA, HANJANUMERIC
} }
private final byte[] characterCategoryMap = new byte[0x10000]; private final byte[] characterCategoryMap = new byte[0x10000];
@ -108,11 +108,7 @@ public final class CharacterDefinition {
} }
public boolean hasCoda(char ch){ public boolean hasCoda(char ch){
if (((ch - 0xAC00) % 0x001C) == 0) { return ((ch - 0xAC00) % 0x001C) != 0;
return false;
} else {
return true;
}
} }
public static byte lookupCharacterClass(String characterClassName) { public static byte lookupCharacterClass(String characterClassName) {

View File

@ -40,7 +40,7 @@ public final class ConnectionCosts {
private ConnectionCosts() throws IOException { private ConnectionCosts() throws IOException {
InputStream is = null; InputStream is = null;
ByteBuffer buffer = null; ByteBuffer buffer;
boolean success = false; boolean success = false;
try { try {
is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX); is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);

View File

@ -28,7 +28,7 @@ public final class TokenInfoFST {
private final FST<Long> fst; private final FST<Long> fst;
private final int cacheCeiling; private final int cacheCeiling;
private final FST.Arc<Long> rootCache[]; private final FST.Arc<Long>[] rootCache;
public final Long NO_OUTPUT; public final Long NO_OUTPUT;
@ -41,7 +41,7 @@ public final class TokenInfoFST {
@SuppressWarnings({"rawtypes","unchecked"}) @SuppressWarnings({"rawtypes","unchecked"})
private FST.Arc<Long>[] cacheRootArcs() throws IOException { private FST.Arc<Long>[] cacheRootArcs() throws IOException {
FST.Arc<Long> rootCache[] = new FST.Arc[1+(cacheCeiling-0xAC00)]; FST.Arc<Long>[] rootCache = new FST.Arc[1+(cacheCeiling-0xAC00)];
FST.Arc<Long> firstArc = new FST.Arc<>(); FST.Arc<Long> firstArc = new FST.Arc<>();
fst.getFirstArc(firstArc); fst.getFirstArc(firstArc);
FST.Arc<Long> arc = new FST.Arc<>(); FST.Arc<Long> arc = new FST.Arc<>();

View File

@ -37,26 +37,26 @@ public final class UserDictionary implements Dictionary {
// text -> wordID // text -> wordID
private final TokenInfoFST fst; private final TokenInfoFST fst;
public static final int WORD_COST = -100000; private static final int WORD_COST = -100000;
// NNG left // NNG left
public static final short LEFT_ID = 1781; private static final short LEFT_ID = 1781;
// NNG right // NNG right
public static final short RIGHT_ID = 3533; private static final short RIGHT_ID = 3533;
// NNG right with hangul and a coda on the last char // NNG right with hangul and a coda on the last char
public static final short RIGHT_ID_T = 3535; private static final short RIGHT_ID_T = 3535;
// NNG right with hangul and no coda on the last char // NNG right with hangul and no coda on the last char
public static final short RIGHT_ID_F = 3534; private static final short RIGHT_ID_F = 3534;
// length, length... indexed by compound ID or null for simple noun // length, length... indexed by compound ID or null for simple noun
private final int segmentations[][]; private final int[][] segmentations;
private final short[] rightIds; private final short[] rightIds;
public static UserDictionary open(Reader reader) throws IOException { public static UserDictionary open(Reader reader) throws IOException {
BufferedReader br = new BufferedReader(reader); BufferedReader br = new BufferedReader(reader);
String line = null; String line;
List<String> entries = new ArrayList<>(); List<String> entries = new ArrayList<>();
// text + optional segmentations // text + optional segmentations
@ -127,7 +127,7 @@ public final class UserDictionary implements Dictionary {
scratch.grow(token.length()); scratch.grow(token.length());
scratch.setLength(token.length()); scratch.setLength(token.length());
for (int i = 0; i < token.length(); i++) { for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, (int) token.charAt(i)); scratch.setIntAt(i, token.charAt(i));
} }
fstBuilder.add(scratch.get(), ord); fstBuilder.add(scratch.get(), ord);
lastToken = token; lastToken = token;

View File

@ -117,8 +117,8 @@ abstract class BinaryDictionaryWriter {
boolean hasSinglePOS = (leftPOS == rightPOS); boolean hasSinglePOS = (leftPOS == rightPOS);
if (posType != POS.Type.MORPHEME && expression.length() > 0) { if (posType != POS.Type.MORPHEME && expression.length() > 0) {
String[] exprTokens = expression.split("\\+"); String[] exprTokens = expression.split("\\+");
for (int i = 0; i < exprTokens.length; i++) { for (String exprToken : exprTokens) {
String[] tokenSplit = exprTokens[i].split("/"); String[] tokenSplit = exprToken.split("/");
assert tokenSplit.length == 3; assert tokenSplit.length == 3;
String surfaceForm = tokenSplit[0].trim(); String surfaceForm = tokenSplit[0].trim();
if (surfaceForm.isEmpty() == false) { if (surfaceForm.isEmpty() == false) {

View File

@ -70,7 +70,7 @@ public final class CSVUtil {
return new String[0]; return new String[0];
} }
return result.toArray(new String[result.size()]); return result.toArray(new String[0]);
} }
private static String unQuoteUnEscape(String original) { private static String unQuoteUnEscape(String original) {
@ -84,7 +84,7 @@ public final class CSVUtil {
} }
// Unescape // Unescape
if (result.indexOf(ESCAPED_QUOTE) >= 0) { if (result.contains(ESCAPED_QUOTE)) {
result = result.replace(ESCAPED_QUOTE, "\""); result = result.replace(ESCAPED_QUOTE, "\"");
} }
} }

View File

@ -114,7 +114,7 @@ class TokenInfoDictionaryBuilder {
scratch.grow(surfaceForm.length()); scratch.grow(surfaceForm.length());
scratch.setLength(surfaceForm.length()); scratch.setLength(surfaceForm.length());
for (int i = 0; i < surfaceForm.length(); i++) { for (int i = 0; i < surfaceForm.length(); i++) {
scratch.setIntAt(i, (int) surfaceForm.charAt(i)); scratch.setIntAt(i, surfaceForm.charAt(i));
} }
fstBuilder.add(scratch.get(), ord); fstBuilder.add(scratch.get(), ord);
} }

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.util.ResourceLoader;
/** Fake resource loader for tests: works if you want to fake reading a single file */ /** Fake resource loader for tests: works if you want to fake reading a single file */
class StringMockResourceLoader implements ResourceLoader { class StringMockResourceLoader implements ResourceLoader {
String text; private String text;
public StringMockResourceLoader(String text) { public StringMockResourceLoader(String text) {
this.text = text; this.text = text;

View File

@ -18,9 +18,9 @@ package org.apache.lucene.analysis.ko;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet;
import java.util.Random; import java.util.Random;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -41,7 +41,7 @@ public class TestKoreanAnalyzer extends BaseTokenStreamTestCase {
} }
public void testStopTags() throws IOException { public void testStopTags() throws IOException {
Set<POS.Tag> stopTags = Arrays.asList(POS.Tag.NNP, POS.Tag.NNG).stream().collect(Collectors.toSet()); Set<POS.Tag> stopTags = new HashSet<>(Arrays.asList(POS.Tag.NNP, POS.Tag.NNG));
Analyzer a = new KoreanAnalyzer(null, KoreanTokenizer.DecompoundMode.DISCARD, stopTags, false); Analyzer a = new KoreanAnalyzer(null, KoreanTokenizer.DecompoundMode.DISCARD, stopTags, false);
assertAnalyzesTo(a, "한국은 대단한 나라입니다.", assertAnalyzesTo(a, "한국은 대단한 나라입니다.",
new String[]{"", "대단", "", "", "", "ᄇ니다"}, new String[]{"", "대단", "", "", "", "ᄇ니다"},

View File

@ -50,12 +50,12 @@ public class TestKoreanNumberFilterFactory extends BaseTokenStreamTestCase {
} }
/** Test that bogus arguments result in exception */ /** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception { public void testBogusArguments() {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
new KoreanNumberFilterFactory(new HashMap<String,String>() {{ new KoreanNumberFilterFactory(new HashMap<>() {{
put("bogusArg", "bogusValue"); put("bogusArg", "bogusValue");
}}); }})
}); );
assertTrue(expected.getMessage().contains("Unknown parameters")); assertTrue(expected.getMessage().contains("Unknown parameters"));
} }
} }

View File

@ -32,7 +32,7 @@ import org.apache.lucene.util.Version;
*/ */
public class TestKoreanPartOfSpeechStopFilterFactory extends BaseTokenStreamTestCase { public class TestKoreanPartOfSpeechStopFilterFactory extends BaseTokenStreamTestCase {
public void testStopTags() throws IOException { public void testStopTags() throws IOException {
KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(new HashMap<String,String>()); KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(new HashMap<>());
tokenizerFactory.inform(new StringMockResourceLoader("")); tokenizerFactory.inform(new StringMockResourceLoader(""));
TokenStream ts = tokenizerFactory.create(); TokenStream ts = tokenizerFactory.create();
((Tokenizer)ts).setReader(new StringReader(" 한국은 대단한 나라입니다.")); ((Tokenizer)ts).setReader(new StringReader(" 한국은 대단한 나라입니다."));
@ -47,13 +47,13 @@ public class TestKoreanPartOfSpeechStopFilterFactory extends BaseTokenStreamTest
} }
/** Test that bogus arguments result in exception */ /** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception { public void testBogusArguments() {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
new KoreanPartOfSpeechStopFilterFactory(new HashMap<String,String>() {{ new KoreanPartOfSpeechStopFilterFactory(new HashMap<>() {{
put("luceneMatchVersion", Version.LATEST.toString()); put("luceneMatchVersion", Version.LATEST.toString());
put("bogusArg", "bogusValue"); put("bogusArg", "bogusValue");
}}); }})
}); );
assertTrue(expected.getMessage().contains("Unknown parameters")); assertTrue(expected.getMessage().contains("Unknown parameters"));
} }
} }

View File

@ -31,8 +31,8 @@ public class TestKoreanReadingFormFilterFactory extends BaseTokenStreamTestCase
public void testReadings() throws IOException { public void testReadings() throws IOException {
KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(new HashMap<>()); KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(new HashMap<>());
tokenizerFactory.inform(new StringMockResourceLoader("")); tokenizerFactory.inform(new StringMockResourceLoader(""));
TokenStream tokenStream = tokenizerFactory.create(); Tokenizer tokenStream = tokenizerFactory.create();
((Tokenizer)tokenStream).setReader(new StringReader("丞相")); tokenStream.setReader(new StringReader("丞相"));
KoreanReadingFormFilterFactory filterFactory = new KoreanReadingFormFilterFactory(new HashMap<>()); KoreanReadingFormFilterFactory filterFactory = new KoreanReadingFormFilterFactory(new HashMap<>());
assertTokenStreamContents(filterFactory.create(tokenStream), assertTokenStreamContents(filterFactory.create(tokenStream),
new String[] { "승상" } new String[] { "승상" }
@ -40,12 +40,12 @@ public class TestKoreanReadingFormFilterFactory extends BaseTokenStreamTestCase
} }
/** Test that bogus arguments result in exception */ /** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception { public void testBogusArguments() {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
new KoreanReadingFormFilterFactory(new HashMap<String,String>() {{ new KoreanReadingFormFilterFactory(new HashMap<>() {{
put("bogusArg", "bogusValue"); put("bogusArg", "bogusValue");
}}); }})
}); );
assertTrue(expected.getMessage().contains("Unknown parameters")); assertTrue(expected.getMessage().contains("Unknown parameters"));
} }
} }

View File

@ -33,8 +33,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
public void testSimple() throws IOException { public void testSimple() throws IOException {
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(Collections.emptyMap()); KoreanTokenizerFactory factory = new KoreanTokenizerFactory(Collections.emptyMap());
factory.inform(new StringMockResourceLoader("")); factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(newAttributeFactory()); Tokenizer ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("안녕하세요")); ts.setReader(new StringReader("안녕하세요"));
assertTokenStreamContents(ts, assertTokenStreamContents(ts,
new String[] { "안녕", "", "", "어요" }, new String[] { "안녕", "", "", "어요" },
new int[] { 0, 2, 3, 3 }, new int[] { 0, 2, 3, 3 },
@ -50,8 +50,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
args.put("decompoundMode", "discard"); args.put("decompoundMode", "discard");
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args); KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
factory.inform(new StringMockResourceLoader("")); factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(newAttributeFactory()); Tokenizer ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("갠지스강")); ts.setReader(new StringReader("갠지스강"));
assertTokenStreamContents(ts, assertTokenStreamContents(ts,
new String[] { "갠지스", "" } new String[] { "갠지스", "" }
); );
@ -62,8 +62,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
args.put("decompoundMode", "none"); args.put("decompoundMode", "none");
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args); KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
factory.inform(new StringMockResourceLoader("")); factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(newAttributeFactory()); Tokenizer ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("갠지스강")); ts.setReader(new StringReader("갠지스강"));
assertTokenStreamContents(ts, assertTokenStreamContents(ts,
new String[] { "갠지스강" } new String[] { "갠지스강" }
); );
@ -74,8 +74,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
args.put("decompoundMode", "mixed"); args.put("decompoundMode", "mixed");
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args); KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
factory.inform(new StringMockResourceLoader("")); factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(newAttributeFactory()); Tokenizer ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("갠지스강")); ts.setReader(new StringReader("갠지스강"));
assertTokenStreamContents(ts, assertTokenStreamContents(ts,
new String[] { "갠지스강", "갠지스", "" } new String[] { "갠지스강", "갠지스", "" }
); );
@ -94,8 +94,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
args.put("userDictionary", "userdict.txt"); args.put("userDictionary", "userdict.txt");
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args); KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
factory.inform(new StringMockResourceLoader(userDict)); factory.inform(new StringMockResourceLoader(userDict));
TokenStream ts = factory.create(newAttributeFactory()); Tokenizer ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("세종시")); ts.setReader(new StringReader("세종시"));
assertTokenStreamContents(ts, assertTokenStreamContents(ts,
new String[] { "세종", "" } new String[] { "세종", "" }
); );
@ -109,8 +109,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
args.put("discardPunctuation", "true"); args.put("discardPunctuation", "true");
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args); KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
factory.inform(new StringMockResourceLoader("")); factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(newAttributeFactory()); Tokenizer ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("10.1 인치 모니터")); ts.setReader(new StringReader("10.1 인치 모니터"));
assertTokenStreamContents(ts, assertTokenStreamContents(ts,
new String[] { "10", "1", "인치", "모니터" } new String[] { "10", "1", "인치", "모니터" }
); );
@ -124,20 +124,20 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
args.put("discardPunctuation", "false"); args.put("discardPunctuation", "false");
KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args); KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
factory.inform(new StringMockResourceLoader("")); factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(newAttributeFactory()); Tokenizer ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("10.1 인치 모니터")); ts.setReader(new StringReader("10.1 인치 모니터"));
assertTokenStreamContents(ts, assertTokenStreamContents(ts,
new String[] { "10", ".", "1", " ", "인치", " ", "모니터" } new String[] { "10", ".", "1", " ", "인치", " ", "모니터" }
); );
} }
/** Test that bogus arguments result in exception */ /** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception { public void testBogusArguments() {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
new KoreanTokenizerFactory(new HashMap<String,String>() {{ new KoreanTokenizerFactory(new HashMap<>() {{
put("bogusArg", "bogusValue"); put("bogusArg", "bogusValue");
}}); }})
}); );
assertTrue(expected.getMessage().contains("Unknown parameters")); assertTrue(expected.getMessage().contains("Unknown parameters"));
} }
} }

View File

@ -135,11 +135,11 @@ public class TokenInfoDictionaryTest extends LuceneTestCase {
POS.Tag rightPOS = tid.getRightPOS(wordId); POS.Tag rightPOS = tid.getRightPOS(wordId);
if (type == POS.Type.MORPHEME) { if (type == POS.Type.MORPHEME) {
assertTrue(leftPOS == rightPOS); assertSame(leftPOS, rightPOS);
String reading = tid.getReading(wordId); String reading = tid.getReading(wordId);
boolean isHanja = charDef.isHanja(surfaceForm.charAt(0)); boolean isHanja = charDef.isHanja(surfaceForm.charAt(0));
if (isHanja) { if (isHanja) {
assertTrue(reading != null); assertNotNull(reading);
for (int j = 0; j < reading.length(); j++) { for (int j = 0; j < reading.length(); j++) {
assertTrue(charDef.isHangul(reading.charAt(j))); assertTrue(charDef.isHangul(reading.charAt(j)));
} }
@ -149,7 +149,7 @@ public class TokenInfoDictionaryTest extends LuceneTestCase {
} }
} else { } else {
if (type == POS.Type.COMPOUND) { if (type == POS.Type.COMPOUND) {
assertTrue(leftPOS == rightPOS); assertSame(leftPOS, rightPOS);
assertTrue(leftPOS == POS.Tag.NNG || rightPOS == POS.Tag.NNP); assertTrue(leftPOS == POS.Tag.NNG || rightPOS == POS.Tag.NNP);
} }
Dictionary.Morpheme[] decompound = tid.getMorphemes(wordId, chars, 0, chars.length); Dictionary.Morpheme[] decompound = tid.getMorphemes(wordId, chars, 0, chars.length);

View File

@ -41,7 +41,8 @@ public class UserDictionaryTest extends LuceneTestCase {
assertNull(dictionary.getMorphemes(wordIds.get(0), sArray, 0, s.length())); assertNull(dictionary.getMorphemes(wordIds.get(0), sArray, 0, s.length()));
Dictionary.Morpheme[] decompound = dictionary.getMorphemes(wordIds.get(1), sArray, 0, s.length()); Dictionary.Morpheme[] decompound = dictionary.getMorphemes(wordIds.get(1), sArray, 0, s.length());
assertTrue(decompound.length == 2); assertNotNull(decompound);
assertEquals(2, decompound.length);
assertEquals(decompound[0].posTag, POS.Tag.NNG); assertEquals(decompound[0].posTag, POS.Tag.NNG);
assertEquals(decompound[0].surfaceForm, "세종"); assertEquals(decompound[0].surfaceForm, "세종");
assertEquals(decompound[1].posTag, POS.Tag.NNG); assertEquals(decompound[1].posTag, POS.Tag.NNG);
@ -55,7 +56,7 @@ public class UserDictionaryTest extends LuceneTestCase {
} }
@Test @Test
public void testRead() throws IOException { public void testRead() {
UserDictionary dictionary = TestKoreanTokenizer.readDict(); UserDictionary dictionary = TestKoreanTokenizer.readDict();
assertNotNull(dictionary); assertNotNull(dictionary);
} }