mirror of https://github.com/apache/lucene.git
LUCENE-8524: Add the Hangul Letter Araea (interpunct) as a separator in Nori's tokenizer.
This change also removes empty terms and trim surface form in Nori's Korean dictionary.
This commit is contained in:
parent
f33be7a172
commit
6f291d402b
|
@ -212,6 +212,9 @@ Bug fixes:
|
|||
in the graph if the slop is greater than 0. Span queries cannot be used in this case because
|
||||
they don't handle slop the same way than phrase queries. (Steve Rowe, Uwe Schindler, Jim Ferenczi)
|
||||
|
||||
* LUCENE-8524: Add the Hangul Letter Araea (interpunct) as a separator in Nori's tokenizer.
|
||||
This change also removes empty terms and trim surface form in Nori's Korean dictionary. (Trey Jones, Jim Ferenczi)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-8496: Selective indexing - modify BKDReader/BKDWriter to allow users
|
||||
|
|
|
@ -932,6 +932,10 @@ public final class KoreanTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
private static boolean isPunctuation(char ch) {
|
||||
// special case for Hangul Letter Araea (interpunct)
|
||||
if (ch == 0x318D) {
|
||||
return true;
|
||||
}
|
||||
switch(Character.getType(ch)) {
|
||||
case Character.SPACE_SEPARATOR:
|
||||
case Character.LINE_SEPARATOR:
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -289,6 +289,14 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
|||
);
|
||||
}
|
||||
|
||||
public void testInterpunct() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "도로ㆍ지반ㆍ수자원ㆍ건설환경ㆍ건축ㆍ화재설비연구",
|
||||
new String[]{"도로", "지반", "수자원", "건설", "환경", "건축", "화재", "설비", "연구"},
|
||||
new int[]{0, 3, 6, 10, 12, 15, 18, 20, 22},
|
||||
new int[]{2, 5, 9, 12, 14, 17, 20, 22, 24},
|
||||
new int[]{1, 1, 1, 1, 1, 1, 1, 1, 1}
|
||||
);
|
||||
}
|
||||
|
||||
/** blast some random strings through the tokenizer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
|
|
|
@ -48,6 +48,8 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
|
|||
chars[i] = (char)input.ints[input.offset+i];
|
||||
}
|
||||
String surfaceForm = new String(chars);
|
||||
assertFalse(surfaceForm.isEmpty());
|
||||
assertEquals(surfaceForm.trim(), surfaceForm);
|
||||
assertTrue(UnicodeUtil.validUTF16String(surfaceForm));
|
||||
|
||||
Long output = mapping.output;
|
||||
|
@ -96,6 +98,8 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
|
|||
int offset = 0;
|
||||
for (Dictionary.Morpheme morph : decompound) {
|
||||
assertTrue(UnicodeUtil.validUTF16String(morph.surfaceForm));
|
||||
assertFalse(morph.surfaceForm.isEmpty());
|
||||
assertEquals(morph.surfaceForm.trim(), morph.surfaceForm);
|
||||
if (type != POS.Type.INFLECT) {
|
||||
assertEquals(morph.surfaceForm, surfaceForm.substring(offset, offset + morph.surfaceForm.length()));
|
||||
offset += morph.surfaceForm.length();
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.nio.channels.Channels;
|
|||
import java.nio.channels.WritableByteChannel;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.ko.POS;
|
||||
import org.apache.lucene.analysis.ko.dict.Dictionary;
|
||||
|
@ -109,23 +110,23 @@ public abstract class BinaryDictionaryWriter {
|
|||
assert existing == null || existing.equals(fullPOSData);
|
||||
posDict.set(leftId, fullPOSData);
|
||||
|
||||
final Dictionary.Morpheme[] morphemes;
|
||||
final List<Dictionary.Morpheme> morphemes = new ArrayList<>();
|
||||
// true if the POS and decompounds of the token are all the same.
|
||||
boolean hasSinglePOS = (leftPOS == rightPOS);
|
||||
if (posType != POS.Type.MORPHEME && expression.length() > 0) {
|
||||
String[] exprTokens = expression.split("\\+");
|
||||
morphemes = new Dictionary.Morpheme[exprTokens.length];
|
||||
for (int i = 0; i < exprTokens.length; i++) {
|
||||
String[] tokenSplit = exprTokens[i].split("\\/");
|
||||
assert tokenSplit.length == 3;
|
||||
POS.Tag exprTag = POS.resolveTag(tokenSplit[1]);
|
||||
morphemes[i] = new Dictionary.Morpheme(exprTag, tokenSplit[0]);
|
||||
if (leftPOS != exprTag) {
|
||||
hasSinglePOS = false;
|
||||
String surfaceForm = tokenSplit[0].trim();
|
||||
if (surfaceForm.isEmpty() == false) {
|
||||
POS.Tag exprTag = POS.resolveTag(tokenSplit[1]);
|
||||
morphemes.add(new Dictionary.Morpheme(exprTag, tokenSplit[0]));
|
||||
if (leftPOS != exprTag) {
|
||||
hasSinglePOS = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
morphemes = new Dictionary.Morpheme[0];
|
||||
}
|
||||
|
||||
int flags = 0;
|
||||
|
@ -151,17 +152,17 @@ public abstract class BinaryDictionaryWriter {
|
|||
if (hasSinglePOS == false) {
|
||||
buffer.put((byte) rightPOS.ordinal());
|
||||
}
|
||||
buffer.put((byte) morphemes.length);
|
||||
buffer.put((byte) morphemes.size());
|
||||
int compoundOffset = 0;
|
||||
for (int i = 0; i < morphemes.length; i++) {
|
||||
for (Dictionary.Morpheme morpheme : morphemes) {
|
||||
if (hasSinglePOS == false) {
|
||||
buffer.put((byte) morphemes[i].posTag.ordinal());
|
||||
buffer.put((byte) morpheme.posTag.ordinal());
|
||||
}
|
||||
if (posType != POS.Type.INFLECT) {
|
||||
buffer.put((byte) morphemes[i].surfaceForm.length());
|
||||
compoundOffset += morphemes[i].surfaceForm.length();
|
||||
buffer.put((byte) morpheme.surfaceForm.length());
|
||||
compoundOffset += morpheme.surfaceForm.length();
|
||||
} else {
|
||||
writeString(morphemes[i].surfaceForm);
|
||||
writeString(morpheme.surfaceForm);
|
||||
}
|
||||
assert compoundOffset <= entry[0].length() : Arrays.toString(entry);
|
||||
}
|
||||
|
|
|
@ -116,6 +116,10 @@ public class TokenInfoDictionaryBuilder {
|
|||
|
||||
// build tokeninfo dictionary
|
||||
for (String[] entry : lines) {
|
||||
String surfaceForm = entry[0].trim();
|
||||
if (surfaceForm.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
int next = dictionary.put(entry);
|
||||
|
||||
if(next == offset){
|
||||
|
@ -123,15 +127,14 @@ public class TokenInfoDictionaryBuilder {
|
|||
continue;
|
||||
}
|
||||
|
||||
String token = entry[0];
|
||||
if (!token.equals(lastValue)) {
|
||||
if (!surfaceForm.equals(lastValue)) {
|
||||
// new word to add to fst
|
||||
ord++;
|
||||
lastValue = token;
|
||||
scratch.grow(token.length());
|
||||
scratch.setLength(token.length());
|
||||
for (int i = 0; i < token.length(); i++) {
|
||||
scratch.setIntAt(i, (int) token.charAt(i));
|
||||
lastValue = surfaceForm;
|
||||
scratch.grow(surfaceForm.length());
|
||||
scratch.setLength(surfaceForm.length());
|
||||
for (int i = 0; i < surfaceForm.length(); i++) {
|
||||
scratch.setIntAt(i, (int) surfaceForm.charAt(i));
|
||||
}
|
||||
fstBuilder.add(scratch.get(), ord);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue