LUCENE-8524: Add the Hangul Letter Araea (interpunct) as a separator in Nori's tokenizer.

This change also removes empty terms and trim surface form in Nori's Korean dictionary.
2018-10-26 10:28:37 +02:00 · 2018-10-26 10:28:37 +02:00 · 6f291d402b
parent f33be7a172
commit 6f291d402b
9 changed files with 44 additions and 21 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -212,6 +212,9 @@ Bug fixes:
  in the graph if the slop is greater than 0. Span queries cannot be used in this case because
  they don't handle slop the same way than phrase queries. (Steve Rowe, Uwe Schindler, Jim Ferenczi)

+* LUCENE-8524: Add the Hangul Letter Araea (interpunct) as a separator in Nori's tokenizer.
+  This change also removes empty terms and trim surface form in Nori's Korean dictionary. (Trey Jones, Jim Ferenczi)
+
 New Features

 * LUCENE-8496: Selective indexing - modify BKDReader/BKDWriter to allow users
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@ -932,6 +932,10 @@ public final class KoreanTokenizer extends Tokenizer {
  }

  private static boolean isPunctuation(char ch) {
+    // special case for Hangul Letter Araea (interpunct)
+    if (ch == 0x318D) {
+      return true;
+    }
    switch(Character.getType(ch)) {
      case Character.SPACE_SEPARATOR:
      case Character.LINE_SEPARATOR:
--- a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$buffer.dat
+++ b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$buffer.dat
--- a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat
+++ b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat
--- a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$targetMap.dat
+++ b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$targetMap.dat
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
@ -289,6 +289,14 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
    );
  }

+  public void testInterpunct() throws IOException {
+    assertAnalyzesTo(analyzer, "도로ㆍ지반ㆍ수자원ㆍ건설환경ㆍ건축ㆍ화재설비연구",
+        new String[]{"도로", "지반", "수자원", "건설", "환경", "건축", "화재", "설비", "연구"},
+        new int[]{0, 3, 6, 10, 12, 15, 18, 20, 22},
+        new int[]{2, 5, 9, 12, 14, 17, 20, 22, 24},
+        new int[]{1, 1, 1, 1,   1,  1,  1,  1,  1}
+    );
+  }

  /** blast some random strings through the tokenizer */
  public void testRandomStrings() throws Exception {
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java
@ -48,6 +48,8 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
        chars[i] = (char)input.ints[input.offset+i];
      }
      String surfaceForm = new String(chars);
+      assertFalse(surfaceForm.isEmpty());
+      assertEquals(surfaceForm.trim(), surfaceForm);
      assertTrue(UnicodeUtil.validUTF16String(surfaceForm));
      
      Long output = mapping.output;
@ -96,6 +98,8 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
            int offset = 0;
            for (Dictionary.Morpheme morph : decompound) {
              assertTrue(UnicodeUtil.validUTF16String(morph.surfaceForm));
+              assertFalse(morph.surfaceForm.isEmpty());
+              assertEquals(morph.surfaceForm.trim(), morph.surfaceForm);
              if (type != POS.Type.INFLECT) {
                assertEquals(morph.surfaceForm, surfaceForm.substring(offset, offset + morph.surfaceForm.length()));
                offset += morph.surfaceForm.length();
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
@ -26,6 +26,7 @@ import java.nio.channels.Channels;
 import java.nio.channels.WritableByteChannel;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;

 import org.apache.lucene.analysis.ko.POS;
 import org.apache.lucene.analysis.ko.dict.Dictionary;
@ -109,23 +110,23 @@ public abstract class BinaryDictionaryWriter {
    assert existing == null || existing.equals(fullPOSData);
    posDict.set(leftId, fullPOSData);

-    final Dictionary.Morpheme[] morphemes;
+    final List<Dictionary.Morpheme> morphemes = new ArrayList<>();
    // true if the POS and decompounds of the token are all the same.
    boolean hasSinglePOS = (leftPOS == rightPOS);
    if (posType != POS.Type.MORPHEME && expression.length() > 0) {
      String[] exprTokens = expression.split("\\+");
-      morphemes = new Dictionary.Morpheme[exprTokens.length];
      for (int i = 0; i < exprTokens.length; i++) {
        String[] tokenSplit = exprTokens[i].split("\\/");
        assert tokenSplit.length == 3;
-        POS.Tag exprTag = POS.resolveTag(tokenSplit[1]);
-        morphemes[i] = new Dictionary.Morpheme(exprTag, tokenSplit[0]);
-        if (leftPOS != exprTag) {
-          hasSinglePOS = false;
+        String surfaceForm = tokenSplit[0].trim();
+        if (surfaceForm.isEmpty() == false) {
+          POS.Tag exprTag = POS.resolveTag(tokenSplit[1]);
+          morphemes.add(new Dictionary.Morpheme(exprTag, tokenSplit[0]));
+          if (leftPOS != exprTag) {
+            hasSinglePOS = false;
+          }
        }
      }
-    } else {
-      morphemes = new Dictionary.Morpheme[0];
    }

    int flags = 0;
@ -151,17 +152,17 @@ public abstract class BinaryDictionaryWriter {
      if (hasSinglePOS == false) {
        buffer.put((byte) rightPOS.ordinal());
      }
-      buffer.put((byte) morphemes.length);
+      buffer.put((byte) morphemes.size());
      int compoundOffset = 0;
-      for (int i = 0; i < morphemes.length; i++) {
+      for (Dictionary.Morpheme morpheme : morphemes) {
        if (hasSinglePOS == false) {
-          buffer.put((byte) morphemes[i].posTag.ordinal());
+          buffer.put((byte) morpheme.posTag.ordinal());
        }
        if (posType != POS.Type.INFLECT) {
-          buffer.put((byte) morphemes[i].surfaceForm.length());
-          compoundOffset += morphemes[i].surfaceForm.length();
+          buffer.put((byte) morpheme.surfaceForm.length());
+          compoundOffset += morpheme.surfaceForm.length();
        } else {
-          writeString(morphemes[i].surfaceForm);
+          writeString(morpheme.surfaceForm);
        }
        assert compoundOffset <= entry[0].length() : Arrays.toString(entry);
      }
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
@ -116,6 +116,10 @@ public class TokenInfoDictionaryBuilder {

    // build tokeninfo dictionary
    for (String[] entry : lines) {
+      String surfaceForm = entry[0].trim();
+      if (surfaceForm.isEmpty()) {
+        continue;
+      }
      int next = dictionary.put(entry);

      if(next == offset){
@ -123,15 +127,14 @@ public class TokenInfoDictionaryBuilder {
        continue;
      }

-      String token = entry[0];
-      if (!token.equals(lastValue)) {
+      if (!surfaceForm.equals(lastValue)) {
        // new word to add to fst
        ord++;
-        lastValue = token;
-        scratch.grow(token.length());
-        scratch.setLength(token.length());
-        for (int i = 0; i < token.length(); i++) {
-          scratch.setIntAt(i, (int) token.charAt(i));
+        lastValue = surfaceForm;
+        scratch.grow(surfaceForm.length());
+        scratch.setLength(surfaceForm.length());
+        for (int i = 0; i < surfaceForm.length(); i++) {
+          scratch.setIntAt(i, (int) surfaceForm.charAt(i));
        }
        fstBuilder.add(scratch.get(), ord);
      }