LUCENE-9581: Japanese tokenizer should discard the compound token instead of disabling the decomposition of long tokens when discardCompoundToken is activated.

2020-11-23 08:55:36 +01:00 · 2020-11-23 08:55:36 +01:00 · a5d0654a24
parent 77a205387f
commit a5d0654a24
5 changed files with 47 additions and 38 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -247,11 +247,18 @@ Optimizations
 * LUCENE-9536: Reduced memory usage for OrdinalMap when a segment has all
  values. (Julie Tibshirani via Adrien Grand)

+Bug Fixes
+---------------------
+
+* LUCENE-9581: Japanese tokenizer should discard the compound token instead of disabling the decomposition
+  of long tokens when discardCompoundToken is activated. (Jim Ferenczi)
+
 Other
 ---------------------

 * SOLR-14995: Update Jetty to 9.4.34 (Mike Drob)

+
 ======================= Lucene 8.7.0 =======================

 API Changes
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
@ -507,7 +507,7 @@ public final class JapaneseTokenizer extends Tokenizer {
      System.out.println("      + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos=" + endPos + " toPos.idx=" + positions.get(endPos).count);
    }

-    if ((addPenalty || (!outputCompounds && searchMode)) && type != Type.USER) {
+    if (addPenalty && type != Type.USER) {
      final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
      if (VERBOSE) {
        if (penalty > 0) {
@ -1670,7 +1670,7 @@ public final class JapaneseTokenizer extends Tokenizer {
      int backID = posData.backID[bestIDX];
      int nextBestIDX = posData.backIndex[bestIDX];

-      if (outputCompounds && searchMode && altToken == null && backType != Type.USER) {
+      if (searchMode && altToken == null && backType != Type.USER) {

        // In searchMode, if best path had picked a too-long
        // token, we use the "penalty" to compute the allowed
@ -1764,34 +1764,35 @@ public final class JapaneseTokenizer extends Tokenizer {
      assert offset >= 0;

      if (altToken != null && altToken.getPosition() >= backPos) {
+        if (outputCompounds) {
+          // We've backtraced to the position where the
+          // compound token starts; add it now:

-        // We've backtraced to the position where the
-        // compound token starts; add it now:
+          // The pruning we did when we created the altToken
+          // ensures that the back trace will align back with
+          // the start of the altToken:
+          assert altToken.getPosition() == backPos : altToken.getPosition() + " vs " + backPos;

-        // The pruning we did when we created the altToken
-        // ensures that the back trace will align back with
-        // the start of the altToken:
-        assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;
+          // NOTE: not quite right: the compound token may
+          // have had all punctuation back traced so far, but
+          // then the decompounded token at this position is
+          // not punctuation.  In this case backCount is 0,
+          // but we should maybe add the altToken anyway...?

-        // NOTE: not quite right: the compound token may
-        // have had all punctuation back traced so far, but
-        // then the decompounded token at this position is
-        // not punctuation.  In this case backCount is 0,
-        // but we should maybe add the altToken anyway...?
-
-        if (backCount > 0) {
-          backCount++;
-          altToken.setPositionLength(backCount);
-          if (VERBOSE) {
-            System.out.println("    add altToken=" + altToken);
+          if (backCount > 0) {
+            backCount++;
+            altToken.setPositionLength(backCount);
+            if (VERBOSE) {
+              System.out.println("    add altToken=" + altToken);
+            }
+            pending.add(altToken);
+          } else {
+            // This means alt token was all punct tokens:
+            if (VERBOSE) {
+              System.out.println("    discard all-punctuation altToken=" + altToken);
+            }
+            assert discardPunctuation;
          }
-          pending.add(altToken);
-        } else {
-          // This means alt token was all punct tokens:
-          if (VERBOSE) {
-            System.out.println("    discard all-punctuation altToken=" + altToken);
-          }
-          assert discardPunctuation;
        }
        altToken = null;
      }
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
@ -108,13 +108,13 @@ public class TestJapaneseAnalyzer extends BaseTokenStreamTestCase {
    a.close();
    a = new JapaneseAnalyzer();
    assertAnalyzesToPositions(a, "京都大学硬式野球部",
-                     new String[] { "京都",
-                                    "大学",
-                                    "硬式",
-                                    "野球",
-                                    "部" },
-                              new int[] {1, 1, 1, 1, 1},
-                              new int[] {1, 1, 1, 1, 1});
+            new String[] { "京都大",
+                    "学",
+                    "硬式",
+                    "野球",
+                    "部" },
+            new int[] {1, 1, 1, 1, 1},
+            new int[] {1, 1, 1, 1, 1});
    // toDotFile(a, "成田空港", "/mnt/scratch/out.dot");
    a.close();
  }
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@ -917,5 +917,11 @@ public class

    assertAnalyzesTo(extendedModeAnalyzerNoCompound, "株式会社とアカデミア",
        new String[]{"株式", "会社", "と", "ア", "カ", "デ", "ミ", "ア"});
+
+    assertAnalyzesTo(analyzer, "北海道日本ハムファイターズ",
+            new String[]{"北海道", "日本", "ハムファイターズ"});
+
+    assertAnalyzesTo(analyzerNoCompound, "北海道日本ハムファイターズ",
+            new String[]{"北海道", "日本", "ハムファイターズ"});
  }
 }
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestSearchMode.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestSearchMode.java
@ -122,11 +122,6 @@ public class TestSearchMode extends BaseTokenStreamTestCase {
        String[] fields = line.split("\t", 2);
        String sourceText = fields[0];
        String[] tmpExpectedTokens = fields[1].split("\\s+");
-        if (sourceText.equals("京都大学硬式野球部")) {
-          // This is the only case that tokenization result is different from discardCompoundToken=false
-          tmpExpectedTokens[0] = "京都";
-          tmpExpectedTokens[1] = "大学";
-        }

        List<String> expectedTokenList = new ArrayList<>();
        for(int tokIDX=0;tokIDX<tmpExpectedTokens.length;tokIDX++) {