LUCENE-9581: Japanese tokenizer should discard the compound token instead of disabling the decomposition of long tokens when discardCompoundToken is activated.

2020-11-23 08:55:36 +01:00 · 2020-11-23 08:55:36 +01:00 · a5d0654a24
parent 77a205387f
commit a5d0654a24
5 changed files with 47 additions and 38 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -247,11 +247,18 @@ Optimizations
 * LUCENE-9536: Reduced memory usage for OrdinalMap when a segment has all
  values. (Julie Tibshirani via Adrien Grand)

+Bug Fixes
+---------------------
+
+* LUCENE-9581: Japanese tokenizer should discard the compound token instead of disabling the decomposition
+  of long tokens when discardCompoundToken is activated. (Jim Ferenczi)
+
 Other
 ---------------------

 * SOLR-14995: Update Jetty to 9.4.34 (Mike Drob)

+
 ======================= Lucene 8.7.0 =======================

 API Changes
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
@ -507,7 +507,7 @@ public final class JapaneseTokenizer extends Tokenizer {
      System.out.println("      + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos=" + endPos + " toPos.idx=" + positions.get(endPos).count);
    }

-    if ((addPenalty || (!outputCompounds && searchMode)) && type != Type.USER) {
+    if (addPenalty && type != Type.USER) {
      final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
      if (VERBOSE) {
        if (penalty > 0) {
@ -1670,7 +1670,7 @@ public final class JapaneseTokenizer extends Tokenizer {
      int backID = posData.backID[bestIDX];
      int nextBestIDX = posData.backIndex[bestIDX];

-      if (outputCompounds && searchMode && altToken == null && backType != Type.USER) {
+      if (searchMode && altToken == null && backType != Type.USER) {

        // In searchMode, if best path had picked a too-long
        // token, we use the "penalty" to compute the allowed
@ -1764,7 +1764,7 @@ public final class JapaneseTokenizer extends Tokenizer {
      assert offset >= 0;

      if (altToken != null && altToken.getPosition() >= backPos) {
-
+        if (outputCompounds) {
          // We've backtraced to the position where the
          // compound token starts; add it now:

@ -1793,6 +1793,7 @@ public final class JapaneseTokenizer extends Tokenizer {
            }
            assert discardPunctuation;
          }
+        }
        altToken = null;
      }

--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
@ -108,8 +108,8 @@ public class TestJapaneseAnalyzer extends BaseTokenStreamTestCase {
    a.close();
    a = new JapaneseAnalyzer();
    assertAnalyzesToPositions(a, "京都大学硬式野球部",
-                     new String[] { "京都",
-                                    "大学",
+            new String[] { "京都大",
+                    "学",
                    "硬式",
                    "野球",
                    "部" },
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@ -917,5 +917,11 @@ public class

    assertAnalyzesTo(extendedModeAnalyzerNoCompound, "株式会社とアカデミア",
        new String[]{"株式", "会社", "と", "ア", "カ", "デ", "ミ", "ア"});
+
+    assertAnalyzesTo(analyzer, "北海道日本ハムファイターズ",
+            new String[]{"北海道", "日本", "ハムファイターズ"});
+
+    assertAnalyzesTo(analyzerNoCompound, "北海道日本ハムファイターズ",
+            new String[]{"北海道", "日本", "ハムファイターズ"});
  }
 }
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestSearchMode.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestSearchMode.java
@ -122,11 +122,6 @@ public class TestSearchMode extends BaseTokenStreamTestCase {
        String[] fields = line.split("\t", 2);
        String sourceText = fields[0];
        String[] tmpExpectedTokens = fields[1].split("\\s+");
-        if (sourceText.equals("京都大学硬式野球部")) {
-          // This is the only case that tokenization result is different from discardCompoundToken=false
-          tmpExpectedTokens[0] = "京都";
-          tmpExpectedTokens[1] = "大学";
-        }

        List<String> expectedTokenList = new ArrayList<>();
        for(int tokIDX=0;tokIDX<tmpExpectedTokens.length;tokIDX++) {