From 4e8fb2a9df86496a5e157386f4b0d45008d821a5 Mon Sep 17 00:00:00 2001
From: Bruno Roustant <33934988+bruno-roustant@users.noreply.github.com>
Date: Tue, 11 Jun 2024 10:02:58 +0200
Subject: [PATCH] Optimize Japanese UserDictionary. (#13431)

Replace TreeMap by a List of Match. Use compiled Pattern.
---
 .../analysis/ja/dict/UserDictionary.java      | 94 ++++++++++---------
 1 file changed, 49 insertions(+), 45 deletions(-)
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
index a62ffe5d8ac..391ed2ba44b 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@@ -23,8 +23,7 @@ import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
+import java.util.regex.Pattern;
 import org.apache.lucene.analysis.morph.Dictionary;
 import org.apache.lucene.analysis.util.CSVUtil;
 import org.apache.lucene.util.IntsRefBuilder;
@@ -37,6 +36,10 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
 
   public static final String INTERNAL_SEPARATOR = "\u0000";
 
+  private static final Pattern LINE_COMMENT = Pattern.compile("^#.*$");
+  private static final Pattern WHITESPACE = Pattern.compile("\\s");
+  private static final Pattern SPACES = Pattern.compile(" +");
+
   // phrase text -> phrase ID
   private final TokenInfoFST fst;
 
@@ -51,16 +54,16 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
   public static UserDictionary open(Reader reader) throws IOException {
 
     BufferedReader br = new BufferedReader(reader);
-    String line = null;
+    String line;
     List<String[]> featureEntries = new ArrayList<>();
 
     // text, segmentation, readings, POS
     while ((line = br.readLine()) != null) {
       // Remove comments
-      line = line.replaceAll("^#.*$", "");
+      line = LINE_COMMENT.matcher(line).replaceAll("");
 
       // Skip empty lines or comment lines
-      if (line.trim().length() == 0) {
+      if (line.trim().isEmpty()) {
         continue;
       }
       String[] values = CSVUtil.parse(line);
@@ -99,10 +102,10 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
     long ord = 0;
 
     for (String[] values : featureEntries) {
-      String surface = values[0].replaceAll("\\s", "");
-      String concatenatedSegment = values[1].replaceAll("\\s", "");
-      String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
-      String[] readings = values[2].replaceAll("  *", " ").split(" ");
+      String surface = WHITESPACE.matcher(values[0]).replaceAll("");
+      String concatenatedSegment = WHITESPACE.matcher(values[1]).replaceAll("");
+      String[] segmentation = SPACES.split(values[1]);
+      String[] readings = SPACES.split(values[2]);
       String pos = values[3];
 
       if (segmentation.length != readings.length) {
@@ -141,7 +144,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
       scratch.growNoCopy(token.length());
       scratch.setLength(token.length());
       for (int i = 0; i < token.length(); i++) {
-        scratch.setIntAt(i, (int) token.charAt(i));
+        scratch.setIntAt(i, token.charAt(i));
       }
       fstCompiler.add(scratch.get(), ord);
       segmentations.add(wordIdAndLength);
@@ -151,7 +154,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
         new TokenInfoFST(
             FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false);
     this.morphAtts = new UserMorphData(data.toArray(new String[0]));
-    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
+    this.segmentations = segmentations.toArray(new int[0][]);
   }
 
   @Override
@@ -168,33 +171,53 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
    * @return array of {wordId, position, length}
    */
   public int[][] lookup(char[] chars, int off, int len) throws IOException {
-    // TODO: can we avoid this treemap/toIndexArray?
-    TreeMap<Integer, int[]> result = new TreeMap<>(); // index, [length, length...]
-    boolean found = false; // true if we found any results
-
+    List<Match> matches = null;
+    int numResults = 0;
     final FST.BytesReader fstReader = fst.getBytesReader();
-
+    final int end = off + len;
     FST.Arc<Long> arc = new FST.Arc<>();
-    int end = off + len;
     for (int startOffset = off; startOffset < end; startOffset++) {
+      int[] wordIdAndLength = null;
       arc = fst.getFirstArc(arc);
       int output = 0;
-      int remaining = end - startOffset;
-      for (int i = 0; i < remaining; i++) {
+      for (int i = 0, remaining = end - startOffset; i < remaining; i++) {
         int ch = chars[startOffset + i];
         if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
           break; // continue to next position
         }
         output += arc.output().intValue();
         if (arc.isFinal()) {
-          final int finalOutput = output + arc.nextFinalOutput().intValue();
-          result.put(startOffset - off, segmentations[finalOutput]);
-          found = true;
+          int finalOutput = output + arc.nextFinalOutput().intValue();
+          wordIdAndLength = segmentations[finalOutput];
         }
       }
+      if (wordIdAndLength != null) {
+        if (matches == null) {
+          matches = new ArrayList<>();
+        }
+        matches.add(new Match(startOffset - off, wordIdAndLength));
+        numResults += wordIdAndLength.length - 1;
+      }
     }
-
-    return found ? toIndexArray(result) : EMPTY_RESULT;
+    if (numResults == 0) {
+      return EMPTY_RESULT;
+    }
+    int[][] result = new int[numResults][];
+    int index = 0;
+    for (int i = 0; i < matches.size(); i++) {
+      Match match = matches.get(i);
+      int[] wordIdAndLength = match.wordIdAndLength;
+      int wordId = wordIdAndLength[0];
+      // convert length to index
+      int position = match.position;
+      for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
+        // add a {wordId, index, length} token to the results
+        int[] token = {wordId + j - 1, position, wordIdAndLength[j]};
+        result[index++] = token;
+        position += wordIdAndLength[j];
+      }
+    }
+    return result;
   }
 
   public TokenInfoFST getFST() {
@@ -203,28 +226,9 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
 
   private static final int[][] EMPTY_RESULT = new int[0][];
 
-  /**
-   * Convert Map of index and wordIdAndLength to array of {wordId, index, length}
-   *
-   * @return array of {wordId, index, length}
-   */
-  private int[][] toIndexArray(Map<Integer, int[]> input) {
-    ArrayList<int[]> result = new ArrayList<>();
-    for (Map.Entry<Integer, int[]> entry : input.entrySet()) {
-      int[] wordIdAndLength = entry.getValue();
-      int wordId = wordIdAndLength[0];
-      // convert length to index
-      int current = entry.getKey();
-      for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
-        int[] token = {wordId + j - 1, current, wordIdAndLength[j]};
-        result.add(token);
-        current += wordIdAndLength[j];
-      }
-    }
-    return result.toArray(new int[result.size()][]);
-  }
-
   public int[] lookupSegmentation(int phraseID) {
     return segmentations[phraseID];
   }
+
+  private record Match(int position, int[] wordIdAndLength) {}
 }