From 4e8fb2a9df86496a5e157386f4b0d45008d821a5 Mon Sep 17 00:00:00 2001 From: Bruno Roustant <33934988+bruno-roustant@users.noreply.github.com> Date: Tue, 11 Jun 2024 10:02:58 +0200 Subject: [PATCH] Optimize Japanese UserDictionary. (#13431) Replace TreeMap by a List of Match. Use compiled Pattern. --- .../analysis/ja/dict/UserDictionary.java | 94 ++++++++++--------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index a62ffe5d8ac..391ed2ba44b 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -23,8 +23,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; -import java.util.Map; -import java.util.TreeMap; +import java.util.regex.Pattern; import org.apache.lucene.analysis.morph.Dictionary; import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.util.IntsRefBuilder; @@ -37,6 +36,10 @@ public final class UserDictionary implements Dictionary { public static final String INTERNAL_SEPARATOR = "\u0000"; + private static final Pattern LINE_COMMENT = Pattern.compile("^#.*$"); + private static final Pattern WHITESPACE = Pattern.compile("\\s"); + private static final Pattern SPACES = Pattern.compile(" +"); + // phrase text -> phrase ID private final TokenInfoFST fst; @@ -51,16 +54,16 @@ public final class UserDictionary implements Dictionary { public static UserDictionary open(Reader reader) throws IOException { BufferedReader br = new BufferedReader(reader); - String line = null; + String line; List featureEntries = new ArrayList<>(); // text, segmentation, readings, POS while ((line = br.readLine()) != null) { // Remove comments - line = line.replaceAll("^#.*$", ""); + line = LINE_COMMENT.matcher(line).replaceAll(""); // Skip empty lines or comment lines - if (line.trim().length() == 0) { + if (line.trim().isEmpty()) { continue; } String[] values = CSVUtil.parse(line); @@ -99,10 +102,10 @@ public final class UserDictionary implements Dictionary { long ord = 0; for (String[] values : featureEntries) { - String surface = values[0].replaceAll("\\s", ""); - String concatenatedSegment = values[1].replaceAll("\\s", ""); - String[] segmentation = values[1].replaceAll(" *", " ").split(" "); - String[] readings = values[2].replaceAll(" *", " ").split(" "); + String surface = WHITESPACE.matcher(values[0]).replaceAll(""); + String concatenatedSegment = WHITESPACE.matcher(values[1]).replaceAll(""); + String[] segmentation = SPACES.split(values[1]); + String[] readings = SPACES.split(values[2]); String pos = values[3]; if (segmentation.length != readings.length) { @@ -141,7 +144,7 @@ public final class UserDictionary implements Dictionary { scratch.growNoCopy(token.length()); scratch.setLength(token.length()); for (int i = 0; i < token.length(); i++) { - scratch.setIntAt(i, (int) token.charAt(i)); + scratch.setIntAt(i, token.charAt(i)); } fstCompiler.add(scratch.get(), ord); segmentations.add(wordIdAndLength); @@ -151,7 +154,7 @@ public final class UserDictionary implements Dictionary { new TokenInfoFST( FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false); this.morphAtts = new UserMorphData(data.toArray(new String[0])); - this.segmentations = segmentations.toArray(new int[segmentations.size()][]); + this.segmentations = segmentations.toArray(new int[0][]); } @Override @@ -168,33 +171,53 @@ public final class UserDictionary implements Dictionary { * @return array of {wordId, position, length} */ public int[][] lookup(char[] chars, int off, int len) throws IOException { - // TODO: can we avoid this treemap/toIndexArray? - TreeMap result = new TreeMap<>(); // index, [length, length...] - boolean found = false; // true if we found any results - + List matches = null; + int numResults = 0; final FST.BytesReader fstReader = fst.getBytesReader(); - + final int end = off + len; FST.Arc arc = new FST.Arc<>(); - int end = off + len; for (int startOffset = off; startOffset < end; startOffset++) { + int[] wordIdAndLength = null; arc = fst.getFirstArc(arc); int output = 0; - int remaining = end - startOffset; - for (int i = 0; i < remaining; i++) { + for (int i = 0, remaining = end - startOffset; i < remaining; i++) { int ch = chars[startOffset + i]; if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) { break; // continue to next position } output += arc.output().intValue(); if (arc.isFinal()) { - final int finalOutput = output + arc.nextFinalOutput().intValue(); - result.put(startOffset - off, segmentations[finalOutput]); - found = true; + int finalOutput = output + arc.nextFinalOutput().intValue(); + wordIdAndLength = segmentations[finalOutput]; } } + if (wordIdAndLength != null) { + if (matches == null) { + matches = new ArrayList<>(); + } + matches.add(new Match(startOffset - off, wordIdAndLength)); + numResults += wordIdAndLength.length - 1; + } } - - return found ? toIndexArray(result) : EMPTY_RESULT; + if (numResults == 0) { + return EMPTY_RESULT; + } + int[][] result = new int[numResults][]; + int index = 0; + for (int i = 0; i < matches.size(); i++) { + Match match = matches.get(i); + int[] wordIdAndLength = match.wordIdAndLength; + int wordId = wordIdAndLength[0]; + // convert length to index + int position = match.position; + for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset + // add a {wordId, index, length} token to the results + int[] token = {wordId + j - 1, position, wordIdAndLength[j]}; + result[index++] = token; + position += wordIdAndLength[j]; + } + } + return result; } public TokenInfoFST getFST() { @@ -203,28 +226,9 @@ public final class UserDictionary implements Dictionary { private static final int[][] EMPTY_RESULT = new int[0][]; - /** - * Convert Map of index and wordIdAndLength to array of {wordId, index, length} - * - * @return array of {wordId, index, length} - */ - private int[][] toIndexArray(Map input) { - ArrayList result = new ArrayList<>(); - for (Map.Entry entry : input.entrySet()) { - int[] wordIdAndLength = entry.getValue(); - int wordId = wordIdAndLength[0]; - // convert length to index - int current = entry.getKey(); - for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset - int[] token = {wordId + j - 1, current, wordIdAndLength[j]}; - result.add(token); - current += wordIdAndLength[j]; - } - } - return result.toArray(new int[result.size()][]); - } - public int[] lookupSegmentation(int phraseID) { return segmentations[phraseID]; } + + private record Match(int position, int[] wordIdAndLength) {} }