Optimize Japanese UserDictionary. (#13431)

Replace TreeMap by a List of Match. Use compiled Pattern.
This commit is contained in:
Bruno Roustant 2024-06-11 10:02:58 +02:00 committed by GitHub
parent edba83e636
commit 4e8fb2a9df
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 49 additions and 45 deletions

View File

@ -23,8 +23,7 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.morph.Dictionary;
import org.apache.lucene.analysis.util.CSVUtil;
import org.apache.lucene.util.IntsRefBuilder;
@ -37,6 +36,10 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
public static final String INTERNAL_SEPARATOR = "\u0000";
private static final Pattern LINE_COMMENT = Pattern.compile("^#.*$");
private static final Pattern WHITESPACE = Pattern.compile("\\s");
private static final Pattern SPACES = Pattern.compile(" +");
// phrase text -> phrase ID
private final TokenInfoFST fst;
@ -51,16 +54,16 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
public static UserDictionary open(Reader reader) throws IOException {
BufferedReader br = new BufferedReader(reader);
String line = null;
String line;
List<String[]> featureEntries = new ArrayList<>();
// text, segmentation, readings, POS
while ((line = br.readLine()) != null) {
// Remove comments
line = line.replaceAll("^#.*$", "");
line = LINE_COMMENT.matcher(line).replaceAll("");
// Skip empty lines or comment lines
if (line.trim().length() == 0) {
if (line.trim().isEmpty()) {
continue;
}
String[] values = CSVUtil.parse(line);
@ -99,10 +102,10 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
long ord = 0;
for (String[] values : featureEntries) {
String surface = values[0].replaceAll("\\s", "");
String concatenatedSegment = values[1].replaceAll("\\s", "");
String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
String[] readings = values[2].replaceAll(" *", " ").split(" ");
String surface = WHITESPACE.matcher(values[0]).replaceAll("");
String concatenatedSegment = WHITESPACE.matcher(values[1]).replaceAll("");
String[] segmentation = SPACES.split(values[1]);
String[] readings = SPACES.split(values[2]);
String pos = values[3];
if (segmentation.length != readings.length) {
@ -141,7 +144,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
scratch.growNoCopy(token.length());
scratch.setLength(token.length());
for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, (int) token.charAt(i));
scratch.setIntAt(i, token.charAt(i));
}
fstCompiler.add(scratch.get(), ord);
segmentations.add(wordIdAndLength);
@ -151,7 +154,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
new TokenInfoFST(
FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false);
this.morphAtts = new UserMorphData(data.toArray(new String[0]));
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
this.segmentations = segmentations.toArray(new int[0][]);
}
@Override
@ -168,33 +171,53 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
* @return array of {wordId, position, length}
*/
public int[][] lookup(char[] chars, int off, int len) throws IOException {
// TODO: can we avoid this treemap/toIndexArray?
TreeMap<Integer, int[]> result = new TreeMap<>(); // index, [length, length...]
boolean found = false; // true if we found any results
List<Match> matches = null;
int numResults = 0;
final FST.BytesReader fstReader = fst.getBytesReader();
final int end = off + len;
FST.Arc<Long> arc = new FST.Arc<>();
int end = off + len;
for (int startOffset = off; startOffset < end; startOffset++) {
int[] wordIdAndLength = null;
arc = fst.getFirstArc(arc);
int output = 0;
int remaining = end - startOffset;
for (int i = 0; i < remaining; i++) {
for (int i = 0, remaining = end - startOffset; i < remaining; i++) {
int ch = chars[startOffset + i];
if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
break; // continue to next position
}
output += arc.output().intValue();
if (arc.isFinal()) {
final int finalOutput = output + arc.nextFinalOutput().intValue();
result.put(startOffset - off, segmentations[finalOutput]);
found = true;
int finalOutput = output + arc.nextFinalOutput().intValue();
wordIdAndLength = segmentations[finalOutput];
}
}
if (wordIdAndLength != null) {
if (matches == null) {
matches = new ArrayList<>();
}
return found ? toIndexArray(result) : EMPTY_RESULT;
matches.add(new Match(startOffset - off, wordIdAndLength));
numResults += wordIdAndLength.length - 1;
}
}
if (numResults == 0) {
return EMPTY_RESULT;
}
int[][] result = new int[numResults][];
int index = 0;
for (int i = 0; i < matches.size(); i++) {
Match match = matches.get(i);
int[] wordIdAndLength = match.wordIdAndLength;
int wordId = wordIdAndLength[0];
// convert length to index
int position = match.position;
for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
// add a {wordId, index, length} token to the results
int[] token = {wordId + j - 1, position, wordIdAndLength[j]};
result[index++] = token;
position += wordIdAndLength[j];
}
}
return result;
}
public TokenInfoFST getFST() {
@ -203,28 +226,9 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
private static final int[][] EMPTY_RESULT = new int[0][];
/**
* Convert Map of index and wordIdAndLength to array of {wordId, index, length}
*
* @return array of {wordId, index, length}
*/
private int[][] toIndexArray(Map<Integer, int[]> input) {
ArrayList<int[]> result = new ArrayList<>();
for (Map.Entry<Integer, int[]> entry : input.entrySet()) {
int[] wordIdAndLength = entry.getValue();
int wordId = wordIdAndLength[0];
// convert length to index
int current = entry.getKey();
for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
int[] token = {wordId + j - 1, current, wordIdAndLength[j]};
result.add(token);
current += wordIdAndLength[j];
}
}
return result.toArray(new int[result.size()][]);
}
public int[] lookupSegmentation(int phraseID) {
return segmentations[phraseID];
}
private record Match(int position, int[] wordIdAndLength) {}
}