Optimize Japanese UserDictionary. (#13431)

Replace TreeMap by a List of Match. Use compiled Pattern.
This commit is contained in:
Bruno Roustant 2024-06-11 10:02:58 +02:00 committed by GitHub
parent edba83e636
commit 4e8fb2a9df
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 49 additions and 45 deletions

View File

@ -23,8 +23,7 @@ import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.regex.Pattern;
import java.util.TreeMap;
import org.apache.lucene.analysis.morph.Dictionary; import org.apache.lucene.analysis.morph.Dictionary;
import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.analysis.util.CSVUtil;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
@ -37,6 +36,10 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
public static final String INTERNAL_SEPARATOR = "\u0000"; public static final String INTERNAL_SEPARATOR = "\u0000";
private static final Pattern LINE_COMMENT = Pattern.compile("^#.*$");
private static final Pattern WHITESPACE = Pattern.compile("\\s");
private static final Pattern SPACES = Pattern.compile(" +");
// phrase text -> phrase ID // phrase text -> phrase ID
private final TokenInfoFST fst; private final TokenInfoFST fst;
@ -51,16 +54,16 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
public static UserDictionary open(Reader reader) throws IOException { public static UserDictionary open(Reader reader) throws IOException {
BufferedReader br = new BufferedReader(reader); BufferedReader br = new BufferedReader(reader);
String line = null; String line;
List<String[]> featureEntries = new ArrayList<>(); List<String[]> featureEntries = new ArrayList<>();
// text, segmentation, readings, POS // text, segmentation, readings, POS
while ((line = br.readLine()) != null) { while ((line = br.readLine()) != null) {
// Remove comments // Remove comments
line = line.replaceAll("^#.*$", ""); line = LINE_COMMENT.matcher(line).replaceAll("");
// Skip empty lines or comment lines // Skip empty lines or comment lines
if (line.trim().length() == 0) { if (line.trim().isEmpty()) {
continue; continue;
} }
String[] values = CSVUtil.parse(line); String[] values = CSVUtil.parse(line);
@ -99,10 +102,10 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
long ord = 0; long ord = 0;
for (String[] values : featureEntries) { for (String[] values : featureEntries) {
String surface = values[0].replaceAll("\\s", ""); String surface = WHITESPACE.matcher(values[0]).replaceAll("");
String concatenatedSegment = values[1].replaceAll("\\s", ""); String concatenatedSegment = WHITESPACE.matcher(values[1]).replaceAll("");
String[] segmentation = values[1].replaceAll(" *", " ").split(" "); String[] segmentation = SPACES.split(values[1]);
String[] readings = values[2].replaceAll(" *", " ").split(" "); String[] readings = SPACES.split(values[2]);
String pos = values[3]; String pos = values[3];
if (segmentation.length != readings.length) { if (segmentation.length != readings.length) {
@ -141,7 +144,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
scratch.growNoCopy(token.length()); scratch.growNoCopy(token.length());
scratch.setLength(token.length()); scratch.setLength(token.length());
for (int i = 0; i < token.length(); i++) { for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, (int) token.charAt(i)); scratch.setIntAt(i, token.charAt(i));
} }
fstCompiler.add(scratch.get(), ord); fstCompiler.add(scratch.get(), ord);
segmentations.add(wordIdAndLength); segmentations.add(wordIdAndLength);
@ -151,7 +154,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
new TokenInfoFST( new TokenInfoFST(
FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false); FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false);
this.morphAtts = new UserMorphData(data.toArray(new String[0])); this.morphAtts = new UserMorphData(data.toArray(new String[0]));
this.segmentations = segmentations.toArray(new int[segmentations.size()][]); this.segmentations = segmentations.toArray(new int[0][]);
} }
@Override @Override
@ -168,33 +171,53 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
* @return array of {wordId, position, length} * @return array of {wordId, position, length}
*/ */
public int[][] lookup(char[] chars, int off, int len) throws IOException { public int[][] lookup(char[] chars, int off, int len) throws IOException {
// TODO: can we avoid this treemap/toIndexArray? List<Match> matches = null;
TreeMap<Integer, int[]> result = new TreeMap<>(); // index, [length, length...] int numResults = 0;
boolean found = false; // true if we found any results
final FST.BytesReader fstReader = fst.getBytesReader(); final FST.BytesReader fstReader = fst.getBytesReader();
final int end = off + len;
FST.Arc<Long> arc = new FST.Arc<>(); FST.Arc<Long> arc = new FST.Arc<>();
int end = off + len;
for (int startOffset = off; startOffset < end; startOffset++) { for (int startOffset = off; startOffset < end; startOffset++) {
int[] wordIdAndLength = null;
arc = fst.getFirstArc(arc); arc = fst.getFirstArc(arc);
int output = 0; int output = 0;
int remaining = end - startOffset; for (int i = 0, remaining = end - startOffset; i < remaining; i++) {
for (int i = 0; i < remaining; i++) {
int ch = chars[startOffset + i]; int ch = chars[startOffset + i];
if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) { if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
break; // continue to next position break; // continue to next position
} }
output += arc.output().intValue(); output += arc.output().intValue();
if (arc.isFinal()) { if (arc.isFinal()) {
final int finalOutput = output + arc.nextFinalOutput().intValue(); int finalOutput = output + arc.nextFinalOutput().intValue();
result.put(startOffset - off, segmentations[finalOutput]); wordIdAndLength = segmentations[finalOutput];
found = true;
} }
} }
if (wordIdAndLength != null) {
if (matches == null) {
matches = new ArrayList<>();
}
matches.add(new Match(startOffset - off, wordIdAndLength));
numResults += wordIdAndLength.length - 1;
}
} }
if (numResults == 0) {
return found ? toIndexArray(result) : EMPTY_RESULT; return EMPTY_RESULT;
}
int[][] result = new int[numResults][];
int index = 0;
for (int i = 0; i < matches.size(); i++) {
Match match = matches.get(i);
int[] wordIdAndLength = match.wordIdAndLength;
int wordId = wordIdAndLength[0];
// convert length to index
int position = match.position;
for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
// add a {wordId, index, length} token to the results
int[] token = {wordId + j - 1, position, wordIdAndLength[j]};
result[index++] = token;
position += wordIdAndLength[j];
}
}
return result;
} }
public TokenInfoFST getFST() { public TokenInfoFST getFST() {
@ -203,28 +226,9 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
private static final int[][] EMPTY_RESULT = new int[0][]; private static final int[][] EMPTY_RESULT = new int[0][];
/**
* Convert Map of index and wordIdAndLength to array of {wordId, index, length}
*
* @return array of {wordId, index, length}
*/
private int[][] toIndexArray(Map<Integer, int[]> input) {
ArrayList<int[]> result = new ArrayList<>();
for (Map.Entry<Integer, int[]> entry : input.entrySet()) {
int[] wordIdAndLength = entry.getValue();
int wordId = wordIdAndLength[0];
// convert length to index
int current = entry.getKey();
for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
int[] token = {wordId + j - 1, current, wordIdAndLength[j]};
result.add(token);
current += wordIdAndLength[j];
}
}
return result.toArray(new int[result.size()][]);
}
public int[] lookupSegmentation(int phraseID) { public int[] lookupSegmentation(int phraseID) {
return segmentations[phraseID]; return segmentations[phraseID];
} }
private record Match(int position, int[] wordIdAndLength) {}
} }