mirror of https://github.com/apache/lucene.git
Optimize Japanese UserDictionary. (#13431)
Replace TreeMap by a List of Match. Use compiled Pattern.
This commit is contained in:
parent
edba83e636
commit
4e8fb2a9df
|
@ -23,8 +23,7 @@ import java.util.ArrayList;
|
|||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.regex.Pattern;
|
||||
import org.apache.lucene.analysis.morph.Dictionary;
|
||||
import org.apache.lucene.analysis.util.CSVUtil;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
|
@ -37,6 +36,10 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
|||
|
||||
public static final String INTERNAL_SEPARATOR = "\u0000";
|
||||
|
||||
private static final Pattern LINE_COMMENT = Pattern.compile("^#.*$");
|
||||
private static final Pattern WHITESPACE = Pattern.compile("\\s");
|
||||
private static final Pattern SPACES = Pattern.compile(" +");
|
||||
|
||||
// phrase text -> phrase ID
|
||||
private final TokenInfoFST fst;
|
||||
|
||||
|
@ -51,16 +54,16 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
|||
public static UserDictionary open(Reader reader) throws IOException {
|
||||
|
||||
BufferedReader br = new BufferedReader(reader);
|
||||
String line = null;
|
||||
String line;
|
||||
List<String[]> featureEntries = new ArrayList<>();
|
||||
|
||||
// text, segmentation, readings, POS
|
||||
while ((line = br.readLine()) != null) {
|
||||
// Remove comments
|
||||
line = line.replaceAll("^#.*$", "");
|
||||
line = LINE_COMMENT.matcher(line).replaceAll("");
|
||||
|
||||
// Skip empty lines or comment lines
|
||||
if (line.trim().length() == 0) {
|
||||
if (line.trim().isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
String[] values = CSVUtil.parse(line);
|
||||
|
@ -99,10 +102,10 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
|||
long ord = 0;
|
||||
|
||||
for (String[] values : featureEntries) {
|
||||
String surface = values[0].replaceAll("\\s", "");
|
||||
String concatenatedSegment = values[1].replaceAll("\\s", "");
|
||||
String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
|
||||
String[] readings = values[2].replaceAll(" *", " ").split(" ");
|
||||
String surface = WHITESPACE.matcher(values[0]).replaceAll("");
|
||||
String concatenatedSegment = WHITESPACE.matcher(values[1]).replaceAll("");
|
||||
String[] segmentation = SPACES.split(values[1]);
|
||||
String[] readings = SPACES.split(values[2]);
|
||||
String pos = values[3];
|
||||
|
||||
if (segmentation.length != readings.length) {
|
||||
|
@ -141,7 +144,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
|||
scratch.growNoCopy(token.length());
|
||||
scratch.setLength(token.length());
|
||||
for (int i = 0; i < token.length(); i++) {
|
||||
scratch.setIntAt(i, (int) token.charAt(i));
|
||||
scratch.setIntAt(i, token.charAt(i));
|
||||
}
|
||||
fstCompiler.add(scratch.get(), ord);
|
||||
segmentations.add(wordIdAndLength);
|
||||
|
@ -151,7 +154,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
|||
new TokenInfoFST(
|
||||
FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false);
|
||||
this.morphAtts = new UserMorphData(data.toArray(new String[0]));
|
||||
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
|
||||
this.segmentations = segmentations.toArray(new int[0][]);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -168,33 +171,53 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
|||
* @return array of {wordId, position, length}
|
||||
*/
|
||||
public int[][] lookup(char[] chars, int off, int len) throws IOException {
|
||||
// TODO: can we avoid this treemap/toIndexArray?
|
||||
TreeMap<Integer, int[]> result = new TreeMap<>(); // index, [length, length...]
|
||||
boolean found = false; // true if we found any results
|
||||
|
||||
List<Match> matches = null;
|
||||
int numResults = 0;
|
||||
final FST.BytesReader fstReader = fst.getBytesReader();
|
||||
|
||||
final int end = off + len;
|
||||
FST.Arc<Long> arc = new FST.Arc<>();
|
||||
int end = off + len;
|
||||
for (int startOffset = off; startOffset < end; startOffset++) {
|
||||
int[] wordIdAndLength = null;
|
||||
arc = fst.getFirstArc(arc);
|
||||
int output = 0;
|
||||
int remaining = end - startOffset;
|
||||
for (int i = 0; i < remaining; i++) {
|
||||
for (int i = 0, remaining = end - startOffset; i < remaining; i++) {
|
||||
int ch = chars[startOffset + i];
|
||||
if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
|
||||
break; // continue to next position
|
||||
}
|
||||
output += arc.output().intValue();
|
||||
if (arc.isFinal()) {
|
||||
final int finalOutput = output + arc.nextFinalOutput().intValue();
|
||||
result.put(startOffset - off, segmentations[finalOutput]);
|
||||
found = true;
|
||||
int finalOutput = output + arc.nextFinalOutput().intValue();
|
||||
wordIdAndLength = segmentations[finalOutput];
|
||||
}
|
||||
}
|
||||
if (wordIdAndLength != null) {
|
||||
if (matches == null) {
|
||||
matches = new ArrayList<>();
|
||||
}
|
||||
|
||||
return found ? toIndexArray(result) : EMPTY_RESULT;
|
||||
matches.add(new Match(startOffset - off, wordIdAndLength));
|
||||
numResults += wordIdAndLength.length - 1;
|
||||
}
|
||||
}
|
||||
if (numResults == 0) {
|
||||
return EMPTY_RESULT;
|
||||
}
|
||||
int[][] result = new int[numResults][];
|
||||
int index = 0;
|
||||
for (int i = 0; i < matches.size(); i++) {
|
||||
Match match = matches.get(i);
|
||||
int[] wordIdAndLength = match.wordIdAndLength;
|
||||
int wordId = wordIdAndLength[0];
|
||||
// convert length to index
|
||||
int position = match.position;
|
||||
for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
|
||||
// add a {wordId, index, length} token to the results
|
||||
int[] token = {wordId + j - 1, position, wordIdAndLength[j]};
|
||||
result[index++] = token;
|
||||
position += wordIdAndLength[j];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public TokenInfoFST getFST() {
|
||||
|
@ -203,28 +226,9 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
|||
|
||||
private static final int[][] EMPTY_RESULT = new int[0][];
|
||||
|
||||
/**
|
||||
* Convert Map of index and wordIdAndLength to array of {wordId, index, length}
|
||||
*
|
||||
* @return array of {wordId, index, length}
|
||||
*/
|
||||
private int[][] toIndexArray(Map<Integer, int[]> input) {
|
||||
ArrayList<int[]> result = new ArrayList<>();
|
||||
for (Map.Entry<Integer, int[]> entry : input.entrySet()) {
|
||||
int[] wordIdAndLength = entry.getValue();
|
||||
int wordId = wordIdAndLength[0];
|
||||
// convert length to index
|
||||
int current = entry.getKey();
|
||||
for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
|
||||
int[] token = {wordId + j - 1, current, wordIdAndLength[j]};
|
||||
result.add(token);
|
||||
current += wordIdAndLength[j];
|
||||
}
|
||||
}
|
||||
return result.toArray(new int[result.size()][]);
|
||||
}
|
||||
|
||||
public int[] lookupSegmentation(int phraseID) {
|
||||
return segmentations[phraseID];
|
||||
}
|
||||
|
||||
private record Match(int position, int[] wordIdAndLength) {}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue