mirror of https://github.com/apache/lucene.git
Optimize Japanese UserDictionary. (#13431)
Replace TreeMap by a List of Match. Use compiled Pattern.
This commit is contained in:
parent
edba83e636
commit
4e8fb2a9df
|
@ -23,8 +23,7 @@ import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.regex.Pattern;
|
||||||
import java.util.TreeMap;
|
|
||||||
import org.apache.lucene.analysis.morph.Dictionary;
|
import org.apache.lucene.analysis.morph.Dictionary;
|
||||||
import org.apache.lucene.analysis.util.CSVUtil;
|
import org.apache.lucene.analysis.util.CSVUtil;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
|
@ -37,6 +36,10 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
||||||
|
|
||||||
public static final String INTERNAL_SEPARATOR = "\u0000";
|
public static final String INTERNAL_SEPARATOR = "\u0000";
|
||||||
|
|
||||||
|
private static final Pattern LINE_COMMENT = Pattern.compile("^#.*$");
|
||||||
|
private static final Pattern WHITESPACE = Pattern.compile("\\s");
|
||||||
|
private static final Pattern SPACES = Pattern.compile(" +");
|
||||||
|
|
||||||
// phrase text -> phrase ID
|
// phrase text -> phrase ID
|
||||||
private final TokenInfoFST fst;
|
private final TokenInfoFST fst;
|
||||||
|
|
||||||
|
@ -51,16 +54,16 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
||||||
public static UserDictionary open(Reader reader) throws IOException {
|
public static UserDictionary open(Reader reader) throws IOException {
|
||||||
|
|
||||||
BufferedReader br = new BufferedReader(reader);
|
BufferedReader br = new BufferedReader(reader);
|
||||||
String line = null;
|
String line;
|
||||||
List<String[]> featureEntries = new ArrayList<>();
|
List<String[]> featureEntries = new ArrayList<>();
|
||||||
|
|
||||||
// text, segmentation, readings, POS
|
// text, segmentation, readings, POS
|
||||||
while ((line = br.readLine()) != null) {
|
while ((line = br.readLine()) != null) {
|
||||||
// Remove comments
|
// Remove comments
|
||||||
line = line.replaceAll("^#.*$", "");
|
line = LINE_COMMENT.matcher(line).replaceAll("");
|
||||||
|
|
||||||
// Skip empty lines or comment lines
|
// Skip empty lines or comment lines
|
||||||
if (line.trim().length() == 0) {
|
if (line.trim().isEmpty()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
String[] values = CSVUtil.parse(line);
|
String[] values = CSVUtil.parse(line);
|
||||||
|
@ -99,10 +102,10 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
||||||
long ord = 0;
|
long ord = 0;
|
||||||
|
|
||||||
for (String[] values : featureEntries) {
|
for (String[] values : featureEntries) {
|
||||||
String surface = values[0].replaceAll("\\s", "");
|
String surface = WHITESPACE.matcher(values[0]).replaceAll("");
|
||||||
String concatenatedSegment = values[1].replaceAll("\\s", "");
|
String concatenatedSegment = WHITESPACE.matcher(values[1]).replaceAll("");
|
||||||
String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
|
String[] segmentation = SPACES.split(values[1]);
|
||||||
String[] readings = values[2].replaceAll(" *", " ").split(" ");
|
String[] readings = SPACES.split(values[2]);
|
||||||
String pos = values[3];
|
String pos = values[3];
|
||||||
|
|
||||||
if (segmentation.length != readings.length) {
|
if (segmentation.length != readings.length) {
|
||||||
|
@ -141,7 +144,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
||||||
scratch.growNoCopy(token.length());
|
scratch.growNoCopy(token.length());
|
||||||
scratch.setLength(token.length());
|
scratch.setLength(token.length());
|
||||||
for (int i = 0; i < token.length(); i++) {
|
for (int i = 0; i < token.length(); i++) {
|
||||||
scratch.setIntAt(i, (int) token.charAt(i));
|
scratch.setIntAt(i, token.charAt(i));
|
||||||
}
|
}
|
||||||
fstCompiler.add(scratch.get(), ord);
|
fstCompiler.add(scratch.get(), ord);
|
||||||
segmentations.add(wordIdAndLength);
|
segmentations.add(wordIdAndLength);
|
||||||
|
@ -151,7 +154,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
||||||
new TokenInfoFST(
|
new TokenInfoFST(
|
||||||
FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false);
|
FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false);
|
||||||
this.morphAtts = new UserMorphData(data.toArray(new String[0]));
|
this.morphAtts = new UserMorphData(data.toArray(new String[0]));
|
||||||
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
|
this.segmentations = segmentations.toArray(new int[0][]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -168,33 +171,53 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
||||||
* @return array of {wordId, position, length}
|
* @return array of {wordId, position, length}
|
||||||
*/
|
*/
|
||||||
public int[][] lookup(char[] chars, int off, int len) throws IOException {
|
public int[][] lookup(char[] chars, int off, int len) throws IOException {
|
||||||
// TODO: can we avoid this treemap/toIndexArray?
|
List<Match> matches = null;
|
||||||
TreeMap<Integer, int[]> result = new TreeMap<>(); // index, [length, length...]
|
int numResults = 0;
|
||||||
boolean found = false; // true if we found any results
|
|
||||||
|
|
||||||
final FST.BytesReader fstReader = fst.getBytesReader();
|
final FST.BytesReader fstReader = fst.getBytesReader();
|
||||||
|
final int end = off + len;
|
||||||
FST.Arc<Long> arc = new FST.Arc<>();
|
FST.Arc<Long> arc = new FST.Arc<>();
|
||||||
int end = off + len;
|
|
||||||
for (int startOffset = off; startOffset < end; startOffset++) {
|
for (int startOffset = off; startOffset < end; startOffset++) {
|
||||||
|
int[] wordIdAndLength = null;
|
||||||
arc = fst.getFirstArc(arc);
|
arc = fst.getFirstArc(arc);
|
||||||
int output = 0;
|
int output = 0;
|
||||||
int remaining = end - startOffset;
|
for (int i = 0, remaining = end - startOffset; i < remaining; i++) {
|
||||||
for (int i = 0; i < remaining; i++) {
|
|
||||||
int ch = chars[startOffset + i];
|
int ch = chars[startOffset + i];
|
||||||
if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
|
if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
|
||||||
break; // continue to next position
|
break; // continue to next position
|
||||||
}
|
}
|
||||||
output += arc.output().intValue();
|
output += arc.output().intValue();
|
||||||
if (arc.isFinal()) {
|
if (arc.isFinal()) {
|
||||||
final int finalOutput = output + arc.nextFinalOutput().intValue();
|
int finalOutput = output + arc.nextFinalOutput().intValue();
|
||||||
result.put(startOffset - off, segmentations[finalOutput]);
|
wordIdAndLength = segmentations[finalOutput];
|
||||||
found = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (wordIdAndLength != null) {
|
||||||
|
if (matches == null) {
|
||||||
|
matches = new ArrayList<>();
|
||||||
}
|
}
|
||||||
|
matches.add(new Match(startOffset - off, wordIdAndLength));
|
||||||
return found ? toIndexArray(result) : EMPTY_RESULT;
|
numResults += wordIdAndLength.length - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (numResults == 0) {
|
||||||
|
return EMPTY_RESULT;
|
||||||
|
}
|
||||||
|
int[][] result = new int[numResults][];
|
||||||
|
int index = 0;
|
||||||
|
for (int i = 0; i < matches.size(); i++) {
|
||||||
|
Match match = matches.get(i);
|
||||||
|
int[] wordIdAndLength = match.wordIdAndLength;
|
||||||
|
int wordId = wordIdAndLength[0];
|
||||||
|
// convert length to index
|
||||||
|
int position = match.position;
|
||||||
|
for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
|
||||||
|
// add a {wordId, index, length} token to the results
|
||||||
|
int[] token = {wordId + j - 1, position, wordIdAndLength[j]};
|
||||||
|
result[index++] = token;
|
||||||
|
position += wordIdAndLength[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenInfoFST getFST() {
|
public TokenInfoFST getFST() {
|
||||||
|
@ -203,28 +226,9 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
||||||
|
|
||||||
private static final int[][] EMPTY_RESULT = new int[0][];
|
private static final int[][] EMPTY_RESULT = new int[0][];
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert Map of index and wordIdAndLength to array of {wordId, index, length}
|
|
||||||
*
|
|
||||||
* @return array of {wordId, index, length}
|
|
||||||
*/
|
|
||||||
private int[][] toIndexArray(Map<Integer, int[]> input) {
|
|
||||||
ArrayList<int[]> result = new ArrayList<>();
|
|
||||||
for (Map.Entry<Integer, int[]> entry : input.entrySet()) {
|
|
||||||
int[] wordIdAndLength = entry.getValue();
|
|
||||||
int wordId = wordIdAndLength[0];
|
|
||||||
// convert length to index
|
|
||||||
int current = entry.getKey();
|
|
||||||
for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
|
|
||||||
int[] token = {wordId + j - 1, current, wordIdAndLength[j]};
|
|
||||||
result.add(token);
|
|
||||||
current += wordIdAndLength[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result.toArray(new int[result.size()][]);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int[] lookupSegmentation(int phraseID) {
|
public int[] lookupSegmentation(int phraseID) {
|
||||||
return segmentations[phraseID];
|
return segmentations[phraseID];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private record Match(int position, int[] wordIdAndLength) {}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue