mirror of https://github.com/apache/lucene.git
hunspell (minor): reduce allocations when reading the dictionary's morphological data (#12323)
there can be many entries with morph data, so we'd better avoid compiling and matching regexes and even stream allocation
This commit is contained in:
parent
ac8c1870fa
commit
4bf1b94209
|
@ -992,7 +992,7 @@ public class Dictionary {
|
|||
// if we haven't seen any custom morphological data, try to parse one
|
||||
if (!hasCustomMorphData) {
|
||||
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
||||
if (morphStart >= 0 && morphStart < line.length()) {
|
||||
if (morphStart >= 0) {
|
||||
String data = line.substring(morphStart + 1);
|
||||
hasCustomMorphData =
|
||||
splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
|
||||
|
@ -1321,14 +1321,22 @@ public class Dictionary {
|
|||
if (morphData.isBlank()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return Arrays.stream(morphData.split("\\s+"))
|
||||
.filter(
|
||||
s ->
|
||||
s.length() > 3
|
||||
&& Character.isLetter(s.charAt(0))
|
||||
&& Character.isLetter(s.charAt(1))
|
||||
&& s.charAt(2) == ':')
|
||||
.collect(Collectors.toList());
|
||||
|
||||
List<String> result = null;
|
||||
int start = 0;
|
||||
for (int i = 0; i <= morphData.length(); i++) {
|
||||
if (i == morphData.length() || Character.isWhitespace(morphData.charAt(i))) {
|
||||
if (i - start > 3
|
||||
&& Character.isLetter(morphData.charAt(start))
|
||||
&& Character.isLetter(morphData.charAt(start + 1))
|
||||
&& morphData.charAt(start + 2) == ':') {
|
||||
if (result == null) result = new ArrayList<>();
|
||||
result.add(morphData.substring(start, i));
|
||||
}
|
||||
start = i + 1;
|
||||
}
|
||||
}
|
||||
return result == null ? List.of() : result;
|
||||
}
|
||||
|
||||
boolean hasFlag(IntsRef forms, char flag) {
|
||||
|
|
Loading…
Reference in New Issue