hunspell (minor): reduce allocations when reading the dictionary's morphological data (#12323)

there can be many entries with morph data, so we'd better avoid compiling and matching regexes and even stream allocation
2023-06-01 11:37:38 +02:00 · 2023-06-01 11:37:38 +02:00 · 4bf1b94209
parent ac8c1870fa
commit 4bf1b94209
1 changed files with 17 additions and 9 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -992,7 +992,7 @@ public class Dictionary {
          // if we haven't seen any custom morphological data, try to parse one
          if (!hasCustomMorphData) {
            int morphStart = line.indexOf(MORPH_SEPARATOR);
-            if (morphStart >= 0 && morphStart < line.length()) {
+            if (morphStart >= 0) {
              String data = line.substring(morphStart + 1);
              hasCustomMorphData =
                  splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
@ -1321,14 +1321,22 @@ public class Dictionary {
    if (morphData.isBlank()) {
      return Collections.emptyList();
    }
-    return Arrays.stream(morphData.split("\\s+"))
-        .filter(
-            s ->
-                s.length() > 3
-                    && Character.isLetter(s.charAt(0))
-                    && Character.isLetter(s.charAt(1))
-                    && s.charAt(2) == ':')
-        .collect(Collectors.toList());
+
+    List<String> result = null;
+    int start = 0;
+    for (int i = 0; i <= morphData.length(); i++) {
+      if (i == morphData.length() || Character.isWhitespace(morphData.charAt(i))) {
+        if (i - start > 3
+            && Character.isLetter(morphData.charAt(start))
+            && Character.isLetter(morphData.charAt(start + 1))
+            && morphData.charAt(start + 2) == ':') {
+          if (result == null) result = new ArrayList<>();
+          result.add(morphData.substring(start, i));
+        }
+        start = i + 1;
+      }
+    }
+    return result == null ? List.of() : result;
  }

  boolean hasFlag(IntsRef forms, char flag) {