fix TestAutomaton.testRandomFinite test bug that was trying to StringsToAutomaton.build a Collection of BytesRefs containing a too-massive (> 1000 UTF-8 bytes) term; corrected the exception message to make it clear the limit is in UTF-8 bytes, not java (UTF-16) characters

2023-10-29 11:41:40 -04:00 · 2023-10-29 11:41:40 -04:00 · 11436a848c
parent a8c52e2e19
commit 11436a848c
3 changed files with 12 additions and 3 deletions
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@ -269,7 +269,7 @@ final class StringsToAutomaton {
      throw new IllegalArgumentException(
          "This builder doesn't allow terms that are larger than "
              + Automata.MAX_STRING_UNION_TERM_LENGTH
-              + " characters, got "
+              + " UTF-8 bytes, got "
              + current);
    }
    assert stateRegistry != null : "Automaton already built.";
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
@ -790,9 +790,18 @@ public class TestAutomaton extends LuceneTestCase {
    return null;
  }

+  private static boolean hasMassiveTerm(Collection<BytesRef> terms) {
+    for (BytesRef term : terms) {
+      if (term.length > Automata.MAX_STRING_UNION_TERM_LENGTH) {
+        return true;
+      }
+    }
+    return false;
+  }
+
  private Automaton unionTerms(Collection<BytesRef> terms) {
    Automaton a;
-    if (random().nextBoolean()) {
+    if (random().nextBoolean() || hasMassiveTerm(terms)) {
      if (VERBOSE) {
        System.out.println("TEST: unionTerms: use union");
      }
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@ -103,7 +103,7 @@ public class TestStringsToAutomaton extends LuceneTestCase {
            .startsWith(
                "This builder doesn't allow terms that are larger than "
                    + Automata.MAX_STRING_UNION_TERM_LENGTH
-                    + " characters"));
+                    + " UTF-8 bytes"));

    byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);
    build(Collections.singleton(new BytesRef(b1k)), false); // no exception