GITHUB#12451: Update TestStringsToAutomaton validation to work around GH#12458 (#12461)

2023-07-25 11:56:18 -07:00 · 2023-07-25 11:56:18 -07:00 · 2b3b028734
parent 20e97fbd00
commit 2b3b028734
2 changed files with 20 additions and 5 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -172,6 +172,9 @@ Bug Fixes
 * GITHUB#12423: Respect timeouts in ExitableDirectoryReader when searching with byte[] vectors (Ben Trent).
 * GITHUB#12451: Change TestStringsToAutomaton validation to avoid automaton conversion bug discovered in GH#12458
  (Greg Miller).
 Other
 ---------------------
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@ -33,6 +33,7 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.BytesRefIterator;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.fst.Util;
 public class TestStringsToAutomaton extends LuceneTestCase {
@ -141,11 +142,22 @@ public class TestStringsToAutomaton extends LuceneTestCase {
    }
    // Make sure every term produced by the automaton is expected
-    BytesRefBuilder scratch = new BytesRefBuilder();
+    FiniteStringsIterator it = new FiniteStringsIterator(a);
-    FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
+    if (isBinary) {
-    for (IntsRef r = it.next(); r != null; r = it.next()) {
+      BytesRefBuilder scratch = new BytesRefBuilder();
-      BytesRef t = Util.toBytesRef(r, scratch);
+      for (IntsRef r = it.next(); r != null; r = it.next()) {
-      assertTrue(expected.contains(t));
+        BytesRef t = Util.toBytesRef(r, scratch);
        assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
      }
    } else {
      // Note that we validate against the original automaton, not the compiled one as the compiled
      // automaton can incorrectly produce invalid/overlong utf8 terms (see: GH#12458). This means
      // we need slightly different logic here since the automaton "speaks" code points and not
      // utf8 bytes.
      for (IntsRef r = it.next(); r != null; r = it.next()) {
        BytesRef t = newBytesRef(UnicodeUtil.newString(r.ints, r.offset, r.length));
        assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
      }
    }
  }