GITHUB#12451: Update TestStringsToAutomaton validation to work around GH#12458 (#12461)

This commit is contained in:
Greg Miller 2023-07-25 11:56:18 -07:00 committed by GitHub
parent 20e97fbd00
commit 2b3b028734
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 5 deletions

View File

@ -172,6 +172,9 @@ Bug Fixes
* GITHUB#12423: Respect timeouts in ExitableDirectoryReader when searching with byte[] vectors (Ben Trent).
* GITHUB#12451: Change TestStringsToAutomaton validation to avoid automaton conversion bug discovered in GH#12458
(Greg Miller).
Other
---------------------

View File

@ -33,6 +33,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.Util;
public class TestStringsToAutomaton extends LuceneTestCase {
@ -141,11 +142,22 @@ public class TestStringsToAutomaton extends LuceneTestCase {
}
// Make sure every term produced by the automaton is expected
BytesRefBuilder scratch = new BytesRefBuilder();
FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
for (IntsRef r = it.next(); r != null; r = it.next()) {
BytesRef t = Util.toBytesRef(r, scratch);
assertTrue(expected.contains(t));
FiniteStringsIterator it = new FiniteStringsIterator(a);
if (isBinary) {
BytesRefBuilder scratch = new BytesRefBuilder();
for (IntsRef r = it.next(); r != null; r = it.next()) {
BytesRef t = Util.toBytesRef(r, scratch);
assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
}
} else {
// Note that we validate against the original automaton, not the compiled one as the compiled
// automaton can incorrectly produce invalid/overlong utf8 terms (see: GH#12458). This means
// we need slightly different logic here since the automaton "speaks" code points and not
// utf8 bytes.
for (IntsRef r = it.next(); r != null; r = it.next()) {
BytesRef t = newBytesRef(UnicodeUtil.newString(r.ints, r.offset, r.length));
assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
}
}
}