GITHUB#12451: Update TestStringsToAutomaton validation to work around GH#12458 (#12461)

This commit is contained in:
Greg Miller 2023-07-25 11:56:18 -07:00 committed by GitHub
parent 20e97fbd00
commit 2b3b028734
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 5 deletions

View File

@ -172,6 +172,9 @@ Bug Fixes
* GITHUB#12423: Respect timeouts in ExitableDirectoryReader when searching with byte[] vectors (Ben Trent). * GITHUB#12423: Respect timeouts in ExitableDirectoryReader when searching with byte[] vectors (Ben Trent).
* GITHUB#12451: Change TestStringsToAutomaton validation to avoid automaton conversion bug discovered in GH#12458
(Greg Miller).
Other Other
--------------------- ---------------------

View File

@ -33,6 +33,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.fst.Util;
public class TestStringsToAutomaton extends LuceneTestCase { public class TestStringsToAutomaton extends LuceneTestCase {
@ -141,11 +142,22 @@ public class TestStringsToAutomaton extends LuceneTestCase {
} }
// Make sure every term produced by the automaton is expected // Make sure every term produced by the automaton is expected
BytesRefBuilder scratch = new BytesRefBuilder(); FiniteStringsIterator it = new FiniteStringsIterator(a);
FiniteStringsIterator it = new FiniteStringsIterator(c.automaton); if (isBinary) {
for (IntsRef r = it.next(); r != null; r = it.next()) { BytesRefBuilder scratch = new BytesRefBuilder();
BytesRef t = Util.toBytesRef(r, scratch); for (IntsRef r = it.next(); r != null; r = it.next()) {
assertTrue(expected.contains(t)); BytesRef t = Util.toBytesRef(r, scratch);
assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
}
} else {
// Note that we validate against the original automaton, not the compiled one as the compiled
// automaton can incorrectly produce invalid/overlong utf8 terms (see: GH#12458). This means
// we need slightly different logic here since the automaton "speaks" code points and not
// utf8 bytes.
for (IntsRef r = it.next(); r != null; r = it.next()) {
BytesRef t = newBytesRef(UnicodeUtil.newString(r.ints, r.offset, r.length));
assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
}
} }
} }