Improve search equivalence tests. (#14036)

This addresses an existing TODO about giving terms a zipfian distribution, and
disables query caching to make sure that two-phase iterators are properly
tested.
This commit is contained in:
Adrien Grand 2024-12-04 15:12:04 +01:00
parent 356a534c0b
commit 4947c0f746
1 changed files with 11 additions and 2 deletions

View File

@ -94,7 +94,11 @@ public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
reader = iw.getReader(); reader = iw.getReader();
s1 = newSearcher(reader); s1 = newSearcher(reader);
// Disable the query cache, which converts two-phase iterators to normal iterators, while we
// want to make sure two-phase iterators are exercised.
s1.setQueryCache(null);
s2 = newSearcher(reader); s2 = newSearcher(reader);
s2.setQueryCache(null);
iw.close(); iw.close();
} }
@ -114,7 +118,6 @@ public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
* tokenization can be assumed to be on whitespace. * tokenization can be assumed to be on whitespace.
*/ */
static String randomFieldContents() { static String randomFieldContents() {
// TODO: zipf-like distribution
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
int numTerms = random().nextInt(15); int numTerms = random().nextInt(15);
for (int i = 0; i < numTerms; i++) { for (int i = 0; i < numTerms; i++) {
@ -128,7 +131,13 @@ public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
/** returns random character (a-z) */ /** returns random character (a-z) */
static char randomChar() { static char randomChar() {
return (char) TestUtil.nextInt(random(), 'a', 'z'); char c = (char) TestUtil.nextInt(random(), 'a', 'z');
if (random().nextBoolean()) {
// bias towards earlier chars, so that chars have a ~ zipfian distribution with earlier chars
// having a higher frequency
c = (char) TestUtil.nextInt(random(), 'a', c);
}
return c;
} }
/** returns a term suitable for searching. terms are single characters in lowercase (a-z) */ /** returns a term suitable for searching. terms are single characters in lowercase (a-z) */