Improve search equivalence tests. (#14036)

This addresses an existing TODO about giving terms a zipfian distribution, and disables query caching to make sure that two-phase iterators are properly tested.
2024-12-04 15:12:04 +01:00 · 2024-12-04 15:12:04 +01:00 · 4947c0f746
parent 356a534c0b
commit 4947c0f746
1 changed files with 11 additions and 2 deletions
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/SearchEquivalenceTestBase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/SearchEquivalenceTestBase.java
@ -94,7 +94,11 @@ public abstract class SearchEquivalenceTestBase extends LuceneTestCase {

    reader = iw.getReader();
    s1 = newSearcher(reader);
+    // Disable the query cache, which converts two-phase iterators to normal iterators, while we
+    // want to make sure two-phase iterators are exercised.
+    s1.setQueryCache(null);
    s2 = newSearcher(reader);
+    s2.setQueryCache(null);
    iw.close();
  }

@ -114,7 +118,6 @@ public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
   * tokenization can be assumed to be on whitespace.
   */
  static String randomFieldContents() {
-    // TODO: zipf-like distribution
    StringBuilder sb = new StringBuilder();
    int numTerms = random().nextInt(15);
    for (int i = 0; i < numTerms; i++) {
@ -128,7 +131,13 @@ public abstract class SearchEquivalenceTestBase extends LuceneTestCase {

  /** returns random character (a-z) */
  static char randomChar() {
-    return (char) TestUtil.nextInt(random(), 'a', 'z');
+    char c = (char) TestUtil.nextInt(random(), 'a', 'z');
+    if (random().nextBoolean()) {
+      // bias towards earlier chars, so that chars have a ~ zipfian distribution with earlier chars
+      // having a higher frequency
+      c = (char) TestUtil.nextInt(random(), 'a', c);
+    }
+    return c;
  }

  /** returns a term suitable for searching. terms are single characters in lowercase (a-z) */