SOLR-13486: Fix trivial test bug in TestTlogReplayVsRecovery

Add TODOs for future test improvements once underlying race condition is fixed in core code
2020-01-11 16:48:05 -07:00 · 2020-01-11 16:48:05 -07:00 · 9a2497f637
parent 0b072ecedb
commit 9a2497f637
1 changed files with 22 additions and 9 deletions
--- a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java
@ -48,11 +48,10 @@ import org.junit.Before;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13486")
+@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13486;https://issues.apache.org/jira/browse/SOLR-14183")
 public class TestTlogReplayVsRecovery extends SolrCloudTestCase {

  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-
  private static final String COLLECTION = "collecion_with_slow_tlog_recovery";
  
  private JettySolrRunner NODE0;
@ -60,12 +59,16 @@ public class TestTlogReplayVsRecovery extends SolrCloudTestCase {
  private Map<JettySolrRunner, SocketProxy> proxies;
  private Map<URI, JettySolrRunner> jettys;

-  @Before
-  public void setupCluster() throws Exception {
  // we want to ensure there is tlog replay on the leader after we restart it,
  // so in addition to not committing the docs we add during network partition
  // we also want to ensure that our leader doesn't do a "Commit on close"
-    DirectUpdateHandler2.commitOnClose = false; 
+  //
+  // TODO: once SOLR-13486 is fixed, we should randomize this...
+  private static final boolean TEST_VALUE_FOR_COMMIT_ON_CLOSE = false;
+  
+  @Before
+  public void setupCluster() throws Exception {
+    DirectUpdateHandler2.commitOnClose = TEST_VALUE_FOR_COMMIT_ON_CLOSE;
    
    System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
    System.setProperty("solr.ulog.numRecordsToKeep", "1000");
@ -113,6 +116,14 @@ public class TestTlogReplayVsRecovery extends SolrCloudTestCase {
  }

  public void testManyDocsInTlogReplayWhileReplicaIsTryingToRecover() throws Exception {
+    // TODO: One the basic problem in SOLR-13486 is fixed, this test can be made more robust by:
+    // 1) randomizing the number of committedDocs (pre net split) & uncommittedDocs (post net split)
+    //    to trigger diff recovery strategies & shutdown behavior
+    // 2) replace "committedDocs + uncommittedDocs" with 4 variables:
+    //    a: docs committed before network split (add + commit)
+    //    b: docs not committed before network split (add w/o commit)
+    //    c: docs committed after network split (add + commit)
+    //    d: docs not committed after network split (add w/o commit)
    final int committedDocs = 3;
    final int uncommittedDocs = 50;
    
@ -140,8 +151,9 @@ public class TestTlogReplayVsRecovery extends SolrCloudTestCase {
    final Replica leader = getCollectionState(COLLECTION).getSlice("shard1").getLeader();
    assertEquals("Sanity check failed", NODE0.getNodeName(), leader.getNodeName());

-    log.info("Add and commit a {} docs...", committedDocs);
+    log.info("Add and commit {} docs...", committedDocs);
    addDocs(true, committedDocs, 1);
+    assertDocsExistInBothReplicas(1, committedDocs);

    log.info("Partition nodes...");
    proxies.get(NODE0).close();
@ -151,7 +163,8 @@ public class TestTlogReplayVsRecovery extends SolrCloudTestCase {
    addDocs(false, uncommittedDocs, committedDocs + 1);

    log.info("Stopping leader node...");
-    assertEquals("Something broke our expected commitOnClose", false, DirectUpdateHandler2.commitOnClose);
+    assertEquals("Something broke our expected commitOnClose",
+                 TEST_VALUE_FOR_COMMIT_ON_CLOSE, DirectUpdateHandler2.commitOnClose);
    NODE0.stop();
    cluster.waitForJettyToStop(NODE0);

@ -188,7 +201,7 @@ public class TestTlogReplayVsRecovery extends SolrCloudTestCase {
    cluster.waitForActiveCollection(COLLECTION, 1, 2);

    log.info("Check docs on both replicas...");
-    assertDocsExistInBothReplicas(1, uncommittedDocs + uncommittedDocs);
+    assertDocsExistInBothReplicas(1, committedDocs + uncommittedDocs);
    
    log.info("Test ok, delete collection...");
    CollectionAdminRequest.deleteCollection(COLLECTION).process(cluster.getSolrClient());