SOLR-13599: additional 'checkpoint' logging to try and help diagnose strange failures

This commit is contained in:
Chris Hostetter 2019-07-02 14:52:13 -07:00
parent f686167e71
commit b4a602f6b2
1 changed files with 45 additions and 18 deletions

View File

@ -271,12 +271,12 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
} }
protected void testRf3() throws Exception { protected void testRf3() throws Exception {
int numShards = 1; final int numShards = 1;
int replicationFactor = 3; final int replicationFactor = 3;
int maxShardsPerNode = 1; final int maxShardsPerNode = 1;
String testCollectionName = "repfacttest_c8n_1x3"; final String testCollectionName = "repfacttest_c8n_1x3";
String shardId = "shard1"; final String shardId = "shard1";
int minRf = 2; final int minRf = 2;
createCollectionWithRetry(testCollectionName, "conf1", numShards, replicationFactor, maxShardsPerNode); createCollectionWithRetry(testCollectionName, "conf1", numShards, replicationFactor, maxShardsPerNode);
cloudClient.setDefaultCollection(testCollectionName); cloudClient.setDefaultCollection(testCollectionName);
@ -285,6 +285,7 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
ensureAllReplicasAreActive(testCollectionName, shardId, numShards, replicationFactor, 30); ensureAllReplicasAreActive(testCollectionName, shardId, numShards, replicationFactor, 30);
assertTrue("Expected 2 active replicas for "+testCollectionName, replicas.size() == 2); assertTrue("Expected 2 active replicas for "+testCollectionName, replicas.size() == 2);
log.info("Indexing docId=1");
int rf = sendDoc(1, minRf); int rf = sendDoc(1, minRf);
assertRf(3, "all replicas should be active", rf); assertRf(3, "all replicas should be active", rf);
@ -292,9 +293,10 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
doDBIdWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1); doDBIdWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1);
doDBQWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1); doDBQWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1);
log.info("Closing one proxy port");
getProxyForReplica(replicas.get(0)).close(); getProxyForReplica(replicas.get(0)).close();
log.info("Indexing docId=2");
rf = sendDoc(2, minRf); rf = sendDoc(2, minRf);
assertRf(2, "one replica should be down", rf); assertRf(2, "one replica should be down", rf);
@ -303,8 +305,22 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
doDBIdWithRetry(2, 5, "deletes should have propagated to 2 replicas", 1); doDBIdWithRetry(2, 5, "deletes should have propagated to 2 replicas", 1);
// SOLR-13599 sanity check if problem is related to sending a batch
List<SolrInputDocument> batch = new ArrayList<SolrInputDocument>(10);
for (int i=30; i < 45; i++) {
SolrInputDocument doc = new SolrInputDocument();
doc.addField(id, String.valueOf(i));
doc.addField("a_t", "hello" + i);
batch.add(doc);
}
log.info("Indexing batch of documents (30-45)");
int batchRf = sendDocsWithRetry(batch, minRf, 5, 1);
assertRf(2, "batch should have succeded, only one replica should be down", batchRf);
log.info("Closing second proxy port");
getProxyForReplica(replicas.get(1)).close(); getProxyForReplica(replicas.get(1)).close();
log.info("Indexing docId=3");
rf = sendDoc(3, minRf); rf = sendDoc(3, minRf);
assertRf(1, "both replicas should be down", rf); assertRf(1, "both replicas should be down", rf);
@ -312,6 +328,7 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
doDBIdWithRetry(1, 5, "deletes should have propagated to only 1 replica", 1); doDBIdWithRetry(1, 5, "deletes should have propagated to only 1 replica", 1);
// heal the partitions // heal the partitions
log.info("Re-opening closed proxy ports");
getProxyForReplica(replicas.get(0)).reopen(); getProxyForReplica(replicas.get(0)).reopen();
getProxyForReplica(replicas.get(1)).reopen(); getProxyForReplica(replicas.get(1)).reopen();
@ -319,14 +336,15 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
ensureAllReplicasAreActive(testCollectionName, shardId, numShards, replicationFactor, 30); ensureAllReplicasAreActive(testCollectionName, shardId, numShards, replicationFactor, 30);
log.info("Indexing docId=4");
rf = sendDoc(4, minRf); rf = sendDoc(4, minRf);
assertRf(3, "partitions to replicas have been healed", rf); assertRf(3, "all replicas have been healed", rf);
doDBQWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1); doDBQWithRetry(3, 5, "delete should have propagated to all 3 replicas", 1);
doDBIdWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1); doDBIdWithRetry(3, 5, "delete should have propagated to all 3 replicas", 1);
// now send a batch // now send a batch
List<SolrInputDocument> batch = new ArrayList<SolrInputDocument>(10); batch = new ArrayList<SolrInputDocument>(10);
for (int i=5; i < 15; i++) { for (int i=5; i < 15; i++) {
SolrInputDocument doc = new SolrInputDocument(); SolrInputDocument doc = new SolrInputDocument();
doc.addField(id, String.valueOf(i)); doc.addField(id, String.valueOf(i));
@ -334,16 +352,24 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
batch.add(doc); batch.add(doc);
} }
int batchRf = sendDocsWithRetry(batch, minRf, 5, 1); log.info("Indexing batch of documents (5-14)");
assertRf(3, "batch should have succeeded on all replicas", batchRf); batchRf = sendDocsWithRetry(batch, minRf, 5, 1);
assertRf(3, "batch add should have succeeded on all replicas", batchRf);
doDBQWithRetry(3, 5, "deletes should have propagated to only 1 replica", 15); doDBQWithRetry(3, 5, "batch deletes should have propagated to all 3 replica", 15);
doDBIdWithRetry(3, 5, "deletes should have propagated to only 1 replica", 15); doDBIdWithRetry(3, 5, "batch deletes should have propagated to all 3 replica", 15);
// add some chaos to the batch // add some chaos to the batch
log.info("Closing one proxy port (again)");
getProxyForReplica(replicas.get(0)).close(); getProxyForReplica(replicas.get(0)).close();
// send a single doc (again)
// SOLR-13599 sanity check if problem is related to "re-closing" a port on the proxy
log.info("Indexing docId=5");
rf = sendDoc(5, minRf);
assertRf(2, "doc should have succeded, only one replica should be down", rf);
// now send a batch // now send a batch (again)
batch = new ArrayList<SolrInputDocument>(10); batch = new ArrayList<SolrInputDocument>(10);
for (int i=15; i < 30; i++) { for (int i=15; i < 30; i++) {
SolrInputDocument doc = new SolrInputDocument(); SolrInputDocument doc = new SolrInputDocument();
@ -351,14 +377,15 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
doc.addField("a_t", "hello" + i); doc.addField("a_t", "hello" + i);
batch.add(doc); batch.add(doc);
} }
log.info("Indexing batch of documents (15-29)");
batchRf = sendDocsWithRetry(batch, minRf, 5, 1); batchRf = sendDocsWithRetry(batch, minRf, 5, 1);
assertRf(2, "batch should have succeeded on 2 replicas (only one replica should be down)", batchRf); assertRf(2, "batch should have succeded, only one replica should be down", batchRf);
doDBQWithRetry(2, 5, "deletes should have propagated to only 1 replica", 15); doDBQWithRetry(2, 5, "deletes should have propagated to only 1 replica", 15);
doDBIdWithRetry(2, 5, "deletes should have propagated to only 1 replica", 15); doDBIdWithRetry(2, 5, "deletes should have propagated to only 1 replica", 15);
// close the 2nd replica, and send a 3rd batch with expected achieved rf=1 // close the 2nd replica, and send a 3rd batch with expected achieved rf=1
log.info("Closing second proxy port (again)");
getProxyForReplica(replicas.get(1)).close(); getProxyForReplica(replicas.get(1)).close();
batch = new ArrayList<SolrInputDocument>(10); batch = new ArrayList<SolrInputDocument>(10);