mirror of https://github.com/apache/lucene.git
SOLR-13599: additional 'checkpoint' logging to try and help diagnose strange failures
This commit is contained in:
parent
f686167e71
commit
b4a602f6b2
|
@ -271,12 +271,12 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void testRf3() throws Exception {
|
protected void testRf3() throws Exception {
|
||||||
int numShards = 1;
|
final int numShards = 1;
|
||||||
int replicationFactor = 3;
|
final int replicationFactor = 3;
|
||||||
int maxShardsPerNode = 1;
|
final int maxShardsPerNode = 1;
|
||||||
String testCollectionName = "repfacttest_c8n_1x3";
|
final String testCollectionName = "repfacttest_c8n_1x3";
|
||||||
String shardId = "shard1";
|
final String shardId = "shard1";
|
||||||
int minRf = 2;
|
final int minRf = 2;
|
||||||
|
|
||||||
createCollectionWithRetry(testCollectionName, "conf1", numShards, replicationFactor, maxShardsPerNode);
|
createCollectionWithRetry(testCollectionName, "conf1", numShards, replicationFactor, maxShardsPerNode);
|
||||||
cloudClient.setDefaultCollection(testCollectionName);
|
cloudClient.setDefaultCollection(testCollectionName);
|
||||||
|
@ -285,6 +285,7 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
|
||||||
ensureAllReplicasAreActive(testCollectionName, shardId, numShards, replicationFactor, 30);
|
ensureAllReplicasAreActive(testCollectionName, shardId, numShards, replicationFactor, 30);
|
||||||
assertTrue("Expected 2 active replicas for "+testCollectionName, replicas.size() == 2);
|
assertTrue("Expected 2 active replicas for "+testCollectionName, replicas.size() == 2);
|
||||||
|
|
||||||
|
log.info("Indexing docId=1");
|
||||||
int rf = sendDoc(1, minRf);
|
int rf = sendDoc(1, minRf);
|
||||||
assertRf(3, "all replicas should be active", rf);
|
assertRf(3, "all replicas should be active", rf);
|
||||||
|
|
||||||
|
@ -292,9 +293,10 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
|
||||||
doDBIdWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1);
|
doDBIdWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1);
|
||||||
doDBQWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1);
|
doDBQWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1);
|
||||||
|
|
||||||
|
log.info("Closing one proxy port");
|
||||||
getProxyForReplica(replicas.get(0)).close();
|
getProxyForReplica(replicas.get(0)).close();
|
||||||
|
|
||||||
|
log.info("Indexing docId=2");
|
||||||
rf = sendDoc(2, minRf);
|
rf = sendDoc(2, minRf);
|
||||||
assertRf(2, "one replica should be down", rf);
|
assertRf(2, "one replica should be down", rf);
|
||||||
|
|
||||||
|
@ -303,8 +305,22 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
|
||||||
doDBIdWithRetry(2, 5, "deletes should have propagated to 2 replicas", 1);
|
doDBIdWithRetry(2, 5, "deletes should have propagated to 2 replicas", 1);
|
||||||
|
|
||||||
|
|
||||||
|
// SOLR-13599 sanity check if problem is related to sending a batch
|
||||||
|
List<SolrInputDocument> batch = new ArrayList<SolrInputDocument>(10);
|
||||||
|
for (int i=30; i < 45; i++) {
|
||||||
|
SolrInputDocument doc = new SolrInputDocument();
|
||||||
|
doc.addField(id, String.valueOf(i));
|
||||||
|
doc.addField("a_t", "hello" + i);
|
||||||
|
batch.add(doc);
|
||||||
|
}
|
||||||
|
log.info("Indexing batch of documents (30-45)");
|
||||||
|
int batchRf = sendDocsWithRetry(batch, minRf, 5, 1);
|
||||||
|
assertRf(2, "batch should have succeded, only one replica should be down", batchRf);
|
||||||
|
|
||||||
|
log.info("Closing second proxy port");
|
||||||
getProxyForReplica(replicas.get(1)).close();
|
getProxyForReplica(replicas.get(1)).close();
|
||||||
|
|
||||||
|
log.info("Indexing docId=3");
|
||||||
rf = sendDoc(3, minRf);
|
rf = sendDoc(3, minRf);
|
||||||
assertRf(1, "both replicas should be down", rf);
|
assertRf(1, "both replicas should be down", rf);
|
||||||
|
|
||||||
|
@ -312,6 +328,7 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
|
||||||
doDBIdWithRetry(1, 5, "deletes should have propagated to only 1 replica", 1);
|
doDBIdWithRetry(1, 5, "deletes should have propagated to only 1 replica", 1);
|
||||||
|
|
||||||
// heal the partitions
|
// heal the partitions
|
||||||
|
log.info("Re-opening closed proxy ports");
|
||||||
getProxyForReplica(replicas.get(0)).reopen();
|
getProxyForReplica(replicas.get(0)).reopen();
|
||||||
getProxyForReplica(replicas.get(1)).reopen();
|
getProxyForReplica(replicas.get(1)).reopen();
|
||||||
|
|
||||||
|
@ -319,14 +336,15 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
|
||||||
|
|
||||||
ensureAllReplicasAreActive(testCollectionName, shardId, numShards, replicationFactor, 30);
|
ensureAllReplicasAreActive(testCollectionName, shardId, numShards, replicationFactor, 30);
|
||||||
|
|
||||||
|
log.info("Indexing docId=4");
|
||||||
rf = sendDoc(4, minRf);
|
rf = sendDoc(4, minRf);
|
||||||
assertRf(3, "partitions to replicas have been healed", rf);
|
assertRf(3, "all replicas have been healed", rf);
|
||||||
|
|
||||||
doDBQWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1);
|
doDBQWithRetry(3, 5, "delete should have propagated to all 3 replicas", 1);
|
||||||
doDBIdWithRetry(3, 5, "deletes should have propagated to all 3 replicas", 1);
|
doDBIdWithRetry(3, 5, "delete should have propagated to all 3 replicas", 1);
|
||||||
|
|
||||||
// now send a batch
|
// now send a batch
|
||||||
List<SolrInputDocument> batch = new ArrayList<SolrInputDocument>(10);
|
batch = new ArrayList<SolrInputDocument>(10);
|
||||||
for (int i=5; i < 15; i++) {
|
for (int i=5; i < 15; i++) {
|
||||||
SolrInputDocument doc = new SolrInputDocument();
|
SolrInputDocument doc = new SolrInputDocument();
|
||||||
doc.addField(id, String.valueOf(i));
|
doc.addField(id, String.valueOf(i));
|
||||||
|
@ -334,16 +352,24 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
|
||||||
batch.add(doc);
|
batch.add(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
int batchRf = sendDocsWithRetry(batch, minRf, 5, 1);
|
log.info("Indexing batch of documents (5-14)");
|
||||||
assertRf(3, "batch should have succeeded on all replicas", batchRf);
|
batchRf = sendDocsWithRetry(batch, minRf, 5, 1);
|
||||||
|
assertRf(3, "batch add should have succeeded on all replicas", batchRf);
|
||||||
|
|
||||||
doDBQWithRetry(3, 5, "deletes should have propagated to only 1 replica", 15);
|
doDBQWithRetry(3, 5, "batch deletes should have propagated to all 3 replica", 15);
|
||||||
doDBIdWithRetry(3, 5, "deletes should have propagated to only 1 replica", 15);
|
doDBIdWithRetry(3, 5, "batch deletes should have propagated to all 3 replica", 15);
|
||||||
|
|
||||||
// add some chaos to the batch
|
// add some chaos to the batch
|
||||||
|
log.info("Closing one proxy port (again)");
|
||||||
getProxyForReplica(replicas.get(0)).close();
|
getProxyForReplica(replicas.get(0)).close();
|
||||||
|
|
||||||
|
// send a single doc (again)
|
||||||
|
// SOLR-13599 sanity check if problem is related to "re-closing" a port on the proxy
|
||||||
|
log.info("Indexing docId=5");
|
||||||
|
rf = sendDoc(5, minRf);
|
||||||
|
assertRf(2, "doc should have succeded, only one replica should be down", rf);
|
||||||
|
|
||||||
// now send a batch
|
// now send a batch (again)
|
||||||
batch = new ArrayList<SolrInputDocument>(10);
|
batch = new ArrayList<SolrInputDocument>(10);
|
||||||
for (int i=15; i < 30; i++) {
|
for (int i=15; i < 30; i++) {
|
||||||
SolrInputDocument doc = new SolrInputDocument();
|
SolrInputDocument doc = new SolrInputDocument();
|
||||||
|
@ -351,14 +377,15 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
|
||||||
doc.addField("a_t", "hello" + i);
|
doc.addField("a_t", "hello" + i);
|
||||||
batch.add(doc);
|
batch.add(doc);
|
||||||
}
|
}
|
||||||
|
log.info("Indexing batch of documents (15-29)");
|
||||||
batchRf = sendDocsWithRetry(batch, minRf, 5, 1);
|
batchRf = sendDocsWithRetry(batch, minRf, 5, 1);
|
||||||
assertRf(2, "batch should have succeeded on 2 replicas (only one replica should be down)", batchRf);
|
assertRf(2, "batch should have succeded, only one replica should be down", batchRf);
|
||||||
|
|
||||||
doDBQWithRetry(2, 5, "deletes should have propagated to only 1 replica", 15);
|
doDBQWithRetry(2, 5, "deletes should have propagated to only 1 replica", 15);
|
||||||
doDBIdWithRetry(2, 5, "deletes should have propagated to only 1 replica", 15);
|
doDBIdWithRetry(2, 5, "deletes should have propagated to only 1 replica", 15);
|
||||||
|
|
||||||
// close the 2nd replica, and send a 3rd batch with expected achieved rf=1
|
// close the 2nd replica, and send a 3rd batch with expected achieved rf=1
|
||||||
|
log.info("Closing second proxy port (again)");
|
||||||
getProxyForReplica(replicas.get(1)).close();
|
getProxyForReplica(replicas.get(1)).close();
|
||||||
|
|
||||||
batch = new ArrayList<SolrInputDocument>(10);
|
batch = new ArrayList<SolrInputDocument>(10);
|
||||||
|
|
Loading…
Reference in New Issue