SOLR-13815: enhance live split test to fail more often

This commit is contained in:
yonik 2019-10-13 16:45:32 -04:00 committed by Yonik Seeley
parent 509b74fa4e
commit 1d43bda284
1 changed files with 56 additions and 37 deletions

View File

@ -20,6 +20,9 @@ package org.apache.solr.cloud;
import java.io.IOException; import java.io.IOException;
import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodHandles;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
@ -31,6 +34,8 @@ import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.UpdateRequest; import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Replica;
@ -194,39 +199,46 @@ public class SplitShardTest extends SolrCloudTestCase {
return totCount; return totCount;
} }
void doLiveSplitShard(String collectionName, int repFactor, int nThreads) throws Exception {
void doLiveSplitShard(String collectionName, int repFactor) throws Exception {
final CloudSolrClient client = createCollection(collectionName, repFactor); final CloudSolrClient client = createCollection(collectionName, repFactor);
final ConcurrentHashMap<String,Long> model = new ConcurrentHashMap<>(); // what the index should contain
final AtomicBoolean doIndex = new AtomicBoolean(true); final AtomicBoolean doIndex = new AtomicBoolean(true);
final AtomicInteger docsIndexed = new AtomicInteger(); final AtomicInteger docsIndexed = new AtomicInteger();
Thread indexThread = null; Thread[] indexThreads = new Thread[nThreads];
try { try {
// start indexing client before we initiate a shard split
indexThread = new Thread(() -> {
while (doIndex.get()) {
try {
// Thread.sleep(10); // uncomment this to cap indexing rate at 100 docs per second...
int currDoc = docsIndexed.get();
// Try all docs in the same update request for (int i=0; i<nThreads; i++) {
UpdateRequest updateReq = new UpdateRequest(); indexThreads[i] = new Thread(() -> {
updateReq.add(sdoc("id", "doc_" + currDoc)); while (doIndex.get()) {
UpdateResponse ursp = updateReq.commit(client, collectionName); try {
assertEquals(0, ursp.getStatus()); // for now, don't accept any failures // Thread.sleep(10); // cap indexing rate at 100 docs per second per thread
if (ursp.getStatus() == 0) { int currDoc = docsIndexed.incrementAndGet();
docsIndexed.incrementAndGet(); String docId = "doc_" + currDoc;
// Try all docs in the same update request
UpdateRequest updateReq = new UpdateRequest();
updateReq.add(sdoc("id", docId));
// UpdateResponse ursp = updateReq.commit(client, collectionName); // uncomment this if you want a commit each time
UpdateResponse ursp = updateReq.process(client, collectionName);
assertEquals(0, ursp.getStatus()); // for now, don't accept any failures
if (ursp.getStatus() == 0) {
model.put(docId, 1L); // in the future, keep track of a version per document and reuse ids to keep index from growing too large
}
} catch (Exception e) {
fail(e.getMessage());
break;
} }
} catch (Exception e) {
fail(e.getMessage());
break;
} }
} });
}); }
indexThread.start();
for (Thread thread : indexThreads) {
thread.start();
}
Thread.sleep(100); // wait for a few docs to be indexed before invoking split Thread.sleep(100); // wait for a few docs to be indexed before invoking split
int docCount = docsIndexed.get(); int docCount = model.size();
CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(collectionName) CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(collectionName)
.setShardName("shard1"); .setShardName("shard1");
@ -235,39 +247,46 @@ public class SplitShardTest extends SolrCloudTestCase {
collectionName, activeClusterShape(2, 3*repFactor)); // 2 repFactor for the new split shards, 1 repFactor for old replicas collectionName, activeClusterShape(2, 3*repFactor)); // 2 repFactor for the new split shards, 1 repFactor for old replicas
// make sure that docs were able to be indexed during the split // make sure that docs were able to be indexed during the split
assertTrue(docsIndexed.get() > docCount); assertTrue(model.size() > docCount);
Thread.sleep(100); // wait for a few more docs to be indexed after split Thread.sleep(100); // wait for a few more docs to be indexed after split
} finally { } finally {
// shut down the indexer // shut down the indexers
doIndex.set(false); doIndex.set(false);
if (indexThread != null) { for (Thread thread : indexThreads) {
indexThread.join(); thread.join();
} }
} }
assertTrue(docsIndexed.get() > 0); client.commit(); // final commit is needed for visibility
long numDocs = getNumDocs(client); long numDocs = getNumDocs(client);
if (numDocs != docsIndexed.get()) { if (numDocs != model.size()) {
// Find out what docs are missing. SolrDocumentList results = client.query(new SolrQuery("q","*:*", "fl","id", "rows", Integer.toString(model.size()) )).getResults();
for (int i = 0; i < docsIndexed.get(); i++) { Map<String,Long> leftover = new HashMap<>(model);
String id = "doc_" + i; for (SolrDocument doc : results) {
long cloudClientDocs = client.query(new SolrQuery("id:" + id)).getResults().getNumFound(); String id = (String) doc.get("id");
if (cloudClientDocs != 1) { leftover.remove(id);
log.error("MISSING DOCUMENT " + id);
}
} }
log.error("MISSING DOCUMENTS: " + leftover);
} }
assertEquals("Documents are missing!", docsIndexed.get(), numDocs); assertEquals("Documents are missing!", docsIndexed.get(), numDocs);
log.info("Number of documents indexed and queried : " + numDocs); log.info("Number of documents indexed and queried : " + numDocs);
} }
@Test @Test
public void testLiveSplit() throws Exception { public void testLiveSplit() throws Exception {
doLiveSplitShard("livesplit1", 1); // Debugging tips: if this fails, it may be easier to debug by lowering the number fo threads to 1 and looping the test
// until you get another failure.
// You may need to further instrument things like DistributedZkUpdateProcessor to display the cluster state for the collection, etc.
// Using more threads increases the chance to hit a concurrency bug, but too many threads can overwhelm single-threaded buffering
// replay after the low level index split and result in subShard leaders that can't catch up and
// become active (a known issue that still needs to be resolved.)
doLiveSplitShard("livesplit1", 1, 4);
} }