HBASE-24128 [Flakey Tests] Add retry on thrift cmdline if client fails plus misc debug (#1442)
hbase-server/src/test/java/org/apache/hadoop/hbase/TestClusterPortAssignment.java Saw case where Master failed startup but it came out as an IOE so we did not trip the retry logic. hbase-server/src/test/java/org/apache/hadoop/hbase/TestInfoServers.java Add some debug and up timeouts. This test fails frequently for me locally. hbase-server/src/test/java/org/apache/hadoop/hbase/client/locking/TestEntityLocks.java Up the wait from 2x 200ms to 10x in case a pause on hardware or GC. This test fails locally and up on jenkins. hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestClearRegionBlockCache.java Debug. Have assert say what bad count was. hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactingToCellFlatMapMemStore.java Fails on occasion. Found count is off by a few. Tricky to debug. HBASE-24129 to reenable. hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionMergeTransactionOnCluster.java Debug. Add wait and check before moving to assert. hbase-thrift/src/test/java/org/apache/hadoop/hbase/thrift/TestThriftHttpServer.java Check for null before shutting; can be null if failed start. hbase-thrift/src/test/java/org/apache/hadoop/hbase/thrift/TestThriftServerCmdLine.java Add retry if client messes up connection. Fails for me locally.
This commit is contained in:
parent
af659564f3
commit
9da7f95fa7
|
@ -65,7 +65,7 @@ public class TestClusterPortAssignment {
|
|||
cluster.getRegionServer(0).getRpcServer().getListenerAddress().getPort());
|
||||
assertEquals("RS info port is incorrect", rsInfoPort,
|
||||
cluster.getRegionServer(0).getInfoServer().getPort());
|
||||
} catch (BindException|UnsupportedOperationException e) {
|
||||
} catch (Exception e) {
|
||||
if (e instanceof BindException || e.getCause() != null &&
|
||||
(e.getCause() instanceof BindException || e.getCause().getCause() != null &&
|
||||
e.getCause().getCause() instanceof BindException)) {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/**
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
|
@ -20,12 +20,12 @@ package org.apache.hadoop.hbase;
|
|||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MiscTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
|
@ -85,8 +85,8 @@ public class TestInfoServers {
|
|||
}
|
||||
|
||||
/**
|
||||
* Ensure when we go to top level index pages that we get redirected to an info-server specific status
|
||||
* page.
|
||||
* Ensure when we go to top level index pages that we get redirected to an info-server specific
|
||||
* status page.
|
||||
*/
|
||||
@Test
|
||||
public void testInfoServersRedirect() throws Exception {
|
||||
|
@ -121,9 +121,10 @@ public class TestInfoServers {
|
|||
byte[] cf = Bytes.toBytes("d");
|
||||
UTIL.createTable(tableName, cf);
|
||||
UTIL.waitTableAvailable(tableName);
|
||||
int port = UTIL.getHBaseCluster().getMaster().getInfoServer().getPort();
|
||||
assertDoesNotContainContent(new URL("http://localhost:" + port + "/table.jsp?name=" + tableName
|
||||
+ "&action=split&key="), "Table action request accepted");
|
||||
HMaster master = UTIL.getHBaseCluster().getMaster();
|
||||
int port = master.getRegionServerInfoPort(master.getServerName());
|
||||
assertDoesNotContainContent(new URL("http://localhost:" + port + "/table.jsp?name=" +
|
||||
tableName + "&action=split&key="), "Table action request accepted");
|
||||
assertDoesNotContainContent(
|
||||
new URL("http://localhost:" + port + "/table.jsp?name=" + tableName), "Actions:");
|
||||
}
|
||||
|
@ -143,11 +144,11 @@ public class TestInfoServers {
|
|||
|
||||
private String getUrlContent(URL u) throws IOException {
|
||||
java.net.URLConnection c = u.openConnection();
|
||||
c.setConnectTimeout(2000);
|
||||
c.setReadTimeout(2000);
|
||||
c.setConnectTimeout(20000);
|
||||
c.setReadTimeout(20000);
|
||||
c.connect();
|
||||
try (InputStream in = c.getInputStream()) {
|
||||
return IOUtils.toString(in);
|
||||
return IOUtils.toString(in, HConstants.UTF8_ENCODING);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -173,8 +173,8 @@ public class TestEntityLocks {
|
|||
lock.requestLock();
|
||||
lock.await();
|
||||
assertTrue(lock.isLocked());
|
||||
// Should get unlocked in next heartbeat i.e. after workerSleepTime. Wait 2x time.
|
||||
assertTrue(waitLockTimeOut(lock, 2 * workerSleepTime));
|
||||
// Should get unlocked in next heartbeat i.e. after workerSleepTime. Wait 10x time to be sure.
|
||||
assertTrue(waitLockTimeOut(lock, 10 * workerSleepTime));
|
||||
assertFalse(lock.getWorker().isAlive());
|
||||
verify(abortable, times(1)).abort(any(), eq(null));
|
||||
}
|
||||
|
|
|
@ -46,6 +46,8 @@ import org.junit.BeforeClass;
|
|||
import org.junit.ClassRule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Confirm that we will do backoff when retrying on closing a region, to avoid consuming all the
|
||||
|
@ -53,6 +55,7 @@ import org.junit.experimental.categories.Category;
|
|||
*/
|
||||
@Category({ MasterTests.class, MediumTests.class })
|
||||
public class TestCloseRegionWhileRSCrash {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestCloseRegionWhileRSCrash.class);
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
|
@ -176,6 +179,7 @@ public class TestCloseRegionWhileRSCrash {
|
|||
try {
|
||||
UTIL.getAdmin().move(region.getEncodedNameAsBytes(), dstRs.getServerName());
|
||||
} catch (IOException e) {
|
||||
LOG.info("Failed move of {}", region.getRegionNameAsString(), e);
|
||||
}
|
||||
});
|
||||
t.start();
|
||||
|
@ -185,12 +189,13 @@ public class TestCloseRegionWhileRSCrash {
|
|||
// wait until the timeout value increase three times
|
||||
ProcedureTestUtil.waitUntilProcedureTimeoutIncrease(UTIL, TransitRegionStateProcedure.class, 3);
|
||||
// close connection to make sure that we can not finish the TRSP
|
||||
HMaster master = UTIL.getMiniHBaseCluster().getMaster();
|
||||
final HMaster master = UTIL.getMiniHBaseCluster().getMaster();
|
||||
master.getConnection().close();
|
||||
RESUME.countDown();
|
||||
UTIL.waitFor(30000, () -> !master.isAlive());
|
||||
// here we start a new master
|
||||
UTIL.getMiniHBaseCluster().startMaster();
|
||||
HMaster master2 = UTIL.getMiniHBaseCluster().startMaster().getMaster();
|
||||
LOG.info("Master2 {}, joining move thread", master2.getServerName());
|
||||
t.join();
|
||||
// Make sure that the region is online, it may not on the original target server, as we will set
|
||||
// forceNewPlan to true if there is a server crash
|
||||
|
|
|
@ -117,8 +117,10 @@ public class TestClearRegionBlockCache {
|
|||
HTU.getNumHFilesForRS(rs2, TABLE_NAME, FAMILY));
|
||||
clearRegionBlockCache(rs2);
|
||||
|
||||
assertEquals(initialBlockCount1, blockCache1.getBlockCount());
|
||||
assertEquals(initialBlockCount2, blockCache2.getBlockCount());
|
||||
assertEquals("" + blockCache1.getBlockCount(),
|
||||
initialBlockCount1, blockCache1.getBlockCount());
|
||||
assertEquals("" + blockCache2.getBlockCount(),
|
||||
initialBlockCount2, blockCache2.getBlockCount());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -839,7 +839,9 @@ public class TestCompactingToCellFlatMapMemStore extends TestCompactingMemStore
|
|||
* testForceCopyOfBigCellIntoImmutableSegment checks that the
|
||||
* ImmutableMemStoreLAB's forceCopyOfBigCellInto does what it's supposed to do.
|
||||
*/
|
||||
@Test
|
||||
@org.junit.Ignore @Test // Flakey. Disabled by HBASE-24128. HBASE-24129 is for reenable.
|
||||
// TestCompactingToCellFlatMapMemStore.testForceCopyOfBigCellIntoImmutableSegment:902 i=1
|
||||
// expected:<8389924> but was:<8389992>
|
||||
public void testForceCopyOfBigCellIntoImmutableSegment() throws IOException {
|
||||
|
||||
if (toCellChunkMap == false) {
|
||||
|
|
|
@ -21,13 +21,14 @@ import static org.junit.Assert.assertEquals;
|
|||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.commons.lang3.RandomUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
@ -66,6 +67,7 @@ import org.apache.hadoop.hbase.util.FutureUtils;
|
|||
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hadoop.hbase.util.PairOfSameType;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.junit.AfterClass;
|
||||
|
@ -77,11 +79,9 @@ import org.junit.experimental.categories.Category;
|
|||
import org.junit.rules.TestName;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.base.Joiner;
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.RpcController;
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
|
||||
|
@ -130,7 +130,9 @@ public class TestRegionMergeTransactionOnCluster {
|
|||
@AfterClass
|
||||
public static void afterAllTests() throws Exception {
|
||||
TEST_UTIL.shutdownMiniCluster();
|
||||
if (ADMIN != null) ADMIN.close();
|
||||
if (ADMIN != null) {
|
||||
ADMIN.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -286,8 +288,19 @@ public class TestRegionMergeTransactionOnCluster {
|
|||
// cleaned up by the time we got here making the test sometimes flakey.
|
||||
assertTrue(cleaned > 0);
|
||||
|
||||
mergedRegionResult = MetaTableAccessor.getRegionResult(
|
||||
TEST_UTIL.getConnection(), mergedRegionInfo.getRegionName());
|
||||
// Wait around a bit to give stuff a chance to complete.
|
||||
while (true) {
|
||||
mergedRegionResult = MetaTableAccessor
|
||||
.getRegionResult(TEST_UTIL.getConnection(), mergedRegionInfo.getRegionName());
|
||||
if (MetaTableAccessor.hasMergeRegions(mergedRegionResult.rawCells())) {
|
||||
LOG.info("Waiting on cleanup of merge columns {}",
|
||||
Arrays.asList(mergedRegionResult.rawCells()).stream().
|
||||
map(c -> c.toString()).collect(Collectors.joining(",")));
|
||||
Threads.sleep(50);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertFalse(MetaTableAccessor.hasMergeRegions(mergedRegionResult.rawCells()));
|
||||
} finally {
|
||||
ADMIN.enableCatalogJanitor(true);
|
||||
|
|
|
@ -239,9 +239,13 @@ public class TestThriftHttpServer {
|
|||
}
|
||||
|
||||
private void stopHttpServerThread() throws Exception {
|
||||
LOG.debug("Stopping " + " Thrift HTTP server");
|
||||
LOG.debug("Stopping Thrift HTTP server {}", thriftServer);
|
||||
if (thriftServer != null) {
|
||||
thriftServer.stop();
|
||||
}
|
||||
if (httpServerThread != null) {
|
||||
httpServerThread.join();
|
||||
}
|
||||
if (httpServerException != null) {
|
||||
LOG.error("Command-line invocation of HBase Thrift server threw an " +
|
||||
"exception", httpServerException);
|
||||
|
|
|
@ -296,8 +296,11 @@ public class TestThriftServerCmdLine {
|
|||
@Test
|
||||
public void testRunThriftServer() throws Exception {
|
||||
ThriftServer thriftServer = createBoundServer();
|
||||
// Add retries in case we see stuff like connection reset
|
||||
for (int i = 0; i < 10; i++) {
|
||||
try {
|
||||
talkToThriftServer();
|
||||
break;
|
||||
} catch (Exception ex) {
|
||||
clientSideException = ex;
|
||||
LOG.info("Exception", ex);
|
||||
|
@ -305,6 +308,7 @@ public class TestThriftServerCmdLine {
|
|||
stopCmdLineThread();
|
||||
thriftServer.stop();
|
||||
}
|
||||
}
|
||||
|
||||
if (clientSideException != null) {
|
||||
LOG.error("Thrift client threw an exception. Parameters:" +
|
||||
|
|
Loading…
Reference in New Issue