HBASE-21144 AssignmentManager.waitForAssignment is not stable

This commit is contained in:
Duo Zhang 2018-09-04 11:55:19 +08:00
parent 83131b1ac4
commit f504c4d797
7 changed files with 63 additions and 95 deletions

View File

@ -36,7 +36,6 @@ import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseIOException;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.PleaseHoldException;
import org.apache.hadoop.hbase.RegionException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.UnknownRegionException;
@ -52,7 +51,6 @@ import org.apache.hadoop.hbase.favored.FavoredNodesPromoter;
import org.apache.hadoop.hbase.master.LoadBalancer;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.MetricsAssignmentManager;
import org.apache.hadoop.hbase.master.NoSuchProcedureException;
import org.apache.hadoop.hbase.master.RegionPlan;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.RegionState.State;
@ -612,46 +610,6 @@ public class AssignmentManager implements ServerListener {
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
}
@VisibleForTesting
public boolean waitForAssignment(final RegionInfo regionInfo) throws IOException {
return waitForAssignment(regionInfo, Long.MAX_VALUE);
}
@VisibleForTesting
// TODO: Remove this?
public boolean waitForAssignment(final RegionInfo regionInfo, final long timeout)
throws IOException {
RegionStateNode node = null;
// This method can be called before the regionInfo has made it into the regionStateMap
// so wait around here a while.
long startTime = System.currentTimeMillis();
// Something badly wrong if takes ten seconds to register a region.
long endTime = startTime + 10000;
while ((node = regionStates.getRegionStateNode(regionInfo)) == null && isRunning() &&
System.currentTimeMillis() < endTime) {
// Presume it not yet added but will be added soon. Let it spew a lot so we can tell if
// we are waiting here alot.
LOG.debug("Waiting on " + regionInfo + " to be added to regionStateMap");
Threads.sleep(10);
}
if (node == null) {
if (!isRunning()) {
return false;
}
throw new RegionException(
regionInfo.getRegionNameAsString() + " never registered with Assigment.");
}
TransitRegionStateProcedure proc = node.getProcedure();
if (proc == null) {
throw new NoSuchProcedureException(node.toString());
}
ProcedureSyncWait.waitForProcedureToCompleteIOE(master.getMasterProcedureExecutor(), proc,
timeout);
return true;
}
// ============================================================================================
// RegionTransition procedures helpers
// ============================================================================================
@ -1904,4 +1862,9 @@ public class AssignmentManager implements ServerListener {
private void killRegionServer(final ServerStateNode serverNode) {
master.getServerManager().expireServer(serverNode.getServerName());
}
@VisibleForTesting
MasterServices getMaster() {
return master;
}
}

View File

@ -101,6 +101,7 @@ import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
import org.apache.hadoop.hbase.master.assignment.RegionStates;
import org.apache.hadoop.hbase.regionserver.BloomType;
@ -3358,17 +3359,15 @@ public class HBaseTestingUtility extends HBaseZKTestingUtility {
}
/**
* Uses directly the assignment manager to assign the region.
* and waits until the specified region has completed assignment.
* @throws IOException
* @throw InterruptedException
* Uses directly the assignment manager to assign the region. and waits until the specified region
* has completed assignment.
* @return true if the region is assigned false otherwise.
*/
public boolean assignRegion(final RegionInfo regionInfo)
throws IOException, InterruptedException {
final AssignmentManager am = getHBaseCluster().getMaster().getAssignmentManager();
am.assign(regionInfo);
return am.waitForAssignment(regionInfo);
return AssignmentTestingUtil.waitForAssignment(am, regionInfo);
}
/**

View File

@ -37,7 +37,6 @@ import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.NoSuchProcedureException;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
@ -92,22 +91,12 @@ public class TestAsyncRegionAdminApi extends TestAsyncAdminBase {
// Expected
assertThat(e.getCause(), instanceOf(DoNotRetryRegionException.class));
}
try {
am.waitForAssignment(hri);
fail("Expected NoSuchProcedureException");
} catch (NoSuchProcedureException e) {
// Expected
}
assertFalse(am.getRegionStates().getRegionStateNode(hri).isInTransition());
assertTrue(regionStates.getRegionState(hri).isOpened());
// unassign region
admin.unassign(hri.getRegionName(), true).get();
try {
am.waitForAssignment(hri);
fail("Expected NoSuchProcedureException");
} catch (NoSuchProcedureException e) {
// Expected
}
assertFalse(am.getRegionStates().getRegionStateNode(hri).isInTransition());
assertTrue(regionStates.getRegionState(hri).isClosed());
}

View File

@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.client;
import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@ -46,8 +47,8 @@ import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableNotFoundException;
import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.master.NoSuchProcedureException;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.regionserver.StorefileRefresherChore;
import org.apache.hadoop.hbase.testclassification.LargeTests;
@ -103,18 +104,14 @@ public class TestMetaWithReplicas {
getMetaRegionLocation(TEST_UTIL.getZooKeeperWatcher());
LOG.info("HBASE:META DEPLOY: on " + hbaseMetaServerName);
sns.add(hbaseMetaServerName);
for (int replicaId = 1; replicaId < 3; replicaId ++) {
RegionInfo h =
RegionReplicaUtil.getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
replicaId);
try {
am.waitForAssignment(h);
ServerName sn = am.getRegionStates().getRegionServerOfRegion(h);
LOG.info("HBASE:META DEPLOY: " + h.getRegionNameAsString() + " on " + sn);
sns.add(sn);
} catch (NoSuchProcedureException e) {
LOG.info("Presume the procedure has been cleaned up so just proceed: " + e.toString());
}
for (int replicaId = 1; replicaId < 3; replicaId++) {
RegionInfo h = RegionReplicaUtil
.getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO, replicaId);
AssignmentTestingUtil.waitForAssignment(am, h);
ServerName sn = am.getRegionStates().getRegionServerOfRegion(h);
assertNotNull(sn);
LOG.info("HBASE:META DEPLOY: " + h.getRegionNameAsString() + " on " + sn);
sns.add(sn);
}
// Fun. All meta region replicas have ended up on the one server. This will cause this test
// to fail ... sometimes.

View File

@ -18,7 +18,6 @@
package org.apache.hadoop.hbase.client;
import java.io.File;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtility;
@ -28,7 +27,7 @@ import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.StartMiniClusterOption;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.NoSuchProcedureException;
import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Bytes;
@ -42,7 +41,6 @@ import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.TestName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -95,7 +93,7 @@ public class TestSeparateClientZKCluster {
FileUtils.deleteDirectory(clientZkDir);
}
@Test(timeout = 60000)
@Test
public void testBasicOperation() throws Exception {
TableName tn = TableName.valueOf(name.getMethodName());
// create table
@ -122,7 +120,7 @@ public class TestSeparateClientZKCluster {
}
}
@Test(timeout = 60000)
@Test
public void testMasterSwitch() throws Exception {
// get an admin instance and issue some request first
Connection conn = TEST_UTIL.getConnection();
@ -146,7 +144,7 @@ public class TestSeparateClientZKCluster {
}
}
@Test(timeout = 60000)
@Test
public void testMetaRegionMove() throws Exception {
TableName tn = TableName.valueOf(name.getMethodName());
// create table
@ -202,7 +200,7 @@ public class TestSeparateClientZKCluster {
}
}
@Test(timeout = 120000)
@Test
public void testMetaMoveDuringClientZkClusterRestart() throws Exception {
TableName tn = TableName.valueOf(name.getMethodName());
// create table
@ -233,12 +231,8 @@ public class TestSeparateClientZKCluster {
Thread.sleep(200);
}
// wait for meta region online
try {
cluster.getMaster().getAssignmentManager()
.waitForAssignment(RegionInfoBuilder.FIRST_META_REGIONINFO);
} catch (NoSuchProcedureException e) {
// we don't need to take any further action
}
AssignmentTestingUtil.waitForAssignment(cluster.getMaster().getAssignmentManager(),
RegionInfoBuilder.FIRST_META_REGIONINFO);
// wait some long time to make sure we will retry sync data to client ZK until data set
Thread.sleep(10000);
clientZkCluster.startup(clientZkDir);
@ -253,7 +247,7 @@ public class TestSeparateClientZKCluster {
}
}
@Test(timeout = 60000)
@Test
public void testAsyncTable() throws Exception {
TableName tn = TableName.valueOf(name.getMethodName());
ColumnFamilyDescriptorBuilder cfDescBuilder = ColumnFamilyDescriptorBuilder.newBuilder(family);

View File

@ -18,14 +18,18 @@
package org.apache.hadoop.hbase.master.assignment;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import java.io.IOException;
import java.util.Set;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.Waiter.ExplainingPredicate;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.yetus.audience.InterfaceStability;
@ -34,7 +38,7 @@ import org.slf4j.LoggerFactory;
@InterfaceAudience.Private
@InterfaceStability.Evolving
public abstract class AssignmentTestingUtil {
public final class AssignmentTestingUtil {
private static final Logger LOG = LoggerFactory.getLogger(AssignmentTestingUtil.class);
private AssignmentTestingUtil() {}
@ -122,4 +126,30 @@ public abstract class AssignmentTestingUtil {
private static HMaster getMaster(final HBaseTestingUtility util) {
return util.getMiniHBaseCluster().getMaster();
}
public static boolean waitForAssignment(AssignmentManager am, RegionInfo regionInfo)
throws IOException {
// This method can be called before the regionInfo has made it into the regionStateMap
// so wait around here a while.
Waiter.waitFor(am.getConfiguration(), 10000,
() -> am.getRegionStates().getRegionStateNode(regionInfo) != null);
RegionStateNode regionNode = am.getRegionStates().getRegionStateNode(regionInfo);
// Wait until the region has already been open, or we have a TRSP along with it.
Waiter.waitFor(am.getConfiguration(), 30000,
() -> regionNode.isInState(State.OPEN) || regionNode.isInTransition());
TransitRegionStateProcedure proc = regionNode.getProcedure();
regionNode.lock();
try {
if (regionNode.isInState(State.OPEN)) {
return true;
}
proc = regionNode.getProcedure();
} finally {
regionNode.unlock();
}
assertNotNull(proc);
ProcedureSyncWait.waitForProcedureToCompleteIOE(am.getMaster().getMasterProcedureExecutor(),
proc, 5L * 60 * 1000);
return true;
}
}

View File

@ -75,10 +75,10 @@ import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.LoadBalancer;
import org.apache.hadoop.hbase.master.MasterRpcServices;
import org.apache.hadoop.hbase.master.NoSuchProcedureException;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
import org.apache.hadoop.hbase.master.assignment.RegionStates;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
@ -167,11 +167,7 @@ public class TestSplitTransactionOnCluster {
throws IOException, InterruptedException {
assertEquals(1, regions.size());
RegionInfo hri = regions.get(0).getRegionInfo();
try {
cluster.getMaster().getAssignmentManager().waitForAssignment(hri, 600000);
} catch (NoSuchProcedureException e) {
LOG.info("Presume the procedure has been cleaned up so just proceed: " + e.toString());
}
AssignmentTestingUtil.waitForAssignment(cluster.getMaster().getAssignmentManager(), hri);
return hri;
}