HBASE-21144 AssignmentManager.waitForAssignment is not stable

This commit is contained in:
Duo Zhang 2018-09-04 11:55:19 +08:00
parent 7e524f633a
commit 6ab9997d1f
9 changed files with 84 additions and 107 deletions

View File

@ -37,7 +37,6 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseIOException;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.PleaseHoldException;
import org.apache.hadoop.hbase.RegionException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.YouAreDeadException;
@ -53,7 +52,6 @@ import org.apache.hadoop.hbase.master.AssignmentListener;
import org.apache.hadoop.hbase.master.LoadBalancer;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.MetricsAssignmentManager;
import org.apache.hadoop.hbase.master.NoSuchProcedureException;
import org.apache.hadoop.hbase.master.RegionPlan;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.RegionState.State;
@ -561,11 +559,6 @@ public class AssignmentManager implements ServerListener {
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
}
@VisibleForTesting
public boolean waitForAssignment(final RegionInfo regionInfo) throws IOException {
return waitForAssignment(regionInfo, Long.MAX_VALUE);
}
/**
* Create round-robin assigns. Use on table creation to distribute out regions across cluster.
* @return AssignProcedures made out of the passed in <code>hris</code> and a call to the balancer
@ -577,38 +570,6 @@ public class AssignmentManager implements ServerListener {
return createRoundRobinAssignProcedures(hris, null);
}
@VisibleForTesting
// TODO: Remove this?
public boolean waitForAssignment(final RegionInfo regionInfo, final long timeout)
throws IOException {
RegionStateNode node = null;
// This method can be called before the regionInfo has made it into the regionStateMap
// so wait around here a while.
long startTime = System.currentTimeMillis();
// Something badly wrong if takes ten seconds to register a region.
long endTime = startTime + 10000;
while ((node = regionStates.getRegionStateNode(regionInfo)) == null && isRunning() &&
System.currentTimeMillis() < endTime) {
// Presume it not yet added but will be added soon. Let it spew a lot so we can tell if
// we are waiting here alot.
LOG.debug("Waiting on " + regionInfo + " to be added to regionStateMap");
Threads.sleep(10);
}
if (node == null) {
if (!isRunning()) return false;
throw new RegionException(regionInfo.getRegionNameAsString() + " never registered with Assigment.");
}
RegionTransitionProcedure proc = node.getProcedure();
if (proc == null) {
throw new NoSuchProcedureException(node.toString());
}
ProcedureSyncWait.waitForProcedureToCompleteIOE(
master.getMasterProcedureExecutor(), proc, timeout);
return true;
}
// ============================================================================================
// RegionTransition procedures helpers
// ============================================================================================
@ -1918,4 +1879,9 @@ public class AssignmentManager implements ServerListener {
new ServerCrashException(rtp.getProcId(), serverName));
}
}
@VisibleForTesting
MasterServices getMaster() {
return master;
}
}

View File

@ -497,7 +497,7 @@ public class RegionStates {
return regionsMap.get(regionName);
}
protected RegionStateNode getRegionStateNode(final RegionInfo regionInfo) {
public RegionStateNode getRegionStateNode(final RegionInfo regionInfo) {
return getRegionStateNodeFromName(regionInfo.getRegionName());
}

View File

@ -40,6 +40,8 @@ import org.apache.yetus.audience.InterfaceStability;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState;
/**
* Helper to synchronously wait on conditions.
* This will be removed in the future (mainly when the AssignmentManager will be
@ -135,18 +137,25 @@ public final class ProcedureSyncWait {
}
public static byte[] waitForProcedureToComplete(
final ProcedureExecutor<MasterProcedureEnv> procExec,
final Procedure<?> proc, final long timeout)
throws IOException {
final ProcedureExecutor<MasterProcedureEnv> procExec, final Procedure<?> proc,
final long timeout) throws IOException {
waitFor(procExec.getEnvironment(), "pid=" + proc.getProcId(),
new ProcedureSyncWait.Predicate<Boolean>() {
@Override
public Boolean evaluate() throws IOException {
return !procExec.isRunning() || procExec.isFinished(proc.getProcId());
if (!procExec.isRunning()) {
return true;
}
ProcedureState state = proc.getState();
if (state == ProcedureState.INITIALIZING || state == ProcedureState.RUNNABLE) {
// under these states the procedure may have not been added to procExec yet, so do not
// use isFinished to test whether it is finished, as this method will just check if the
// procedure is in the running procedure list
return false;
}
return procExec.isFinished(proc.getProcId());
}
}
);
});
if (!procExec.isRunning()) {
throw new IOException("The Master is Aborting");
}

View File

@ -99,6 +99,7 @@ import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
import org.apache.hadoop.hbase.master.assignment.RegionStates;
import org.apache.hadoop.hbase.regionserver.BloomType;
@ -3210,17 +3211,15 @@ public class HBaseTestingUtility extends HBaseZKTestingUtility {
}
/**
* Uses directly the assignment manager to assign the region.
* and waits until the specified region has completed assignment.
* @throws IOException
* @throw InterruptedException
* Uses directly the assignment manager to assign the region. and waits until the specified region
* has completed assignment.
* @return true if the region is assigned false otherwise.
*/
public boolean assignRegion(final RegionInfo regionInfo)
throws IOException, InterruptedException {
final AssignmentManager am = getHBaseCluster().getMaster().getAssignmentManager();
am.assign(regionInfo);
return am.waitForAssignment(regionInfo);
return AssignmentTestingUtil.waitForAssignment(am, regionInfo);
}
/**

View File

@ -33,7 +33,6 @@ import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.NoSuchProcedureException;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
@ -82,22 +81,10 @@ public class TestAsyncRegionAdminApi extends TestAsyncAdminBase {
// Region is assigned now. Let's assign it again.
// Master should not abort, and region should stay assigned.
admin.assign(hri.getRegionName()).get();
try {
am.waitForAssignment(hri);
fail("Expected NoSuchProcedureException");
} catch (NoSuchProcedureException e) {
// Expected
}
assertTrue(regionStates.getRegionState(hri).isOpened());
// unassign region
admin.unassign(hri.getRegionName(), true).get();
try {
am.waitForAssignment(hri);
fail("Expected NoSuchProcedureException");
} catch (NoSuchProcedureException e) {
// Expected
}
assertTrue(regionStates.getRegionState(hri).isClosed());
}

View File

@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.client;
import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@ -46,8 +47,8 @@ import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableNotFoundException;
import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.master.NoSuchProcedureException;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.regionserver.StorefileRefresherChore;
import org.apache.hadoop.hbase.testclassification.LargeTests;
@ -103,18 +104,14 @@ public class TestMetaWithReplicas {
getMetaRegionLocation(TEST_UTIL.getZooKeeperWatcher());
LOG.info("HBASE:META DEPLOY: on " + hbaseMetaServerName);
sns.add(hbaseMetaServerName);
for (int replicaId = 1; replicaId < 3; replicaId ++) {
RegionInfo h =
RegionReplicaUtil.getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
replicaId);
try {
am.waitForAssignment(h);
ServerName sn = am.getRegionStates().getRegionServerOfRegion(h);
LOG.info("HBASE:META DEPLOY: " + h.getRegionNameAsString() + " on " + sn);
sns.add(sn);
} catch (NoSuchProcedureException e) {
LOG.info("Presume the procedure has been cleaned up so just proceed: " + e.toString());
}
for (int replicaId = 1; replicaId < 3; replicaId++) {
RegionInfo h = RegionReplicaUtil
.getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO, replicaId);
AssignmentTestingUtil.waitForAssignment(am, h);
ServerName sn = am.getRegionStates().getRegionServerOfRegion(h);
assertNotNull(sn);
LOG.info("HBASE:META DEPLOY: " + h.getRegionNameAsString() + " on " + sn);
sns.add(sn);
}
// Fun. All meta region replicas have ended up on the one server. This will cause this test
// to fail ... sometimes.

View File

@ -18,7 +18,6 @@
package org.apache.hadoop.hbase.client;
import java.io.File;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtility;
@ -27,7 +26,7 @@ import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.NoSuchProcedureException;
import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Bytes;
@ -41,7 +40,6 @@ import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.TestName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -92,7 +90,7 @@ public class TestSeparateClientZKCluster {
FileUtils.deleteDirectory(clientZkDir);
}
@Test(timeout = 60000)
@Test
public void testBasicOperation() throws Exception {
TableName tn = TableName.valueOf(name.getMethodName());
// create table
@ -119,7 +117,7 @@ public class TestSeparateClientZKCluster {
}
}
@Test(timeout = 60000)
@Test
public void testMasterSwitch() throws Exception {
// get an admin instance and issue some request first
Connection conn = TEST_UTIL.getConnection();
@ -143,7 +141,7 @@ public class TestSeparateClientZKCluster {
}
}
@Test(timeout = 60000)
@Test
public void testMetaRegionMove() throws Exception {
TableName tn = TableName.valueOf(name.getMethodName());
// create table
@ -199,7 +197,7 @@ public class TestSeparateClientZKCluster {
}
}
@Test(timeout = 120000)
@Test
public void testMetaMoveDuringClientZkClusterRestart() throws Exception {
TableName tn = TableName.valueOf(name.getMethodName());
// create table
@ -230,12 +228,8 @@ public class TestSeparateClientZKCluster {
Thread.sleep(200);
}
// wait for meta region online
try {
cluster.getMaster().getAssignmentManager()
.waitForAssignment(RegionInfoBuilder.FIRST_META_REGIONINFO);
} catch (NoSuchProcedureException e) {
// we don't need to take any further action
}
AssignmentTestingUtil.waitForAssignment(cluster.getMaster().getAssignmentManager(),
RegionInfoBuilder.FIRST_META_REGIONINFO);
// wait some long time to make sure we will retry sync data to client ZK until data set
Thread.sleep(10000);
clientZkCluster.startup(clientZkDir);
@ -250,7 +244,7 @@ public class TestSeparateClientZKCluster {
}
}
@Test(timeout = 60000)
@Test
public void testAsyncTable() throws Exception {
TableName tn = TableName.valueOf(name.getMethodName());
ColumnFamilyDescriptorBuilder cfDescBuilder = ColumnFamilyDescriptorBuilder.newBuilder(family);

View File

@ -18,14 +18,19 @@
package org.apache.hadoop.hbase.master.assignment;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import java.io.IOException;
import java.util.Set;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.Waiter.ExplainingPredicate;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.yetus.audience.InterfaceStability;
@ -34,7 +39,7 @@ import org.slf4j.LoggerFactory;
@InterfaceAudience.Private
@InterfaceStability.Evolving
public abstract class AssignmentTestingUtil {
public final class AssignmentTestingUtil {
private static final Logger LOG = LoggerFactory.getLogger(AssignmentTestingUtil.class);
private AssignmentTestingUtil() {}
@ -122,4 +127,27 @@ public abstract class AssignmentTestingUtil {
private static HMaster getMaster(final HBaseTestingUtility util) {
return util.getMiniHBaseCluster().getMaster();
}
public static boolean waitForAssignment(AssignmentManager am, RegionInfo regionInfo)
throws IOException {
// This method can be called before the regionInfo has made it into the regionStateMap
// so wait around here a while.
Waiter.waitFor(am.getConfiguration(), 10000,
() -> am.getRegionStates().getRegionStateNode(regionInfo) != null);
RegionStateNode regionNode = am.getRegionStates().getRegionStateNode(regionInfo);
// Wait until the region has already been open, or we have a TRSP along with it.
Waiter.waitFor(am.getConfiguration(), 30000,
() -> regionNode.isInState(State.OPEN) || regionNode.isInTransition());
RegionTransitionProcedure proc = regionNode.getProcedure();
synchronized (regionNode) {
if (regionNode.isInState(State.OPEN)) {
return true;
}
proc = regionNode.getProcedure();
}
assertNotNull(proc);
ProcedureSyncWait.waitForProcedureToCompleteIOE(am.getMaster().getMasterProcedureExecutor(),
proc, 5L * 60 * 1000);
return true;
}
}

View File

@ -33,7 +33,6 @@ import java.util.Optional;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@ -73,18 +72,14 @@ import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.LoadBalancer;
import org.apache.hadoop.hbase.master.MasterRpcServices;
import org.apache.hadoop.hbase.master.NoSuchProcedureException;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
import org.apache.hadoop.hbase.master.assignment.RegionStates;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
import org.apache.hadoop.hbase.util.Bytes;
@ -94,8 +89,6 @@ import org.apache.hadoop.hbase.util.HBaseFsck;
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hbase.thirdparty.com.google.protobuf.RpcController;
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.junit.After;
@ -111,6 +104,14 @@ import org.junit.rules.TestName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.protobuf.RpcController;
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
/**
* The below tests are testing split region against a running cluster
*/
@ -161,11 +162,7 @@ public class TestSplitTransactionOnCluster {
throws IOException, InterruptedException {
assertEquals(1, regions.size());
RegionInfo hri = regions.get(0).getRegionInfo();
try {
cluster.getMaster().getAssignmentManager().waitForAssignment(hri, 600000);
} catch (NoSuchProcedureException e) {
LOG.info("Presume the procedure has been cleaned up so just proceed: " + e.toString());
}
AssignmentTestingUtil.waitForAssignment(cluster.getMaster().getAssignmentManager(), hri);
return hri;
}