HBASE-26459 HMaster should move non-meta region only if meta is ONLINE (#3875)

Signed-off-by: Viraj Jasani <vjasani@apache.org>
This commit is contained in:
Yutong Xiao 2021-12-04 01:30:16 +08:00 committed by GitHub
parent 7192423ae7
commit c472329460
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 82 additions and 1 deletions

View File

@ -136,6 +136,12 @@ public final class HConstants {
/** Default value for the max percent of regions in transition */
public static final double DEFAULT_HBASE_MASTER_BALANCER_MAX_RIT_PERCENT = 1.0;
/** Time in milliseconds to wait meta region assignment, when moving non-meta regions. */
public static final String HBASE_MASTER_WAITING_META_ASSIGNMENT_TIMEOUT =
"hbase.master.waiting.meta.assignment.timeout";
public static final long HBASE_MASTER_WAITING_META_ASSIGNMENT_TIMEOUT_DEFAULT = 10000;
/** Config for the max balancing time */
public static final String HBASE_BALANCER_MAX_BALANCING = "hbase.balancer.max.balancing";

View File

@ -393,6 +393,9 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
// Cached clusterId on stand by masters to serve clusterID requests from clients.
private final CachedClusterId cachedClusterId;
// Waiting time of non-meta region's moving for meta regions assignment.
private final long timeoutWaitMetaRegionAssignment;
public static class RedirectServlet extends HttpServlet {
private static final long serialVersionUID = 2894774810058302473L;
private final int regionServerInfoPort;
@ -498,6 +501,9 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
this.maxBalancingTime = getMaxBalancingTime();
this.maxRitPercent = conf.getDouble(HConstants.HBASE_MASTER_BALANCER_MAX_RIT_PERCENT,
HConstants.DEFAULT_HBASE_MASTER_BALANCER_MAX_RIT_PERCENT);
this.timeoutWaitMetaRegionAssignment =
conf.getLong(HConstants.HBASE_MASTER_WAITING_META_ASSIGNMENT_TIMEOUT,
HConstants.HBASE_MASTER_WAITING_META_ASSIGNMENT_TIMEOUT_DEFAULT);
// Do we publish the status?
@ -1845,12 +1851,20 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
// closed
serverManager.sendRegionWarmup(rp.getDestination(), hri);
// Here wait until all the meta regions are not in transition.
if (!hri.isMetaRegion() && assignmentManager.getRegionStates().isMetaRegionInTransition()) {
Thread.sleep(timeoutWaitMetaRegionAssignment);
if (assignmentManager.getRegionStates().isMetaRegionInTransition()) {
throw new HBaseIOException("Fail-fast of the region move, " +
" because hbase:meta region is still in transition. Failed region move info:" + rp);
}
}
LOG.info(getClientIdAuditPrefix() + " move " + rp + ", running balancer");
this.assignmentManager.balance(rp);
if (this.cpHost != null) {
this.cpHost.postMove(hri, rp.getSource(), rp.getDestination());
}
} catch (IOException ioe) {
} catch (IOException | InterruptedException ioe) {
if (ioe instanceof HBaseIOException) {
throw (HBaseIOException)ioe;
}

View File

@ -28,6 +28,7 @@ import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.HBaseIOException;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
@ -46,7 +47,9 @@ import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.util.StringUtils;
import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@ -66,6 +69,9 @@ public class TestMaster {
public static void beforeAllTests() throws Exception {
// we will retry operations when PleaseHoldException is thrown
TEST_UTIL.getConfiguration().setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 3);
// Here just set 1 ms for testing.
TEST_UTIL.getConfiguration().
setLong(HConstants.HBASE_MASTER_WAITING_META_ASSIGNMENT_TIMEOUT, 1);
// Set hbase.min.version.move.system.tables as version 0 so that
// testMoveRegionWhenNotInitialized never fails even if hbase-default has valid default
// value present for production use-case.
@ -188,5 +194,60 @@ public class TestMaster {
TEST_UTIL.deleteTable(tableName);
}
}
@Test (timeout = 300000)
public void testMoveRegionWhenMetaRegionInTransition()
throws IOException, InterruptedException, KeeperException {
TableName tableName = TableName.valueOf("testMoveRegionWhenMetaRegionInTransition");
HMaster master = TEST_UTIL.getMiniHBaseCluster().getMaster();
HTableDescriptor htd = new HTableDescriptor(tableName);
HColumnDescriptor hcd = new HColumnDescriptor("value");
RegionStates regionStates = master.getAssignmentManager().getRegionStates();
htd.addFamily(hcd);
admin.createTable(htd, null);
try {
HRegionInfo hri = admin.getTableRegions(tableName).get(0);
HRegionInfo metaRegion = admin.getTableRegions(TableName.META_TABLE_NAME).get(0);
ServerName rs0 = TEST_UTIL.getHBaseCluster().getRegionServer(0).getServerName();
ServerName rs1 = TEST_UTIL.getHBaseCluster().getRegionServer(1).getServerName();
admin.move(hri.getEncodedNameAsBytes(), rs0.getServerName().getBytes());
while (regionStates.isRegionInTransition(hri)) {
// Make sure the region is not in transition
Thread.sleep(1000);
}
// Meta region should be in transition
master.assignmentManager.unassign(metaRegion);
// Then move the region to a new region server.
try{
master.move(hri.getEncodedNameAsBytes(), rs1.getServerName().getBytes());
Assert.fail("Admin move should not be successful here.");
} catch (HBaseIOException e) {
assertTrue(e.getMessage().contains("Fail-fast"));
}
// Wait for the movement.
Thread.sleep(HConstants.HBASE_MASTER_WAITING_META_ASSIGNMENT_TIMEOUT_DEFAULT);
// The region should be still on rs0.
TEST_UTIL.assertRegionOnServer(hri, rs0, 5000);
// Wait until the meta region is reassigned.
admin.assign(metaRegion.getEncodedNameAsBytes());
while (regionStates.isMetaRegionInTransition()) {
Thread.sleep(1000);
}
// Try to move region to rs1 once again.
admin.move(hri.getEncodedNameAsBytes(), rs1.getServerName().getBytes());
Thread.sleep(HConstants.HBASE_MASTER_WAITING_META_ASSIGNMENT_TIMEOUT_DEFAULT);
// It should be moved to rs1 this time.
TEST_UTIL.assertRegionOnServer(hri, rs1, 5000);
} finally {
TEST_UTIL.deleteTable(tableName);
}
}
}