HBASE-20202 [AMv2] Don't move region if its a split parent or offlined

M hbase-client/src/main/java/org/apache/hadoop/hbase/client/DoNotRetryRegionException.java
M hbase-client/src/main/java/org/apache/hadoop/hbase/exceptions/MergeRegionException.java
 Allow passing cause to Constructor.

M hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
 Add prepare step to move procedure.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java
 Add check that regions to merge are actually online to the Constructor
so we can fail fast if they are offline

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MoveRegionProcedure.java
 Add prepare step. Check regions and context and skip move if not right.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
 Add check parent region is online to constructor.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
 Add generic check region is online utility function for use by subclasses.

M hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionMove.java
 Add test that we fail if we try to move an offlined region.
This commit is contained in:
Michael Stack 2018-03-15 13:33:27 -07:00
parent e0bdc87b27
commit 79d47dd57a
12 changed files with 104 additions and 17 deletions

View File

@ -37,4 +37,7 @@ public class DoNotRetryRegionException extends DoNotRetryIOException {
super(s);
}
public DoNotRetryRegionException(Throwable cause) {
super(cause);
}
}

View File

@ -42,4 +42,8 @@ public class MergeRegionException extends DoNotRetryRegionException {
public MergeRegionException(String s) {
super(s);
}
public MergeRegionException(Throwable cause) {
super(cause);
}
}

View File

@ -338,6 +338,7 @@ message UnassignRegionStateData {
}
enum MoveRegionState {
MOVE_REGION_PREPARE = 0;
MOVE_REGION_UNASSIGN = 1;
MOVE_REGION_ASSIGN = 2;
}

View File

@ -2399,7 +2399,7 @@ public class HMaster extends HRegionServer implements MasterServices {
checkTableExists(tableName);
TableState ts = getTableStateManager().getTableState(tableName);
if (!ts.isDisabled()) {
throw new TableNotDisabledException("Not DISABLE tableState=" + ts);
throw new TableNotDisabledException("Not DISABLED; " + ts);
}
}

View File

@ -33,6 +33,7 @@ import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.UnknownRegionException;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
import org.apache.hadoop.hbase.client.MasterSwitchType;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.RegionInfo;
@ -108,8 +109,8 @@ public class MergeTableRegionsProcedure
super(env);
// Check daughter regions and make sure that we have valid daughter regions
// before doing the real work.
checkRegionsToMerge(regionsToMerge, forcible);
// before doing the real work. This check calls the super method #checkOnline also.
checkRegionsToMerge(env, regionsToMerge, forcible);
// WARN: make sure there is no parent region of the two merging regions in
// hbase:meta If exists, fixing up daughters would cause daughter regions(we
@ -122,7 +123,7 @@ public class MergeTableRegionsProcedure
this.forcible = forcible;
}
private static void checkRegionsToMerge(final RegionInfo[] regionsToMerge,
private static void checkRegionsToMerge(MasterProcedureEnv env, final RegionInfo[] regionsToMerge,
final boolean forcible) throws MergeRegionException {
// For now, we only merge 2 regions.
// It could be extended to more than 2 regions in the future.
@ -131,10 +132,13 @@ public class MergeTableRegionsProcedure
Arrays.toString(regionsToMerge));
}
checkRegionsToMerge(regionsToMerge[0], regionsToMerge[1], forcible);
checkRegionsToMerge(env, regionsToMerge[0], regionsToMerge[1], forcible);
}
private static void checkRegionsToMerge(final RegionInfo regionToMergeA,
/**
* One time checks.
*/
private static void checkRegionsToMerge(MasterProcedureEnv env, final RegionInfo regionToMergeA,
final RegionInfo regionToMergeB, final boolean forcible) throws MergeRegionException {
if (!regionToMergeA.getTable().equals(regionToMergeB.getTable())) {
throw new MergeRegionException("Can't merge regions from two different tables: " +
@ -146,6 +150,13 @@ public class MergeTableRegionsProcedure
throw new MergeRegionException("Can't merge non-default replicas");
}
try {
checkOnline(env, regionToMergeA);
checkOnline(env, regionToMergeB);
} catch (DoNotRetryRegionException dnrre) {
throw new MergeRegionException(dnrre);
}
if (!RegionInfo.areAdjacent(regionToMergeA, regionToMergeB)) {
String msg = "Unable to merge non-adjacent regions " + regionToMergeA.getShortNameToLog() +
", " + regionToMergeB.getShortNameToLog() + " where forcible = " + forcible;
@ -156,6 +167,7 @@ public class MergeTableRegionsProcedure
}
}
private static RegionInfo createMergedRegionInfo(final RegionInfo[] regionsToMerge) {
return createMergedRegionInfo(regionsToMerge[0], regionsToMerge[1]);
}
@ -457,7 +469,6 @@ public class MergeTableRegionsProcedure
private boolean prepareMergeRegion(final MasterProcedureEnv env) throws IOException {
// Note: the following logic assumes that we only have 2 regions to merge. In the future,
// if we want to extend to more than 2 regions, the code needs to be modified a little bit.
//
CatalogJanitor catalogJanitor = env.getMasterServices().getCatalogJanitor();
boolean regionAHasMergeQualifier = !catalogJanitor.cleanMergeQualifier(regionsToMerge[0]);
if (regionAHasMergeQualifier
@ -492,7 +503,6 @@ public class MergeTableRegionsProcedure
return false;
}
// Ask the remote regionserver if regions are mergeable. If we get an IOE, report it
// along with the failure, so we can see why regions are not mergeable at this time.
IOException mergeableCheckIOE = null;

View File

@ -64,6 +64,7 @@ public class MoveRegionProcedure extends AbstractStateMachineRegionProcedure<Mov
super(env, plan.getRegionInfo());
this.plan = plan;
preflightChecks(env, true);
checkOnline(env, plan.getRegionInfo());
}
@Override
@ -73,6 +74,16 @@ public class MoveRegionProcedure extends AbstractStateMachineRegionProcedure<Mov
LOG.trace(this + " execute state=" + state);
}
switch (state) {
case MOVE_REGION_PREPARE:
// Check context again and that region is online; do it here after we have lock on region.
try {
preflightChecks(env, true);
checkOnline(env, this.plan.getRegionInfo());
} catch (HBaseIOException e) {
LOG.warn(this.toString() + " FAILED because " + e.toString());
return Flow.NO_MORE_STATE;
}
break;
case MOVE_REGION_UNASSIGN:
addChildProcedure(new UnassignProcedure(plan.getRegionInfo(), plan.getSource(),
plan.getDestination(), true));

View File

@ -107,6 +107,9 @@ public class SplitTableRegionProcedure
final RegionInfo regionToSplit, final byte[] splitRow) throws IOException {
super(env, regionToSplit);
preflightChecks(env, true);
// When procedure goes to run in its prepare step, it also does these checkOnline checks. Here
// we fail-fast on construction. There it skips the split with just a warning.
checkOnline(env, regionToSplit);
this.bestSplitRow = splitRow;
checkSplittable(env, regionToSplit, bestSplitRow);
final TableName table = regionToSplit.getTable();

View File

@ -26,11 +26,16 @@ import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableNotDisabledException;
import org.apache.hadoop.hbase.TableNotEnabledException;
import org.apache.hadoop.hbase.TableNotFoundException;
import org.apache.hadoop.hbase.UnknownRegionException;
import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionOfflineException;
import org.apache.hadoop.hbase.client.TableState;
import org.apache.hadoop.hbase.master.MasterFileSystem;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.TableStateManager;
import org.apache.hadoop.hbase.master.assignment.RegionStates;
import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.FSUtils;
@ -173,4 +178,30 @@ public abstract class AbstractStateMachineTableProcedure<TState>
throw new HBaseIOException(ioe);
}
}
/**
* Check region is online.
*/
protected static void checkOnline(MasterProcedureEnv env, final RegionInfo ri)
throws DoNotRetryRegionException {
RegionStates regionStates = env.getAssignmentManager().getRegionStates();
RegionState rs = regionStates.getRegionState(ri);
if (rs == null) {
throw new UnknownRegionException("No RegionState found for " + ri.getEncodedName());
}
if (!rs.isOpened()) {
throw new DoNotRetryRegionException(ri.getEncodedName() + " is not OPEN");
}
if (ri.isSplitParent()) {
throw new DoNotRetryRegionException(ri.getEncodedName() +
" is not online (splitParent=true)");
}
if (ri.isSplit()) {
throw new DoNotRetryRegionException(ri.getEncodedName() + " has split=true");
}
if (ri.isOffline()) {
// RegionOfflineException is not instance of DNRIOE so wrap it.
throw new DoNotRetryRegionException(new RegionOfflineException(ri.getEncodedName()));
}
}
}

View File

@ -51,6 +51,7 @@ import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Table;
@ -388,7 +389,12 @@ public class TestNamespaceAuditor {
Collections.sort(hris);
// verify that we cannot split
HRegionInfo hriToSplit2 = hris.get(1);
try {
ADMIN.split(tableTwo, Bytes.toBytes("6"));
fail();
} catch (DoNotRetryRegionException e) {
// Expected
}
Thread.sleep(2000);
assertEquals(initialRegions, ADMIN.getTableRegions(tableTwo).size());
}

View File

@ -26,7 +26,6 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.lang3.RandomUtils;
@ -43,6 +42,7 @@ import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.UnknownRegionException;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
@ -326,11 +326,9 @@ public class TestRegionMergeTransactionOnCluster {
admin.mergeRegionsAsync(a.getEncodedNameAsBytes(), b.getEncodedNameAsBytes(), false)
.get(syncWaitTimeout, TimeUnit.MILLISECONDS);
fail("Offline regions should not be able to merge");
} catch (ExecutionException ie) {
} catch (DoNotRetryRegionException ie) {
System.out.println(ie);
assertTrue("Exception should mention regions not online",
StringUtils.stringifyException(ie.getCause()).contains("regions that are not online")
&& ie.getCause() instanceof MergeRegionException);
assertTrue(ie instanceof MergeRegionException);
}
try {

View File

@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hbase.regionserver;
import static junit.framework.TestCase.fail;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
@ -29,6 +30,7 @@ import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableNotEnabledException;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.Table;
@ -115,6 +117,18 @@ public class TestRegionMove {
+ ", but found none", !regionsOnRS1ForTable.isEmpty());
final RegionInfo regionToMove = regionsOnRS1ForTable.get(0);
// Offline the region and then try to move it. Should fail.
admin.unassign(regionToMove.getRegionName(), true);
try {
admin.move(regionToMove.getEncodedNameAsBytes(),
Bytes.toBytes(rs2.getServerName().toString()));
fail();
} catch (DoNotRetryRegionException e) {
// We got expected exception
}
// Reassign for next stage of test.
admin.assign(regionToMove.getRegionName());
// Disable the table
admin.disableTable(tableName);

View File

@ -52,6 +52,7 @@ import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Consistency;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
@ -320,9 +321,14 @@ public class TestSplitTransactionOnCluster {
// Now try splitting.... should fail. And each should successfully
// rollback.
// We don't roll back here anymore. Instead we fail-fast on construction of the
// split transaction. Catch the exception instead.
try {
this.admin.splitRegion(hri.getRegionName());
this.admin.splitRegion(hri.getRegionName());
this.admin.splitRegion(hri.getRegionName());
fail();
} catch (DoNotRetryRegionException e) {
// Expected
}
// Wait around a while and assert count of regions remains constant.
for (int i = 0; i < 10; i++) {
Thread.sleep(100);