HBASE-20893 Data loss if splitting region while ServerCrashProcedure executing ADDENDUM: Rather than rollback, just do region reopens.

In split, reopen the parent if recovered.edits and in merge, reopen the
parent region or regions that happened to have recovered.edits on close.
This commit is contained in:
Michael Stack 2018-07-26 22:39:52 -07:00
parent daeec8657e
commit 0f4e857c7a
4 changed files with 43 additions and 43 deletions

View File

@ -68,6 +68,7 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProto
* failure. Should we ignore rollback calls to Assign/Unassign then? Or just
* remove rollback here?
*/
// TODO: Add being able to assign a region to open read-only.
@InterfaceAudience.Private
public class AssignProcedure extends RegionTransitionProcedure {
private static final Logger LOG = LoggerFactory.getLogger(AssignProcedure.class);

View File

@ -238,8 +238,21 @@ public class MergeTableRegionsProcedure
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CHECK_CLOSED_REGIONS);
break;
case MERGE_TABLE_REGIONS_CHECK_CLOSED_REGIONS:
checkClosedRegions(env);
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CREATE_MERGED_REGION);
List<RegionInfo> ris = hasRecoveredEdits(env);
if (ris.isEmpty()) {
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CREATE_MERGED_REGION);
} else {
// Need to reopen parent regions to pickup missed recovered.edits. Do it by creating
// child assigns and then stepping back to MERGE_TABLE_REGIONS_CLOSE_REGIONS.
// Just assign the primary regions recovering the missed recovered.edits -- no replicas.
// May need to cycle here a few times if heavy writes.
// TODO: Add an assign read-only.
for (RegionInfo ri: ris) {
LOG.info("Found recovered.edits under {}, reopen to pickup missed edits!", ri);
addChildProcedure(env.getAssignmentManager().createAssignProcedure(ri));
}
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CLOSE_REGIONS);
}
break;
case MERGE_TABLE_REGIONS_CREATE_MERGED_REGION:
createMergedRegion(env);
@ -458,30 +471,19 @@ public class MergeTableRegionsProcedure
}
/**
* check the closed regions
* Return list of regions that have recovered.edits... usually its an empty list.
* @param env the master env
* @throws IOException IOException
*/
private void checkClosedRegions(final MasterProcedureEnv env) throws IOException {
checkClosedRegion(env, regionsToMerge[0]);
checkClosedRegion(env, regionsToMerge[1]);
}
/**
* Check whether there is recovered.edits in the closed region
* If any, that means this region is not closed property, we need
* to abort region merge to prevent data loss
* @param env master env
* @param regionInfo regioninfo
* @throws IOException IOException
*/
private void checkClosedRegion(final MasterProcedureEnv env,
RegionInfo regionInfo) throws IOException {
if (WALSplitter.hasRecoveredEdits(env.getMasterServices().getFileSystem(),
env.getMasterConfiguration(), regionInfo)) {
throw new IOException("Recovered.edits are found in Region: " + regionInfo
+ ", abort merge to prevent data loss");
private List<RegionInfo> hasRecoveredEdits(final MasterProcedureEnv env) throws IOException {
List<RegionInfo> ris = new ArrayList<RegionInfo>(regionsToMerge.length);
for (int i = 0; i < regionsToMerge.length; i++) {
RegionInfo ri = regionsToMerge[i];
if (SplitTableRegionProcedure.hasRecoveredEdits(env, ri)) {
ris.add(ri);
}
}
return ris;
}
/**

View File

@ -139,18 +139,13 @@ public class SplitTableRegionProcedure
}
/**
* Check whether there is recovered.edits in the closed region
* If any, that means this region is not closed property, we need
* to abort region split to prevent data loss
* Check whether there are recovered.edits in the parent closed region.
* @param env master env
* @throws IOException IOException
*/
private void checkClosedRegion(final MasterProcedureEnv env) throws IOException {
if (WALSplitter.hasRecoveredEdits(env.getMasterServices().getFileSystem(),
env.getMasterConfiguration(), getRegion())) {
throw new IOException("Recovered.edits are found in Region: " + getRegion()
+ ", abort split to prevent data loss");
}
static boolean hasRecoveredEdits(MasterProcedureEnv env, RegionInfo ri) throws IOException {
return WALSplitter.hasRecoveredEdits(env.getMasterServices().getFileSystem(),
env.getMasterConfiguration(), ri);
}
/**
@ -256,8 +251,18 @@ public class SplitTableRegionProcedure
setNextState(SplitTableRegionState.SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS);
break;
case SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS:
checkClosedRegion(env);
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS);
if (hasRecoveredEdits(env, getRegion())) {
// If recovered edits, reopen parent region and then re-run the close by going back to
// SPLIT_TABLE_REGION_CLOSE_PARENT_REGION. We might have to cycle here a few times
// (TODO: Add being able to open a region in read-only mode). Open the primary replica
// in this case only where we just want to pickup the left-out replicated.edits.
LOG.info("Found recovered.edits under {}, reopen so we pickup these missed edits!",
getRegion().getEncodedName());
addChildProcedure(env.getAssignmentManager().createAssignProcedure(getParentRegion()));
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CLOSE_PARENT_REGION);
} else {
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS);
}
break;
case SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS:
createDaughterRegions(env);
@ -290,10 +295,9 @@ public class SplitTableRegionProcedure
throw new UnsupportedOperationException(this + " unhandled state=" + state);
}
} catch (IOException e) {
String msg = "Error trying to split region " + getParentRegion().getEncodedName() +
" in the table " + getTableName() + " (in state=" + state + ")";
String msg = "Splitting " + getParentRegion().getEncodedName() + ", " + this;
if (!isRollbackSupported(state)) {
// We reach a state that cannot be rolled back. We just need to keep retry.
// We reach a state that cannot be rolled back. We just need to keep retrying.
LOG.warn(msg, e);
} else {
LOG.error(msg, e);

View File

@ -119,12 +119,5 @@ public class TestMergeTableRegionsWhileRSCrash {
count++;
}
Assert.assertEquals("There should be 10 rows!", 10, count);
}
}