From 0f4e857c7a532bce262451eccd05a97fb6df61ce Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Thu, 26 Jul 2018 22:39:52 -0700 Subject: [PATCH] HBASE-20893 Data loss if splitting region while ServerCrashProcedure executing ADDENDUM: Rather than rollback, just do region reopens. In split, reopen the parent if recovered.edits and in merge, reopen the parent region or regions that happened to have recovered.edits on close. --- .../master/assignment/AssignProcedure.java | 1 + .../MergeTableRegionsProcedure.java | 46 ++++++++++--------- .../assignment/SplitTableRegionProcedure.java | 32 +++++++------ .../TestMergeTableRegionsWhileRSCrash.java | 7 --- 4 files changed, 43 insertions(+), 43 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java index 86f0a3ff591..55aee4ad071 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java @@ -68,6 +68,7 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProto * failure. Should we ignore rollback calls to Assign/Unassign then? Or just * remove rollback here? */ +// TODO: Add being able to assign a region to open read-only. @InterfaceAudience.Private public class AssignProcedure extends RegionTransitionProcedure { private static final Logger LOG = LoggerFactory.getLogger(AssignProcedure.class); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java index 580b9a96d48..20ae444256b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java @@ -238,8 +238,21 @@ public class MergeTableRegionsProcedure setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CHECK_CLOSED_REGIONS); break; case MERGE_TABLE_REGIONS_CHECK_CLOSED_REGIONS: - checkClosedRegions(env); - setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CREATE_MERGED_REGION); + List ris = hasRecoveredEdits(env); + if (ris.isEmpty()) { + setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CREATE_MERGED_REGION); + } else { + // Need to reopen parent regions to pickup missed recovered.edits. Do it by creating + // child assigns and then stepping back to MERGE_TABLE_REGIONS_CLOSE_REGIONS. + // Just assign the primary regions recovering the missed recovered.edits -- no replicas. + // May need to cycle here a few times if heavy writes. + // TODO: Add an assign read-only. + for (RegionInfo ri: ris) { + LOG.info("Found recovered.edits under {}, reopen to pickup missed edits!", ri); + addChildProcedure(env.getAssignmentManager().createAssignProcedure(ri)); + } + setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CLOSE_REGIONS); + } break; case MERGE_TABLE_REGIONS_CREATE_MERGED_REGION: createMergedRegion(env); @@ -458,30 +471,19 @@ public class MergeTableRegionsProcedure } /** - * check the closed regions + * Return list of regions that have recovered.edits... usually its an empty list. * @param env the master env * @throws IOException IOException */ - private void checkClosedRegions(final MasterProcedureEnv env) throws IOException { - checkClosedRegion(env, regionsToMerge[0]); - checkClosedRegion(env, regionsToMerge[1]); - } - - /** - * Check whether there is recovered.edits in the closed region - * If any, that means this region is not closed property, we need - * to abort region merge to prevent data loss - * @param env master env - * @param regionInfo regioninfo - * @throws IOException IOException - */ - private void checkClosedRegion(final MasterProcedureEnv env, - RegionInfo regionInfo) throws IOException { - if (WALSplitter.hasRecoveredEdits(env.getMasterServices().getFileSystem(), - env.getMasterConfiguration(), regionInfo)) { - throw new IOException("Recovered.edits are found in Region: " + regionInfo - + ", abort merge to prevent data loss"); + private List hasRecoveredEdits(final MasterProcedureEnv env) throws IOException { + List ris = new ArrayList(regionsToMerge.length); + for (int i = 0; i < regionsToMerge.length; i++) { + RegionInfo ri = regionsToMerge[i]; + if (SplitTableRegionProcedure.hasRecoveredEdits(env, ri)) { + ris.add(ri); + } } + return ris; } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java index f36b4c5e3e1..29dbabb6311 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java @@ -139,18 +139,13 @@ public class SplitTableRegionProcedure } /** - * Check whether there is recovered.edits in the closed region - * If any, that means this region is not closed property, we need - * to abort region split to prevent data loss + * Check whether there are recovered.edits in the parent closed region. * @param env master env * @throws IOException IOException */ - private void checkClosedRegion(final MasterProcedureEnv env) throws IOException { - if (WALSplitter.hasRecoveredEdits(env.getMasterServices().getFileSystem(), - env.getMasterConfiguration(), getRegion())) { - throw new IOException("Recovered.edits are found in Region: " + getRegion() - + ", abort split to prevent data loss"); - } + static boolean hasRecoveredEdits(MasterProcedureEnv env, RegionInfo ri) throws IOException { + return WALSplitter.hasRecoveredEdits(env.getMasterServices().getFileSystem(), + env.getMasterConfiguration(), ri); } /** @@ -256,8 +251,18 @@ public class SplitTableRegionProcedure setNextState(SplitTableRegionState.SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS); break; case SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS: - checkClosedRegion(env); - setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS); + if (hasRecoveredEdits(env, getRegion())) { + // If recovered edits, reopen parent region and then re-run the close by going back to + // SPLIT_TABLE_REGION_CLOSE_PARENT_REGION. We might have to cycle here a few times + // (TODO: Add being able to open a region in read-only mode). Open the primary replica + // in this case only where we just want to pickup the left-out replicated.edits. + LOG.info("Found recovered.edits under {}, reopen so we pickup these missed edits!", + getRegion().getEncodedName()); + addChildProcedure(env.getAssignmentManager().createAssignProcedure(getParentRegion())); + setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CLOSE_PARENT_REGION); + } else { + setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS); + } break; case SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS: createDaughterRegions(env); @@ -290,10 +295,9 @@ public class SplitTableRegionProcedure throw new UnsupportedOperationException(this + " unhandled state=" + state); } } catch (IOException e) { - String msg = "Error trying to split region " + getParentRegion().getEncodedName() + - " in the table " + getTableName() + " (in state=" + state + ")"; + String msg = "Splitting " + getParentRegion().getEncodedName() + ", " + this; if (!isRollbackSupported(state)) { - // We reach a state that cannot be rolled back. We just need to keep retry. + // We reach a state that cannot be rolled back. We just need to keep retrying. LOG.warn(msg, e); } else { LOG.error(msg, e); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMergeTableRegionsWhileRSCrash.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMergeTableRegionsWhileRSCrash.java index b979bb86fbf..9608e5cc7e4 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMergeTableRegionsWhileRSCrash.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMergeTableRegionsWhileRSCrash.java @@ -119,12 +119,5 @@ public class TestMergeTableRegionsWhileRSCrash { count++; } Assert.assertEquals("There should be 10 rows!", 10, count); - - - - } - - - }