From af2742fcf2fc31945c8bdd09f0f6bd52f857b77f Mon Sep 17 00:00:00 2001 From: Allan Yang Date: Mon, 23 Jul 2018 14:33:59 +0800 Subject: [PATCH] HBASE-20893 Data loss if splitting region while ServerCrashProcedure executing --- .../src/main/protobuf/MasterProcedure.proto | 1 + .../assignment/SplitTableRegionProcedure.java | 23 ++++ .../apache/hadoop/hbase/wal/WALSplitter.java | 24 ++++ .../master/TestSplitRegionWhileRSCrash.java | 122 ++++++++++++++++++ 4 files changed, 170 insertions(+) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitRegionWhileRSCrash.java diff --git a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto index 39d2824394e..1b137c7dff9 100644 --- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto @@ -248,6 +248,7 @@ enum SplitTableRegionState { SPLIT_TABLE_REGION_PRE_OPERATION_AFTER_META = 8; SPLIT_TABLE_REGION_OPEN_CHILD_REGIONS = 9; SPLIT_TABLE_REGION_POST_OPERATION = 10; + SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS = 11; } message SplitTableRegionStateData { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java index 230603721fa..f0ea25bc37d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java @@ -138,6 +138,21 @@ public class SplitTableRegionProcedure } } + /** + * Check whether there is recovered.edits in the closed region + * If any, that means this region is not closed property, we need + * to abort region merge to prevent data loss + * @param env master env + * @throws IOException IOException + */ + private void checkClosedRegion(final MasterProcedureEnv env) throws IOException { + if (WALSplitter.hasRecoveredEdits(env.getMasterServices().getFileSystem(), + env.getMasterConfiguration(), getRegion())) { + throw new IOException("Recovered.edits are found in Region: " + getRegion() + + ", abort split to prevent data loss"); + } + } + /** * Check whether the region is splittable * @param env MasterProcedureEnv @@ -238,6 +253,10 @@ public class SplitTableRegionProcedure break; case SPLIT_TABLE_REGION_CLOSE_PARENT_REGION: addChildProcedure(createUnassignProcedures(env, getRegionReplication(env))); + setNextState(SplitTableRegionState.SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS); + break; + case SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS: + checkClosedRegion(env); setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS); break; case SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS: @@ -312,6 +331,10 @@ public class SplitTableRegionProcedure case SPLIT_TABLE_REGION_WRITE_MAX_SEQUENCE_ID_FILE: // Doing nothing, as re-open parent region would clean up daughter region directories. break; + case SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS: + // Doing nothing, in SPLIT_TABLE_REGION_CLOSE_PARENT_REGION, + // we will bring parent region online + break; case SPLIT_TABLE_REGION_CLOSE_PARENT_REGION: openParentRegion(env); break; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALSplitter.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALSplitter.java index a49b96b95cd..634f44f8d0b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALSplitter.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALSplitter.java @@ -63,6 +63,7 @@ import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Durability; import org.apache.hadoop.hbase.client.Mutation; import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.coordination.SplitLogWorkerCoordination; import org.apache.hadoop.hbase.io.HeapSize; import org.apache.hadoop.hbase.log.HBaseMarkers; @@ -542,6 +543,29 @@ public class WALSplitter { return new Path(regiondir, HConstants.RECOVERED_EDITS_DIR); } + /** + * Check whether there is recovered.edits in the region dir + * @param fs FileSystem + * @param conf conf + * @param regionInfo the region to check + * @throws IOException IOException + * @return true if recovered.edits exist in the region dir + */ + public static boolean hasRecoveredEdits(final FileSystem fs, + final Configuration conf, final RegionInfo regionInfo) throws IOException { + // No recovered.edits for non default replica regions + if (regionInfo.getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID) { + return false; + } + Path rootDir = FSUtils.getRootDir(conf); + //Only default replica region can reach here, so we can use regioninfo + //directly without converting it to default replica's regioninfo. + Path regionDir = HRegion.getRegionDir(rootDir, regionInfo); + NavigableSet files = getSplitEditFilesSorted(fs, regionDir); + return files != null && !files.isEmpty(); + } + + /** * Returns sorted set of edit files made by splitter, excluding files * with '.temp' suffix. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitRegionWhileRSCrash.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitRegionWhileRSCrash.java new file mode 100644 index 00000000000..a881575c12e --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitRegionWhileRSCrash.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import java.util.List; +import java.util.concurrent.CountDownLatch; + +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.master.assignment.SplitTableRegionProcedure; +import org.apache.hadoop.hbase.master.assignment.UnassignProcedure; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Category({MasterTests.class, MediumTests.class}) +public class TestSplitRegionWhileRSCrash { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestSplitRegionWhileRSCrash.class); + + private static final Logger LOG = LoggerFactory + .getLogger(TestSplitRegionWhileRSCrash.class); + + protected static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + private static TableName TABLE_NAME = TableName.valueOf("test"); + private static Admin admin; + private static byte[] CF = Bytes.toBytes("cf"); + private static CountDownLatch mergeCommitArrive = new CountDownLatch(1); + private static Table TABLE; + + + @BeforeClass + public static void setupCluster() throws Exception { + UTIL.startMiniCluster(1); + admin = UTIL.getHBaseAdmin(); + TABLE = UTIL.createTable(TABLE_NAME, CF); + UTIL.waitTableAvailable(TABLE_NAME); + } + + @AfterClass + public static void cleanupTest() throws Exception { + try { + UTIL.shutdownMiniCluster(); + } catch (Exception e) { + LOG.warn("failure shutting down cluster", e); + } + } + + @Test + public void test() throws Exception { + MasterProcedureEnv env = UTIL.getMiniHBaseCluster().getMaster() + .getMasterProcedureExecutor().getEnvironment(); + final ProcedureExecutor executor = UTIL.getMiniHBaseCluster() + .getMaster().getMasterProcedureExecutor(); + List regionInfos = admin.getRegions(TABLE_NAME); + //Since a flush request will be sent while initializing SplitTableRegionProcedure + //Create SplitTableRegionProcedure first before put data + SplitTableRegionProcedure splitProcedure = new SplitTableRegionProcedure( + env, regionInfos.get(0), Bytes.toBytes("row5")); + //write some rows to the table + LOG.info("Begin to put data"); + for (int i = 0; i < 10; i++) { + byte[] row = Bytes.toBytes("row" + i); + Put put = new Put(row); + put.addColumn(CF, CF, CF); + TABLE.put(put); + } + executor.submitProcedure(splitProcedure); + LOG.info("SplitProcedure submitted"); + UTIL.waitFor(30000, () -> executor.getProcedures().stream() + .filter(p -> p instanceof UnassignProcedure) + .map(p -> (UnassignProcedure) p) + .anyMatch(p -> TABLE_NAME.equals(p.getTableName()))); + UTIL.getMiniHBaseCluster().killRegionServer( + UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName()); + UTIL.getMiniHBaseCluster().startRegionServer(); + UTIL.waitUntilNoRegionsInTransition(); + Scan scan = new Scan(); + ResultScanner results = TABLE.getScanner(scan); + int count = 0; + Result result = null; + while ((result = results.next()) != null) { + count++; + } + Assert.assertEquals("There should be 10 rows!", 10, count); + } +} \ No newline at end of file