diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManagerUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManagerUtil.java index 36ac6cfb3de..97ae7eadd6b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManagerUtil.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManagerUtil.java @@ -25,7 +25,6 @@ import java.util.ListIterator; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; - import org.apache.commons.lang3.ArrayUtils; import org.apache.hadoop.hbase.HBaseIOException; import org.apache.hadoop.hbase.ServerName; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java index d3b209e18ce..7811d9bf906 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java @@ -480,6 +480,13 @@ public class MergeTableRegionsProcedure * Prepare merge and do some check */ private boolean prepareMergeRegion(final MasterProcedureEnv env) throws IOException { + // Fail if we are taking snapshot for the given table + if (env.getMasterServices().getSnapshotManager() + .isTakingSnapshot(regionsToMerge[0].getTable())) { + throw new MergeRegionException( + "Skip merging regions " + RegionInfo.getShortNameToLog(regionsToMerge) + + ", because we are taking snapshot for the table " + regionsToMerge[0].getTable()); + } // Note: the following logic assumes that we only have 2 regions to merge. In the future, // if we want to extend to more than 2 regions, the code needs to be modified a little bit. CatalogJanitor catalogJanitor = env.getMasterServices().getCatalogJanitor(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java index bcee8979cc2..b66d91fd966 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java @@ -454,6 +454,13 @@ public class SplitTableRegionProcedure */ @VisibleForTesting public boolean prepareSplitRegion(final MasterProcedureEnv env) throws IOException { + // Fail if we are taking snapshot for the given table + if (env.getMasterServices().getSnapshotManager() + .isTakingSnapshot(getParentRegion().getTable())) { + setFailure(new IOException("Skip splitting region " + getParentRegion().getShortNameToLog() + + ", because we are taking snapshot for the table " + getParentRegion().getTable())); + return false; + } // Check whether the region is splittable RegionStateNode node = env.getAssignmentManager().getRegionStates().getRegionStateNode(getParentRegion()); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/DisabledTableSnapshotHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/DisabledTableSnapshotHandler.java index b1b6886930a..39b84252910 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/DisabledTableSnapshotHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/DisabledTableSnapshotHandler.java @@ -122,4 +122,11 @@ public class DisabledTableSnapshotHandler extends TakeSnapshotHandler { + " as finished."); } } + + @Override + protected boolean downgradeToSharedTableLock() { + // for taking snapshot on disabled table, it is OK to always hold the exclusive lock, as we do + // not need to assign the regions when there are region server crashes. + return false; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/EnabledTableSnapshotHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/EnabledTableSnapshotHandler.java index 445685ced89..3a6a8136119 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/EnabledTableSnapshotHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/EnabledTableSnapshotHandler.java @@ -132,4 +132,12 @@ public class EnabledTableSnapshotHandler extends TakeSnapshotHandler { monitor.rethrowException(); status.setStatus("Completed referencing HFiles for the mob region of table: " + snapshotTable); } + + @Override + protected boolean downgradeToSharedTableLock() { + // return true here to change from exclusive lock to shared lock, so we can still assign regions + // while taking snapshots. This is important, as region server crash can happen at any time, if + // we can not assign regions then the cluster will be in trouble as the regions can not online. + return true; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotManager.java index e496b4485b4..82acc7f6b9f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotManager.java @@ -420,7 +420,7 @@ public class SnapshotManager extends MasterProcedureManager implements Stoppable * @param tableName name of the table being snapshotted. * @return true if there is a snapshot in progress on the specified table. */ - synchronized boolean isTakingSnapshot(final TableName tableName) { + public synchronized boolean isTakingSnapshot(final TableName tableName) { SnapshotSentinel handler = this.snapshotHandlers.get(tableName); return handler != null && !handler.isFinished(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/TakeSnapshotHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/TakeSnapshotHandler.java index f6b48cbbcd7..d44312ab9ac 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/TakeSnapshotHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/TakeSnapshotHandler.java @@ -44,6 +44,7 @@ import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.MetricsSnapshot; import org.apache.hadoop.hbase.master.SnapshotSentinel; import org.apache.hadoop.hbase.master.locking.LockManager; +import org.apache.hadoop.hbase.master.locking.LockManager.MasterLock; import org.apache.hadoop.hbase.monitoring.MonitoredTask; import org.apache.hadoop.hbase.monitoring.TaskMonitor; import org.apache.hadoop.hbase.procedure2.LockType; @@ -88,7 +89,7 @@ public abstract class TakeSnapshotHandler extends EventHandler implements Snapsh protected final Path workingDir; private final MasterSnapshotVerifier verifier; protected final ForeignExceptionDispatcher monitor; - protected final LockManager.MasterLock tableLock; + private final LockManager.MasterLock tableLock; protected final MonitoredTask status; protected final TableName snapshotTable; protected final SnapshotManifest snapshotManifest; @@ -174,8 +175,16 @@ public abstract class TakeSnapshotHandler extends EventHandler implements Snapsh + eventType + " on table " + snapshotTable; LOG.info(msg); ReentrantLock lock = snapshotManager.getLocks().acquireLock(snapshot.getName()); + MasterLock tableLockToRelease = this.tableLock; status.setStatus(msg); try { + if (downgradeToSharedTableLock()) { + // release the exclusive lock and hold the shared lock instead + tableLockToRelease = master.getLockManager().createMasterLock(snapshotTable, + LockType.SHARED, this.getClass().getName() + ": take snapshot " + snapshot.getName()); + tableLock.release(); + tableLockToRelease.acquire(); + } // If regions move after this meta scan, the region specific snapshot should fail, triggering // an external exception that gets captured here. @@ -243,7 +252,7 @@ public abstract class TakeSnapshotHandler extends EventHandler implements Snapsh LOG.error("Couldn't delete snapshot working directory:" + workingDir); } lock.unlock(); - tableLock.release(); + tableLockToRelease.release(); } } @@ -282,6 +291,16 @@ public abstract class TakeSnapshotHandler extends EventHandler implements Snapsh finished = true; } + /** + * When taking snapshot, first we must acquire the exclusive table lock to confirm that there are + * no ongoing merge/split procedures. But later, we should try our best to release the exclusive + * lock as this may hurt the availability, because we need to hold the shared lock when assigning + * regions. + *

+ * See HBASE-21480 for more details. + */ + protected abstract boolean downgradeToSharedTableLock(); + /** * Snapshot the specified regions */ diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestSnapshotWhileRSCrashes.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestSnapshotWhileRSCrashes.java new file mode 100644 index 00000000000..46dc99f8609 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestSnapshotWhileRSCrashes.java @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.snapshot; + +import static org.junit.Assert.assertNull; + +import java.io.IOException; +import java.io.UncheckedIOException; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.master.locking.LockManager.MasterLock; +import org.apache.hadoop.hbase.master.locking.LockProcedure; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.procedure2.LockType; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category({ MasterTests.class, MediumTests.class }) +public class TestSnapshotWhileRSCrashes { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestSnapshotWhileRSCrashes.class); + + private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + private static TableName NAME = TableName.valueOf("Cleanup"); + + private static byte[] CF = Bytes.toBytes("cf"); + + @BeforeClass + public static void setUp() throws Exception { + UTIL.startMiniCluster(3); + UTIL.createMultiRegionTable(NAME, CF); + UTIL.waitTableAvailable(NAME); + } + + @AfterClass + public static void tearDown() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void test() throws InterruptedException, IOException { + String snName = "sn"; + MasterLock lock = UTIL.getMiniHBaseCluster().getMaster().getLockManager().createMasterLock(NAME, + LockType.EXCLUSIVE, "for testing"); + lock.acquire(); + Thread t = new Thread(() -> { + try { + UTIL.getAdmin().snapshot(snName, NAME); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }); + t.setDaemon(true); + t.start(); + ProcedureExecutor procExec = + UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); + UTIL.waitFor(10000, + () -> procExec.getProcedures().stream().filter(p -> !p.isFinished()) + .filter(p -> p instanceof LockProcedure).map(p -> (LockProcedure) p) + .filter(p -> NAME.equals(p.getTableName())).anyMatch(p -> !p.isLocked())); + UTIL.getMiniHBaseCluster().stopRegionServer(0); + lock.release(); + // the snapshot can not work properly when there are rs crashes, so here we just want to make + // sure that the regions could online + try (Table table = UTIL.getConnection().getTable(NAME); + ResultScanner scanner = table.getScanner(CF)) { + assertNull(scanner.next()); + } + } +}