From 422e98957b9b4a0a98db8bf5d4a457a2a0ce5c22 Mon Sep 17 00:00:00 2001 From: tianjingyun Date: Sat, 27 Oct 2018 10:06:56 +0800 Subject: [PATCH] HBASE-21322 Add a scheduleServerCrashProcedure() API to HbckService Signed-off-by: Michael Stack --- .../apache/hadoop/hbase/client/HBaseHbck.java | 18 +++++++ .../org/apache/hadoop/hbase/client/Hbck.java | 5 ++ .../shaded/protobuf/RequestConverter.java | 7 +++ .../src/main/protobuf/Master.proto | 12 +++++ .../hbase/master/MasterRpcServices.java | 54 +++++++++++++++++++ .../master/assignment/AssignmentManager.java | 11 ++-- .../apache/hadoop/hbase/client/TestHbck.java | 21 ++++++++ 7 files changed, 123 insertions(+), 5 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java index 2d088253a4b..eb39a2d8195 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java @@ -26,6 +26,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ipc.RpcControllerFactory; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter; +import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.GetTableStateResponse; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.HbckService.BlockingInterface; @@ -154,4 +155,21 @@ public class HBaseHbck implements Hbck { }); return response.getBypassedList(); } + + @Override + public List scheduleServerCrashProcedure(List serverNames) + throws IOException { + try { + MasterProtos.ScheduleServerCrashProcedureResponse response = + this.hbck.scheduleServerCrashProcedure(rpcControllerFactory.newController(), + RequestConverter.toScheduleServerCrashProcedureRequest(serverNames)); + return response.getPidList(); + } catch (ServiceException se) { + LOG.debug(toCommaDelimitedString( + serverNames.stream().map(serverName -> ProtobufUtil.toServerName(serverName).toString()) + .collect(Collectors.toList())), + se); + throw new IOException(se); + } + } } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java index 5c97d97daab..e88805cdcc3 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java @@ -25,6 +25,8 @@ import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.HBaseInterfaceAudience; import org.apache.yetus.audience.InterfaceAudience; +import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos; + /** * Hbck fixup tool APIs. Obtain an instance from {@link ClusterConnection#getHbck()} and call * {@link #close()} when done. @@ -101,4 +103,7 @@ public interface Hbck extends Abortable, Closeable { */ List bypassProcedure(List pids, long waitTime, boolean override, boolean recursive) throws IOException; + + List scheduleServerCrashProcedure(List serverNames) + throws IOException; } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java index 7db8b0225d5..a4bf8990c05 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java @@ -1897,6 +1897,13 @@ public final class RequestConverter { setOverride(override).build(); } + public static MasterProtos.ScheduleServerCrashProcedureRequest + toScheduleServerCrashProcedureRequest(List serverNames) { + MasterProtos.ScheduleServerCrashProcedureRequest.Builder b = + MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder(); + return b.addAllServerName(serverNames).build(); + } + private static List toEncodedRegionNameRegionSpecifiers( List encodedRegionNames) { return encodedRegionNames.stream(). diff --git a/hbase-protocol-shaded/src/main/protobuf/Master.proto b/hbase-protocol-shaded/src/main/protobuf/Master.proto index afc6e64cd32..f895ca0c8b8 100644 --- a/hbase-protocol-shaded/src/main/protobuf/Master.proto +++ b/hbase-protocol-shaded/src/main/protobuf/Master.proto @@ -1042,6 +1042,14 @@ message BypassProcedureResponse { repeated bool bypassed = 1; } +message ScheduleServerCrashProcedureRequest { + repeated ServerName serverName = 1; +} + +message ScheduleServerCrashProcedureResponse { + repeated uint64 pid = 1; +} + service HbckService { /** Update state of the table in meta only*/ rpc SetTableStateInMeta(SetTableStateInMetaRequest) @@ -1068,4 +1076,8 @@ service HbckService { /** Bypass a procedure to completion, procedure is completed but no actual work is done*/ rpc BypassProcedure(BypassProcedureRequest) returns(BypassProcedureResponse); + + /** Schedule a ServerCrashProcedure to help recover a crash server */ + rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest) + returns(ScheduleServerCrashProcedureResponse); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java index 17af66d2e9d..778a4379b1d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java @@ -18,6 +18,8 @@ */ package org.apache.hadoop.hbase.master; +import static org.apache.hadoop.hbase.master.MasterWalManager.META_FILTER; + import java.io.IOException; import java.net.BindException; import java.net.InetAddress; @@ -32,6 +34,7 @@ import java.util.Set; import java.util.function.BiFunction; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.ClusterMetricsBuilder; import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.HConstants; @@ -65,8 +68,10 @@ import org.apache.hadoop.hbase.ipc.RpcServerInterface; import org.apache.hadoop.hbase.ipc.ServerRpcController; import org.apache.hadoop.hbase.master.assignment.RegionStates; import org.apache.hadoop.hbase.master.locking.LockProcedure; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil; import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil.NonceProcedureRunnable; +import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; import org.apache.hadoop.hbase.mob.MobUtils; import org.apache.hadoop.hbase.procedure.MasterProcedureManager; import org.apache.hadoop.hbase.procedure2.LockType; @@ -97,6 +102,7 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.ForeignExceptionUtil; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.yetus.audience.InterfaceAudience; import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; @@ -2424,4 +2430,52 @@ public class MasterRpcServices extends RSRpcServices throw new ServiceException(e); } } + + @Override + public MasterProtos.ScheduleServerCrashProcedureResponse scheduleServerCrashProcedure( + RpcController controller, MasterProtos.ScheduleServerCrashProcedureRequest request) + throws ServiceException { + List serverNames = request.getServerNameList(); + List pids = new ArrayList<>(); + try { + for (HBaseProtos.ServerName serverName : serverNames) { + ServerName server = ProtobufUtil.toServerName(serverName); + if (shouldSubmitSCP(server)) { + ProcedureExecutor procExec = this.master.getMasterProcedureExecutor(); + pids.add(procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), + server, true, containMetaWals(server)))); + } else { + pids.add(-1L); + } + } + return MasterProtos.ScheduleServerCrashProcedureResponse.newBuilder().addAllPid(pids).build(); + } catch (IOException e) { + throw new ServiceException(e); + } + } + + private boolean containMetaWals(ServerName serverName) throws IOException { + Path logDir = new Path(master.getWALRootDir(), + AbstractFSWALProvider.getWALDirectoryName(serverName.toString())); + Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); + Path checkDir = master.getFileSystem().exists(splitDir) ? splitDir : logDir; + return master.getFileSystem().listStatus(checkDir, META_FILTER).length > 0; + } + + private boolean shouldSubmitSCP(ServerName serverName) { + // check if there is already a SCP of this server running + List> procedures = + master.getMasterProcedureExecutor().getProcedures(); + for (Procedure procedure : procedures) { + if (procedure instanceof ServerCrashProcedure) { + if (serverName.compareTo(((ServerCrashProcedure) procedure).getServerName()) == 0 + && !procedure.isFinished()) { + LOG.info("there is already a SCP of this server {} running, pid {}", serverName, + procedure.getProcId()); + return false; + } + } + } + return true; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index d3976e82084..22fff58d43e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1330,13 +1330,14 @@ public class AssignmentManager implements ServerListener { return 0; } - public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) { + public long submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) { boolean carryingMeta = isCarryingMeta(serverName); ProcedureExecutor procExec = this.master.getMasterProcedureExecutor(); - procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName, - shouldSplitWal, carryingMeta)); - LOG.debug("Added=" + serverName + - " to dead servers, submitted shutdown handler to be executed meta=" + carryingMeta); + long pid = procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), + serverName, shouldSplitWal, carryingMeta)); + LOG.debug("Added=" + serverName + + " to dead servers, submitted shutdown handler to be executed meta=" + carryingMeta); + return pid; } public void offlineRegion(final RegionInfo regionInfo) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java index 7680845cce5..8d9380f9745 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java @@ -28,6 +28,7 @@ import java.util.stream.Collectors; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; @@ -36,6 +37,7 @@ import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; +import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.testclassification.ClientTests; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.Bytes; @@ -50,6 +52,8 @@ import org.junit.rules.TestName; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; + /** * Class to test HBaseHbck. * Spins up the minicluster once at test start and then takes it down afterward. @@ -173,6 +177,23 @@ public class TestHbck { } } + @Test + public void testScheduleSCP() throws Exception { + HRegionServer testRs = TEST_UTIL.getRSForFirstRegionInTable(TABLE_NAME); + TEST_UTIL.loadTable(TEST_UTIL.getConnection().getTable(TABLE_NAME), Bytes.toBytes("family1"), + true); + ServerName serverName = testRs.getServerName(); + Hbck hbck = TEST_UTIL.getHbck(); + List pids = + hbck.scheduleServerCrashProcedure(Arrays.asList(ProtobufUtil.toServerName(serverName))); + assertTrue(pids.get(0) > 0); + LOG.info("pid is {}", pids.get(0)); + + pids = hbck.scheduleServerCrashProcedure(Arrays.asList(ProtobufUtil.toServerName(serverName))); + assertTrue(pids.get(0) == -1); + LOG.info("pid is {}", pids.get(0)); + } + private void waitOnPids(List pids) { for (Long pid: pids) { while (!TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor().