From af8dc2e4dfd3befa88907258282fdf7a53ad150c Mon Sep 17 00:00:00 2001 From: Peter Somogyi Date: Thu, 11 Mar 2021 17:20:36 +0100 Subject: [PATCH] HBASE-25587 [hbck2] Schedule SCP for all unknown servers (#2978) Signed-off-by: Wellington Chevreuil --- .../apache/hadoop/hbase/client/HBaseHbck.java | 16 ++++++ .../org/apache/hadoop/hbase/client/Hbck.java | 2 + .../src/main/protobuf/Master.proto | 10 ++++ .../hbase/master/MasterRpcServices.java | 25 +++++++++ .../hadoop/hbase/master/ServerManager.java | 11 ++++ .../hbase/master/procedure/TestHBCKSCP.java | 30 ++++++---- .../master/procedure/TestHBCKSCPUnknown.java | 55 +++++++++++++++++++ 7 files changed, 137 insertions(+), 12 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCPUnknown.java diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java index d146a905a75..85920b66d11 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java @@ -45,6 +45,8 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.HbckServic import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RegionSpecifierAndState; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreRequest; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreResponse; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleSCPsForUnknownServersRequest; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleSCPsForUnknownServersResponse; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleServerCrashProcedureResponse; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.UnassignsResponse; @@ -201,6 +203,20 @@ public class HBaseHbck implements Hbck { } } + @Override + public List scheduleSCPsForUnknownServers() throws IOException { + try { + ScheduleSCPsForUnknownServersResponse response = + this.hbck.scheduleSCPsForUnknownServers( + rpcControllerFactory.newController(), + ScheduleSCPsForUnknownServersRequest.newBuilder().build()); + return response.getPidList(); + } catch (ServiceException se) { + LOG.debug("Failed to run ServerCrashProcedures for unknown servers", se); + throw new IOException(se); + } + } + @Override public boolean runHbckChore() throws IOException { try { diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java index 57d2912c206..99befa4aa11 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java @@ -130,6 +130,8 @@ public interface Hbck extends Abortable, Closeable { List scheduleServerCrashProcedures(List serverNames) throws IOException; + List scheduleSCPsForUnknownServers() throws IOException; + /** * Request HBCK chore to run at master side. * diff --git a/hbase-protocol-shaded/src/main/protobuf/Master.proto b/hbase-protocol-shaded/src/main/protobuf/Master.proto index ac1e8db08f9..4a66e2d74a4 100644 --- a/hbase-protocol-shaded/src/main/protobuf/Master.proto +++ b/hbase-protocol-shaded/src/main/protobuf/Master.proto @@ -1181,6 +1181,12 @@ message ScheduleServerCrashProcedureResponse { repeated uint64 pid = 1; } +message ScheduleSCPsForUnknownServersRequest {} + +message ScheduleSCPsForUnknownServersResponse { + repeated uint64 pid = 1; +} + message FixMetaRequest {} message FixMetaResponse {} @@ -1220,6 +1226,10 @@ service HbckService { rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest) returns(ScheduleServerCrashProcedureResponse); + /** Schedule a ServerCrashProcedure for unknown servers */ + rpc ScheduleSCPsForUnknownServers(ScheduleSCPsForUnknownServersRequest) + returns(ScheduleSCPsForUnknownServersResponse); + /** * Request HBCK chore to run at master side. */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java index c3b94d98919..9f7974e8998 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java @@ -2700,6 +2700,31 @@ public class MasterRpcServices extends RSRpcServices implements return MasterProtos.ScheduleServerCrashProcedureResponse.newBuilder().addAllPid(pids).build(); } + @Override + public MasterProtos.ScheduleSCPsForUnknownServersResponse scheduleSCPsForUnknownServers( + RpcController controller, MasterProtos.ScheduleSCPsForUnknownServersRequest request) + throws ServiceException { + + List pids = new ArrayList<>(); + final Set serverNames = + master.getAssignmentManager().getRegionStates().getRegionStates().stream() + .map(RegionState::getServerName).collect(Collectors.toSet()); + + final Set unknownServerNames = serverNames.stream() + .filter(sn -> master.getServerManager().isServerUnknown(sn)).collect(Collectors.toSet()); + + for (ServerName sn: unknownServerNames) { + LOG.info("{} schedule ServerCrashProcedure for unknown {}", + this.master.getClientIdAuditPrefix(), sn); + if (shouldSubmitSCP(sn)) { + pids.add(this.master.getServerManager().expireServer(sn, true)); + } else { + pids.add(Procedure.NO_PROC_ID); + } + } + return MasterProtos.ScheduleSCPsForUnknownServersResponse.newBuilder().addAllPid(pids).build(); + } + @Override public FixMetaResponse fixMeta(RpcController controller, FixMetaRequest request) throws ServiceException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 16cef3e3eaf..6b9a0a566ed 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -917,6 +917,17 @@ public class ServerManager { return serverName == null || deadservers.isDeadServer(serverName); } + /** + * Check if a server is unknown. A server can be online, + * or known to be dead, or unknown to this manager (i.e, not online, + * not known to be dead either; it is simply not tracked by the + * master any more, for example, a very old previous instance). + */ + public boolean isServerUnknown(ServerName serverName) { + return serverName == null + || (!onlineServers.containsKey(serverName) && !deadservers.isDeadServer(serverName)); + } + public void shutdownCluster() { String statusStr = "Cluster shutdown requested of master=" + this.master.getServerName(); LOG.info(statusStr); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java index a4d251f64f5..beb4fe36071 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java @@ -40,12 +40,8 @@ import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.regionserver.HRegionServer; -import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; -import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; - import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.MasterTests; - import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.Threads; @@ -57,6 +53,10 @@ import org.junit.rules.TestName; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException; +import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; + /** * Test of the HBCK-version of SCP. @@ -109,15 +109,16 @@ public class TestHBCKSCP extends TestSCPBase { // not be processing this server 'normally'. Remove it from processing by // calling 'finish' and then remove it from dead servers so rsServerName // becomes an 'Unknown Server' even though it is still around. + LOG.info("Killing {}", rsServerName); + cluster.killRegionServer(rsServerName); + master.getServerManager().moveFromOnlineToDeadServers(rsServerName); master.getServerManager().getDeadServers().finish(rsServerName); master.getServerManager().getDeadServers().removeDeadServer(rsServerName); master.getAssignmentManager().getRegionStates().removeServer(rsServerName); // Kill the server. Nothing should happen since an 'Unknown Server' as far // as the Master is concerned; i.e. no SCP. - LOG.info("Killing {}", rsServerName); HRegionServer hrs = cluster.getRegionServer(rsServerName); - hrs.abort("KILLED"); while (!hrs.isStopped()) { Threads.sleep(10); } @@ -135,12 +136,7 @@ public class TestHBCKSCP extends TestSCPBase { // I now have 'Unknown Server' references in hbase:meta; i.e. Server references // with no corresponding SCP. Queue one. - MasterProtos.ScheduleServerCrashProcedureResponse response = - master.getMasterRpcServices().scheduleServerCrashProcedure(null, - MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder(). - addServerName(ProtobufUtil.toServerName(rsServerName)).build()); - assertEquals(1, response.getPidCount()); - long pid = response.getPid(0); + long pid = scheduleHBCKSCP(rsServerName, master); assertNotEquals(Procedure.NO_PROC_ID, pid); while (master.getMasterProcedureExecutor().getActiveProcIds().contains(pid)) { Threads.sleep(10); @@ -156,6 +152,16 @@ public class TestHBCKSCP extends TestSCPBase { assertFalse(searchMeta(master, rsServerName)); } + protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException { + MasterProtos.ScheduleServerCrashProcedureResponse response = + master.getMasterRpcServices().scheduleServerCrashProcedure(null, + MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder(). + addServerName(ProtobufUtil.toServerName(rsServerName)).build()); + assertEquals(1, response.getPidCount()); + long pid = response.getPid(0); + return pid; + } + /** * @return True if we find reference to sn in meta table. */ diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCPUnknown.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCPUnknown.java new file mode 100644 index 00000000000..6702f402339 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCPUnknown.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.procedure; + +import static org.junit.Assert.assertEquals; + +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.testclassification.MasterTests; + +import org.junit.ClassRule; +import org.junit.experimental.categories.Category; +import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; + + +/** + * Test of the HBCK-version of SCP. + * The HBCKSCP is an SCP only it reads hbase:meta for list of Regions that were + * on the server-to-process rather than consult Master in-memory-state. + */ +@Category({ MasterTests.class, LargeTests.class }) +public class TestHBCKSCPUnknown extends TestHBCKSCP { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestHBCKSCPUnknown.class); + + @Override + protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException { + MasterProtos.ScheduleSCPsForUnknownServersResponse response = + master.getMasterRpcServices().scheduleSCPsForUnknownServers(null, + MasterProtos.ScheduleSCPsForUnknownServersRequest.newBuilder().build()); + assertEquals(1, response.getPidCount()); + long pid = response.getPid(0); + return pid; + } +}