HBASE-25587 [hbck2] Schedule SCP for all unknown servers (#2978)

Signed-off-by: Wellington Chevreuil <wchevreuil@apache.org>
This commit is contained in:
Peter Somogyi 2021-03-11 17:20:36 +01:00
parent b5fc5e17e2
commit af8dc2e4df
7 changed files with 137 additions and 12 deletions

View File

@ -45,6 +45,8 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.HbckServic
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RegionSpecifierAndState; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RegionSpecifierAndState;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreRequest; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreRequest;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreResponse; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleSCPsForUnknownServersRequest;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleSCPsForUnknownServersResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleServerCrashProcedureResponse; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleServerCrashProcedureResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.UnassignsResponse; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.UnassignsResponse;
@ -201,6 +203,20 @@ public class HBaseHbck implements Hbck {
} }
} }
@Override
public List<Long> scheduleSCPsForUnknownServers() throws IOException {
try {
ScheduleSCPsForUnknownServersResponse response =
this.hbck.scheduleSCPsForUnknownServers(
rpcControllerFactory.newController(),
ScheduleSCPsForUnknownServersRequest.newBuilder().build());
return response.getPidList();
} catch (ServiceException se) {
LOG.debug("Failed to run ServerCrashProcedures for unknown servers", se);
throw new IOException(se);
}
}
@Override @Override
public boolean runHbckChore() throws IOException { public boolean runHbckChore() throws IOException {
try { try {

View File

@ -130,6 +130,8 @@ public interface Hbck extends Abortable, Closeable {
List<Long> scheduleServerCrashProcedures(List<ServerName> serverNames) throws IOException; List<Long> scheduleServerCrashProcedures(List<ServerName> serverNames) throws IOException;
List<Long> scheduleSCPsForUnknownServers() throws IOException;
/** /**
* Request HBCK chore to run at master side. * Request HBCK chore to run at master side.
* *

View File

@ -1181,6 +1181,12 @@ message ScheduleServerCrashProcedureResponse {
repeated uint64 pid = 1; repeated uint64 pid = 1;
} }
message ScheduleSCPsForUnknownServersRequest {}
message ScheduleSCPsForUnknownServersResponse {
repeated uint64 pid = 1;
}
message FixMetaRequest {} message FixMetaRequest {}
message FixMetaResponse {} message FixMetaResponse {}
@ -1220,6 +1226,10 @@ service HbckService {
rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest) rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest)
returns(ScheduleServerCrashProcedureResponse); returns(ScheduleServerCrashProcedureResponse);
/** Schedule a ServerCrashProcedure for unknown servers */
rpc ScheduleSCPsForUnknownServers(ScheduleSCPsForUnknownServersRequest)
returns(ScheduleSCPsForUnknownServersResponse);
/** /**
* Request HBCK chore to run at master side. * Request HBCK chore to run at master side.
*/ */

View File

@ -2700,6 +2700,31 @@ public class MasterRpcServices extends RSRpcServices implements
return MasterProtos.ScheduleServerCrashProcedureResponse.newBuilder().addAllPid(pids).build(); return MasterProtos.ScheduleServerCrashProcedureResponse.newBuilder().addAllPid(pids).build();
} }
@Override
public MasterProtos.ScheduleSCPsForUnknownServersResponse scheduleSCPsForUnknownServers(
RpcController controller, MasterProtos.ScheduleSCPsForUnknownServersRequest request)
throws ServiceException {
List<Long> pids = new ArrayList<>();
final Set<ServerName> serverNames =
master.getAssignmentManager().getRegionStates().getRegionStates().stream()
.map(RegionState::getServerName).collect(Collectors.toSet());
final Set<ServerName> unknownServerNames = serverNames.stream()
.filter(sn -> master.getServerManager().isServerUnknown(sn)).collect(Collectors.toSet());
for (ServerName sn: unknownServerNames) {
LOG.info("{} schedule ServerCrashProcedure for unknown {}",
this.master.getClientIdAuditPrefix(), sn);
if (shouldSubmitSCP(sn)) {
pids.add(this.master.getServerManager().expireServer(sn, true));
} else {
pids.add(Procedure.NO_PROC_ID);
}
}
return MasterProtos.ScheduleSCPsForUnknownServersResponse.newBuilder().addAllPid(pids).build();
}
@Override @Override
public FixMetaResponse fixMeta(RpcController controller, FixMetaRequest request) public FixMetaResponse fixMeta(RpcController controller, FixMetaRequest request)
throws ServiceException { throws ServiceException {

View File

@ -917,6 +917,17 @@ public class ServerManager {
return serverName == null || deadservers.isDeadServer(serverName); return serverName == null || deadservers.isDeadServer(serverName);
} }
/**
* Check if a server is unknown. A server can be online,
* or known to be dead, or unknown to this manager (i.e, not online,
* not known to be dead either; it is simply not tracked by the
* master any more, for example, a very old previous instance).
*/
public boolean isServerUnknown(ServerName serverName) {
return serverName == null
|| (!onlineServers.containsKey(serverName) && !deadservers.isDeadServer(serverName));
}
public void shutdownCluster() { public void shutdownCluster() {
String statusStr = "Cluster shutdown requested of master=" + this.master.getServerName(); String statusStr = "Cluster shutdown requested of master=" + this.master.getServerName();
LOG.info(statusStr); LOG.info(statusStr);

View File

@ -40,12 +40,8 @@ import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.util.Threads;
@ -57,6 +53,10 @@ import org.junit.rules.TestName;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
/** /**
* Test of the HBCK-version of SCP. * Test of the HBCK-version of SCP.
@ -109,15 +109,16 @@ public class TestHBCKSCP extends TestSCPBase {
// not be processing this server 'normally'. Remove it from processing by // not be processing this server 'normally'. Remove it from processing by
// calling 'finish' and then remove it from dead servers so rsServerName // calling 'finish' and then remove it from dead servers so rsServerName
// becomes an 'Unknown Server' even though it is still around. // becomes an 'Unknown Server' even though it is still around.
LOG.info("Killing {}", rsServerName);
cluster.killRegionServer(rsServerName);
master.getServerManager().moveFromOnlineToDeadServers(rsServerName); master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
master.getServerManager().getDeadServers().finish(rsServerName); master.getServerManager().getDeadServers().finish(rsServerName);
master.getServerManager().getDeadServers().removeDeadServer(rsServerName); master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
master.getAssignmentManager().getRegionStates().removeServer(rsServerName); master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
// Kill the server. Nothing should happen since an 'Unknown Server' as far // Kill the server. Nothing should happen since an 'Unknown Server' as far
// as the Master is concerned; i.e. no SCP. // as the Master is concerned; i.e. no SCP.
LOG.info("Killing {}", rsServerName);
HRegionServer hrs = cluster.getRegionServer(rsServerName); HRegionServer hrs = cluster.getRegionServer(rsServerName);
hrs.abort("KILLED");
while (!hrs.isStopped()) { while (!hrs.isStopped()) {
Threads.sleep(10); Threads.sleep(10);
} }
@ -135,12 +136,7 @@ public class TestHBCKSCP extends TestSCPBase {
// I now have 'Unknown Server' references in hbase:meta; i.e. Server references // I now have 'Unknown Server' references in hbase:meta; i.e. Server references
// with no corresponding SCP. Queue one. // with no corresponding SCP. Queue one.
MasterProtos.ScheduleServerCrashProcedureResponse response = long pid = scheduleHBCKSCP(rsServerName, master);
master.getMasterRpcServices().scheduleServerCrashProcedure(null,
MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder().
addServerName(ProtobufUtil.toServerName(rsServerName)).build());
assertEquals(1, response.getPidCount());
long pid = response.getPid(0);
assertNotEquals(Procedure.NO_PROC_ID, pid); assertNotEquals(Procedure.NO_PROC_ID, pid);
while (master.getMasterProcedureExecutor().getActiveProcIds().contains(pid)) { while (master.getMasterProcedureExecutor().getActiveProcIds().contains(pid)) {
Threads.sleep(10); Threads.sleep(10);
@ -156,6 +152,16 @@ public class TestHBCKSCP extends TestSCPBase {
assertFalse(searchMeta(master, rsServerName)); assertFalse(searchMeta(master, rsServerName));
} }
protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
MasterProtos.ScheduleServerCrashProcedureResponse response =
master.getMasterRpcServices().scheduleServerCrashProcedure(null,
MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder().
addServerName(ProtobufUtil.toServerName(rsServerName)).build());
assertEquals(1, response.getPidCount());
long pid = response.getPid(0);
return pid;
}
/** /**
* @return True if we find reference to <code>sn</code> in meta table. * @return True if we find reference to <code>sn</code> in meta table.
*/ */

View File

@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master.procedure;
import static org.junit.Assert.assertEquals;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.junit.ClassRule;
import org.junit.experimental.categories.Category;
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
/**
* Test of the HBCK-version of SCP.
* The HBCKSCP is an SCP only it reads hbase:meta for list of Regions that were
* on the server-to-process rather than consult Master in-memory-state.
*/
@Category({ MasterTests.class, LargeTests.class })
public class TestHBCKSCPUnknown extends TestHBCKSCP {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestHBCKSCPUnknown.class);
@Override
protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
MasterProtos.ScheduleSCPsForUnknownServersResponse response =
master.getMasterRpcServices().scheduleSCPsForUnknownServers(null,
MasterProtos.ScheduleSCPsForUnknownServersRequest.newBuilder().build());
assertEquals(1, response.getPidCount());
long pid = response.getPid(0);
return pid;
}
}