HBASE-25587 [hbck2] Schedule SCP for all unknown servers (#2978)
Signed-off-by: Wellington Chevreuil <wchevreuil@apache.org>
This commit is contained in:
parent
b5fc5e17e2
commit
af8dc2e4df
|
@ -45,6 +45,8 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.HbckServic
|
|||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RegionSpecifierAndState;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreResponse;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleSCPsForUnknownServersRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleSCPsForUnknownServersResponse;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleServerCrashProcedureResponse;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.UnassignsResponse;
|
||||
|
||||
|
@ -201,6 +203,20 @@ public class HBaseHbck implements Hbck {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Long> scheduleSCPsForUnknownServers() throws IOException {
|
||||
try {
|
||||
ScheduleSCPsForUnknownServersResponse response =
|
||||
this.hbck.scheduleSCPsForUnknownServers(
|
||||
rpcControllerFactory.newController(),
|
||||
ScheduleSCPsForUnknownServersRequest.newBuilder().build());
|
||||
return response.getPidList();
|
||||
} catch (ServiceException se) {
|
||||
LOG.debug("Failed to run ServerCrashProcedures for unknown servers", se);
|
||||
throw new IOException(se);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean runHbckChore() throws IOException {
|
||||
try {
|
||||
|
|
|
@ -130,6 +130,8 @@ public interface Hbck extends Abortable, Closeable {
|
|||
|
||||
List<Long> scheduleServerCrashProcedures(List<ServerName> serverNames) throws IOException;
|
||||
|
||||
List<Long> scheduleSCPsForUnknownServers() throws IOException;
|
||||
|
||||
/**
|
||||
* Request HBCK chore to run at master side.
|
||||
*
|
||||
|
|
|
@ -1181,6 +1181,12 @@ message ScheduleServerCrashProcedureResponse {
|
|||
repeated uint64 pid = 1;
|
||||
}
|
||||
|
||||
message ScheduleSCPsForUnknownServersRequest {}
|
||||
|
||||
message ScheduleSCPsForUnknownServersResponse {
|
||||
repeated uint64 pid = 1;
|
||||
}
|
||||
|
||||
message FixMetaRequest {}
|
||||
|
||||
message FixMetaResponse {}
|
||||
|
@ -1220,6 +1226,10 @@ service HbckService {
|
|||
rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest)
|
||||
returns(ScheduleServerCrashProcedureResponse);
|
||||
|
||||
/** Schedule a ServerCrashProcedure for unknown servers */
|
||||
rpc ScheduleSCPsForUnknownServers(ScheduleSCPsForUnknownServersRequest)
|
||||
returns(ScheduleSCPsForUnknownServersResponse);
|
||||
|
||||
/**
|
||||
* Request HBCK chore to run at master side.
|
||||
*/
|
||||
|
|
|
@ -2700,6 +2700,31 @@ public class MasterRpcServices extends RSRpcServices implements
|
|||
return MasterProtos.ScheduleServerCrashProcedureResponse.newBuilder().addAllPid(pids).build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MasterProtos.ScheduleSCPsForUnknownServersResponse scheduleSCPsForUnknownServers(
|
||||
RpcController controller, MasterProtos.ScheduleSCPsForUnknownServersRequest request)
|
||||
throws ServiceException {
|
||||
|
||||
List<Long> pids = new ArrayList<>();
|
||||
final Set<ServerName> serverNames =
|
||||
master.getAssignmentManager().getRegionStates().getRegionStates().stream()
|
||||
.map(RegionState::getServerName).collect(Collectors.toSet());
|
||||
|
||||
final Set<ServerName> unknownServerNames = serverNames.stream()
|
||||
.filter(sn -> master.getServerManager().isServerUnknown(sn)).collect(Collectors.toSet());
|
||||
|
||||
for (ServerName sn: unknownServerNames) {
|
||||
LOG.info("{} schedule ServerCrashProcedure for unknown {}",
|
||||
this.master.getClientIdAuditPrefix(), sn);
|
||||
if (shouldSubmitSCP(sn)) {
|
||||
pids.add(this.master.getServerManager().expireServer(sn, true));
|
||||
} else {
|
||||
pids.add(Procedure.NO_PROC_ID);
|
||||
}
|
||||
}
|
||||
return MasterProtos.ScheduleSCPsForUnknownServersResponse.newBuilder().addAllPid(pids).build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public FixMetaResponse fixMeta(RpcController controller, FixMetaRequest request)
|
||||
throws ServiceException {
|
||||
|
|
|
@ -917,6 +917,17 @@ public class ServerManager {
|
|||
return serverName == null || deadservers.isDeadServer(serverName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a server is unknown. A server can be online,
|
||||
* or known to be dead, or unknown to this manager (i.e, not online,
|
||||
* not known to be dead either; it is simply not tracked by the
|
||||
* master any more, for example, a very old previous instance).
|
||||
*/
|
||||
public boolean isServerUnknown(ServerName serverName) {
|
||||
return serverName == null
|
||||
|| (!onlineServers.containsKey(serverName) && !deadservers.isDeadServer(serverName));
|
||||
}
|
||||
|
||||
public void shutdownCluster() {
|
||||
String statusStr = "Cluster shutdown requested of master=" + this.master.getServerName();
|
||||
LOG.info(statusStr);
|
||||
|
|
|
@ -40,12 +40,8 @@ import org.apache.hadoop.hbase.master.HMaster;
|
|||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
|
||||
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
|
@ -57,6 +53,10 @@ import org.junit.rules.TestName;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
|
||||
|
||||
|
||||
/**
|
||||
* Test of the HBCK-version of SCP.
|
||||
|
@ -109,15 +109,16 @@ public class TestHBCKSCP extends TestSCPBase {
|
|||
// not be processing this server 'normally'. Remove it from processing by
|
||||
// calling 'finish' and then remove it from dead servers so rsServerName
|
||||
// becomes an 'Unknown Server' even though it is still around.
|
||||
LOG.info("Killing {}", rsServerName);
|
||||
cluster.killRegionServer(rsServerName);
|
||||
|
||||
master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
|
||||
master.getServerManager().getDeadServers().finish(rsServerName);
|
||||
master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
|
||||
master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
|
||||
// Kill the server. Nothing should happen since an 'Unknown Server' as far
|
||||
// as the Master is concerned; i.e. no SCP.
|
||||
LOG.info("Killing {}", rsServerName);
|
||||
HRegionServer hrs = cluster.getRegionServer(rsServerName);
|
||||
hrs.abort("KILLED");
|
||||
while (!hrs.isStopped()) {
|
||||
Threads.sleep(10);
|
||||
}
|
||||
|
@ -135,12 +136,7 @@ public class TestHBCKSCP extends TestSCPBase {
|
|||
|
||||
// I now have 'Unknown Server' references in hbase:meta; i.e. Server references
|
||||
// with no corresponding SCP. Queue one.
|
||||
MasterProtos.ScheduleServerCrashProcedureResponse response =
|
||||
master.getMasterRpcServices().scheduleServerCrashProcedure(null,
|
||||
MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder().
|
||||
addServerName(ProtobufUtil.toServerName(rsServerName)).build());
|
||||
assertEquals(1, response.getPidCount());
|
||||
long pid = response.getPid(0);
|
||||
long pid = scheduleHBCKSCP(rsServerName, master);
|
||||
assertNotEquals(Procedure.NO_PROC_ID, pid);
|
||||
while (master.getMasterProcedureExecutor().getActiveProcIds().contains(pid)) {
|
||||
Threads.sleep(10);
|
||||
|
@ -156,6 +152,16 @@ public class TestHBCKSCP extends TestSCPBase {
|
|||
assertFalse(searchMeta(master, rsServerName));
|
||||
}
|
||||
|
||||
protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
|
||||
MasterProtos.ScheduleServerCrashProcedureResponse response =
|
||||
master.getMasterRpcServices().scheduleServerCrashProcedure(null,
|
||||
MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder().
|
||||
addServerName(ProtobufUtil.toServerName(rsServerName)).build());
|
||||
assertEquals(1, response.getPidCount());
|
||||
long pid = response.getPid(0);
|
||||
return pid;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return True if we find reference to <code>sn</code> in meta table.
|
||||
*/
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.procedure;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
|
||||
|
||||
|
||||
/**
|
||||
* Test of the HBCK-version of SCP.
|
||||
* The HBCKSCP is an SCP only it reads hbase:meta for list of Regions that were
|
||||
* on the server-to-process rather than consult Master in-memory-state.
|
||||
*/
|
||||
@Category({ MasterTests.class, LargeTests.class })
|
||||
public class TestHBCKSCPUnknown extends TestHBCKSCP {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestHBCKSCPUnknown.class);
|
||||
|
||||
@Override
|
||||
protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
|
||||
MasterProtos.ScheduleSCPsForUnknownServersResponse response =
|
||||
master.getMasterRpcServices().scheduleSCPsForUnknownServers(null,
|
||||
MasterProtos.ScheduleSCPsForUnknownServersRequest.newBuilder().build());
|
||||
assertEquals(1, response.getPidCount());
|
||||
long pid = response.getPid(0);
|
||||
return pid;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue