HBASE-25587 [hbck2] Schedule SCP for all unknown servers (#2978)
Signed-off-by: Wellington Chevreuil <wchevreuil@apache.org>
This commit is contained in:
parent
b5fc5e17e2
commit
af8dc2e4df
|
@ -45,6 +45,8 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.HbckServic
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RegionSpecifierAndState;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RegionSpecifierAndState;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreRequest;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreRequest;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreResponse;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreResponse;
|
||||||
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleSCPsForUnknownServersRequest;
|
||||||
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleSCPsForUnknownServersResponse;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleServerCrashProcedureResponse;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleServerCrashProcedureResponse;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.UnassignsResponse;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.UnassignsResponse;
|
||||||
|
|
||||||
|
@ -201,6 +203,20 @@ public class HBaseHbck implements Hbck {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Long> scheduleSCPsForUnknownServers() throws IOException {
|
||||||
|
try {
|
||||||
|
ScheduleSCPsForUnknownServersResponse response =
|
||||||
|
this.hbck.scheduleSCPsForUnknownServers(
|
||||||
|
rpcControllerFactory.newController(),
|
||||||
|
ScheduleSCPsForUnknownServersRequest.newBuilder().build());
|
||||||
|
return response.getPidList();
|
||||||
|
} catch (ServiceException se) {
|
||||||
|
LOG.debug("Failed to run ServerCrashProcedures for unknown servers", se);
|
||||||
|
throw new IOException(se);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean runHbckChore() throws IOException {
|
public boolean runHbckChore() throws IOException {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -130,6 +130,8 @@ public interface Hbck extends Abortable, Closeable {
|
||||||
|
|
||||||
List<Long> scheduleServerCrashProcedures(List<ServerName> serverNames) throws IOException;
|
List<Long> scheduleServerCrashProcedures(List<ServerName> serverNames) throws IOException;
|
||||||
|
|
||||||
|
List<Long> scheduleSCPsForUnknownServers() throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Request HBCK chore to run at master side.
|
* Request HBCK chore to run at master side.
|
||||||
*
|
*
|
||||||
|
|
|
@ -1181,6 +1181,12 @@ message ScheduleServerCrashProcedureResponse {
|
||||||
repeated uint64 pid = 1;
|
repeated uint64 pid = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
message ScheduleSCPsForUnknownServersRequest {}
|
||||||
|
|
||||||
|
message ScheduleSCPsForUnknownServersResponse {
|
||||||
|
repeated uint64 pid = 1;
|
||||||
|
}
|
||||||
|
|
||||||
message FixMetaRequest {}
|
message FixMetaRequest {}
|
||||||
|
|
||||||
message FixMetaResponse {}
|
message FixMetaResponse {}
|
||||||
|
@ -1220,6 +1226,10 @@ service HbckService {
|
||||||
rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest)
|
rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest)
|
||||||
returns(ScheduleServerCrashProcedureResponse);
|
returns(ScheduleServerCrashProcedureResponse);
|
||||||
|
|
||||||
|
/** Schedule a ServerCrashProcedure for unknown servers */
|
||||||
|
rpc ScheduleSCPsForUnknownServers(ScheduleSCPsForUnknownServersRequest)
|
||||||
|
returns(ScheduleSCPsForUnknownServersResponse);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Request HBCK chore to run at master side.
|
* Request HBCK chore to run at master side.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -2700,6 +2700,31 @@ public class MasterRpcServices extends RSRpcServices implements
|
||||||
return MasterProtos.ScheduleServerCrashProcedureResponse.newBuilder().addAllPid(pids).build();
|
return MasterProtos.ScheduleServerCrashProcedureResponse.newBuilder().addAllPid(pids).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MasterProtos.ScheduleSCPsForUnknownServersResponse scheduleSCPsForUnknownServers(
|
||||||
|
RpcController controller, MasterProtos.ScheduleSCPsForUnknownServersRequest request)
|
||||||
|
throws ServiceException {
|
||||||
|
|
||||||
|
List<Long> pids = new ArrayList<>();
|
||||||
|
final Set<ServerName> serverNames =
|
||||||
|
master.getAssignmentManager().getRegionStates().getRegionStates().stream()
|
||||||
|
.map(RegionState::getServerName).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
final Set<ServerName> unknownServerNames = serverNames.stream()
|
||||||
|
.filter(sn -> master.getServerManager().isServerUnknown(sn)).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
for (ServerName sn: unknownServerNames) {
|
||||||
|
LOG.info("{} schedule ServerCrashProcedure for unknown {}",
|
||||||
|
this.master.getClientIdAuditPrefix(), sn);
|
||||||
|
if (shouldSubmitSCP(sn)) {
|
||||||
|
pids.add(this.master.getServerManager().expireServer(sn, true));
|
||||||
|
} else {
|
||||||
|
pids.add(Procedure.NO_PROC_ID);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return MasterProtos.ScheduleSCPsForUnknownServersResponse.newBuilder().addAllPid(pids).build();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FixMetaResponse fixMeta(RpcController controller, FixMetaRequest request)
|
public FixMetaResponse fixMeta(RpcController controller, FixMetaRequest request)
|
||||||
throws ServiceException {
|
throws ServiceException {
|
||||||
|
|
|
@ -917,6 +917,17 @@ public class ServerManager {
|
||||||
return serverName == null || deadservers.isDeadServer(serverName);
|
return serverName == null || deadservers.isDeadServer(serverName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a server is unknown. A server can be online,
|
||||||
|
* or known to be dead, or unknown to this manager (i.e, not online,
|
||||||
|
* not known to be dead either; it is simply not tracked by the
|
||||||
|
* master any more, for example, a very old previous instance).
|
||||||
|
*/
|
||||||
|
public boolean isServerUnknown(ServerName serverName) {
|
||||||
|
return serverName == null
|
||||||
|
|| (!onlineServers.containsKey(serverName) && !deadservers.isDeadServer(serverName));
|
||||||
|
}
|
||||||
|
|
||||||
public void shutdownCluster() {
|
public void shutdownCluster() {
|
||||||
String statusStr = "Cluster shutdown requested of master=" + this.master.getServerName();
|
String statusStr = "Cluster shutdown requested of master=" + this.master.getServerName();
|
||||||
LOG.info(statusStr);
|
LOG.info(statusStr);
|
||||||
|
|
|
@ -40,12 +40,8 @@ import org.apache.hadoop.hbase.master.HMaster;
|
||||||
import org.apache.hadoop.hbase.master.RegionState;
|
import org.apache.hadoop.hbase.master.RegionState;
|
||||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
|
|
||||||
|
|
||||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||||
|
|
||||||
import org.apache.hadoop.hbase.util.Bytes;
|
import org.apache.hadoop.hbase.util.Bytes;
|
||||||
import org.apache.hadoop.hbase.util.Pair;
|
import org.apache.hadoop.hbase.util.Pair;
|
||||||
import org.apache.hadoop.hbase.util.Threads;
|
import org.apache.hadoop.hbase.util.Threads;
|
||||||
|
@ -57,6 +53,10 @@ import org.junit.rules.TestName;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
|
||||||
|
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||||
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test of the HBCK-version of SCP.
|
* Test of the HBCK-version of SCP.
|
||||||
|
@ -109,15 +109,16 @@ public class TestHBCKSCP extends TestSCPBase {
|
||||||
// not be processing this server 'normally'. Remove it from processing by
|
// not be processing this server 'normally'. Remove it from processing by
|
||||||
// calling 'finish' and then remove it from dead servers so rsServerName
|
// calling 'finish' and then remove it from dead servers so rsServerName
|
||||||
// becomes an 'Unknown Server' even though it is still around.
|
// becomes an 'Unknown Server' even though it is still around.
|
||||||
|
LOG.info("Killing {}", rsServerName);
|
||||||
|
cluster.killRegionServer(rsServerName);
|
||||||
|
|
||||||
master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
|
master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
|
||||||
master.getServerManager().getDeadServers().finish(rsServerName);
|
master.getServerManager().getDeadServers().finish(rsServerName);
|
||||||
master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
|
master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
|
||||||
master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
|
master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
|
||||||
// Kill the server. Nothing should happen since an 'Unknown Server' as far
|
// Kill the server. Nothing should happen since an 'Unknown Server' as far
|
||||||
// as the Master is concerned; i.e. no SCP.
|
// as the Master is concerned; i.e. no SCP.
|
||||||
LOG.info("Killing {}", rsServerName);
|
|
||||||
HRegionServer hrs = cluster.getRegionServer(rsServerName);
|
HRegionServer hrs = cluster.getRegionServer(rsServerName);
|
||||||
hrs.abort("KILLED");
|
|
||||||
while (!hrs.isStopped()) {
|
while (!hrs.isStopped()) {
|
||||||
Threads.sleep(10);
|
Threads.sleep(10);
|
||||||
}
|
}
|
||||||
|
@ -135,12 +136,7 @@ public class TestHBCKSCP extends TestSCPBase {
|
||||||
|
|
||||||
// I now have 'Unknown Server' references in hbase:meta; i.e. Server references
|
// I now have 'Unknown Server' references in hbase:meta; i.e. Server references
|
||||||
// with no corresponding SCP. Queue one.
|
// with no corresponding SCP. Queue one.
|
||||||
MasterProtos.ScheduleServerCrashProcedureResponse response =
|
long pid = scheduleHBCKSCP(rsServerName, master);
|
||||||
master.getMasterRpcServices().scheduleServerCrashProcedure(null,
|
|
||||||
MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder().
|
|
||||||
addServerName(ProtobufUtil.toServerName(rsServerName)).build());
|
|
||||||
assertEquals(1, response.getPidCount());
|
|
||||||
long pid = response.getPid(0);
|
|
||||||
assertNotEquals(Procedure.NO_PROC_ID, pid);
|
assertNotEquals(Procedure.NO_PROC_ID, pid);
|
||||||
while (master.getMasterProcedureExecutor().getActiveProcIds().contains(pid)) {
|
while (master.getMasterProcedureExecutor().getActiveProcIds().contains(pid)) {
|
||||||
Threads.sleep(10);
|
Threads.sleep(10);
|
||||||
|
@ -156,6 +152,16 @@ public class TestHBCKSCP extends TestSCPBase {
|
||||||
assertFalse(searchMeta(master, rsServerName));
|
assertFalse(searchMeta(master, rsServerName));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
|
||||||
|
MasterProtos.ScheduleServerCrashProcedureResponse response =
|
||||||
|
master.getMasterRpcServices().scheduleServerCrashProcedure(null,
|
||||||
|
MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder().
|
||||||
|
addServerName(ProtobufUtil.toServerName(rsServerName)).build());
|
||||||
|
assertEquals(1, response.getPidCount());
|
||||||
|
long pid = response.getPid(0);
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return True if we find reference to <code>sn</code> in meta table.
|
* @return True if we find reference to <code>sn</code> in meta table.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hbase.master.procedure;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||||
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
|
import org.apache.hadoop.hbase.master.HMaster;
|
||||||
|
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||||
|
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||||
|
|
||||||
|
import org.junit.ClassRule;
|
||||||
|
import org.junit.experimental.categories.Category;
|
||||||
|
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
|
||||||
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test of the HBCK-version of SCP.
|
||||||
|
* The HBCKSCP is an SCP only it reads hbase:meta for list of Regions that were
|
||||||
|
* on the server-to-process rather than consult Master in-memory-state.
|
||||||
|
*/
|
||||||
|
@Category({ MasterTests.class, LargeTests.class })
|
||||||
|
public class TestHBCKSCPUnknown extends TestHBCKSCP {
|
||||||
|
|
||||||
|
@ClassRule
|
||||||
|
public static final HBaseClassTestRule CLASS_RULE =
|
||||||
|
HBaseClassTestRule.forClass(TestHBCKSCPUnknown.class);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
|
||||||
|
MasterProtos.ScheduleSCPsForUnknownServersResponse response =
|
||||||
|
master.getMasterRpcServices().scheduleSCPsForUnknownServers(null,
|
||||||
|
MasterProtos.ScheduleSCPsForUnknownServersRequest.newBuilder().build());
|
||||||
|
assertEquals(1, response.getPidCount());
|
||||||
|
long pid = response.getPid(0);
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue