HBASE-22041 master should resolve regionserver's ip again when ConnectionException

This commit is contained in:
youchan 2023-03-23 20:12:21 +08:00
parent 7d6a79b768
commit a5f2691b71
3 changed files with 19 additions and 2 deletions

View File

@ -301,6 +301,10 @@ public class AsyncConnectionImpl implements AsyncConnection {
() -> createAdminServerStub(serverName));
}
void removeAdminStub(ServerName serverName) {
adminStubs.remove(getStubKey(AdminService.getDescriptor().getName(), serverName));
}
CompletableFuture<MasterService.Interface> getMasterStub() {
return ConnectionUtils.getOrFetch(masterStub, masterStubMakeFuture, false, () -> {
CompletableFuture<MasterService.Interface> future = new CompletableFuture<>();

View File

@ -216,4 +216,8 @@ public class AsyncRegionServerAdmin {
executeProcedures(ExecuteProceduresRequest request) {
return call((stub, controller, done) -> stub.executeProcedures(controller, request, done));
}
public void removeRsStub(ServerName serverName) {
conn.removeAdminStub(serverName);
}
}

View File

@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.procedure;
import java.io.IOException;
import java.lang.Thread.UncaughtExceptionHandler;
import java.net.ConnectException;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
@ -257,7 +258,7 @@ public class RSProcedureDispatcher extends RemoteProcedureDispatcher<MasterProce
DEFAULT_RS_RPC_RETRY_INTERVAL);
}
private AsyncRegionServerAdmin getRsAdmin() throws IOException {
private AsyncRegionServerAdmin getRsAdmin() {
return master.getAsyncClusterConnection().getRegionServerAdmin(serverName);
}
@ -306,6 +307,14 @@ public class RSProcedureDispatcher extends RemoteProcedureDispatcher<MasterProce
serverName, e.toString(), numberOfAttemptsSoFar);
return false;
}
//This situation may be that the master resolves to the wrong ip address.
// removing the cache so that the master can resolve the ip for regionserver again.
if (e instanceof ConnectException) {
getRsAdmin().removeRsStub(serverName);
LOG.warn("Request to {} failed due to {}, try={} retry get new rs admin... ",
serverName, e.toString(), numberOfAttemptsSoFar);
}
if (e instanceof RegionServerAbortedException || e instanceof RegionServerStoppedException) {
// A better way is to return true here to let the upper layer quit, and then schedule a
// background task to check whether the region server is dead. And if it is dead, call
@ -313,7 +322,7 @@ public class RSProcedureDispatcher extends RemoteProcedureDispatcher<MasterProce
// result, but waste some resources.
LOG.warn("{} is aborted or stopped, for safety we still need to"
+ " wait until it is fully dead, try={}", serverName, numberOfAttemptsSoFar);
} else {
} else if (!(e instanceof ConnectException)) {
LOG.warn("request to {} failed due to {}, try={}, retrying...", serverName, e.toString(),
numberOfAttemptsSoFar);
}