HBASE-22041 master should resolve regionserver's ip again when ConnectionException
This commit is contained in:
parent
7d6a79b768
commit
a5f2691b71
|
@ -301,6 +301,10 @@ public class AsyncConnectionImpl implements AsyncConnection {
|
||||||
() -> createAdminServerStub(serverName));
|
() -> createAdminServerStub(serverName));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void removeAdminStub(ServerName serverName) {
|
||||||
|
adminStubs.remove(getStubKey(AdminService.getDescriptor().getName(), serverName));
|
||||||
|
}
|
||||||
|
|
||||||
CompletableFuture<MasterService.Interface> getMasterStub() {
|
CompletableFuture<MasterService.Interface> getMasterStub() {
|
||||||
return ConnectionUtils.getOrFetch(masterStub, masterStubMakeFuture, false, () -> {
|
return ConnectionUtils.getOrFetch(masterStub, masterStubMakeFuture, false, () -> {
|
||||||
CompletableFuture<MasterService.Interface> future = new CompletableFuture<>();
|
CompletableFuture<MasterService.Interface> future = new CompletableFuture<>();
|
||||||
|
|
|
@ -216,4 +216,8 @@ public class AsyncRegionServerAdmin {
|
||||||
executeProcedures(ExecuteProceduresRequest request) {
|
executeProcedures(ExecuteProceduresRequest request) {
|
||||||
return call((stub, controller, done) -> stub.executeProcedures(controller, request, done));
|
return call((stub, controller, done) -> stub.executeProcedures(controller, request, done));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void removeRsStub(ServerName serverName) {
|
||||||
|
conn.removeAdminStub(serverName);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.procedure;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.Thread.UncaughtExceptionHandler;
|
import java.lang.Thread.UncaughtExceptionHandler;
|
||||||
|
import java.net.ConnectException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
@ -257,7 +258,7 @@ public class RSProcedureDispatcher extends RemoteProcedureDispatcher<MasterProce
|
||||||
DEFAULT_RS_RPC_RETRY_INTERVAL);
|
DEFAULT_RS_RPC_RETRY_INTERVAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
private AsyncRegionServerAdmin getRsAdmin() throws IOException {
|
private AsyncRegionServerAdmin getRsAdmin() {
|
||||||
return master.getAsyncClusterConnection().getRegionServerAdmin(serverName);
|
return master.getAsyncClusterConnection().getRegionServerAdmin(serverName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -306,6 +307,14 @@ public class RSProcedureDispatcher extends RemoteProcedureDispatcher<MasterProce
|
||||||
serverName, e.toString(), numberOfAttemptsSoFar);
|
serverName, e.toString(), numberOfAttemptsSoFar);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
//This situation may be that the master resolves to the wrong ip address.
|
||||||
|
// removing the cache so that the master can resolve the ip for regionserver again.
|
||||||
|
if (e instanceof ConnectException) {
|
||||||
|
getRsAdmin().removeRsStub(serverName);
|
||||||
|
LOG.warn("Request to {} failed due to {}, try={} retry get new rs admin... ",
|
||||||
|
serverName, e.toString(), numberOfAttemptsSoFar);
|
||||||
|
}
|
||||||
|
|
||||||
if (e instanceof RegionServerAbortedException || e instanceof RegionServerStoppedException) {
|
if (e instanceof RegionServerAbortedException || e instanceof RegionServerStoppedException) {
|
||||||
// A better way is to return true here to let the upper layer quit, and then schedule a
|
// A better way is to return true here to let the upper layer quit, and then schedule a
|
||||||
// background task to check whether the region server is dead. And if it is dead, call
|
// background task to check whether the region server is dead. And if it is dead, call
|
||||||
|
@ -313,7 +322,7 @@ public class RSProcedureDispatcher extends RemoteProcedureDispatcher<MasterProce
|
||||||
// result, but waste some resources.
|
// result, but waste some resources.
|
||||||
LOG.warn("{} is aborted or stopped, for safety we still need to"
|
LOG.warn("{} is aborted or stopped, for safety we still need to"
|
||||||
+ " wait until it is fully dead, try={}", serverName, numberOfAttemptsSoFar);
|
+ " wait until it is fully dead, try={}", serverName, numberOfAttemptsSoFar);
|
||||||
} else {
|
} else if (!(e instanceof ConnectException)) {
|
||||||
LOG.warn("request to {} failed due to {}, try={}, retrying...", serverName, e.toString(),
|
LOG.warn("request to {} failed due to {}, try={}, retrying...", serverName, e.toString(),
|
||||||
numberOfAttemptsSoFar);
|
numberOfAttemptsSoFar);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue