HBASE-27277 TestRaceBetweenSCPAndTRSP fails in pre commit (#5248)
Signed-off-by: GeorryHuang <huangzhuoyue@apache.org>
This commit is contained in:
parent
e4e79179d4
commit
dc30ca552b
|
@ -17,6 +17,7 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.hadoop.hbase.procedure2;
|
package org.apache.hadoop.hbase.procedure2;
|
||||||
|
|
||||||
|
import com.google.errorprone.annotations.RestrictedApi;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.Thread.UncaughtExceptionHandler;
|
import java.lang.Thread.UncaughtExceptionHandler;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
@ -296,6 +297,12 @@ public abstract class RemoteProcedureDispatcher<TEnv, TRemote extends Comparable
|
||||||
return (List<T>) requestByType.removeAll(type);
|
return (List<T>) requestByType.removeAll(type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@RestrictedApi(explanation = "Should only be called in tests", link = "",
|
||||||
|
allowedOnPath = ".*/src/test/.*")
|
||||||
|
public boolean hasNode(TRemote key) {
|
||||||
|
return nodeMap.containsKey(key);
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================================================
|
// ============================================================================================
|
||||||
// Timeout Helpers
|
// Timeout Helpers
|
||||||
// ============================================================================================
|
// ============================================================================================
|
||||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.hadoop.hbase.client.RegionInfo;
|
||||||
import org.apache.hadoop.hbase.master.HMaster;
|
import org.apache.hadoop.hbase.master.HMaster;
|
||||||
import org.apache.hadoop.hbase.master.MasterServices;
|
import org.apache.hadoop.hbase.master.MasterServices;
|
||||||
import org.apache.hadoop.hbase.master.RegionPlan;
|
import org.apache.hadoop.hbase.master.RegionPlan;
|
||||||
|
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
|
||||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
||||||
import org.apache.hadoop.hbase.master.region.MasterRegion;
|
import org.apache.hadoop.hbase.master.region.MasterRegion;
|
||||||
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||||
|
@ -147,16 +148,32 @@ public class TestRaceBetweenSCPAndTRSP {
|
||||||
Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn));
|
Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn));
|
||||||
arriveRegionOpening.await();
|
arriveRegionOpening.await();
|
||||||
|
|
||||||
|
// Kill the region server and trigger a SCP
|
||||||
UTIL.getMiniHBaseCluster().killRegionServer(sn);
|
UTIL.getMiniHBaseCluster().killRegionServer(sn);
|
||||||
|
// Wait until the SCP reaches the getRegionsOnServer call
|
||||||
arriveGetRegionsOnServer.await();
|
arriveGetRegionsOnServer.await();
|
||||||
RESUME_REGION_OPENING.countDown();
|
RSProcedureDispatcher remoteDispatcher = UTIL.getMiniHBaseCluster().getMaster()
|
||||||
|
.getMasterProcedureExecutor().getEnvironment().getRemoteDispatcher();
|
||||||
|
// this is necessary for making the UT stable, the problem here is that, in
|
||||||
|
// ServerManager.expireServer, we will submit the SCP and then the SCP will be executed in
|
||||||
|
// another thread(the PEWorker), so when we reach the above getRegionsOnServer call in SCP, it
|
||||||
|
// is still possible that the expireServer call has not been finished so the remote dispatcher
|
||||||
|
// still think it can dispatcher the TRSP, in this way we will be in dead lock as the TRSP will
|
||||||
|
// not schedule a new ORP since it relies on SCP to wake it up after everything is OK. This is
|
||||||
|
// not what we want to test in this UT so we need to wait here to prevent this from happening.
|
||||||
|
// See HBASE-27277 for more detailed analysis.
|
||||||
|
UTIL.waitFor(15000, () -> !remoteDispatcher.hasNode(sn));
|
||||||
|
|
||||||
|
// Resume the TRSP, it should be able to finish
|
||||||
|
RESUME_REGION_OPENING.countDown();
|
||||||
moveFuture.get();
|
moveFuture.get();
|
||||||
|
|
||||||
ProcedureExecutor<?> procExec =
|
ProcedureExecutor<?> procExec =
|
||||||
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
|
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
|
||||||
long scpProcId =
|
long scpProcId =
|
||||||
procExec.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure)
|
procExec.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure)
|
||||||
.map(p -> (ServerCrashProcedure) p).findAny().get().getProcId();
|
.map(p -> (ServerCrashProcedure) p).findAny().get().getProcId();
|
||||||
|
// Resume the SCP and make sure it can finish too
|
||||||
RESUME_GET_REGIONS_ON_SERVER.countDown();
|
RESUME_GET_REGIONS_ON_SERVER.countDown();
|
||||||
UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId));
|
UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue