From 9ef115163be65860599dcabcc5ba885a758c2c03 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Wed, 13 Dec 2017 23:06:23 -0800 Subject: [PATCH] Revert "master hangs forever if RecoverMeta send assign meta region request to target server fail" Reverting because missing issue JIRA # This reverts commit d3aeaeffa455f32b1eb2fbb871b39707457f684d. --- .../hadoop/hbase/master/ServerManager.java | 4 -- .../master/assignment/AssignmentManager.java | 32 +-------- .../hbase/master/MockNoopMasterServices.java | 7 +- .../master/assignment/MockMasterServices.java | 25 ------- .../assignment/TestAssignmentManager.java | 67 ------------------- 5 files changed, 2 insertions(+), 133 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 78661d253ef..79ffc8a5825 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -572,10 +572,6 @@ public class ServerManager { if (!master.isServerCrashProcessingEnabled()) { LOG.info("Master doesn't enable ServerShutdownHandler during initialization, " + "delay expiring server " + serverName); - // Even we delay expire this server, we still need to handle Meta's RIT - // that are against the crashed server; since when we do RecoverMetaProcedure, - // the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287 - master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName); this.queuedDeadServers.add(serverName); return; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 7a1c8afbc16..cebe0b04658 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -48,7 +48,6 @@ import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.YouAreDeadException; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; -import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.favored.FavoredNodesManager; @@ -71,7 +70,6 @@ import org.apache.hadoop.hbase.master.normalizer.RegionNormalizer; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler; import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait; -import org.apache.hadoop.hbase.master.procedure.ServerCrashException; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureEvent; @@ -80,7 +78,6 @@ import org.apache.hadoop.hbase.procedure2.ProcedureInMemoryChore; import org.apache.hadoop.hbase.procedure2.util.StringUtils; import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; -import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; @@ -1325,7 +1322,7 @@ public class AssignmentManager implements ServerListener { } public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) { - boolean carryingMeta = isCarryingMeta(serverName); + boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(serverName); ProcedureExecutor procExec = this.master.getMasterProcedureExecutor(); procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName, shouldSplitWal, carryingMeta)); @@ -1856,31 +1853,4 @@ public class AssignmentManager implements ServerListener { }*/ master.getServerManager().expireServer(serverNode.getServerName()); } - - /** - * Handle RIT of meta region against crashed server - * Only used when ServerCrashProcedure is not enabled. - * - * @param serverName Server that has already crashed - */ - public void handleMetaRITOnCrashedServer(ServerName serverName) { - RegionInfo hri = RegionReplicaUtil - .getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO, - RegionInfo.DEFAULT_REPLICA_ID); - RegionState regionStateNode = getRegionStates().getRegionState(hri); - if (!regionStateNode.getServerName().equals(serverName)) { - return; - } - // meta has been assigned to crashed server. - LOG.info("Meta has been assigned to crashed server: " + serverName + "; will do re-assign"); - // handle failure and wake event - RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri); - // Not need to consider for REGION_TRANSITION_QUEUE step - if (rtp != null && rtp.isMeta() - && rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) { - LOG.info("Re-do rit procedure: " + rtp.toString()); - rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName, - new ServerCrashException(rtp.getProcId(), serverName)); - } - } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java index 413abe39ad3..6a3d5c76b09 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java @@ -212,14 +212,9 @@ public class MockNoopMasterServices implements MasterServices, Server { return null; } - private boolean serverCrashProcessingEnabled = true; - - public void setServerCrashProcessingEnabled(boolean b) { - serverCrashProcessingEnabled = b; - } @Override public boolean isServerCrashProcessingEnabled() { - return serverCrashProcessingEnabled; + return true; } @Override diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java index 764aa4a1850..6a75729b6ed 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java @@ -21,7 +21,6 @@ import static org.mockito.ArgumentMatchers.any; import java.io.IOException; import java.util.HashSet; -import java.util.List; import java.util.Map; import java.util.NavigableMap; import java.util.SortedSet; @@ -174,30 +173,6 @@ public class MockMasterServices extends MockNoopMasterServices { this.procedureExecutor.getEnvironment().setEventReady(initialized, true); } - /** - * Call this restart method only after running MockMasterServices#start() - * The RSs can be differentiated by the port number, see - * ServerName in MockMasterServices#start() method above. - * Restart of region server will have new startcode in server name - * - * @param serverName Server name to be restarted - */ - public void restartRegionServer(ServerName serverName) throws IOException { - List onlineServers = serverManager.getOnlineServersList(); - long startCode = -1; - for (ServerName s : onlineServers) { - if (s.getAddress().equals(serverName.getAddress())) { - startCode = s.getStartcode() + 1; - break; - } - } - if (startCode == -1) { - return; - } - ServerName sn = ServerName.valueOf(serverName.getAddress().toString(), startCode); - serverManager.regionServerReport(sn, ServerLoad.EMPTY_SERVERLOAD); - } - @Override public void stop(String why) { stopProcedureExecutor(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java index c328c711a6f..1912d1168d7 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java @@ -444,34 +444,6 @@ public class TestAssignmentManager { assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount()); } - /** - * It is possible that when AM send assign meta request to a RS successfully, - * but RS can not send back any response, which cause master startup hangs forever - */ - @Test - public void testAssignMetaAndCrashBeforeResponse() throws Exception { - tearDown(); - // See setUp(), start HBase until set up meta - UTIL = new HBaseTestingUtility(); - this.executor = Executors.newSingleThreadScheduledExecutor(); - setupConfiguration(UTIL.getConfiguration()); - master = new MockMasterServices(UTIL.getConfiguration(), this.regionsToRegionServers); - rsDispatcher = new MockRSProcedureDispatcher(master); - master.start(NSERVERS, rsDispatcher); - am = master.getAssignmentManager(); - - // Assign meta - master.setServerCrashProcessingEnabled(false); - rsDispatcher.setMockRsExecutor(new HangThenRSRestartExecutor()); - am.assign(RegionInfoBuilder.FIRST_META_REGIONINFO); - assertEquals(true, am.isMetaInitialized()); - - // set it back as default, see setUpMeta() - master.setServerCrashProcessingEnabled(true); - am.wakeMetaLoadedEvent(); - am.setFailoverCleanupDone(true); - } - private Future submitProcedure(final Procedure proc) { return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc); } @@ -555,14 +527,6 @@ public class TestAssignmentManager { this.am.submitServerCrash(serverName, false/*No WALs here*/); } - private void doRestart(final ServerName serverName) { - try { - this.master.restartRegionServer(serverName); - } catch (IOException e) { - LOG.warn("Can not restart RS with new startcode"); - } - } - private class NoopRsExecutor implements MockRSExecutor { public ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest request) throws IOException { @@ -714,37 +678,6 @@ public class TestAssignmentManager { } } - /** - * Takes open request and then returns nothing so acts like a RS that went zombie. - * No response (so proc is stuck/suspended on the Master and won't wake up.). - * Different with HangThenRSCrashExecutor, HangThenRSCrashExecutor will create - * ServerCrashProcedure to handle the server crash. However, this HangThenRSRestartExecutor - * will restart RS directly, situation for RS crashed when SCP is not enabled. - */ - private class HangThenRSRestartExecutor extends GoodRsExecutor { - private int invocations; - - @Override - protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq) - throws IOException { - if (this.invocations++ > 0) { - // Return w/o problem the second time through here. - return super.execOpenRegion(server, openReq); - } - // The procedure on master will just hang forever because nothing comes back - // from the RS in this case. - LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout"); - executor.schedule(new Runnable() { - @Override - public void run() { - LOG.info("Restarting RS of " + server); - doRestart(server); - } - }, 1, TimeUnit.SECONDS); - return null; - } - } - private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor { public static final int TYPES_OF_FAILURE = 6; private int invocations;