HBASE-21330 ReopenTableRegionsProcedure will enter an infinite loop if we schedule a TRSP at the same time

This commit is contained in:
Duo Zhang 2018-10-17 18:04:24 +08:00
parent 3563f48900
commit d7d3beb6bc
3 changed files with 114 additions and 6 deletions

View File

@ -322,7 +322,8 @@ public abstract class Procedure<TEnvironment> implements Comparable<Procedure<TE
* @see #holdLock(Object)
* @return true if the procedure has the lock, false otherwise.
*/
protected final boolean hasLock() {
@VisibleForTesting
public final boolean hasLock() {
return locked;
}

View File

@ -70,6 +70,18 @@ public class ReopenTableRegionsProcedure
return TableOperationType.REGION_EDIT;
}
private boolean canSchedule(MasterProcedureEnv env, HRegionLocation loc) {
if (loc.getSeqNum() < 0) {
return false;
}
RegionStateNode regionNode =
env.getAssignmentManager().getRegionStates().getRegionStateNode(loc.getRegion());
// If the region node is null, then at least in the next round we can remove this region to make
// progress. And the second condition is a normal one, if there are no TRSP with it then we can
// schedule one to make progress.
return regionNode == null || !regionNode.isInTransition();
}
@Override
protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState state)
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
@ -85,8 +97,12 @@ public class ReopenTableRegionsProcedure
return Flow.HAS_MORE_STATE;
case REOPEN_TABLE_REGIONS_REOPEN_REGIONS:
for (HRegionLocation loc : regions) {
RegionStateNode regionNode = env.getAssignmentManager().getRegionStates()
.getOrCreateRegionStateNode(loc.getRegion());
RegionStateNode regionNode =
env.getAssignmentManager().getRegionStates().getRegionStateNode(loc.getRegion());
// this possible, maybe the region has already been merged or split, see HBASE-20921
if (regionNode == null) {
continue;
}
TransitRegionStateProcedure proc;
regionNode.lock();
try {
@ -108,13 +124,13 @@ public class ReopenTableRegionsProcedure
if (regions.isEmpty()) {
return Flow.NO_MORE_STATE;
}
if (regions.stream().anyMatch(l -> l.getSeqNum() >= 0)) {
if (regions.stream().anyMatch(loc -> canSchedule(env, loc))) {
attempt = 0;
setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS);
return Flow.HAS_MORE_STATE;
}
// All the regions need to reopen are in OPENING state which means we can not schedule any
// MRPs.
// We can not schedule TRSP for all the regions need to reopen, wait for a while and retry
// again.
long backoff = ProcedureUtil.getBackoffTimeMs(this.attempt++);
LOG.info(
"There are still {} region(s) which need to be reopened for table {} are in " +
@ -138,6 +154,7 @@ public class ReopenTableRegionsProcedure
env.getProcedureScheduler().addFront(this);
return false; // 'false' means that this procedure handled the timeout
}
@Override
protected void rollbackState(MasterProcedureEnv env, ReopenTableRegionsState state)
throws IOException, InterruptedException {

View File

@ -0,0 +1,90 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master.procedure;
import java.io.IOException;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
/**
* Testcase for HBASE-21330.
*/
@Category({ MasterTests.class, MediumTests.class })
public class TestReopenTableRegionsProcedureInfiniteLoop {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestReopenTableRegionsProcedureInfiniteLoop.class);
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
private static TableName TABLE_NAME = TableName.valueOf("InfiniteLoop");
private static byte[] CF = Bytes.toBytes("cf");
@BeforeClass
public static void setUp() throws Exception {
UTIL.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
UTIL.startMiniCluster(1);
UTIL.createTable(TABLE_NAME, CF);
}
@AfterClass
public static void tearDown() throws Exception {
UTIL.shutdownMiniCluster();
}
@Test
public void testInfiniteLoop() throws IOException {
HMaster master = UTIL.getMiniHBaseCluster().getMaster();
AssignmentManager am = master.getAssignmentManager();
ProcedureExecutor<MasterProcedureEnv> exec = master.getMasterProcedureExecutor();
RegionInfo regionInfo = UTIL.getAdmin().getRegions(TABLE_NAME).get(0);
RegionStateNode regionNode = am.getRegionStates().getRegionStateNode(regionInfo);
long procId;
ReopenTableRegionsProcedure proc = new ReopenTableRegionsProcedure(TABLE_NAME);
regionNode.lock();
try {
procId = exec.submitProcedure(proc);
UTIL.waitFor(30000, () -> proc.hasLock());
TransitRegionStateProcedure trsp =
TransitRegionStateProcedure.reopen(exec.getEnvironment(), regionInfo);
regionNode.setProcedure(trsp);
exec.submitProcedure(trsp);
} finally {
regionNode.unlock();
}
UTIL.waitFor(60000, () -> exec.isFinished(procId));
}
}