HBASE-13845 Expire of one region server carrying meta can bring down the master

This commit is contained in:
Jerry He 2015-06-09 09:47:33 -07:00
parent 14db858a28
commit d37d9c43de
2 changed files with 146 additions and 1 deletions

View File

@ -707,7 +707,9 @@ implements ServerProcedureInterface {
services.getAssignmentManager().assignMeta(HRegionInfo.FIRST_META_REGIONINFO);
} else if (serverName.equals(services.getMetaTableLocator().
getMetaRegionLocation(services.getZooKeeper()))) {
throw new IOException("hbase:meta is onlined on the dead server " + this.serverName);
// hbase:meta seems to be still alive on the server whom master is expiring
// and thinks is dying. Let's re-assign the hbase:meta anyway.
services.getAssignmentManager().assignMeta(HRegionInfo.FIRST_META_REGIONINFO);
} else {
LOG.info("Skip assigning hbase:meta because it is online at "
+ services.getMetaTableLocator().getMetaRegionLocation(services.getZooKeeper()));

View File

@ -0,0 +1,143 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.CoordinatedStateManager;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
/**
* Tests handling of meta-carrying region server failover.
*/
@Category(MediumTests.class)
public class TestMetaShutdownHandler {
private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
final static Configuration conf = TEST_UTIL.getConfiguration();
@BeforeClass
public static void setUpBeforeClass() throws Exception {
TEST_UTIL.startMiniCluster(1, 3, null, null, MyRegionServer.class);
}
@AfterClass
public static void tearDownAfterClass() throws Exception {
TEST_UTIL.shutdownMiniCluster();
}
/**
* This test will test the expire handling of a meta-carrying
* region server.
* After HBaseMiniCluster is up, we will delete the ephemeral
* node of the meta-carrying region server, which will trigger
* the expire of this region server on the master.
* On the other hand, we will slow down the abort process on
* the region server so that it is still up during the master SSH.
* We will check that the master SSH is still successfully done.
*/
@Test (timeout=180000)
public void testExpireMetaRegionServer() throws Exception {
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
HMaster master = cluster.getMaster();
RegionStates regionStates = master.getAssignmentManager().getRegionStates();
ServerName metaServerName = regionStates.getRegionServerOfRegion(
HRegionInfo.FIRST_META_REGIONINFO);
if (master.getServerName().equals(metaServerName) || metaServerName == null
|| !metaServerName.equals(cluster.getServerHoldingMeta())) {
// Move meta off master
metaServerName = cluster.getLiveRegionServerThreads()
.get(0).getRegionServer().getServerName();
master.move(HRegionInfo.FIRST_META_REGIONINFO.getEncodedNameAsBytes(),
Bytes.toBytes(metaServerName.getServerName()));
TEST_UTIL.waitUntilNoRegionsInTransition(60000);
}
RegionState metaState =
MetaTableLocator.getMetaRegionState(master.getZooKeeper());
assertEquals("Meta should be not in transition",
metaState.getState(), RegionState.State.OPEN);
assertNotEquals("Meta should be moved off master",
metaServerName, master.getServerName());
// Delete the ephemeral node of the meta-carrying region server.
// This is trigger the expire of this region server on the master.
String rsEphemeralNodePath =
ZKUtil.joinZNode(master.getZooKeeper().rsZNode, metaServerName.toString());
ZKUtil.deleteNode(master.getZooKeeper(), rsEphemeralNodePath);
// Wait for SSH to finish
final ServerManager serverManager = master.getServerManager();
final ServerName priorMetaServerName = metaServerName;
TEST_UTIL.waitFor(120000, 200, new Waiter.Predicate<Exception>() {
@Override
public boolean evaluate() throws Exception {
return !serverManager.isServerOnline(priorMetaServerName)
&& !serverManager.areDeadServersInProgress();
}
});
TEST_UTIL.waitUntilNoRegionsInTransition(60000);
// Now, make sure meta is assigned
assertTrue("Meta should be assigned",
regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
// Now, make sure meta is registered in zk
metaState = MetaTableLocator.getMetaRegionState(master.getZooKeeper());
assertEquals("Meta should be not in transition",
metaState.getState(), RegionState.State.OPEN);
assertEquals("Meta should be assigned", metaState.getServerName(),
regionStates.getRegionServerOfRegion(HRegionInfo.FIRST_META_REGIONINFO));
assertNotEquals("Meta should be assigned on a different server",
metaState.getServerName(), metaServerName);
}
public static class MyRegionServer extends MiniHBaseClusterRegionServer {
public MyRegionServer(Configuration conf, CoordinatedStateManager cp)
throws IOException, KeeperException,
InterruptedException {
super(conf, cp);
}
@Override
public void abort(String reason, Throwable cause) {
// sleep to slow down the region server abort
try {
Thread.sleep(30*1000);
} catch (InterruptedException e) {
return;
}
super.abort(reason, cause);
}
}
}