HBASE-19974 Fix decommissioned servers cannot be removed by remove_servers_rsgroup methods

Signed-off-by: tedyu <yuzhihong@gmail.com>
This commit is contained in:
haxiaolin 2018-02-26 14:25:01 +08:00 committed by tedyu
parent ef02762dd8
commit 0bf33c802d
3 changed files with 40 additions and 21 deletions

View File

@ -675,9 +675,13 @@ public class RSGroupAdminServer implements RSGroupAdmin {
private void checkForDeadOrOnlineServers(Set<Address> servers) throws ConstraintException { private void checkForDeadOrOnlineServers(Set<Address> servers) throws ConstraintException {
// This uglyness is because we only have Address, not ServerName. // This uglyness is because we only have Address, not ServerName.
Set<Address> onlineServers = new HashSet<>(); Set<Address> onlineServers = new HashSet<>();
List<ServerName> drainingServers = master.getServerManager().getDrainingServersList();
for (ServerName server : master.getServerManager().getOnlineServers().keySet()) { for (ServerName server : master.getServerManager().getOnlineServers().keySet()) {
// Only online but not decommissioned servers are really online
if (!drainingServers.contains(server)) {
onlineServers.add(server.getAddress()); onlineServers.add(server.getAddress());
} }
}
Set<Address> deadServers = new HashSet<>(); Set<Address> deadServers = new HashSet<>();
for(ServerName server: master.getServerManager().getDeadServers().copyServerNames()) { for(ServerName server: master.getServerManager().getDeadServers().copyServerNames()) {

View File

@ -36,7 +36,6 @@ import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.Waiter.Predicate; import org.apache.hadoop.hbase.Waiter.Predicate;
import org.apache.hadoop.hbase.client.ClusterConnection; import org.apache.hadoop.hbase.client.ClusterConnection;
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.master.snapshot.SnapshotManager; import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
import org.apache.hadoop.hbase.net.Address; import org.apache.hadoop.hbase.net.Address;
@ -66,7 +65,6 @@ public class TestRSGroups extends TestRSGroupsBase {
HBaseClassTestRule.forClass(TestRSGroups.class); HBaseClassTestRule.forClass(TestRSGroups.class);
protected static final Logger LOG = LoggerFactory.getLogger(TestRSGroups.class); protected static final Logger LOG = LoggerFactory.getLogger(TestRSGroups.class);
private static HMaster master;
private static boolean INIT = false; private static boolean INIT = false;
private static RSGroupAdminEndpoint rsGroupAdminEndpoint; private static RSGroupAdminEndpoint rsGroupAdminEndpoint;
@ -126,6 +124,11 @@ public class TestRSGroups extends TestRSGroupsBase {
deleteNamespaceIfNecessary(); deleteNamespaceIfNecessary();
deleteGroups(); deleteGroups();
for(ServerName sn : admin.listDecommissionedRegionServers()){
admin.recommissionRegionServer(sn, null);
}
assertTrue(admin.listDecommissionedRegionServers().isEmpty());
int missing = NUM_SLAVES_BASE - getNumServers(); int missing = NUM_SLAVES_BASE - getNumServers();
LOG.info("Restoring servers: "+missing); LOG.info("Restoring servers: "+missing);
for(int i=0; i<missing; i++) { for(int i=0; i<missing; i++) {

View File

@ -25,8 +25,10 @@ import static org.junit.Assert.fail;
import java.io.IOException; import java.io.IOException;
import java.security.SecureRandom; import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -81,9 +83,11 @@ public abstract class TestRSGroupsBase {
protected static Admin admin; protected static Admin admin;
protected static HBaseCluster cluster; protected static HBaseCluster cluster;
protected static RSGroupAdmin rsGroupAdmin; protected static RSGroupAdmin rsGroupAdmin;
protected static HMaster master;
public final static long WAIT_TIMEOUT = 60000*5; public final static long WAIT_TIMEOUT = 60000*5;
public final static int NUM_SLAVES_BASE = 4; //number of slaves for the smallest cluster public final static int NUM_SLAVES_BASE = 4; //number of slaves for the smallest cluster
public static int NUM_DEAD_SERVERS = 0;
// Per test variables // Per test variables
TableName tableName; TableName tableName;
@ -271,10 +275,10 @@ public abstract class TestRSGroupsBase {
public int getNumServers() throws IOException { public int getNumServers() throws IOException {
ClusterMetrics status = ClusterMetrics status =
admin.getClusterMetrics(EnumSet.of(Option.MASTER, Option.LIVE_SERVERS)); admin.getClusterMetrics(EnumSet.of(Option.MASTER, Option.LIVE_SERVERS));
ServerName master = status.getMasterName(); ServerName masterName = status.getMasterName();
int count = 0; int count = 0;
for (ServerName sn : status.getLiveServerMetrics().keySet()) { for (ServerName sn : status.getLiveServerMetrics().keySet()) {
if (!sn.equals(master)) { if (!sn.equals(masterName)) {
count++; count++;
} }
} }
@ -883,6 +887,7 @@ public abstract class TestRSGroupsBase {
public void testClearDeadServers() throws Exception { public void testClearDeadServers() throws Exception {
LOG.info("testClearDeadServers"); LOG.info("testClearDeadServers");
final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3); final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3);
NUM_DEAD_SERVERS = cluster.getClusterMetrics().getDeadServerNames().size();
ServerName targetServer = ServerName.parseServerName( ServerName targetServer = ServerName.parseServerName(
newGroup.getServers().iterator().next().toString()); newGroup.getServers().iterator().next().toString());
@ -895,15 +900,15 @@ public abstract class TestRSGroupsBase {
//due to the connection loss //due to the connection loss
targetRS.stopServer(null, targetRS.stopServer(null,
AdminProtos.StopServerRequest.newBuilder().setReason("Die").build()); AdminProtos.StopServerRequest.newBuilder().setReason("Die").build());
NUM_DEAD_SERVERS ++;
} catch(Exception e) { } catch(Exception e) {
} }
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
//wait for stopped regionserver to dead server list //wait for stopped regionserver to dead server list
TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
@Override @Override
public boolean evaluate() throws Exception { public boolean evaluate() throws Exception {
return !master.getServerManager().areDeadServersInProgress() return !master.getServerManager().areDeadServersInProgress()
&& cluster.getClusterMetrics().getDeadServerNames().size() > 0; && cluster.getClusterMetrics().getDeadServerNames().size() == NUM_DEAD_SERVERS;
} }
}); });
assertFalse(cluster.getClusterMetrics().getLiveServerMetrics().containsKey(targetServer)); assertFalse(cluster.getClusterMetrics().getLiveServerMetrics().containsKey(targetServer));
@ -923,8 +928,10 @@ public abstract class TestRSGroupsBase {
public void testRemoveServers() throws Exception { public void testRemoveServers() throws Exception {
LOG.info("testRemoveServers"); LOG.info("testRemoveServers");
final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3); final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3);
ServerName targetServer = ServerName.parseServerName( Iterator<Address> iterator = newGroup.getServers().iterator();
newGroup.getServers().iterator().next().toString()); ServerName targetServer = ServerName.parseServerName(iterator.next().toString());
// remove online servers
try { try {
rsGroupAdmin.removeServers(Sets.newHashSet(targetServer.getAddress())); rsGroupAdmin.removeServers(Sets.newHashSet(targetServer.getAddress()));
fail("Online servers shouldn't have been successfully removed."); fail("Online servers shouldn't have been successfully removed.");
@ -936,6 +943,8 @@ public abstract class TestRSGroupsBase {
} }
assertTrue(newGroup.getServers().contains(targetServer.getAddress())); assertTrue(newGroup.getServers().contains(targetServer.getAddress()));
// remove dead servers
NUM_DEAD_SERVERS = cluster.getClusterMetrics().getDeadServerNames().size();
AdminProtos.AdminService.BlockingInterface targetRS = AdminProtos.AdminService.BlockingInterface targetRS =
((ClusterConnection) admin.getConnection()).getAdmin(targetServer); ((ClusterConnection) admin.getConnection()).getAdmin(targetServer);
try { try {
@ -943,18 +952,19 @@ public abstract class TestRSGroupsBase {
GetServerInfoRequest.newBuilder().build()).getServerInfo().getServerName()); GetServerInfoRequest.newBuilder().build()).getServerInfo().getServerName());
//stopping may cause an exception //stopping may cause an exception
//due to the connection loss //due to the connection loss
LOG.info("stopping server " + targetServer.getHostAndPort());
targetRS.stopServer(null, targetRS.stopServer(null,
AdminProtos.StopServerRequest.newBuilder().setReason("Die").build()); AdminProtos.StopServerRequest.newBuilder().setReason("Die").build());
NUM_DEAD_SERVERS ++;
} catch(Exception e) { } catch(Exception e) {
} }
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
//wait for stopped regionserver to dead server list //wait for stopped regionserver to dead server list
TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
@Override @Override
public boolean evaluate() throws Exception { public boolean evaluate() throws Exception {
return !master.getServerManager().areDeadServersInProgress() return !master.getServerManager().areDeadServersInProgress()
&& cluster.getClusterMetrics().getDeadServerNames().size() > 0; && cluster.getClusterMetrics().getDeadServerNames().size() == NUM_DEAD_SERVERS;
} }
}); });
@ -969,17 +979,19 @@ public abstract class TestRSGroupsBase {
} }
assertTrue(newGroup.getServers().contains(targetServer.getAddress())); assertTrue(newGroup.getServers().contains(targetServer.getAddress()));
ServerName sn = TEST_UTIL.getHBaseClusterInterface().getClusterMetrics().getMasterName(); // remove decommissioned servers
TEST_UTIL.getHBaseClusterInterface().stopMaster(sn); List<ServerName> serversToDecommission = new ArrayList<>();
TEST_UTIL.getHBaseClusterInterface().waitForMasterToStop(sn, 60000); targetServer = ServerName.parseServerName(iterator.next().toString());
TEST_UTIL.getHBaseClusterInterface().startMaster(sn.getHostname(), 0); targetRS = ((ClusterConnection) admin.getConnection()).getAdmin(targetServer);
TEST_UTIL.getHBaseClusterInterface().waitForActiveAndReadyMaster(60000); targetServer = ProtobufUtil.toServerName(targetRS.getServerInfo(null,
GetServerInfoRequest.newBuilder().build()).getServerInfo().getServerName());
assertTrue(master.getServerManager().getOnlineServers().containsKey(targetServer));
serversToDecommission.add(targetServer);
admin.decommissionRegionServers(serversToDecommission, true);
assertEquals(1, admin.listDecommissionedRegionServers().size());
assertEquals(3, cluster.getClusterMetrics().getLiveServerMetrics().size());
assertFalse(cluster.getClusterMetrics().getLiveServerMetrics().containsKey(targetServer));
assertFalse(cluster.getClusterMetrics().getDeadServerNames().contains(targetServer));
assertTrue(newGroup.getServers().contains(targetServer.getAddress())); assertTrue(newGroup.getServers().contains(targetServer.getAddress()));
rsGroupAdmin.removeServers(Sets.newHashSet(targetServer.getAddress())); rsGroupAdmin.removeServers(Sets.newHashSet(targetServer.getAddress()));
Set<Address> newGroupServers = rsGroupAdmin.getRSGroupInfo(newGroup.getName()).getServers(); Set<Address> newGroupServers = rsGroupAdmin.getRSGroupInfo(newGroup.getName()).getServers();
assertFalse(newGroupServers.contains(targetServer.getAddress())); assertFalse(newGroupServers.contains(targetServer.getAddress()));