HBASE-19974 Fix decommissioned servers cannot be removed by remove_servers_rsgroup methods
Signed-off-by: tedyu <yuzhihong@gmail.com>
This commit is contained in:
parent
ef02762dd8
commit
0bf33c802d
|
@ -675,9 +675,13 @@ public class RSGroupAdminServer implements RSGroupAdmin {
|
||||||
private void checkForDeadOrOnlineServers(Set<Address> servers) throws ConstraintException {
|
private void checkForDeadOrOnlineServers(Set<Address> servers) throws ConstraintException {
|
||||||
// This uglyness is because we only have Address, not ServerName.
|
// This uglyness is because we only have Address, not ServerName.
|
||||||
Set<Address> onlineServers = new HashSet<>();
|
Set<Address> onlineServers = new HashSet<>();
|
||||||
|
List<ServerName> drainingServers = master.getServerManager().getDrainingServersList();
|
||||||
for (ServerName server : master.getServerManager().getOnlineServers().keySet()) {
|
for (ServerName server : master.getServerManager().getOnlineServers().keySet()) {
|
||||||
|
// Only online but not decommissioned servers are really online
|
||||||
|
if (!drainingServers.contains(server)) {
|
||||||
onlineServers.add(server.getAddress());
|
onlineServers.add(server.getAddress());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Set<Address> deadServers = new HashSet<>();
|
Set<Address> deadServers = new HashSet<>();
|
||||||
for(ServerName server: master.getServerManager().getDeadServers().copyServerNames()) {
|
for(ServerName server: master.getServerManager().getDeadServers().copyServerNames()) {
|
||||||
|
|
|
@ -36,7 +36,6 @@ import org.apache.hadoop.hbase.Waiter;
|
||||||
import org.apache.hadoop.hbase.Waiter.Predicate;
|
import org.apache.hadoop.hbase.Waiter.Predicate;
|
||||||
import org.apache.hadoop.hbase.client.ClusterConnection;
|
import org.apache.hadoop.hbase.client.ClusterConnection;
|
||||||
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
|
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
|
||||||
import org.apache.hadoop.hbase.master.HMaster;
|
|
||||||
import org.apache.hadoop.hbase.master.ServerManager;
|
import org.apache.hadoop.hbase.master.ServerManager;
|
||||||
import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
|
import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
|
||||||
import org.apache.hadoop.hbase.net.Address;
|
import org.apache.hadoop.hbase.net.Address;
|
||||||
|
@ -66,7 +65,6 @@ public class TestRSGroups extends TestRSGroupsBase {
|
||||||
HBaseClassTestRule.forClass(TestRSGroups.class);
|
HBaseClassTestRule.forClass(TestRSGroups.class);
|
||||||
|
|
||||||
protected static final Logger LOG = LoggerFactory.getLogger(TestRSGroups.class);
|
protected static final Logger LOG = LoggerFactory.getLogger(TestRSGroups.class);
|
||||||
private static HMaster master;
|
|
||||||
private static boolean INIT = false;
|
private static boolean INIT = false;
|
||||||
private static RSGroupAdminEndpoint rsGroupAdminEndpoint;
|
private static RSGroupAdminEndpoint rsGroupAdminEndpoint;
|
||||||
|
|
||||||
|
@ -126,6 +124,11 @@ public class TestRSGroups extends TestRSGroupsBase {
|
||||||
deleteNamespaceIfNecessary();
|
deleteNamespaceIfNecessary();
|
||||||
deleteGroups();
|
deleteGroups();
|
||||||
|
|
||||||
|
for(ServerName sn : admin.listDecommissionedRegionServers()){
|
||||||
|
admin.recommissionRegionServer(sn, null);
|
||||||
|
}
|
||||||
|
assertTrue(admin.listDecommissionedRegionServers().isEmpty());
|
||||||
|
|
||||||
int missing = NUM_SLAVES_BASE - getNumServers();
|
int missing = NUM_SLAVES_BASE - getNumServers();
|
||||||
LOG.info("Restoring servers: "+missing);
|
LOG.info("Restoring servers: "+missing);
|
||||||
for(int i=0; i<missing; i++) {
|
for(int i=0; i<missing; i++) {
|
||||||
|
|
|
@ -25,8 +25,10 @@ import static org.junit.Assert.fail;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.security.SecureRandom;
|
import java.security.SecureRandom;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -81,9 +83,11 @@ public abstract class TestRSGroupsBase {
|
||||||
protected static Admin admin;
|
protected static Admin admin;
|
||||||
protected static HBaseCluster cluster;
|
protected static HBaseCluster cluster;
|
||||||
protected static RSGroupAdmin rsGroupAdmin;
|
protected static RSGroupAdmin rsGroupAdmin;
|
||||||
|
protected static HMaster master;
|
||||||
|
|
||||||
public final static long WAIT_TIMEOUT = 60000*5;
|
public final static long WAIT_TIMEOUT = 60000*5;
|
||||||
public final static int NUM_SLAVES_BASE = 4; //number of slaves for the smallest cluster
|
public final static int NUM_SLAVES_BASE = 4; //number of slaves for the smallest cluster
|
||||||
|
public static int NUM_DEAD_SERVERS = 0;
|
||||||
|
|
||||||
// Per test variables
|
// Per test variables
|
||||||
TableName tableName;
|
TableName tableName;
|
||||||
|
@ -271,10 +275,10 @@ public abstract class TestRSGroupsBase {
|
||||||
public int getNumServers() throws IOException {
|
public int getNumServers() throws IOException {
|
||||||
ClusterMetrics status =
|
ClusterMetrics status =
|
||||||
admin.getClusterMetrics(EnumSet.of(Option.MASTER, Option.LIVE_SERVERS));
|
admin.getClusterMetrics(EnumSet.of(Option.MASTER, Option.LIVE_SERVERS));
|
||||||
ServerName master = status.getMasterName();
|
ServerName masterName = status.getMasterName();
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (ServerName sn : status.getLiveServerMetrics().keySet()) {
|
for (ServerName sn : status.getLiveServerMetrics().keySet()) {
|
||||||
if (!sn.equals(master)) {
|
if (!sn.equals(masterName)) {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -883,6 +887,7 @@ public abstract class TestRSGroupsBase {
|
||||||
public void testClearDeadServers() throws Exception {
|
public void testClearDeadServers() throws Exception {
|
||||||
LOG.info("testClearDeadServers");
|
LOG.info("testClearDeadServers");
|
||||||
final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3);
|
final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3);
|
||||||
|
NUM_DEAD_SERVERS = cluster.getClusterMetrics().getDeadServerNames().size();
|
||||||
|
|
||||||
ServerName targetServer = ServerName.parseServerName(
|
ServerName targetServer = ServerName.parseServerName(
|
||||||
newGroup.getServers().iterator().next().toString());
|
newGroup.getServers().iterator().next().toString());
|
||||||
|
@ -895,15 +900,15 @@ public abstract class TestRSGroupsBase {
|
||||||
//due to the connection loss
|
//due to the connection loss
|
||||||
targetRS.stopServer(null,
|
targetRS.stopServer(null,
|
||||||
AdminProtos.StopServerRequest.newBuilder().setReason("Die").build());
|
AdminProtos.StopServerRequest.newBuilder().setReason("Die").build());
|
||||||
|
NUM_DEAD_SERVERS ++;
|
||||||
} catch(Exception e) {
|
} catch(Exception e) {
|
||||||
}
|
}
|
||||||
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
|
|
||||||
//wait for stopped regionserver to dead server list
|
//wait for stopped regionserver to dead server list
|
||||||
TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
|
TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
|
||||||
@Override
|
@Override
|
||||||
public boolean evaluate() throws Exception {
|
public boolean evaluate() throws Exception {
|
||||||
return !master.getServerManager().areDeadServersInProgress()
|
return !master.getServerManager().areDeadServersInProgress()
|
||||||
&& cluster.getClusterMetrics().getDeadServerNames().size() > 0;
|
&& cluster.getClusterMetrics().getDeadServerNames().size() == NUM_DEAD_SERVERS;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
assertFalse(cluster.getClusterMetrics().getLiveServerMetrics().containsKey(targetServer));
|
assertFalse(cluster.getClusterMetrics().getLiveServerMetrics().containsKey(targetServer));
|
||||||
|
@ -923,8 +928,10 @@ public abstract class TestRSGroupsBase {
|
||||||
public void testRemoveServers() throws Exception {
|
public void testRemoveServers() throws Exception {
|
||||||
LOG.info("testRemoveServers");
|
LOG.info("testRemoveServers");
|
||||||
final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3);
|
final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3);
|
||||||
ServerName targetServer = ServerName.parseServerName(
|
Iterator<Address> iterator = newGroup.getServers().iterator();
|
||||||
newGroup.getServers().iterator().next().toString());
|
ServerName targetServer = ServerName.parseServerName(iterator.next().toString());
|
||||||
|
|
||||||
|
// remove online servers
|
||||||
try {
|
try {
|
||||||
rsGroupAdmin.removeServers(Sets.newHashSet(targetServer.getAddress()));
|
rsGroupAdmin.removeServers(Sets.newHashSet(targetServer.getAddress()));
|
||||||
fail("Online servers shouldn't have been successfully removed.");
|
fail("Online servers shouldn't have been successfully removed.");
|
||||||
|
@ -936,6 +943,8 @@ public abstract class TestRSGroupsBase {
|
||||||
}
|
}
|
||||||
assertTrue(newGroup.getServers().contains(targetServer.getAddress()));
|
assertTrue(newGroup.getServers().contains(targetServer.getAddress()));
|
||||||
|
|
||||||
|
// remove dead servers
|
||||||
|
NUM_DEAD_SERVERS = cluster.getClusterMetrics().getDeadServerNames().size();
|
||||||
AdminProtos.AdminService.BlockingInterface targetRS =
|
AdminProtos.AdminService.BlockingInterface targetRS =
|
||||||
((ClusterConnection) admin.getConnection()).getAdmin(targetServer);
|
((ClusterConnection) admin.getConnection()).getAdmin(targetServer);
|
||||||
try {
|
try {
|
||||||
|
@ -943,18 +952,19 @@ public abstract class TestRSGroupsBase {
|
||||||
GetServerInfoRequest.newBuilder().build()).getServerInfo().getServerName());
|
GetServerInfoRequest.newBuilder().build()).getServerInfo().getServerName());
|
||||||
//stopping may cause an exception
|
//stopping may cause an exception
|
||||||
//due to the connection loss
|
//due to the connection loss
|
||||||
|
LOG.info("stopping server " + targetServer.getHostAndPort());
|
||||||
targetRS.stopServer(null,
|
targetRS.stopServer(null,
|
||||||
AdminProtos.StopServerRequest.newBuilder().setReason("Die").build());
|
AdminProtos.StopServerRequest.newBuilder().setReason("Die").build());
|
||||||
|
NUM_DEAD_SERVERS ++;
|
||||||
} catch(Exception e) {
|
} catch(Exception e) {
|
||||||
}
|
}
|
||||||
|
|
||||||
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
|
|
||||||
//wait for stopped regionserver to dead server list
|
//wait for stopped regionserver to dead server list
|
||||||
TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
|
TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
|
||||||
@Override
|
@Override
|
||||||
public boolean evaluate() throws Exception {
|
public boolean evaluate() throws Exception {
|
||||||
return !master.getServerManager().areDeadServersInProgress()
|
return !master.getServerManager().areDeadServersInProgress()
|
||||||
&& cluster.getClusterMetrics().getDeadServerNames().size() > 0;
|
&& cluster.getClusterMetrics().getDeadServerNames().size() == NUM_DEAD_SERVERS;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -969,17 +979,19 @@ public abstract class TestRSGroupsBase {
|
||||||
}
|
}
|
||||||
assertTrue(newGroup.getServers().contains(targetServer.getAddress()));
|
assertTrue(newGroup.getServers().contains(targetServer.getAddress()));
|
||||||
|
|
||||||
ServerName sn = TEST_UTIL.getHBaseClusterInterface().getClusterMetrics().getMasterName();
|
// remove decommissioned servers
|
||||||
TEST_UTIL.getHBaseClusterInterface().stopMaster(sn);
|
List<ServerName> serversToDecommission = new ArrayList<>();
|
||||||
TEST_UTIL.getHBaseClusterInterface().waitForMasterToStop(sn, 60000);
|
targetServer = ServerName.parseServerName(iterator.next().toString());
|
||||||
TEST_UTIL.getHBaseClusterInterface().startMaster(sn.getHostname(), 0);
|
targetRS = ((ClusterConnection) admin.getConnection()).getAdmin(targetServer);
|
||||||
TEST_UTIL.getHBaseClusterInterface().waitForActiveAndReadyMaster(60000);
|
targetServer = ProtobufUtil.toServerName(targetRS.getServerInfo(null,
|
||||||
|
GetServerInfoRequest.newBuilder().build()).getServerInfo().getServerName());
|
||||||
|
assertTrue(master.getServerManager().getOnlineServers().containsKey(targetServer));
|
||||||
|
serversToDecommission.add(targetServer);
|
||||||
|
|
||||||
|
admin.decommissionRegionServers(serversToDecommission, true);
|
||||||
|
assertEquals(1, admin.listDecommissionedRegionServers().size());
|
||||||
|
|
||||||
assertEquals(3, cluster.getClusterMetrics().getLiveServerMetrics().size());
|
|
||||||
assertFalse(cluster.getClusterMetrics().getLiveServerMetrics().containsKey(targetServer));
|
|
||||||
assertFalse(cluster.getClusterMetrics().getDeadServerNames().contains(targetServer));
|
|
||||||
assertTrue(newGroup.getServers().contains(targetServer.getAddress()));
|
assertTrue(newGroup.getServers().contains(targetServer.getAddress()));
|
||||||
|
|
||||||
rsGroupAdmin.removeServers(Sets.newHashSet(targetServer.getAddress()));
|
rsGroupAdmin.removeServers(Sets.newHashSet(targetServer.getAddress()));
|
||||||
Set<Address> newGroupServers = rsGroupAdmin.getRSGroupInfo(newGroup.getName()).getServers();
|
Set<Address> newGroupServers = rsGroupAdmin.getRSGroupInfo(newGroup.getName()).getServers();
|
||||||
assertFalse(newGroupServers.contains(targetServer.getAddress()));
|
assertFalse(newGroupServers.contains(targetServer.getAddress()));
|
||||||
|
|
Loading…
Reference in New Issue