HBASE-27104 Add a tool command list_unknownservers (#4523)

Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
LiangJun He 2022-08-20 21:48:58 +08:00 committed by GitHub
parent 32c135d8a5
commit 61cd63c7b6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 240 additions and 8 deletions

View File

@ -76,6 +76,9 @@ public interface ClusterMetrics {
/** Returns the names of region servers on the dead list */ /** Returns the names of region servers on the dead list */
List<ServerName> getDeadServerNames(); List<ServerName> getDeadServerNames();
/** Returns the names of region servers on the unknown list */
List<ServerName> getUnknownServerNames();
/** Returns the names of region servers on the decommissioned list */ /** Returns the names of region servers on the decommissioned list */
List<ServerName> getDecommissionedServerNames(); List<ServerName> getDecommissionedServerNames();
@ -179,6 +182,10 @@ public interface ClusterMetrics {
* metrics about dead region servers * metrics about dead region servers
*/ */
DEAD_SERVERS, DEAD_SERVERS,
/**
* metrics about unknown region servers
*/
UNKNOWN_SERVERS,
/** /**
* metrics about master name * metrics about master name
*/ */

View File

@ -47,6 +47,8 @@ public final class ClusterMetricsBuilder {
.collect(Collectors.toList())) .collect(Collectors.toList()))
.addAllDeadServers(metrics.getDeadServerNames().stream().map(ProtobufUtil::toServerName) .addAllDeadServers(metrics.getDeadServerNames().stream().map(ProtobufUtil::toServerName)
.collect(Collectors.toList())) .collect(Collectors.toList()))
.addAllUnknownServers(metrics.getUnknownServerNames().stream()
.map(ProtobufUtil::toServerName).collect(Collectors.toList()))
.addAllLiveServers(metrics.getLiveServerMetrics().entrySet().stream() .addAllLiveServers(metrics.getLiveServerMetrics().entrySet().stream()
.map(s -> ClusterStatusProtos.LiveServerInfo.newBuilder() .map(s -> ClusterStatusProtos.LiveServerInfo.newBuilder()
.setServer(ProtobufUtil.toServerName(s.getKey())) .setServer(ProtobufUtil.toServerName(s.getKey()))
@ -89,6 +91,7 @@ public final class ClusterMetricsBuilder {
builder.setHbaseVersion( builder.setHbaseVersion(
FSProtos.HBaseVersionFileContent.newBuilder().setVersion(metrics.getHBaseVersion())); FSProtos.HBaseVersionFileContent.newBuilder().setVersion(metrics.getHBaseVersion()));
} }
return builder.build(); return builder.build();
} }
@ -100,6 +103,8 @@ public final class ClusterMetricsBuilder {
ServerMetricsBuilder::toServerMetrics))) ServerMetricsBuilder::toServerMetrics)))
.setDeadServerNames(proto.getDeadServersList().stream().map(ProtobufUtil::toServerName) .setDeadServerNames(proto.getDeadServersList().stream().map(ProtobufUtil::toServerName)
.collect(Collectors.toList())) .collect(Collectors.toList()))
.setUnknownServerNames(proto.getUnknownServersList().stream().map(ProtobufUtil::toServerName)
.collect(Collectors.toList()))
.setBackerMasterNames(proto.getBackupMastersList().stream().map(ProtobufUtil::toServerName) .setBackerMasterNames(proto.getBackupMastersList().stream().map(ProtobufUtil::toServerName)
.collect(Collectors.toList())) .collect(Collectors.toList()))
.setRegionsInTransition(proto.getRegionsInTransitionList().stream() .setRegionsInTransition(proto.getRegionsInTransitionList().stream()
@ -151,6 +156,8 @@ public final class ClusterMetricsBuilder {
return ClusterMetrics.Option.LIVE_SERVERS; return ClusterMetrics.Option.LIVE_SERVERS;
case DEAD_SERVERS: case DEAD_SERVERS:
return ClusterMetrics.Option.DEAD_SERVERS; return ClusterMetrics.Option.DEAD_SERVERS;
case UNKNOWN_SERVERS:
return ClusterMetrics.Option.UNKNOWN_SERVERS;
case REGIONS_IN_TRANSITION: case REGIONS_IN_TRANSITION:
return ClusterMetrics.Option.REGIONS_IN_TRANSITION; return ClusterMetrics.Option.REGIONS_IN_TRANSITION;
case CLUSTER_ID: case CLUSTER_ID:
@ -192,6 +199,8 @@ public final class ClusterMetricsBuilder {
return ClusterStatusProtos.Option.LIVE_SERVERS; return ClusterStatusProtos.Option.LIVE_SERVERS;
case DEAD_SERVERS: case DEAD_SERVERS:
return ClusterStatusProtos.Option.DEAD_SERVERS; return ClusterStatusProtos.Option.DEAD_SERVERS;
case UNKNOWN_SERVERS:
return ClusterStatusProtos.Option.UNKNOWN_SERVERS;
case REGIONS_IN_TRANSITION: case REGIONS_IN_TRANSITION:
return ClusterStatusProtos.Option.REGIONS_IN_TRANSITION; return ClusterStatusProtos.Option.REGIONS_IN_TRANSITION;
case CLUSTER_ID: case CLUSTER_ID:
@ -246,6 +255,7 @@ public final class ClusterMetricsBuilder {
@Nullable @Nullable
private String hbaseVersion; private String hbaseVersion;
private List<ServerName> deadServerNames = Collections.emptyList(); private List<ServerName> deadServerNames = Collections.emptyList();
private List<ServerName> unknownServerNames = Collections.emptyList();
private Map<ServerName, ServerMetrics> liveServerMetrics = new TreeMap<>(); private Map<ServerName, ServerMetrics> liveServerMetrics = new TreeMap<>();
@Nullable @Nullable
private ServerName masterName; private ServerName masterName;
@ -276,6 +286,11 @@ public final class ClusterMetricsBuilder {
return this; return this;
} }
public ClusterMetricsBuilder setUnknownServerNames(List<ServerName> value) {
this.unknownServerNames = value;
return this;
}
public ClusterMetricsBuilder setLiveServerMetrics(Map<ServerName, ServerMetrics> value) { public ClusterMetricsBuilder setLiveServerMetrics(Map<ServerName, ServerMetrics> value) {
liveServerMetrics.putAll(value); liveServerMetrics.putAll(value);
return this; return this;
@ -338,15 +353,17 @@ public final class ClusterMetricsBuilder {
} }
public ClusterMetrics build() { public ClusterMetrics build() {
return new ClusterMetricsImpl(hbaseVersion, deadServerNames, liveServerMetrics, masterName, return new ClusterMetricsImpl(hbaseVersion, deadServerNames, unknownServerNames,
backupMasterNames, regionsInTransition, clusterId, masterCoprocessorNames, balancerOn, liveServerMetrics, masterName, backupMasterNames, regionsInTransition, clusterId,
masterInfoPort, serversName, tableRegionStatesCount, masterTasks, decommissionedServerNames); masterCoprocessorNames, balancerOn, masterInfoPort, serversName, tableRegionStatesCount,
masterTasks, decommissionedServerNames);
} }
private static class ClusterMetricsImpl implements ClusterMetrics { private static class ClusterMetricsImpl implements ClusterMetrics {
@Nullable @Nullable
private final String hbaseVersion; private final String hbaseVersion;
private final List<ServerName> deadServerNames; private final List<ServerName> deadServerNames;
private final List<ServerName> unknownServerNames;
private final List<ServerName> decommissionedServerNames; private final List<ServerName> decommissionedServerNames;
private final Map<ServerName, ServerMetrics> liveServerMetrics; private final Map<ServerName, ServerMetrics> liveServerMetrics;
@Nullable @Nullable
@ -364,13 +381,15 @@ public final class ClusterMetricsBuilder {
private final List<ServerTask> masterTasks; private final List<ServerTask> masterTasks;
ClusterMetricsImpl(String hbaseVersion, List<ServerName> deadServerNames, ClusterMetricsImpl(String hbaseVersion, List<ServerName> deadServerNames,
Map<ServerName, ServerMetrics> liveServerMetrics, ServerName masterName, List<ServerName> unknownServerNames, Map<ServerName, ServerMetrics> liveServerMetrics,
List<ServerName> backupMasterNames, List<RegionState> regionsInTransition, String clusterId, ServerName masterName, List<ServerName> backupMasterNames,
List<String> masterCoprocessorNames, Boolean balancerOn, int masterInfoPort, List<RegionState> regionsInTransition, String clusterId, List<String> masterCoprocessorNames,
List<ServerName> serversName, Map<TableName, RegionStatesCount> tableRegionStatesCount, Boolean balancerOn, int masterInfoPort, List<ServerName> serversName,
List<ServerTask> masterTasks, List<ServerName> decommissionedServerNames) { Map<TableName, RegionStatesCount> tableRegionStatesCount, List<ServerTask> masterTasks,
List<ServerName> decommissionedServerNames) {
this.hbaseVersion = hbaseVersion; this.hbaseVersion = hbaseVersion;
this.deadServerNames = Preconditions.checkNotNull(deadServerNames); this.deadServerNames = Preconditions.checkNotNull(deadServerNames);
this.unknownServerNames = Preconditions.checkNotNull(unknownServerNames);
this.decommissionedServerNames = Preconditions.checkNotNull(decommissionedServerNames); this.decommissionedServerNames = Preconditions.checkNotNull(decommissionedServerNames);
this.liveServerMetrics = Preconditions.checkNotNull(liveServerMetrics); this.liveServerMetrics = Preconditions.checkNotNull(liveServerMetrics);
this.masterName = masterName; this.masterName = masterName;
@ -395,6 +414,11 @@ public final class ClusterMetricsBuilder {
return Collections.unmodifiableList(deadServerNames); return Collections.unmodifiableList(deadServerNames);
} }
@Override
public List<ServerName> getUnknownServerNames() {
return Collections.unmodifiableList(unknownServerNames);
}
@Override @Override
public List<ServerName> getDecommissionedServerNames() { public List<ServerName> getDecommissionedServerNames() {
return Collections.unmodifiableList(decommissionedServerNames); return Collections.unmodifiableList(decommissionedServerNames);
@ -490,6 +514,14 @@ public final class ClusterMetricsBuilder {
} }
} }
int unknownServerSize = getUnknownServerNames().size();
sb.append("\nNumber of unknown region servers: " + unknownServerSize);
if (unknownServerSize > 0) {
for (ServerName serverName : getUnknownServerNames()) {
sb.append("\n " + serverName);
}
}
sb.append("\nAverage load: " + getAverageLoad()); sb.append("\nAverage load: " + getAverageLoad());
sb.append("\nNumber of requests: " + getRequestCount()); sb.append("\nNumber of requests: " + getRequestCount());
sb.append("\nNumber of regions: " + getRegionCount()); sb.append("\nNumber of regions: " + getRegionCount());

View File

@ -2228,6 +2228,14 @@ public interface Admin extends Abortable, Closeable {
return getClusterMetrics(EnumSet.of(Option.DEAD_SERVERS)).getDeadServerNames(); return getClusterMetrics(EnumSet.of(Option.DEAD_SERVERS)).getDeadServerNames();
} }
/**
* List unknown region servers.
* @return List of unknown region servers.
*/
default List<ServerName> listUnknownServers() throws IOException {
return getClusterMetrics(EnumSet.of(Option.UNKNOWN_SERVERS)).getUnknownServerNames();
}
/** /**
* Clear dead region servers from master. * Clear dead region servers from master.
* @param servers list of dead region servers. * @param servers list of dead region servers.

View File

@ -1455,6 +1455,14 @@ public interface AsyncAdmin {
.thenApply(ClusterMetrics::getDeadServerNames); .thenApply(ClusterMetrics::getDeadServerNames);
} }
/**
* List all the unknown region servers.
*/
default CompletableFuture<List<ServerName>> listUnknownServers() {
return this.getClusterMetrics(EnumSet.of(Option.UNKNOWN_SERVERS))
.thenApply(ClusterMetrics::getUnknownServerNames);
}
/** /**
* Clear dead region servers from master. * Clear dead region servers from master.
* @param servers list of dead region servers. * @param servers list of dead region servers.

View File

@ -775,6 +775,11 @@ class AsyncHBaseAdmin implements AsyncAdmin {
return wrap(rawAdmin.listDeadServers()); return wrap(rawAdmin.listDeadServers());
} }
@Override
public CompletableFuture<List<ServerName>> listUnknownServers() {
return wrap(rawAdmin.listUnknownServers());
}
@Override @Override
public CompletableFuture<List<ServerName>> clearDeadServers(List<ServerName> servers) { public CompletableFuture<List<ServerName>> clearDeadServers(List<ServerName> servers) {
return wrap(rawAdmin.clearDeadServers(servers)); return wrap(rawAdmin.clearDeadServers(servers));

View File

@ -350,6 +350,7 @@ message ClusterStatus {
repeated TableRegionStatesCount table_region_states_count = 12; repeated TableRegionStatesCount table_region_states_count = 12;
repeated ServerTask master_tasks = 13; repeated ServerTask master_tasks = 13;
repeated ServerName decommissioned_servers = 14; repeated ServerName decommissioned_servers = 14;
repeated ServerName unknown_servers = 15;
} }
enum Option { enum Option {
@ -367,4 +368,5 @@ enum Option {
TABLE_TO_REGIONS_COUNT = 11; TABLE_TO_REGIONS_COUNT = 11;
TASKS = 12; TASKS = 12;
DECOMMISSIONED_SERVERS = 13; DECOMMISSIONED_SERVERS = 13;
UNKNOWN_SERVERS = 14;
} }

View File

@ -2863,6 +2863,12 @@ public class HMaster extends HBaseServerBase<MasterRpcServices> implements Maste
} }
break; break;
} }
case UNKNOWN_SERVERS: {
if (serverManager != null) {
builder.setUnknownServerNames(getUnknownServers());
}
break;
}
case MASTER_COPROCESSORS: { case MASTER_COPROCESSORS: {
if (cpHost != null) { if (cpHost != null) {
builder.setMasterCoprocessorNames(Arrays.asList(getMasterCoprocessors())); builder.setMasterCoprocessorNames(Arrays.asList(getMasterCoprocessors()));
@ -2928,6 +2934,17 @@ public class HMaster extends HBaseServerBase<MasterRpcServices> implements Maste
return builder.build(); return builder.build();
} }
private List<ServerName> getUnknownServers() {
if (serverManager != null) {
final Set<ServerName> serverNames = getAssignmentManager().getRegionStates().getRegionStates()
.stream().map(RegionState::getServerName).collect(Collectors.toSet());
final List<ServerName> unknownServerNames = serverNames.stream()
.filter(sn -> sn != null && serverManager.isServerUnknown(sn)).collect(Collectors.toList());
return unknownServerNames;
}
return null;
}
private Map<ServerName, ServerMetrics> getOnlineServers() { private Map<ServerName, ServerMetrics> getOnlineServers() {
if (serverManager != null) { if (serverManager != null) {
final Map<ServerName, ServerMetrics> map = new HashMap<>(); final Map<ServerName, ServerMetrics> map = new HashMap<>();

View File

@ -227,6 +227,11 @@ public class TestRegionsRecoveryChore {
return null; return null;
} }
@Override
public List<ServerName> getUnknownServerNames() {
return null;
}
@Override @Override
public List<ServerName> getDecommissionedServerNames() { public List<ServerName> getDecommissionedServerNames() {
return null; return null;

View File

@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtil;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@Category(MediumTests.class)
public class TestUnknownServers {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestUnknownServers.class);
private static HBaseTestingUtil UTIL;
private static Admin ADMIN;
private final static int SLAVES = 1;
private static boolean IS_UNKNOWN_SERVER = true;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
UTIL = new HBaseTestingUtil();
UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL,
TestUnknownServers.HMasterForTest.class, HMaster.class);
UTIL.startMiniCluster(SLAVES);
ADMIN = UTIL.getAdmin();
}
@Test
public void testListUnknownServers() throws Exception {
Assert.assertEquals(ADMIN.listUnknownServers().size(), SLAVES);
IS_UNKNOWN_SERVER = false;
Assert.assertEquals(ADMIN.listUnknownServers().size(), 0);
}
@AfterClass
public static void tearDownAfterClass() throws Exception {
if (ADMIN != null) {
ADMIN.close();
}
if (UTIL != null) {
UTIL.shutdownMiniCluster();
}
}
public static final class HMasterForTest extends HMaster {
public HMasterForTest(Configuration conf) throws IOException {
super(conf);
}
@Override
protected ServerManager createServerManager(MasterServices master, RegionServerList storage)
throws IOException {
setupClusterConnection();
return new TestUnknownServers.ServerManagerForTest(master, storage);
}
}
private static final class ServerManagerForTest extends ServerManager {
public ServerManagerForTest(MasterServices master, RegionServerList storage) {
super(master, storage);
}
@Override
public boolean isServerUnknown(ServerName serverName) {
return IS_UNKNOWN_SERVER;
}
}
}

View File

@ -1631,6 +1631,12 @@ module Hbase
@admin.clearDeadServers(servers).to_a @admin.clearDeadServers(servers).to_a
end end
#----------------------------------------------------------------------------------------------
# list unknown region servers
def list_unknownservers
@admin.listUnknownServers.to_a
end
#---------------------------------------------------------------------------------------------- #----------------------------------------------------------------------------------------------
# List live region servers # List live region servers
def list_liveservers def list_liveservers

View File

@ -475,6 +475,7 @@ Shell.load_command_group(
clear_compaction_queues clear_compaction_queues
list_deadservers list_deadservers
list_liveservers list_liveservers
list_unknownservers
clear_deadservers clear_deadservers
clear_block_cache clear_block_cache
stop_master stop_master

View File

@ -0,0 +1,44 @@
#
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
module Shell
module Commands
class ListUnknownservers < Command
def help
<<~EOF
List all unknown region servers
Examples:
hbase> list_unknownservers
EOF
end
def command
now = Time.now
formatter.header(['SERVERNAME'])
servers = admin.list_unknownservers
servers.each do |server|
formatter.row([server.toString])
end
formatter.footer(servers.size)
end
end
end
end