HDFS-15757 RBF: Improving Router Connection Management (#2651)
This commit is contained in:
parent
2960d83c25
commit
43bf009112
|
@ -76,6 +76,21 @@ public interface FederationRPCMBean {
|
||||||
*/
|
*/
|
||||||
int getRpcClientNumActiveConnections();
|
int getRpcClientNumActiveConnections();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the number of idle RPC connections between the Router and the NNs.
|
||||||
|
* @return Number of idle RPC connections between the Router and the NNs.
|
||||||
|
*/
|
||||||
|
int getRpcClientNumIdleConnections();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the number of recently active RPC connections between
|
||||||
|
* the Router and the NNs.
|
||||||
|
*
|
||||||
|
* @return Number of recently active RPC connections between
|
||||||
|
* the Router and the NNs.
|
||||||
|
*/
|
||||||
|
int getRpcClientNumActiveConnectionsRecently();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the number of RPC connections to be created.
|
* Get the number of RPC connections to be created.
|
||||||
* @return Number of RPC connections to be created.
|
* @return Number of RPC connections to be created.
|
||||||
|
|
|
@ -208,6 +208,16 @@ public class FederationRPCMetrics implements FederationRPCMBean {
|
||||||
return rpcServer.getRPCClient().getNumActiveConnections();
|
return rpcServer.getRPCClient().getNumActiveConnections();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getRpcClientNumIdleConnections() {
|
||||||
|
return rpcServer.getRPCClient().getNumIdleConnections();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getRpcClientNumActiveConnectionsRecently() {
|
||||||
|
return rpcServer.getRPCClient().getNumActiveConnectionsRecently();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getRpcClientNumCreatingConnections() {
|
public int getRpcClientNumCreatingConnections() {
|
||||||
return rpcServer.getRPCClient().getNumCreatingConnections();
|
return rpcServer.getRPCClient().getNumCreatingConnections();
|
||||||
|
|
|
@ -18,9 +18,13 @@
|
||||||
package org.apache.hadoop.hdfs.server.federation.router;
|
package org.apache.hadoop.hdfs.server.federation.router;
|
||||||
|
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
import org.apache.hadoop.hdfs.NameNodeProxiesClient.ProxyAndInfo;
|
import org.apache.hadoop.hdfs.NameNodeProxiesClient.ProxyAndInfo;
|
||||||
import org.apache.hadoop.ipc.RPC;
|
import org.apache.hadoop.ipc.RPC;
|
||||||
|
import org.apache.hadoop.util.Time;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Context to track a connection in a {@link ConnectionPool}. When a client uses
|
* Context to track a connection in a {@link ConnectionPool}. When a client uses
|
||||||
|
@ -36,13 +40,19 @@ import org.apache.hadoop.ipc.RPC;
|
||||||
*/
|
*/
|
||||||
public class ConnectionContext {
|
public class ConnectionContext {
|
||||||
|
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(ConnectionContext.class);
|
||||||
|
|
||||||
/** Client for the connection. */
|
/** Client for the connection. */
|
||||||
private final ProxyAndInfo<?> client;
|
private final ProxyAndInfo<?> client;
|
||||||
/** How many threads are using this connection. */
|
/** How many threads are using this connection. */
|
||||||
private int numThreads = 0;
|
private int numThreads = 0;
|
||||||
/** If the connection is closed. */
|
/** If the connection is closed. */
|
||||||
private boolean closed = false;
|
private boolean closed = false;
|
||||||
|
/** Last timestamp the connection was active. */
|
||||||
|
private long lastActiveTs = 0;
|
||||||
|
/** The connection's active status would expire after this window. */
|
||||||
|
private final static long ACTIVE_WINDOW_TIME = TimeUnit.SECONDS.toMillis(30);
|
||||||
|
|
||||||
public ConnectionContext(ProxyAndInfo<?> connection) {
|
public ConnectionContext(ProxyAndInfo<?> connection) {
|
||||||
this.client = connection;
|
this.client = connection;
|
||||||
|
@ -57,6 +67,16 @@ public class ConnectionContext {
|
||||||
return this.numThreads > 0;
|
return this.numThreads > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the connection is/was active recently.
|
||||||
|
*
|
||||||
|
* @return True if the connection is active or
|
||||||
|
* was active in the past period of time.
|
||||||
|
*/
|
||||||
|
public synchronized boolean isActiveRecently() {
|
||||||
|
return Time.monotonicNow() - this.lastActiveTs <= ACTIVE_WINDOW_TIME;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if the connection is closed.
|
* Check if the connection is closed.
|
||||||
*
|
*
|
||||||
|
@ -83,30 +103,41 @@ public class ConnectionContext {
|
||||||
*/
|
*/
|
||||||
public synchronized ProxyAndInfo<?> getClient() {
|
public synchronized ProxyAndInfo<?> getClient() {
|
||||||
this.numThreads++;
|
this.numThreads++;
|
||||||
|
this.lastActiveTs = Time.monotonicNow();
|
||||||
return this.client;
|
return this.client;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Release this connection. If the connection was closed, close the proxy.
|
* Release this connection.
|
||||||
* Otherwise, mark the connection as not used by us anymore.
|
|
||||||
*/
|
*/
|
||||||
public synchronized void release() {
|
public synchronized void release() {
|
||||||
if (--this.numThreads == 0 && this.closed) {
|
if (this.numThreads > 0) {
|
||||||
close();
|
this.numThreads--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* We will not use this connection anymore. If it's not being used, we close
|
* Close a connection. Only idle connections can be closed since
|
||||||
* it. Otherwise, we let release() do it once we are done with it.
|
* the RPC proxy would be shut down immediately.
|
||||||
|
*
|
||||||
|
* @param force whether the connection should be closed anyway.
|
||||||
*/
|
*/
|
||||||
public synchronized void close() {
|
public synchronized void close(boolean force) {
|
||||||
this.closed = true;
|
if (!force && this.numThreads > 0) {
|
||||||
if (this.numThreads == 0) {
|
// this is an erroneous case but we have to close the connection
|
||||||
Object proxy = this.client.getProxy();
|
// anyway since there will be connection leak if we don't do so
|
||||||
// Nobody should be using this anymore so it should close right away
|
// the connection has been moved out of the pool
|
||||||
RPC.stopProxy(proxy);
|
LOG.error("Active connection with {} handlers will be closed",
|
||||||
|
this.numThreads);
|
||||||
}
|
}
|
||||||
|
this.closed = true;
|
||||||
|
Object proxy = this.client.getProxy();
|
||||||
|
// Nobody should be using this anymore so it should close right away
|
||||||
|
RPC.stopProxy(proxy);
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void close() {
|
||||||
|
close(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -281,6 +281,42 @@ public class ConnectionManager {
|
||||||
return total;
|
return total;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get number of idle connections.
|
||||||
|
*
|
||||||
|
* @return Number of active connections.
|
||||||
|
*/
|
||||||
|
public int getNumIdleConnections() {
|
||||||
|
int total = 0;
|
||||||
|
readLock.lock();
|
||||||
|
try {
|
||||||
|
for (ConnectionPool pool : this.pools.values()) {
|
||||||
|
total += pool.getNumIdleConnections();
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
readLock.unlock();
|
||||||
|
}
|
||||||
|
return total;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get number of recently active connections.
|
||||||
|
*
|
||||||
|
* @return Number of recently active connections.
|
||||||
|
*/
|
||||||
|
public int getNumActiveConnectionsRecently() {
|
||||||
|
int total = 0;
|
||||||
|
readLock.lock();
|
||||||
|
try {
|
||||||
|
for (ConnectionPool pool : this.pools.values()) {
|
||||||
|
total += pool.getNumActiveConnectionsRecently();
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
readLock.unlock();
|
||||||
|
}
|
||||||
|
return total;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the number of connections to be created.
|
* Get the number of connections to be created.
|
||||||
*
|
*
|
||||||
|
@ -327,12 +363,21 @@ public class ConnectionManager {
|
||||||
// Check if the pool hasn't been active in a while or not 50% are used
|
// Check if the pool hasn't been active in a while or not 50% are used
|
||||||
long timeSinceLastActive = Time.now() - pool.getLastActiveTime();
|
long timeSinceLastActive = Time.now() - pool.getLastActiveTime();
|
||||||
int total = pool.getNumConnections();
|
int total = pool.getNumConnections();
|
||||||
int active = pool.getNumActiveConnections();
|
// Active is a transient status in many cases for a connection since
|
||||||
|
// the handler thread uses the connection very quickly. Thus the number
|
||||||
|
// of connections with handlers using at the call time is constantly low.
|
||||||
|
// Recently active is more lasting status and it shows how many
|
||||||
|
// connections have been used with a recent time period. (i.e. 30 seconds)
|
||||||
|
int active = pool.getNumActiveConnectionsRecently();
|
||||||
float poolMinActiveRatio = pool.getMinActiveRatio();
|
float poolMinActiveRatio = pool.getMinActiveRatio();
|
||||||
if (timeSinceLastActive > connectionCleanupPeriodMs ||
|
if (timeSinceLastActive > connectionCleanupPeriodMs ||
|
||||||
active < poolMinActiveRatio * total) {
|
active < poolMinActiveRatio * total) {
|
||||||
// Remove and close 1 connection
|
// Be greedy here to close as many connections as possible in one shot
|
||||||
List<ConnectionContext> conns = pool.removeConnections(1);
|
// The number should at least be 1
|
||||||
|
int targetConnectionsCount = Math.max(1,
|
||||||
|
(int)(poolMinActiveRatio * total) - active);
|
||||||
|
List<ConnectionContext> conns =
|
||||||
|
pool.removeConnections(targetConnectionsCount);
|
||||||
for (ConnectionContext conn : conns) {
|
for (ConnectionContext conn : conns) {
|
||||||
conn.close();
|
conn.close();
|
||||||
}
|
}
|
||||||
|
@ -414,7 +459,7 @@ public class ConnectionManager {
|
||||||
ConnectionPool pool = this.queue.take();
|
ConnectionPool pool = this.queue.take();
|
||||||
try {
|
try {
|
||||||
int total = pool.getNumConnections();
|
int total = pool.getNumConnections();
|
||||||
int active = pool.getNumActiveConnections();
|
int active = pool.getNumActiveConnectionsRecently();
|
||||||
float poolMinActiveRatio = pool.getMinActiveRatio();
|
float poolMinActiveRatio = pool.getMinActiveRatio();
|
||||||
if (pool.getNumConnections() < pool.getMaxSize() &&
|
if (pool.getNumConnections() < pool.getMaxSize() &&
|
||||||
active >= poolMinActiveRatio * total) {
|
active >= poolMinActiveRatio * total) {
|
||||||
|
|
|
@ -252,19 +252,23 @@ public class ConnectionPool {
|
||||||
*/
|
*/
|
||||||
public synchronized List<ConnectionContext> removeConnections(int num) {
|
public synchronized List<ConnectionContext> removeConnections(int num) {
|
||||||
List<ConnectionContext> removed = new LinkedList<>();
|
List<ConnectionContext> removed = new LinkedList<>();
|
||||||
|
if (this.connections.size() > this.minSize) {
|
||||||
// Remove and close the last connection
|
int targetCount = Math.min(num, this.connections.size() - this.minSize);
|
||||||
List<ConnectionContext> tmpConnections = new ArrayList<>();
|
// Remove and close targetCount of connections
|
||||||
for (int i=0; i<this.connections.size(); i++) {
|
List<ConnectionContext> tmpConnections = new ArrayList<>();
|
||||||
ConnectionContext conn = this.connections.get(i);
|
for (int i = 0; i < this.connections.size(); i++) {
|
||||||
if (i < this.minSize || i < this.connections.size() - num) {
|
ConnectionContext conn = this.connections.get(i);
|
||||||
tmpConnections.add(conn);
|
// Only pick idle connections to close
|
||||||
} else {
|
if (removed.size() < targetCount && conn.isUsable()) {
|
||||||
removed.add(conn);
|
removed.add(conn);
|
||||||
|
} else {
|
||||||
|
tmpConnections.add(conn);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
this.connections = tmpConnections;
|
||||||
}
|
}
|
||||||
this.connections = tmpConnections;
|
LOG.debug("Expected to remove {} connection " +
|
||||||
|
"and actually removed {} connections", num, removed.size());
|
||||||
return removed;
|
return removed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -278,7 +282,7 @@ public class ConnectionPool {
|
||||||
this.connectionPoolId, timeSinceLastActive);
|
this.connectionPoolId, timeSinceLastActive);
|
||||||
|
|
||||||
for (ConnectionContext connection : this.connections) {
|
for (ConnectionContext connection : this.connections) {
|
||||||
connection.close();
|
connection.close(true);
|
||||||
}
|
}
|
||||||
this.connections.clear();
|
this.connections.clear();
|
||||||
}
|
}
|
||||||
|
@ -309,6 +313,39 @@ public class ConnectionPool {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of usable i.e. no active thread connections.
|
||||||
|
*
|
||||||
|
* @return Number of idle connections
|
||||||
|
*/
|
||||||
|
protected int getNumIdleConnections() {
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
List<ConnectionContext> tmpConnections = this.connections;
|
||||||
|
for (ConnectionContext conn : tmpConnections) {
|
||||||
|
if (conn.isUsable()) {
|
||||||
|
ret++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of active connections recently in the pool.
|
||||||
|
*
|
||||||
|
* @return Number of active connections recently.
|
||||||
|
*/
|
||||||
|
protected int getNumActiveConnectionsRecently() {
|
||||||
|
int ret = 0;
|
||||||
|
List<ConnectionContext> tmpConnections = this.connections;
|
||||||
|
for (ConnectionContext conn : tmpConnections) {
|
||||||
|
if (conn.isActiveRecently()) {
|
||||||
|
ret++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the last time the connection pool was used.
|
* Get the last time the connection pool was used.
|
||||||
*
|
*
|
||||||
|
@ -331,12 +368,18 @@ public class ConnectionPool {
|
||||||
public String getJSON() {
|
public String getJSON() {
|
||||||
final Map<String, String> info = new LinkedHashMap<>();
|
final Map<String, String> info = new LinkedHashMap<>();
|
||||||
info.put("active", Integer.toString(getNumActiveConnections()));
|
info.put("active", Integer.toString(getNumActiveConnections()));
|
||||||
|
info.put("recent_active",
|
||||||
|
Integer.toString(getNumActiveConnectionsRecently()));
|
||||||
|
info.put("idle", Integer.toString(getNumIdleConnections()));
|
||||||
info.put("total", Integer.toString(getNumConnections()));
|
info.put("total", Integer.toString(getNumConnections()));
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
List<ConnectionContext> tmpConnections = this.connections;
|
List<ConnectionContext> tmpConnections = this.connections;
|
||||||
for (int i=0; i<tmpConnections.size(); i++) {
|
for (int i=0; i<tmpConnections.size(); i++) {
|
||||||
ConnectionContext connection = tmpConnections.get(i);
|
ConnectionContext connection = tmpConnections.get(i);
|
||||||
info.put(i + " active", Boolean.toString(connection.isActive()));
|
info.put(i + " active", Boolean.toString(connection.isActive()));
|
||||||
|
info.put(i + " recent_active",
|
||||||
|
Integer.toString(getNumActiveConnectionsRecently()));
|
||||||
|
info.put(i + " idle", Boolean.toString(connection.isUsable()));
|
||||||
info.put(i + " closed", Boolean.toString(connection.isClosed()));
|
info.put(i + " closed", Boolean.toString(connection.isClosed()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -260,6 +260,24 @@ public class RouterRpcClient {
|
||||||
return this.connectionManager.getNumActiveConnections();
|
return this.connectionManager.getNumActiveConnections();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Total number of idle sockets between the router and NNs.
|
||||||
|
*
|
||||||
|
* @return Number of namenode clients.
|
||||||
|
*/
|
||||||
|
public int getNumIdleConnections() {
|
||||||
|
return this.connectionManager.getNumIdleConnections();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Total number of active sockets between the router and NNs.
|
||||||
|
*
|
||||||
|
* @return Number of recently active namenode clients.
|
||||||
|
*/
|
||||||
|
public int getNumActiveConnectionsRecently() {
|
||||||
|
return this.connectionManager.getNumActiveConnectionsRecently();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Total number of open connection pools to a NN. Each connection pool.
|
* Total number of open connection pools to a NN. Each connection pool.
|
||||||
* represents one user + one NN.
|
* represents one user + one NN.
|
||||||
|
|
|
@ -255,6 +255,9 @@ public class TestConnectionManager {
|
||||||
if (e.getKey().getUgi() == ugi) {
|
if (e.getKey().getUgi() == ugi) {
|
||||||
assertEquals(numOfConns, e.getValue().getNumConnections());
|
assertEquals(numOfConns, e.getValue().getNumConnections());
|
||||||
assertEquals(numOfActiveConns, e.getValue().getNumActiveConnections());
|
assertEquals(numOfActiveConns, e.getValue().getNumActiveConnections());
|
||||||
|
// idle + active = total connections
|
||||||
|
assertEquals(numOfConns - numOfActiveConns,
|
||||||
|
e.getValue().getNumIdleConnections());
|
||||||
connPoolFoundForUser = true;
|
connPoolFoundForUser = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -265,13 +268,19 @@ public class TestConnectionManager {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testConfigureConnectionActiveRatio() throws IOException {
|
public void testConfigureConnectionActiveRatio() throws IOException {
|
||||||
final int totalConns = 10;
|
// test 1 conn below the threshold and these conns are closed
|
||||||
int activeConns = 7;
|
testConnectionCleanup(0.8f, 10, 7, 9);
|
||||||
|
|
||||||
|
// test 2 conn below the threshold and these conns are closed
|
||||||
|
testConnectionCleanup(0.8f, 10, 6, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testConnectionCleanup(float ratio, int totalConns,
|
||||||
|
int activeConns, int leftConns) throws IOException {
|
||||||
Configuration tmpConf = new Configuration();
|
Configuration tmpConf = new Configuration();
|
||||||
// Set dfs.federation.router.connection.min-active-ratio 0.8f
|
// Set dfs.federation.router.connection.min-active-ratio
|
||||||
tmpConf.setFloat(
|
tmpConf.setFloat(
|
||||||
RBFConfigKeys.DFS_ROUTER_NAMENODE_CONNECTION_MIN_ACTIVE_RATIO, 0.8f);
|
RBFConfigKeys.DFS_ROUTER_NAMENODE_CONNECTION_MIN_ACTIVE_RATIO, ratio);
|
||||||
ConnectionManager tmpConnManager = new ConnectionManager(tmpConf);
|
ConnectionManager tmpConnManager = new ConnectionManager(tmpConf);
|
||||||
tmpConnManager.start();
|
tmpConnManager.start();
|
||||||
|
|
||||||
|
@ -284,21 +293,20 @@ public class TestConnectionManager {
|
||||||
TEST_NN_ADDRESS, NamenodeProtocol.class);
|
TEST_NN_ADDRESS, NamenodeProtocol.class);
|
||||||
ConnectionPool pool = poolMap.get(connectionPoolId);
|
ConnectionPool pool = poolMap.get(connectionPoolId);
|
||||||
|
|
||||||
// Test min active ratio is 0.8f
|
// Test min active ratio is as set value
|
||||||
assertEquals(0.8f, pool.getMinActiveRatio(), 0.001f);
|
assertEquals(ratio, pool.getMinActiveRatio(), 0.001f);
|
||||||
|
|
||||||
pool.getConnection().getClient();
|
pool.getConnection().getClient();
|
||||||
// Test there is one active connection in pool
|
// Test there is one active connection in pool
|
||||||
assertEquals(1, pool.getNumActiveConnections());
|
assertEquals(1, pool.getNumActiveConnections());
|
||||||
|
|
||||||
// Add other 6 active/9 total connections to pool
|
// Add other active-1 connections / totalConns-1 connections to pool
|
||||||
addConnectionsToPool(pool, totalConns - 1, activeConns - 1);
|
addConnectionsToPool(pool, totalConns - 1, activeConns - 1);
|
||||||
|
|
||||||
// There are 7 active connections.
|
// There are activeConn connections.
|
||||||
// The active number is less than totalConns(10) * minActiveRatio(0.8f).
|
|
||||||
// We can cleanup the pool
|
// We can cleanup the pool
|
||||||
tmpConnManager.cleanup(pool);
|
tmpConnManager.cleanup(pool);
|
||||||
assertEquals(totalConns - 1, pool.getNumConnections());
|
assertEquals(leftConns, pool.getNumConnections());
|
||||||
|
|
||||||
tmpConnManager.close();
|
tmpConnManager.close();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue