From 7e63e37dc5cbe330082a6a42598ffb76e0770fc1 Mon Sep 17 00:00:00 2001 From: Inigo Goiri Date: Tue, 12 Feb 2019 10:44:02 -0800 Subject: [PATCH] HDFS-14230. RBF: Throw RetriableException instead of IOException when no namenodes available. Contributed by Fei Hui. --- .../metrics/FederationRPCMBean.java | 2 + .../metrics/FederationRPCMetrics.java | 11 +++ .../FederationRPCPerformanceMonitor.java | 5 ++ .../router/NoNamenodesAvailableException.java | 33 +++++++ .../federation/router/RouterRpcClient.java | 16 +++- .../federation/router/RouterRpcMonitor.java | 5 ++ .../federation/FederationTestUtils.java | 38 ++++++++ .../TestRouterClientRejectOverload.java | 86 +++++++++++++++++-- .../router/TestRouterRPCClientRetries.java | 2 +- 9 files changed, 188 insertions(+), 10 deletions(-) create mode 100644 hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NoNamenodesAvailableException.java diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMBean.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMBean.java index 973c3983f55..76b3ca64044 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMBean.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMBean.java @@ -46,6 +46,8 @@ public interface FederationRPCMBean { long getProxyOpRetries(); + long getProxyOpNoNamenodes(); + long getRouterFailureStateStoreOps(); long getRouterFailureReadOnlyOps(); diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMetrics.java index cce4b86ce1f..8e57c6b3809 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMetrics.java @@ -60,6 +60,8 @@ public class FederationRPCMetrics implements FederationRPCMBean { private MutableCounterLong proxyOpNotImplemented; @Metric("Number of operation retries") private MutableCounterLong proxyOpRetries; + @Metric("Number of operations to hit no namenodes available") + private MutableCounterLong proxyOpNoNamenodes; @Metric("Failed requests due to State Store unavailable") private MutableCounterLong routerFailureStateStore; @@ -138,6 +140,15 @@ public class FederationRPCMetrics implements FederationRPCMBean { return proxyOpRetries.value(); } + public void incrProxyOpNoNamenodes() { + proxyOpNoNamenodes.incr(); + } + + @Override + public long getProxyOpNoNamenodes() { + return proxyOpNoNamenodes.value(); + } + public void incrRouterFailureStateStore() { routerFailureStateStore.incr(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCPerformanceMonitor.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCPerformanceMonitor.java index 15725d14ce9..cbd63de5d2e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCPerformanceMonitor.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCPerformanceMonitor.java @@ -170,6 +170,11 @@ public class FederationRPCPerformanceMonitor implements RouterRpcMonitor { metrics.incrProxyOpRetries(); } + @Override + public void proxyOpNoNamenodes() { + metrics.incrProxyOpNoNamenodes(); + } + @Override public void routerFailureStateStore() { metrics.incrRouterFailureStateStore(); diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NoNamenodesAvailableException.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NoNamenodesAvailableException.java new file mode 100644 index 00000000000..7eabf00356f --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NoNamenodesAvailableException.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.federation.router; + +import java.io.IOException; + + +/** + * Exception when no namenodes are available. + */ +public class NoNamenodesAvailableException extends IOException { + + private static final long serialVersionUID = 1L; + + public NoNamenodesAvailableException(String nsId, IOException ioe) { + super("No namenodes available under nameservice " + nsId, ioe); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java index f5985ee81ad..d21bde3d679 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java @@ -61,6 +61,7 @@ import org.apache.hadoop.io.retry.RetryPolicies; import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.io.retry.RetryPolicy.RetryAction.RetryDecision; import org.apache.hadoop.ipc.RemoteException; +import org.apache.hadoop.ipc.RetriableException; import org.apache.hadoop.ipc.StandbyException; import org.apache.hadoop.security.UserGroupInformation; import org.slf4j.Logger; @@ -302,8 +303,8 @@ public class RouterRpcClient { * @param retryCount Number of retries. * @param nsId Nameservice ID. * @return Retry decision. - * @throws IOException Original exception if the retry policy generates one - * or IOException for no available namenodes. + * @throws NoNamenodesAvailableException Exception that the retry policy + * generates for no available namenodes. */ private RetryDecision shouldRetry(final IOException ioe, final int retryCount, final String nsId) throws IOException { @@ -313,8 +314,7 @@ public class RouterRpcClient { if (retryCount == 0) { return RetryDecision.RETRY; } else { - throw new IOException("No namenode available under nameservice " + nsId, - ioe); + throw new NoNamenodesAvailableException(nsId, ioe); } } @@ -405,6 +405,14 @@ public class RouterRpcClient { StandbyException se = new StandbyException(ioe.getMessage()); se.initCause(ioe); throw se; + } else if (ioe instanceof NoNamenodesAvailableException) { + if (this.rpcMonitor != null) { + this.rpcMonitor.proxyOpNoNamenodes(); + } + LOG.error("Can not get available namenode for {} {} error: {}", + nsId, rpcAddress, ioe.getMessage()); + // Throw RetriableException so that client can retry + throw new RetriableException(ioe); } else { // Other communication error, this is a failure // Communication retries are handled by the retry policy diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcMonitor.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcMonitor.java index 7af71af0792..5a2adb9e54e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcMonitor.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcMonitor.java @@ -92,6 +92,11 @@ public interface RouterRpcMonitor { */ void proxyOpRetries(); + /** + * Failed to proxy an operation because of no namenodes available. + */ + void proxyOpNoNamenodes(); + /** * If the Router cannot contact the State Store in an operation. */ diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/FederationTestUtils.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/FederationTestUtils.java index d92edac35b6..54342240f15 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/FederationTestUtils.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/FederationTestUtils.java @@ -48,6 +48,7 @@ import org.apache.hadoop.fs.UnsupportedFileSystemException; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster.NamenodeContext; import org.apache.hadoop.hdfs.server.federation.resolver.ActiveNamenodeResolver; import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeContext; import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeServiceState; @@ -374,4 +375,41 @@ public final class FederationTestUtils { Whitebox.setInternalState(rpcClient, "connectionManager", spyConnectionManager); } + + /** + * Switch namenodes of all hdfs name services to standby. + * @param cluster a federated HDFS cluster + */ + public static void transitionClusterNSToStandby( + StateStoreDFSCluster cluster) { + // Name services of the cluster + List nameServiceList = cluster.getNameservices(); + + // Change namenodes of each name service to standby + for (String nameService : nameServiceList) { + List nnList = cluster.getNamenodes(nameService); + for(NamenodeContext namenodeContext : nnList) { + cluster.switchToStandby(nameService, namenodeContext.getNamenodeId()); + } + } + } + + /** + * Switch the index namenode of all hdfs name services to active. + * @param cluster a federated HDFS cluster + * @param index the index of namenodes + */ + public static void transitionClusterNSToActive( + StateStoreDFSCluster cluster, int index) { + // Name services of the cluster + List nameServiceList = cluster.getNameservices(); + + // Change the index namenode of each name service to active + for (String nameService : nameServiceList) { + List listNamenodeContext = + cluster.getNamenodes(nameService); + cluster.switchToActive(nameService, + listNamenodeContext.get(index).getNamenodeId()); + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterClientRejectOverload.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterClientRejectOverload.java index 06641593519..14bd7b0a400 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterClientRejectOverload.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterClientRejectOverload.java @@ -19,6 +19,8 @@ package org.apache.hadoop.hdfs.server.federation.router; import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.simulateSlowNamenode; import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.simulateThrowExceptionRouterRpcServer; +import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.transitionClusterNSToStandby; +import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.transitionClusterNSToActive; import static org.apache.hadoop.test.GenericTestUtils.assertExceptionContains; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -27,6 +29,7 @@ import static org.junit.Assert.fail; import java.io.IOException; import java.net.URI; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -46,7 +49,9 @@ import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.ipc.StandbyException; import org.junit.After; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -71,14 +76,19 @@ public class TestRouterClientRejectOverload { } } - private void setupCluster(boolean overloadControl) throws Exception { + @Rule + public ExpectedException exceptionRule = ExpectedException.none(); + + private void setupCluster(boolean overloadControl, boolean ha) + throws Exception { // Build and start a federated cluster - cluster = new StateStoreDFSCluster(false, 2); + cluster = new StateStoreDFSCluster(ha, 2); Configuration routerConf = new RouterConfigBuilder() .stateStore() .metrics() .admin() .rpc() + .heartbeat() .build(); // Reduce the number of RPC clients threads to overload the Router easy @@ -98,7 +108,7 @@ public class TestRouterClientRejectOverload { @Test public void testWithoutOverloadControl() throws Exception { - setupCluster(false); + setupCluster(false, false); // Nobody should get overloaded testOverloaded(0); @@ -121,7 +131,7 @@ public class TestRouterClientRejectOverload { @Test public void testOverloadControl() throws Exception { - setupCluster(true); + setupCluster(true, false); List routers = cluster.getRouters(); FederationRPCMetrics rpcMetrics0 = @@ -244,7 +254,7 @@ public class TestRouterClientRejectOverload { @Test public void testConnectionNullException() throws Exception { - setupCluster(false); + setupCluster(false, false); // Choose 1st router RouterContext routerContext = cluster.getRouters().get(0); @@ -280,4 +290,70 @@ public class TestRouterClientRejectOverload { assertEquals(originalRouter1Failures, rpcMetrics1.getProxyOpFailureCommunicate()); } + + /** + * When failover occurs, no namenodes are available within a short time. + * Client will success after some retries. + */ + @Test + public void testNoNamenodesAvailable() throws Exception{ + setupCluster(false, true); + + transitionClusterNSToStandby(cluster); + + Configuration conf = cluster.getRouterClientConf(); + // Set dfs.client.failover.random.order false, to pick 1st router at first + conf.setBoolean("dfs.client.failover.random.order", false); + + // Retries is 3 (see FailoverOnNetworkExceptionRetry#shouldRetry, will fail + // when reties > max.attempts), so total access is 4. + conf.setInt("dfs.client.retry.max.attempts", 2); + DFSClient routerClient = new DFSClient(new URI("hdfs://fed"), conf); + + // Get router0 metrics + FederationRPCMetrics rpcMetrics0 = cluster.getRouters().get(0) + .getRouter().getRpcServer().getRPCMetrics(); + // Get router1 metrics + FederationRPCMetrics rpcMetrics1 = cluster.getRouters().get(1) + .getRouter().getRpcServer().getRPCMetrics(); + + // Original failures + long originalRouter0Failures = rpcMetrics0.getProxyOpNoNamenodes(); + long originalRouter1Failures = rpcMetrics1.getProxyOpNoNamenodes(); + + // GetFileInfo will throw Exception + String exceptionMessage = "org.apache.hadoop.hdfs.server.federation." + + "router.NoNamenodesAvailableException: No namenodes available " + + "under nameservice ns0"; + exceptionRule.expect(RemoteException.class); + exceptionRule.expectMessage(exceptionMessage); + routerClient.getFileInfo("/"); + + // Router 0 failures will increase + assertEquals(originalRouter0Failures + 4, + rpcMetrics0.getProxyOpNoNamenodes()); + // Router 1 failures do not change + assertEquals(originalRouter1Failures, + rpcMetrics1.getProxyOpNoNamenodes()); + + // Make name services available + transitionClusterNSToActive(cluster, 0); + for (RouterContext routerContext : cluster.getRouters()) { + // Manually trigger the heartbeat + Collection heartbeatServices = routerContext + .getRouter().getNamenodeHearbeatServices(); + for (NamenodeHeartbeatService service : heartbeatServices) { + service.periodicInvoke(); + } + // Update service cache + routerContext.getRouter().getStateStore().refreshCaches(true); + } + + originalRouter0Failures = rpcMetrics0.getProxyOpNoNamenodes(); + + // RPC call must be successful + routerClient.getFileInfo("/"); + // Router 0 failures do not change + assertEquals(originalRouter0Failures, rpcMetrics0.getProxyOpNoNamenodes()); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java index f84e9a03ee8..8772e2fe22e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java @@ -133,7 +133,7 @@ public class TestRouterRPCClientRetries { } catch (RemoteException e) { String ns0 = cluster.getNameservices().get(0); assertExceptionContains( - "No namenode available under nameservice " + ns0, e); + "No namenodes available under nameservice " + ns0, e); } // Verify the retry times, it should only retry one time.