diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java index f0db7b5f418..946abfbba7e 100644 --- a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java @@ -208,6 +208,10 @@ public final class XceiverClientRatis extends XceiverClientSpi { public ContainerCommandResponseProto sendCommand( ContainerCommandRequestProto request) throws IOException { final RaftClientReply reply = sendRequest(request); + if (reply == null) { + throw new IOException( + String.format("Could not execute the request %s", request)); + } Preconditions.checkState(reply.isSuccess()); return ContainerCommandResponseProto.parseFrom( reply.getMessage().getContent()); diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java index 5b257790068..63f59168288 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java @@ -75,6 +75,19 @@ public final class ScmConfigKeys { public static final TimeDuration DFS_RATIS_CLIENT_REQUEST_TIMEOUT_DURATION_DEFAULT = TimeDuration.valueOf(3000, TimeUnit.MILLISECONDS); + public static final String DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY = + "dfs.ratis.client.request.max.retries"; + public static final int DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_DEFAULT = 180; + public static final String DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY = + "dfs.ratis.client.request.retry.interval"; + public static final TimeDuration + DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_DEFAULT = + TimeDuration.valueOf(100, TimeUnit.MILLISECONDS); + public static final String DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_KEY = + "dfs.ratis.server.retry-cache.timeout.duration"; + public static final TimeDuration + DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_DEFAULT = + TimeDuration.valueOf(600000, TimeUnit.MILLISECONDS); public static final String DFS_RATIS_SERVER_REQUEST_TIMEOUT_DURATION_KEY = "dfs.ratis.server.request.timeout.duration"; public static final TimeDuration diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java index 54ec1392251..599b4e80bf2 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java @@ -237,6 +237,20 @@ public final class OzoneConfigKeys { public static final TimeDuration DFS_RATIS_CLIENT_REQUEST_TIMEOUT_DURATION_DEFAULT = ScmConfigKeys.DFS_RATIS_CLIENT_REQUEST_TIMEOUT_DURATION_DEFAULT; + public static final String DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY = + ScmConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY; + public static final int DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_DEFAULT = + ScmConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_DEFAULT; + public static final String DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY = + ScmConfigKeys.DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY; + public static final TimeDuration + DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_DEFAULT = + ScmConfigKeys.DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_DEFAULT; + public static final String DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_KEY = + ScmConfigKeys.DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_KEY; + public static final TimeDuration + DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_DEFAULT = + ScmConfigKeys.DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_DEFAULT; public static final String DFS_RATIS_SERVER_REQUEST_TIMEOUT_DURATION_KEY = ScmConfigKeys.DFS_RATIS_SERVER_REQUEST_TIMEOUT_DURATION_KEY; public static final TimeDuration diff --git a/hadoop-hdds/common/src/main/java/org/apache/ratis/RatisHelper.java b/hadoop-hdds/common/src/main/java/org/apache/ratis/RatisHelper.java index d851992c424..04bfeb2e848 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/ratis/RatisHelper.java +++ b/hadoop-hdds/common/src/main/java/org/apache/ratis/RatisHelper.java @@ -34,6 +34,7 @@ import org.apache.ratis.retry.RetryPolicy; import org.apache.ratis.rpc.RpcType; import org.apache.ratis.shaded.com.google.protobuf.ByteString; import org.apache.ratis.shaded.proto.RaftProtos; +import org.apache.ratis.util.Preconditions; import org.apache.ratis.util.SizeInBytes; import org.apache.ratis.util.TimeDuration; import org.slf4j.Logger; @@ -48,6 +49,9 @@ import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; +import static org.apache.hadoop.ozone.OzoneConfigKeys.DFS_RATIS_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_DEFAULT; +import static org.apache.hadoop.ozone.OzoneConfigKeys.DFS_RATIS_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_KEY; + /** * Ratis helper methods. */ @@ -162,12 +166,38 @@ public interface RatisHelper { static RetryPolicy createRetryPolicy(Configuration conf) { int maxRetryCount = - conf.getInt(OzoneConfigKeys.OZONE_CLIENT_MAX_RETRIES, OzoneConfigKeys. - OZONE_CLIENT_MAX_RETRIES_DEFAULT); + conf.getInt(OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY, + OzoneConfigKeys. + DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_DEFAULT); long retryInterval = conf.getTimeDuration(OzoneConfigKeys. - OZONE_CLIENT_RETRY_INTERVAL, OzoneConfigKeys. - OZONE_CLIENT_RETRY_INTERVAL_DEFAULT, - TimeUnit.MILLISECONDS.MILLISECONDS); + DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY, OzoneConfigKeys. + DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_DEFAULT + .toInt(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS); + long leaderElectionTimeout = conf.getTimeDuration( + DFS_RATIS_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_KEY, + DFS_RATIS_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_DEFAULT + .toInt(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS); + long clientRequestTimeout = conf.getTimeDuration( + OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_TIMEOUT_DURATION_KEY, + OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_TIMEOUT_DURATION_DEFAULT + .toInt(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS); + long retryCacheTimeout = conf.getTimeDuration( + OzoneConfigKeys.DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_KEY, + OzoneConfigKeys.DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_DEFAULT + .toInt(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS); + Preconditions + .assertTrue(maxRetryCount * retryInterval > 5 * leaderElectionTimeout, + "Please make sure dfs.ratis.client.request.max.retries * " + + "dfs.ratis.client.request.retry.interval > " + + "5 * dfs.ratis.leader.election.minimum.timeout.duration"); + Preconditions.assertTrue( + maxRetryCount * (retryInterval + clientRequestTimeout) + < retryCacheTimeout, + "Please make sure " + + "(dfs.ratis.client.request.max.retries * " + + "(dfs.ratis.client.request.retry.interval + " + + "dfs.ratis.client.request.timeout.duration)) " + + "< dfs.ratis.server.retry-cache.timeout.duration"); TimeDuration sleepDuration = TimeDuration.valueOf(retryInterval, TimeUnit.MILLISECONDS); RetryPolicy retryPolicy = RetryPolicies diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml index e160f257200..a74124e30e0 100644 --- a/hadoop-hdds/common/src/main/resources/ozone-default.xml +++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml @@ -157,6 +157,25 @@ OZONE, RATIS, MANAGEMENT The timeout duration for ratis client request. + + dfs.ratis.client.request.max.retries + 180 + OZONE, RATIS, MANAGEMENT + Number of retries for ratis client request. + + + dfs.ratis.client.request.retry.interval + 100ms + OZONE, RATIS, MANAGEMENT + Interval between successive retries for a ratis client request. + + + + dfs.ratis.server.retry-cache.timeout.duration + 600000ms + OZONE, RATIS, MANAGEMENT + Retry Cache entry timeout for ratis server. + dfs.ratis.server.request.timeout.duration 3s diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java index a57997d189c..24ea0b9a0db 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java @@ -182,6 +182,19 @@ public final class XceiverServerRatis implements XceiverServerSpi { RaftServerConfigKeys.Rpc .setRequestTimeout(properties, serverRequestTimeout); + // set timeout for a retry cache entry + timeUnit = + OzoneConfigKeys.DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_DEFAULT + .getUnit(); + duration = conf.getTimeDuration( + OzoneConfigKeys.DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_KEY, + OzoneConfigKeys.DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_DEFAULT + .getDuration(), timeUnit); + final TimeDuration retryCacheTimeout = + TimeDuration.valueOf(duration, timeUnit); + RaftServerConfigKeys.RetryCache + .setExpiryTime(properties, retryCacheTimeout); + // Set the ratis leader election timeout TimeUnit leaderElectionMinTimeoutUnit = OzoneConfigKeys. diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestCloseContainerHandlingByClient.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestCloseContainerHandlingByClient.java index cf38982a6b6..83421b25f83 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestCloseContainerHandlingByClient.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestCloseContainerHandlingByClient.java @@ -55,7 +55,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.UUID; -import java.util.Random; /** * Tests Close Container Exception handling by Ozone Client. @@ -83,9 +82,9 @@ public class TestCloseContainerHandlingByClient { @BeforeClass public static void init() throws Exception { conf = new OzoneConfiguration(); - // generate a no between 1 to 10 - maxRetries = new Random().nextInt(10); + maxRetries = 100; conf.setInt(OzoneConfigKeys.OZONE_CLIENT_MAX_RETRIES, maxRetries); + conf.set(OzoneConfigKeys.OZONE_CLIENT_RETRY_INTERVAL, "200ms"); chunkSize = (int) OzoneConsts.MB; blockSize = 4 * chunkSize; conf.setInt(ScmConfigKeys.OZONE_SCM_CHUNK_SIZE_KEY, chunkSize);