From 96d7ceb39a14dd1baa7d72f7620186addbb756b1 Mon Sep 17 00:00:00 2001 From: Mukund Thakur Date: Fri, 17 Apr 2020 14:15:38 +0100 Subject: [PATCH] HADOOP-13873. log DNS addresses on s3a initialization. Contributed by Mukund Thakur. If you set the log org.apache.hadoop.fs.s3a.impl.NetworkBinding to DEBUG, then when the S3A bucket probe is made -the DNS address of the S3 endpoint is calculated and printed. This is useful to see if a large set of processes are all using the same IP address from the pool of load balancers to which AWS directs clients when an AWS S3 endpoint is resolved. This can have implications for performance: if all clients access the same load balancer performance may be suboptimal. Note: if bucket probes are disabled, fs.s3a.bucket.probe = 0, the DNS logging does not take place. Change-Id: I21b3ac429dc0b543f03e357fdeb94c2d2a328dd8 --- .../org/apache/hadoop/fs/s3a/Constants.java | 6 ++++ .../apache/hadoop/fs/s3a/S3AFileSystem.java | 5 +++ .../hadoop/fs/s3a/impl/NetworkBinding.java | 31 +++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index 0ca4aa01a7e..430a6bc325a 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -163,6 +163,12 @@ public final class Constants { //use a custom endpoint? public static final String ENDPOINT = "fs.s3a.endpoint"; + /** + * Default value of s3 endpoint. If not set explicitly using + * {@code AmazonS3#setEndpoint()}, this is used. + */ + public static final String DEFAULT_ENDPOINT = "s3.amazonaws.com"; + //Enable path style access? Overrides default virtual hosting public static final String PATH_STYLE_ACCESS = "fs.s3a.path.style.access"; diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java index 9630a9eff74..6d2b3a84ca7 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java @@ -176,6 +176,7 @@ import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletionIg import static org.apache.hadoop.fs.s3a.impl.ErrorTranslation.isUnknownBucket; import static org.apache.hadoop.fs.s3a.impl.InternalConstants.SC_404; import static org.apache.hadoop.fs.s3a.impl.NetworkBinding.fixBucketRegion; +import static org.apache.hadoop.fs.s3a.impl.NetworkBinding.logDnsLookup; import static org.apache.hadoop.io.IOUtils.cleanupWithLogger; /** @@ -469,6 +470,8 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities, * S3AFileSystem initialization. When set to 1 or 2, bucket existence check * will be performed which is potentially slow. * If 3 or higher: warn and use the v2 check. + * Also logging DNS address of the s3 endpoint if the bucket probe value is + * greater than 0 else skipping it for increased performance. * @throws UnknownStoreException the bucket is absent * @throws IOException any other problem talking to S3 */ @@ -483,9 +486,11 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities, LOG.debug("skipping check for bucket existence"); break; case 1: + logDnsLookup(getConf()); verifyBucketExists(); break; case 2: + logDnsLookup(getConf()); verifyBucketExistsV2(); break; default: diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/NetworkBinding.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/NetworkBinding.java index 7ff44510011..8b34376a255 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/NetworkBinding.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/NetworkBinding.java @@ -21,6 +21,8 @@ package org.apache.hadoop.fs.s3a.impl; import java.io.IOException; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; +import java.net.URI; +import java.net.URISyntaxException; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.SSLSocketFactory; @@ -30,9 +32,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.ssl.DelegatingSSLSocketFactory; +import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_SSL_CHANNEL_MODE; +import static org.apache.hadoop.fs.s3a.Constants.ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.SSL_CHANNEL_MODE; /** @@ -121,4 +126,30 @@ public class NetworkBinding { ? "us-east-1" : region; } + + /** + * Log the dns address associated with s3 endpoint. If endpoint is + * not set in the configuration, the {@code Constants#DEFAULT_ENDPOINT} + * will be used. + * @param conf input configuration. + */ + public static void logDnsLookup(Configuration conf) { + String endPoint = conf.getTrimmed(ENDPOINT, DEFAULT_ENDPOINT); + String hostName = endPoint; + if (!endPoint.isEmpty() && LOG.isDebugEnabled()) { + // Updating the hostname if there is a scheme present. + if (endPoint.contains("://")) { + try { + URI uri = new URI(endPoint); + hostName = uri.getHost(); + } catch (URISyntaxException e) { + LOG.debug("Got URISyntaxException, ignoring"); + } + } + LOG.debug("Bucket endpoint : {}, Hostname : {}, DNSAddress : {}", + endPoint, + hostName, + NetUtils.normalizeHostName(hostName)); + } + } }