From ede909144da6fe8bde2f6561fa44a458674f96e2 Mon Sep 17 00:00:00 2001 From: Xiao Chen Date: Wed, 9 Nov 2016 09:08:04 -0800 Subject: [PATCH] HADOOP-13590. Retry until TGT expires even if the UGI renewal thread encountered exception. (cherry picked from commit 367c3d41217728c2e61252c5a5235e5bc1f9822f) --- .../hadoop/security/UserGroupInformation.java | 70 +++++++++++++- .../security/TestUserGroupInformation.java | 94 +++++++++++++++++++ 2 files changed, 160 insertions(+), 4 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java index 79e56e7cd61..64c8460a01f 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java @@ -43,6 +43,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; import javax.security.auth.Subject; import javax.security.auth.callback.CallbackHandler; @@ -54,14 +55,18 @@ import javax.security.auth.login.LoginContext; import javax.security.auth.login.LoginException; import javax.security.auth.spi.LoginModule; +import org.apache.hadoop.io.retry.RetryPolicies; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.metrics2.annotation.Metric; import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.MetricsRegistry; +import org.apache.hadoop.metrics2.lib.MutableGaugeInt; +import org.apache.hadoop.metrics2.lib.MutableGaugeLong; import org.apache.hadoop.metrics2.lib.MutableQuantiles; import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.security.SaslRpcServer.AuthMethod; @@ -85,7 +90,8 @@ import org.slf4j.LoggerFactory; @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce", "HBase", "Hive", "Oozie"}) @InterfaceStability.Evolving public class UserGroupInformation { - private static final Logger LOG = LoggerFactory.getLogger( + @VisibleForTesting + static final Logger LOG = LoggerFactory.getLogger( UserGroupInformation.class); /** @@ -121,6 +127,10 @@ public class UserGroupInformation { MutableRate loginFailure; @Metric("GetGroups") MutableRate getGroups; MutableQuantiles[] getGroupsQuantiles; + @Metric("Renewal failures since startup") + private MutableGaugeLong renewalFailuresTotal; + @Metric("Renewal failures since last successful login") + private MutableGaugeInt renewalFailures; static UgiMetrics create() { return DefaultMetricsSystem.instance().register(new UgiMetrics()); @@ -138,6 +148,10 @@ public class UserGroupInformation { } } } + + MutableGaugeInt getRenewalFailures() { + return renewalFailures; + } } /** @@ -961,6 +975,7 @@ public class UserGroupInformation { return; } long nextRefresh = getRefreshTime(tgt); + RetryPolicy rp = null; while (true) { try { long now = Time.now(); @@ -984,13 +999,40 @@ public class UserGroupInformation { } nextRefresh = Math.max(getRefreshTime(tgt), now + kerberosMinSecondsBeforeRelogin); + metrics.renewalFailures.set(0); + rp = null; } catch (InterruptedException ie) { LOG.warn("Terminating renewal thread"); return; } catch (IOException ie) { - LOG.warn("Exception encountered while running the" + - " renewal command. Aborting renew thread. " + ie); - return; + metrics.renewalFailuresTotal.incr(); + final long tgtEndTime = tgt.getEndTime().getTime(); + LOG.warn("Exception encountered while running the renewal " + + "command for {}. (TGT end time:{}, renewalFailures: {}," + + "renewalFailuresTotal: {})", getUserName(), tgtEndTime, + metrics.renewalFailures, metrics.renewalFailuresTotal, ie); + final long now = Time.now(); + if (rp == null) { + // Use a dummy maxRetries to create the policy. The policy will + // only be used to get next retry time with exponential back-off. + // The final retry time will be later limited within the + // tgt endTime in getNextTgtRenewalTime. + rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2, + kerberosMinSecondsBeforeRelogin, TimeUnit.MILLISECONDS); + } + try { + nextRefresh = getNextTgtRenewalTime(tgtEndTime, now, rp); + } catch (Exception e) { + LOG.error("Exception when calculating next tgt renewal time", e); + return; + } + metrics.renewalFailures.incr(); + // retry until close enough to tgt endTime. + if (now > nextRefresh) { + LOG.error("TGT is expired. Aborting renew thread for {}.", + getUserName()); + return; + } } } } @@ -999,6 +1041,26 @@ public class UserGroupInformation { t.setName("TGT Renewer for " + getUserName()); t.start(); } + + /** + * Get time for next login retry. This will allow the thread to retry with + * exponential back-off, until tgt endtime. + * Last retry is {@link #kerberosMinSecondsBeforeRelogin} before endtime. + * + * @param tgtEndTime EndTime of the tgt. + * @param now Current time. + * @param rp The retry policy. + * @return Time for next login retry. + */ + @VisibleForTesting + static long getNextTgtRenewalTime(final long tgtEndTime, final long now, + final RetryPolicy rp) throws Exception { + final long lastRetryTime = tgtEndTime - kerberosMinSecondsBeforeRelogin; + final RetryPolicy.RetryAction ra = rp.shouldRetry(null, + metrics.renewalFailures.value(), 0, false); + return Math.min(lastRetryTime, now + ra.delayMillis); + } + /** * Log a user in from a keytab file. Loads a user identity from a keytab * file and logs them in. They become the currently logged-in user. diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java index 1cdc4c426a6..fff0ae8664a 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java @@ -20,15 +20,22 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.retry.RetryPolicies; +import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.security.SaslRpcServer.AuthMethod; import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod; import org.apache.hadoop.security.authentication.util.KerberosName; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Time; +import org.apache.log4j.Level; import org.junit.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.security.auth.Subject; import javax.security.auth.kerberos.KerberosPrincipal; @@ -44,10 +51,13 @@ import java.lang.reflect.Method; import java.security.PrivilegedExceptionAction; import java.util.Collection; import java.util.ConcurrentModificationException; +import java.util.Date; import java.util.LinkedHashSet; import java.util.Set; +import java.util.concurrent.TimeUnit; import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTH_TO_LOCAL; import static org.apache.hadoop.ipc.TestSaslRPC.*; import static org.apache.hadoop.test.MetricsAsserts.*; @@ -56,6 +66,9 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; public class TestUserGroupInformation { + + static final Logger LOG = LoggerFactory.getLogger( + TestUserGroupInformation.class); final private static String USER_NAME = "user1@HADOOP.APACHE.ORG"; final private static String GROUP1_NAME = "group1"; final private static String GROUP2_NAME = "group2"; @@ -1011,4 +1024,85 @@ public class TestUserGroupInformation { // Restore hasSufficientTimElapsed back to private method.setAccessible(false); } + + @Test + public void testGetNextRetryTime() throws Exception { + GenericTestUtils.setLogLevel(UserGroupInformation.LOG, Level.DEBUG); + final long reloginInterval = 1; + final long reloginIntervalMs = reloginInterval * 1000; + // Relogin happens every 1 second. + conf.setLong(HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN, reloginInterval); + SecurityUtil.setAuthenticationMethod(AuthenticationMethod.KERBEROS, conf); + UserGroupInformation.setConfiguration(conf); + + // Suppose tgt start time is now, end time is 20 seconds from now. + final long now = Time.now(); + final Date endDate = new Date(now + 20000); + + // Explicitly test the exponential back-off logic. + // Suppose some time (10 seconds) passed. + // Verify exponential backoff and max=(login interval before endTime). + final long currentTime = now + 10000; + final long endTime = endDate.getTime(); + + assertEquals(0, UserGroupInformation.metrics.getRenewalFailures().value()); + RetryPolicy rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2, + 1000, TimeUnit.MILLISECONDS); + long lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + // last try should be right before expiry. + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + String str = + "5th retry, now:" + currentTime + ", retry:" + lastRetry; + LOG.info(str); + assertEquals(str, endTime - reloginIntervalMs, lastRetry); + + // make sure no more retries after (tgt endTime - login interval). + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + str = "overflow retry, now:" + currentTime + ", retry:" + lastRetry; + LOG.info(str); + assertEquals(str, endTime - reloginIntervalMs, lastRetry); + } + + private void assertWithinBounds(final int numFailures, final long lastRetry, + final long reloginIntervalMs, long now) { + // shift is 2 to the power of (numFailure). + int shift = numFailures + 1; + final long lower = now + reloginIntervalMs * (long)((1 << shift) * 0.5); + final long upper = now + reloginIntervalMs * (long)((1 << shift) * 1.5); + final String str = new String("Retry#" + (numFailures + 1) + ", now:" + now + + ", lower bound:" + lower + ", upper bound:" + upper + + ", retry:" + lastRetry); + LOG.info(str); + assertTrue(str, lower <= lastRetry && lastRetry < upper); + } }