diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java index e1f77ece530..b224b07eeee 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java @@ -41,6 +41,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; import javax.security.auth.Subject; import javax.security.auth.callback.CallbackHandler; @@ -52,14 +53,18 @@ import javax.security.auth.login.LoginContext; import javax.security.auth.login.LoginException; import javax.security.auth.spi.LoginModule; +import org.apache.hadoop.io.retry.RetryPolicies; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.metrics2.annotation.Metric; import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.MetricsRegistry; +import org.apache.hadoop.metrics2.lib.MutableGaugeInt; +import org.apache.hadoop.metrics2.lib.MutableGaugeLong; import org.apache.hadoop.metrics2.lib.MutableQuantiles; import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.security.SaslRpcServer.AuthMethod; @@ -83,7 +88,8 @@ import org.slf4j.LoggerFactory; @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce", "HBase", "Hive", "Oozie"}) @InterfaceStability.Evolving public class UserGroupInformation { - private static final Logger LOG = LoggerFactory.getLogger( + @VisibleForTesting + static final Logger LOG = LoggerFactory.getLogger( UserGroupInformation.class); /** @@ -119,6 +125,10 @@ public class UserGroupInformation { MutableRate loginFailure; @Metric("GetGroups") MutableRate getGroups; MutableQuantiles[] getGroupsQuantiles; + @Metric("Renewal failures since startup") + private MutableGaugeLong renewalFailuresTotal; + @Metric("Renewal failures since last successful login") + private MutableGaugeInt renewalFailures; static UgiMetrics create() { return DefaultMetricsSystem.instance().register(new UgiMetrics()); @@ -132,6 +142,10 @@ public class UserGroupInformation { } } } + + MutableGaugeInt getRenewalFailures() { + return renewalFailures; + } } /** @@ -924,6 +938,7 @@ public class UserGroupInformation { return; } long nextRefresh = getRefreshTime(tgt); + RetryPolicy rp = null; while (true) { try { long now = Time.now(); @@ -947,13 +962,40 @@ public class UserGroupInformation { } nextRefresh = Math.max(getRefreshTime(tgt), now + kerberosMinSecondsBeforeRelogin); + metrics.renewalFailures.set(0); + rp = null; } catch (InterruptedException ie) { LOG.warn("Terminating renewal thread"); return; } catch (IOException ie) { - LOG.warn("Exception encountered while running the" + - " renewal command. Aborting renew thread. " + ie); - return; + metrics.renewalFailuresTotal.incr(); + final long tgtEndTime = tgt.getEndTime().getTime(); + LOG.warn("Exception encountered while running the renewal " + + "command for {}. (TGT end time:{}, renewalFailures: {}," + + "renewalFailuresTotal: {})", getUserName(), tgtEndTime, + metrics.renewalFailures, metrics.renewalFailuresTotal, ie); + final long now = Time.now(); + if (rp == null) { + // Use a dummy maxRetries to create the policy. The policy will + // only be used to get next retry time with exponential back-off. + // The final retry time will be later limited within the + // tgt endTime in getNextTgtRenewalTime. + rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2, + kerberosMinSecondsBeforeRelogin, TimeUnit.MILLISECONDS); + } + try { + nextRefresh = getNextTgtRenewalTime(tgtEndTime, now, rp); + } catch (Exception e) { + LOG.error("Exception when calculating next tgt renewal time", e); + return; + } + metrics.renewalFailures.incr(); + // retry until close enough to tgt endTime. + if (now > nextRefresh) { + LOG.error("TGT is expired. Aborting renew thread for {}.", + getUserName()); + return; + } } } } @@ -962,6 +1004,26 @@ public class UserGroupInformation { t.setName("TGT Renewer for " + getUserName()); t.start(); } + + /** + * Get time for next login retry. This will allow the thread to retry with + * exponential back-off, until tgt endtime. + * Last retry is {@link #kerberosMinSecondsBeforeRelogin} before endtime. + * + * @param tgtEndTime EndTime of the tgt. + * @param now Current time. + * @param rp The retry policy. + * @return Time for next login retry. + */ + @VisibleForTesting + static long getNextTgtRenewalTime(final long tgtEndTime, final long now, + final RetryPolicy rp) throws Exception { + final long lastRetryTime = tgtEndTime - kerberosMinSecondsBeforeRelogin; + final RetryPolicy.RetryAction ra = rp.shouldRetry(null, + metrics.renewalFailures.value(), 0, false); + return Math.min(lastRetryTime, now + ra.delayMillis); + } + /** * Log a user in from a keytab file. Loads a user identity from a keytab * file and logs them in. They become the currently logged-in user. diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java index 521ab4abe45..4fe753ef1b4 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java @@ -19,15 +19,22 @@ package org.apache.hadoop.security; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.retry.RetryPolicies; +import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.security.SaslRpcServer.AuthMethod; import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod; import org.apache.hadoop.security.authentication.util.KerberosName; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Time; +import org.apache.log4j.Level; import org.junit.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.security.auth.Subject; import javax.security.auth.kerberos.KerberosPrincipal; @@ -42,10 +49,13 @@ import java.lang.reflect.Method; import java.security.PrivilegedExceptionAction; import java.util.Collection; import java.util.ConcurrentModificationException; +import java.util.Date; import java.util.LinkedHashSet; import java.util.Set; +import java.util.concurrent.TimeUnit; import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTH_TO_LOCAL; import static org.apache.hadoop.ipc.TestSaslRPC.*; import static org.apache.hadoop.test.MetricsAsserts.*; @@ -54,6 +64,9 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; public class TestUserGroupInformation { + + static final Logger LOG = LoggerFactory.getLogger( + TestUserGroupInformation.class); final private static String USER_NAME = "user1@HADOOP.APACHE.ORG"; final private static String GROUP1_NAME = "group1"; final private static String GROUP2_NAME = "group2"; @@ -963,4 +976,85 @@ public class TestUserGroupInformation { // Restore hasSufficientTimElapsed back to private method.setAccessible(false); } + + @Test + public void testGetNextRetryTime() throws Exception { + GenericTestUtils.setLogLevel(UserGroupInformation.LOG, Level.DEBUG); + final long reloginInterval = 1; + final long reloginIntervalMs = reloginInterval * 1000; + // Relogin happens every 1 second. + conf.setLong(HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN, reloginInterval); + SecurityUtil.setAuthenticationMethod(AuthenticationMethod.KERBEROS, conf); + UserGroupInformation.setConfiguration(conf); + + // Suppose tgt start time is now, end time is 20 seconds from now. + final long now = Time.now(); + final Date endDate = new Date(now + 20000); + + // Explicitly test the exponential back-off logic. + // Suppose some time (10 seconds) passed. + // Verify exponential backoff and max=(login interval before endTime). + final long currentTime = now + 10000; + final long endTime = endDate.getTime(); + + assertEquals(0, UserGroupInformation.metrics.getRenewalFailures().value()); + RetryPolicy rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2, + 1000, TimeUnit.MILLISECONDS); + long lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + // last try should be right before expiry. + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + String str = + "5th retry, now:" + currentTime + ", retry:" + lastRetry; + LOG.info(str); + assertEquals(str, endTime - reloginIntervalMs, lastRetry); + + // make sure no more retries after (tgt endTime - login interval). + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + str = "overflow retry, now:" + currentTime + ", retry:" + lastRetry; + LOG.info(str); + assertEquals(str, endTime - reloginIntervalMs, lastRetry); + } + + private void assertWithinBounds(final int numFailures, final long lastRetry, + final long reloginIntervalMs, long now) { + // shift is 2 to the power of (numFailure). + int shift = numFailures + 1; + final long lower = now + reloginIntervalMs * (long)((1 << shift) * 0.5); + final long upper = now + reloginIntervalMs * (long)((1 << shift) * 1.5); + final String str = new String("Retry#" + (numFailures + 1) + ", now:" + now + + ", lower bound:" + lower + ", upper bound:" + upper + + ", retry:" + lastRetry); + LOG.info(str); + assertTrue(str, lower <= lastRetry && lastRetry < upper); + } }