HADOOP-13590. Retry until TGT expires even if the UGI renewal thread encountered exception.

(cherry picked from commit 367c3d4121)
(cherry picked from commit ede909144d)
This commit is contained in:
Xiao Chen 2016-11-09 09:08:04 -08:00
parent ad992e05a6
commit b1b609c003
2 changed files with 160 additions and 4 deletions

View File

@ -41,6 +41,7 @@ import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.concurrent.TimeUnit;
import javax.security.auth.Subject; import javax.security.auth.Subject;
import javax.security.auth.callback.CallbackHandler; import javax.security.auth.callback.CallbackHandler;
@ -52,14 +53,18 @@ import javax.security.auth.login.LoginContext;
import javax.security.auth.login.LoginException; import javax.security.auth.login.LoginException;
import javax.security.auth.spi.LoginModule; import javax.security.auth.spi.LoginModule;
import org.apache.hadoop.io.retry.RetryPolicies;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.metrics2.annotation.Metric; import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.MetricsRegistry; import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
import org.apache.hadoop.metrics2.lib.MutableQuantiles; import org.apache.hadoop.metrics2.lib.MutableQuantiles;
import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.metrics2.lib.MutableRate;
import org.apache.hadoop.security.SaslRpcServer.AuthMethod; import org.apache.hadoop.security.SaslRpcServer.AuthMethod;
@ -83,7 +88,8 @@ import org.slf4j.LoggerFactory;
@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce", "HBase", "Hive", "Oozie"}) @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce", "HBase", "Hive", "Oozie"})
@InterfaceStability.Evolving @InterfaceStability.Evolving
public class UserGroupInformation { public class UserGroupInformation {
private static final Logger LOG = LoggerFactory.getLogger( @VisibleForTesting
static final Logger LOG = LoggerFactory.getLogger(
UserGroupInformation.class); UserGroupInformation.class);
/** /**
@ -119,6 +125,10 @@ public class UserGroupInformation {
MutableRate loginFailure; MutableRate loginFailure;
@Metric("GetGroups") MutableRate getGroups; @Metric("GetGroups") MutableRate getGroups;
MutableQuantiles[] getGroupsQuantiles; MutableQuantiles[] getGroupsQuantiles;
@Metric("Renewal failures since startup")
private MutableGaugeLong renewalFailuresTotal;
@Metric("Renewal failures since last successful login")
private MutableGaugeInt renewalFailures;
static UgiMetrics create() { static UgiMetrics create() {
return DefaultMetricsSystem.instance().register(new UgiMetrics()); return DefaultMetricsSystem.instance().register(new UgiMetrics());
@ -132,6 +142,10 @@ public class UserGroupInformation {
} }
} }
} }
MutableGaugeInt getRenewalFailures() {
return renewalFailures;
}
} }
/** /**
@ -924,6 +938,7 @@ public class UserGroupInformation {
return; return;
} }
long nextRefresh = getRefreshTime(tgt); long nextRefresh = getRefreshTime(tgt);
RetryPolicy rp = null;
while (true) { while (true) {
try { try {
long now = Time.now(); long now = Time.now();
@ -947,13 +962,40 @@ public class UserGroupInformation {
} }
nextRefresh = Math.max(getRefreshTime(tgt), nextRefresh = Math.max(getRefreshTime(tgt),
now + kerberosMinSecondsBeforeRelogin); now + kerberosMinSecondsBeforeRelogin);
metrics.renewalFailures.set(0);
rp = null;
} catch (InterruptedException ie) { } catch (InterruptedException ie) {
LOG.warn("Terminating renewal thread"); LOG.warn("Terminating renewal thread");
return; return;
} catch (IOException ie) { } catch (IOException ie) {
LOG.warn("Exception encountered while running the" + metrics.renewalFailuresTotal.incr();
" renewal command. Aborting renew thread. " + ie); final long tgtEndTime = tgt.getEndTime().getTime();
return; LOG.warn("Exception encountered while running the renewal "
+ "command for {}. (TGT end time:{}, renewalFailures: {},"
+ "renewalFailuresTotal: {})", getUserName(), tgtEndTime,
metrics.renewalFailures, metrics.renewalFailuresTotal, ie);
final long now = Time.now();
if (rp == null) {
// Use a dummy maxRetries to create the policy. The policy will
// only be used to get next retry time with exponential back-off.
// The final retry time will be later limited within the
// tgt endTime in getNextTgtRenewalTime.
rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2,
kerberosMinSecondsBeforeRelogin, TimeUnit.MILLISECONDS);
}
try {
nextRefresh = getNextTgtRenewalTime(tgtEndTime, now, rp);
} catch (Exception e) {
LOG.error("Exception when calculating next tgt renewal time", e);
return;
}
metrics.renewalFailures.incr();
// retry until close enough to tgt endTime.
if (now > nextRefresh) {
LOG.error("TGT is expired. Aborting renew thread for {}.",
getUserName());
return;
}
} }
} }
} }
@ -962,6 +1004,26 @@ public class UserGroupInformation {
t.setName("TGT Renewer for " + getUserName()); t.setName("TGT Renewer for " + getUserName());
t.start(); t.start();
} }
/**
* Get time for next login retry. This will allow the thread to retry with
* exponential back-off, until tgt endtime.
* Last retry is {@link #kerberosMinSecondsBeforeRelogin} before endtime.
*
* @param tgtEndTime EndTime of the tgt.
* @param now Current time.
* @param rp The retry policy.
* @return Time for next login retry.
*/
@VisibleForTesting
static long getNextTgtRenewalTime(final long tgtEndTime, final long now,
final RetryPolicy rp) throws Exception {
final long lastRetryTime = tgtEndTime - kerberosMinSecondsBeforeRelogin;
final RetryPolicy.RetryAction ra = rp.shouldRetry(null,
metrics.renewalFailures.value(), 0, false);
return Math.min(lastRetryTime, now + ra.delayMillis);
}
/** /**
* Log a user in from a keytab file. Loads a user identity from a keytab * Log a user in from a keytab file. Loads a user identity from a keytab
* file and logs them in. They become the currently logged-in user. * file and logs them in. They become the currently logged-in user.

View File

@ -19,15 +19,22 @@ package org.apache.hadoop.security;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.retry.RetryPolicies;
import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.metrics2.MetricsRecordBuilder;
import org.apache.hadoop.security.SaslRpcServer.AuthMethod; import org.apache.hadoop.security.SaslRpcServer.AuthMethod;
import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod; import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
import org.apache.hadoop.security.authentication.util.KerberosName; import org.apache.hadoop.security.authentication.util.KerberosName;
import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.TokenIdentifier; import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;
import org.apache.log4j.Level;
import org.junit.*; import org.junit.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.security.auth.Subject; import javax.security.auth.Subject;
import javax.security.auth.kerberos.KerberosPrincipal; import javax.security.auth.kerberos.KerberosPrincipal;
@ -42,10 +49,13 @@ import java.lang.reflect.Method;
import java.security.PrivilegedExceptionAction; import java.security.PrivilegedExceptionAction;
import java.util.Collection; import java.util.Collection;
import java.util.ConcurrentModificationException; import java.util.ConcurrentModificationException;
import java.util.Date;
import java.util.LinkedHashSet; import java.util.LinkedHashSet;
import java.util.Set; import java.util.Set;
import java.util.concurrent.TimeUnit;
import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS; import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS;
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN;
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTH_TO_LOCAL; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTH_TO_LOCAL;
import static org.apache.hadoop.ipc.TestSaslRPC.*; import static org.apache.hadoop.ipc.TestSaslRPC.*;
import static org.apache.hadoop.test.MetricsAsserts.*; import static org.apache.hadoop.test.MetricsAsserts.*;
@ -54,6 +64,9 @@ import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when; import static org.mockito.Mockito.when;
public class TestUserGroupInformation { public class TestUserGroupInformation {
static final Logger LOG = LoggerFactory.getLogger(
TestUserGroupInformation.class);
final private static String USER_NAME = "user1@HADOOP.APACHE.ORG"; final private static String USER_NAME = "user1@HADOOP.APACHE.ORG";
final private static String GROUP1_NAME = "group1"; final private static String GROUP1_NAME = "group1";
final private static String GROUP2_NAME = "group2"; final private static String GROUP2_NAME = "group2";
@ -963,4 +976,85 @@ public class TestUserGroupInformation {
// Restore hasSufficientTimElapsed back to private // Restore hasSufficientTimElapsed back to private
method.setAccessible(false); method.setAccessible(false);
} }
@Test
public void testGetNextRetryTime() throws Exception {
GenericTestUtils.setLogLevel(UserGroupInformation.LOG, Level.DEBUG);
final long reloginInterval = 1;
final long reloginIntervalMs = reloginInterval * 1000;
// Relogin happens every 1 second.
conf.setLong(HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN, reloginInterval);
SecurityUtil.setAuthenticationMethod(AuthenticationMethod.KERBEROS, conf);
UserGroupInformation.setConfiguration(conf);
// Suppose tgt start time is now, end time is 20 seconds from now.
final long now = Time.now();
final Date endDate = new Date(now + 20000);
// Explicitly test the exponential back-off logic.
// Suppose some time (10 seconds) passed.
// Verify exponential backoff and max=(login interval before endTime).
final long currentTime = now + 10000;
final long endTime = endDate.getTime();
assertEquals(0, UserGroupInformation.metrics.getRenewalFailures().value());
RetryPolicy rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2,
1000, TimeUnit.MILLISECONDS);
long lastRetry =
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
assertWithinBounds(
UserGroupInformation.metrics.getRenewalFailures().value(),
lastRetry, reloginIntervalMs, currentTime);
UserGroupInformation.metrics.getRenewalFailures().incr();
lastRetry =
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
assertWithinBounds(
UserGroupInformation.metrics.getRenewalFailures().value(),
lastRetry, reloginIntervalMs, currentTime);
UserGroupInformation.metrics.getRenewalFailures().incr();
lastRetry =
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
assertWithinBounds(
UserGroupInformation.metrics.getRenewalFailures().value(),
lastRetry, reloginIntervalMs, currentTime);
UserGroupInformation.metrics.getRenewalFailures().incr();
lastRetry =
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
assertWithinBounds(
UserGroupInformation.metrics.getRenewalFailures().value(),
lastRetry, reloginIntervalMs, currentTime);
// last try should be right before expiry.
UserGroupInformation.metrics.getRenewalFailures().incr();
lastRetry =
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
String str =
"5th retry, now:" + currentTime + ", retry:" + lastRetry;
LOG.info(str);
assertEquals(str, endTime - reloginIntervalMs, lastRetry);
// make sure no more retries after (tgt endTime - login interval).
UserGroupInformation.metrics.getRenewalFailures().incr();
lastRetry =
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
str = "overflow retry, now:" + currentTime + ", retry:" + lastRetry;
LOG.info(str);
assertEquals(str, endTime - reloginIntervalMs, lastRetry);
}
private void assertWithinBounds(final int numFailures, final long lastRetry,
final long reloginIntervalMs, long now) {
// shift is 2 to the power of (numFailure).
int shift = numFailures + 1;
final long lower = now + reloginIntervalMs * (long)((1 << shift) * 0.5);
final long upper = now + reloginIntervalMs * (long)((1 << shift) * 1.5);
final String str = new String("Retry#" + (numFailures + 1) + ", now:" + now
+ ", lower bound:" + lower + ", upper bound:" + upper
+ ", retry:" + lastRetry);
LOG.info(str);
assertTrue(str, lower <= lastRetry && lastRetry < upper);
}
} }