YARN-4041. Slow delegation token renewal can severely prolong RM recovery. Contributed by Sunil G
(cherry picked from commit d3a34a4f38
)
Conflicts:
hadoop-yarn-project/CHANGES.txt
This commit is contained in:
parent
49a7d70f53
commit
2b6ce20e1c
|
@ -91,6 +91,9 @@ Release 2.7.2 - UNRELEASED
|
||||||
YARN-4000. RM crashes with NPE if leaf queue becomes parent queue during restart.
|
YARN-4000. RM crashes with NPE if leaf queue becomes parent queue during restart.
|
||||||
(Varun Saxena via jianhe)
|
(Varun Saxena via jianhe)
|
||||||
|
|
||||||
|
YARN-4041. Slow delegation token renewal can severely prolong RM recovery
|
||||||
|
(Sunil G via jlowe)
|
||||||
|
|
||||||
Release 2.7.1 - 2015-07-06
|
Release 2.7.1 - 2015-07-06
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -854,14 +854,16 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (UserGroupInformation.isSecurityEnabled()) {
|
if (UserGroupInformation.isSecurityEnabled()) {
|
||||||
// synchronously renew delegation token on recovery.
|
// asynchronously renew delegation token on recovery.
|
||||||
try {
|
try {
|
||||||
app.rmContext.getDelegationTokenRenewer().addApplicationSync(
|
app.rmContext.getDelegationTokenRenewer()
|
||||||
app.getApplicationId(), app.parseCredentials(),
|
.addApplicationAsyncDuringRecovery(app.getApplicationId(),
|
||||||
app.submissionContext.getCancelTokensWhenComplete(), app.getUser());
|
app.parseCredentials(),
|
||||||
|
app.submissionContext.getCancelTokensWhenComplete(),
|
||||||
|
app.getUser());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
String msg = "Failed to renew token for " + app.applicationId
|
String msg = "Failed to fetch user credentials from application:"
|
||||||
+ " on recovery : " + e.getMessage();
|
+ e.getMessage();
|
||||||
app.diagnostics.append(msg);
|
app.diagnostics.append(msg);
|
||||||
LOG.error(msg, e);
|
LOG.error(msg, e);
|
||||||
}
|
}
|
||||||
|
|
|
@ -385,6 +385,25 @@ public class DelegationTokenRenewer extends AbstractService {
|
||||||
applicationId, ts, shouldCancelAtEnd, user));
|
applicationId, ts, shouldCancelAtEnd, user));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Asynchronously add application tokens for renewal.
|
||||||
|
*
|
||||||
|
* @param applicationId
|
||||||
|
* added application
|
||||||
|
* @param ts
|
||||||
|
* tokens
|
||||||
|
* @param shouldCancelAtEnd
|
||||||
|
* true if tokens should be canceled when the app is done else false.
|
||||||
|
* @param user
|
||||||
|
* user
|
||||||
|
*/
|
||||||
|
public void addApplicationAsyncDuringRecovery(ApplicationId applicationId,
|
||||||
|
Credentials ts, boolean shouldCancelAtEnd, String user) {
|
||||||
|
processDelegationTokenRenewerEvent(
|
||||||
|
new DelegationTokenRenewerAppRecoverEvent(applicationId, ts,
|
||||||
|
shouldCancelAtEnd, user));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Synchronously renew delegation tokens.
|
* Synchronously renew delegation tokens.
|
||||||
* @param user user
|
* @param user user
|
||||||
|
@ -396,7 +415,7 @@ public class DelegationTokenRenewer extends AbstractService {
|
||||||
applicationId, ts, shouldCancelAtEnd, user));
|
applicationId, ts, shouldCancelAtEnd, user));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void handleAppSubmitEvent(DelegationTokenRenewerAppSubmitEvent evt)
|
private void handleAppSubmitEvent(AbstractDelegationTokenRenewerAppEvent evt)
|
||||||
throws IOException, InterruptedException {
|
throws IOException, InterruptedException {
|
||||||
ApplicationId applicationId = evt.getApplicationId();
|
ApplicationId applicationId = evt.getApplicationId();
|
||||||
Credentials ts = evt.getCredentials();
|
Credentials ts = evt.getCredentials();
|
||||||
|
@ -825,6 +844,10 @@ public class DelegationTokenRenewer extends AbstractService {
|
||||||
DelegationTokenRenewerAppSubmitEvent appSubmitEvt =
|
DelegationTokenRenewerAppSubmitEvent appSubmitEvt =
|
||||||
(DelegationTokenRenewerAppSubmitEvent) evt;
|
(DelegationTokenRenewerAppSubmitEvent) evt;
|
||||||
handleDTRenewerAppSubmitEvent(appSubmitEvt);
|
handleDTRenewerAppSubmitEvent(appSubmitEvt);
|
||||||
|
} else if (evt instanceof DelegationTokenRenewerAppRecoverEvent) {
|
||||||
|
DelegationTokenRenewerAppRecoverEvent appRecoverEvt =
|
||||||
|
(DelegationTokenRenewerAppRecoverEvent) evt;
|
||||||
|
handleDTRenewerAppRecoverEvent(appRecoverEvt);
|
||||||
} else if (evt.getType().equals(
|
} else if (evt.getType().equals(
|
||||||
DelegationTokenRenewerEventType.FINISH_APPLICATION)) {
|
DelegationTokenRenewerEventType.FINISH_APPLICATION)) {
|
||||||
DelegationTokenRenewer.this.handleAppFinishEvent(evt);
|
DelegationTokenRenewer.this.handleAppFinishEvent(evt);
|
||||||
|
@ -860,16 +883,49 @@ public class DelegationTokenRenewer extends AbstractService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static class DelegationTokenRenewerAppSubmitEvent extends
|
@SuppressWarnings("unchecked")
|
||||||
|
private void handleDTRenewerAppRecoverEvent(
|
||||||
|
DelegationTokenRenewerAppRecoverEvent event) {
|
||||||
|
try {
|
||||||
|
// Setup tokens for renewal during recovery
|
||||||
|
DelegationTokenRenewer.this.handleAppSubmitEvent(event);
|
||||||
|
} catch (Throwable t) {
|
||||||
|
LOG.warn(
|
||||||
|
"Unable to add the application to the delegation token renewer.", t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DelegationTokenRenewerAppSubmitEvent
|
||||||
|
extends
|
||||||
|
AbstractDelegationTokenRenewerAppEvent {
|
||||||
|
public DelegationTokenRenewerAppSubmitEvent(ApplicationId appId,
|
||||||
|
Credentials credentails, boolean shouldCancelAtEnd, String user) {
|
||||||
|
super(appId, credentails, shouldCancelAtEnd, user,
|
||||||
|
DelegationTokenRenewerEventType.VERIFY_AND_START_APPLICATION);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DelegationTokenRenewerAppRecoverEvent
|
||||||
|
extends
|
||||||
|
AbstractDelegationTokenRenewerAppEvent {
|
||||||
|
public DelegationTokenRenewerAppRecoverEvent(ApplicationId appId,
|
||||||
|
Credentials credentails, boolean shouldCancelAtEnd, String user) {
|
||||||
|
super(appId, credentails, shouldCancelAtEnd, user,
|
||||||
|
DelegationTokenRenewerEventType.RECOVER_APPLICATION);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class AbstractDelegationTokenRenewerAppEvent extends
|
||||||
DelegationTokenRenewerEvent {
|
DelegationTokenRenewerEvent {
|
||||||
|
|
||||||
private Credentials credentials;
|
private Credentials credentials;
|
||||||
private boolean shouldCancelAtEnd;
|
private boolean shouldCancelAtEnd;
|
||||||
private String user;
|
private String user;
|
||||||
|
|
||||||
public DelegationTokenRenewerAppSubmitEvent(ApplicationId appId,
|
public AbstractDelegationTokenRenewerAppEvent(ApplicationId appId,
|
||||||
Credentials credentails, boolean shouldCancelAtEnd, String user) {
|
Credentials credentails, boolean shouldCancelAtEnd, String user,
|
||||||
super(appId, DelegationTokenRenewerEventType.VERIFY_AND_START_APPLICATION);
|
DelegationTokenRenewerEventType type) {
|
||||||
|
super(appId, type);
|
||||||
this.credentials = credentails;
|
this.credentials = credentails;
|
||||||
this.shouldCancelAtEnd = shouldCancelAtEnd;
|
this.shouldCancelAtEnd = shouldCancelAtEnd;
|
||||||
this.user = user;
|
this.user = user;
|
||||||
|
@ -890,6 +946,7 @@ public class DelegationTokenRenewer extends AbstractService {
|
||||||
|
|
||||||
enum DelegationTokenRenewerEventType {
|
enum DelegationTokenRenewerEventType {
|
||||||
VERIFY_AND_START_APPLICATION,
|
VERIFY_AND_START_APPLICATION,
|
||||||
|
RECOVER_APPLICATION,
|
||||||
FINISH_APPLICATION
|
FINISH_APPLICATION
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1166,24 +1166,24 @@ public class TestRMRestart extends ParameterizedSchedulerTestBase {
|
||||||
|
|
||||||
// Need to wait for a while as now token renewal happens on another thread
|
// Need to wait for a while as now token renewal happens on another thread
|
||||||
// and is asynchronous in nature.
|
// and is asynchronous in nature.
|
||||||
waitForTokensToBeRenewed(rm2);
|
waitForTokensToBeRenewed(rm2, tokenSet);
|
||||||
|
|
||||||
// verify tokens are properly populated back to rm2 DelegationTokenRenewer
|
// verify tokens are properly populated back to rm2 DelegationTokenRenewer
|
||||||
Assert.assertEquals(tokenSet, rm2.getRMContext()
|
Assert.assertEquals(tokenSet, rm2.getRMContext()
|
||||||
.getDelegationTokenRenewer().getDelegationTokens());
|
.getDelegationTokenRenewer().getDelegationTokens());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void waitForTokensToBeRenewed(MockRM rm2) throws Exception {
|
private void waitForTokensToBeRenewed(MockRM rm2,
|
||||||
int waitCnt = 20;
|
HashSet<Token<RMDelegationTokenIdentifier>> tokenSet) throws Exception {
|
||||||
boolean atleastOneAppInNEWState = true;
|
// Max wait time to get the token renewal can be kept as 1sec (100 * 10ms)
|
||||||
while (waitCnt-- > 0 && atleastOneAppInNEWState) {
|
int waitCnt = 100;
|
||||||
atleastOneAppInNEWState = false;
|
while (waitCnt-- > 0) {
|
||||||
for (RMApp rmApp : rm2.getRMContext().getRMApps().values()) {
|
if (tokenSet.equals(rm2.getRMContext().getDelegationTokenRenewer()
|
||||||
if (rmApp.getState() == RMAppState.NEW) {
|
.getDelegationTokens())) {
|
||||||
Thread.sleep(1000);
|
// Stop waiting as tokens are populated to DelegationTokenRenewer.
|
||||||
atleastOneAppInNEWState = true;
|
|
||||||
break;
|
break;
|
||||||
}
|
} else {
|
||||||
|
Thread.sleep(10);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue