YARN-11217. [Federation] Add dumpSchedulerLogs REST APIs for Router. (#5272)

This commit is contained in:
slfan1989 2023-02-09 03:48:38 +08:00 committed by GitHub
parent 08f58ecf07
commit af20841fb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 180 additions and 2 deletions

View File

@ -135,6 +135,8 @@ public final class RouterMetrics {
private MutableGaugeInt numRenewDelegationTokenFailedRetrieved; private MutableGaugeInt numRenewDelegationTokenFailedRetrieved;
@Metric("# of renewDelegationToken failed to be retrieved") @Metric("# of renewDelegationToken failed to be retrieved")
private MutableGaugeInt numCancelDelegationTokenFailedRetrieved; private MutableGaugeInt numCancelDelegationTokenFailedRetrieved;
@Metric("# of dumpSchedulerLogs failed to be retrieved")
private MutableGaugeInt numDumpSchedulerLogsFailedRetrieved;
@Metric("# of getActivities failed to be retrieved") @Metric("# of getActivities failed to be retrieved")
private MutableGaugeInt numGetActivitiesFailedRetrieved; private MutableGaugeInt numGetActivitiesFailedRetrieved;
@Metric("# of getBulkActivities failed to be retrieved") @Metric("# of getBulkActivities failed to be retrieved")
@ -241,6 +243,8 @@ public final class RouterMetrics {
private MutableRate totalSucceededRenewDelegationTokenRetrieved; private MutableRate totalSucceededRenewDelegationTokenRetrieved;
@Metric("Total number of successful Retrieved CancelDelegationToken and latency(ms)") @Metric("Total number of successful Retrieved CancelDelegationToken and latency(ms)")
private MutableRate totalSucceededCancelDelegationTokenRetrieved; private MutableRate totalSucceededCancelDelegationTokenRetrieved;
@Metric("Total number of successful Retrieved DumpSchedulerLogs and latency(ms)")
private MutableRate totalSucceededDumpSchedulerLogsRetrieved;
@Metric("Total number of successful Retrieved GetActivities and latency(ms)") @Metric("Total number of successful Retrieved GetActivities and latency(ms)")
private MutableRate totalSucceededGetActivitiesRetrieved; private MutableRate totalSucceededGetActivitiesRetrieved;
@Metric("Total number of successful Retrieved GetBulkActivities and latency(ms)") @Metric("Total number of successful Retrieved GetBulkActivities and latency(ms)")
@ -303,6 +307,7 @@ public final class RouterMetrics {
private MutableQuantiles getDelegationTokenLatency; private MutableQuantiles getDelegationTokenLatency;
private MutableQuantiles renewDelegationTokenLatency; private MutableQuantiles renewDelegationTokenLatency;
private MutableQuantiles cancelDelegationTokenLatency; private MutableQuantiles cancelDelegationTokenLatency;
private MutableQuantiles dumpSchedulerLogsLatency;
private MutableQuantiles getActivitiesLatency; private MutableQuantiles getActivitiesLatency;
private MutableQuantiles getBulkActivitiesLatency; private MutableQuantiles getBulkActivitiesLatency;
private MutableQuantiles getSchedulerInfoRetrievedLatency; private MutableQuantiles getSchedulerInfoRetrievedLatency;
@ -482,6 +487,9 @@ public final class RouterMetrics {
cancelDelegationTokenLatency = registry.newQuantiles("cancelDelegationTokenLatency", cancelDelegationTokenLatency = registry.newQuantiles("cancelDelegationTokenLatency",
"latency of cancel delegation token timeouts", "ops", "latency", 10); "latency of cancel delegation token timeouts", "ops", "latency", 10);
dumpSchedulerLogsLatency = registry.newQuantiles("dumpSchedulerLogsLatency",
"latency of dump scheduler logs timeouts", "ops", "latency", 10);
getActivitiesLatency = registry.newQuantiles("getActivitiesLatency", getActivitiesLatency = registry.newQuantiles("getActivitiesLatency",
"latency of get activities timeouts", "ops", "latency", 10); "latency of get activities timeouts", "ops", "latency", 10);
@ -752,6 +760,11 @@ public final class RouterMetrics {
return totalSucceededCancelDelegationTokenRetrieved.lastStat().numSamples(); return totalSucceededCancelDelegationTokenRetrieved.lastStat().numSamples();
} }
@VisibleForTesting
public long getNumSucceededDumpSchedulerLogsRetrieved() {
return totalSucceededDumpSchedulerLogsRetrieved.lastStat().numSamples();
}
@VisibleForTesting @VisibleForTesting
public long getNumSucceededGetActivitiesRetrieved() { public long getNumSucceededGetActivitiesRetrieved() {
return totalSucceededGetActivitiesRetrieved.lastStat().numSamples(); return totalSucceededGetActivitiesRetrieved.lastStat().numSamples();
@ -1007,6 +1020,11 @@ public final class RouterMetrics {
return totalSucceededCancelDelegationTokenRetrieved.lastStat().mean(); return totalSucceededCancelDelegationTokenRetrieved.lastStat().mean();
} }
@VisibleForTesting
public double getLatencySucceededDumpSchedulerLogsRetrieved() {
return totalSucceededDumpSchedulerLogsRetrieved.lastStat().mean();
}
@VisibleForTesting @VisibleForTesting
public double getLatencySucceededGetActivitiesRetrieved() { public double getLatencySucceededGetActivitiesRetrieved() {
return totalSucceededGetActivitiesRetrieved.lastStat().mean(); return totalSucceededGetActivitiesRetrieved.lastStat().mean();
@ -1245,6 +1263,10 @@ public final class RouterMetrics {
return numCancelDelegationTokenFailedRetrieved.value(); return numCancelDelegationTokenFailedRetrieved.value();
} }
public int getDumpSchedulerLogsFailedRetrieved() {
return numDumpSchedulerLogsFailedRetrieved.value();
}
public int getActivitiesFailedRetrieved() { public int getActivitiesFailedRetrieved() {
return numGetActivitiesFailedRetrieved.value(); return numGetActivitiesFailedRetrieved.value();
} }
@ -1492,6 +1514,11 @@ public final class RouterMetrics {
cancelDelegationTokenLatency.add(duration); cancelDelegationTokenLatency.add(duration);
} }
public void succeededDumpSchedulerLogsRetrieved(long duration) {
totalSucceededDumpSchedulerLogsRetrieved.add(duration);
dumpSchedulerLogsLatency.add(duration);
}
public void succeededGetActivitiesLatencyRetrieved(long duration) { public void succeededGetActivitiesLatencyRetrieved(long duration) {
totalSucceededGetActivitiesRetrieved.add(duration); totalSucceededGetActivitiesRetrieved.add(duration);
getActivitiesLatency.add(duration); getActivitiesLatency.add(duration);
@ -1713,6 +1740,10 @@ public final class RouterMetrics {
numCancelDelegationTokenFailedRetrieved.incr(); numCancelDelegationTokenFailedRetrieved.incr();
} }
public void incrDumpSchedulerLogsFailedRetrieved() {
numDumpSchedulerLogsFailedRetrieved.incr();
}
public void incrGetActivitiesFailedRetrieved() { public void incrGetActivitiesFailedRetrieved() {
numGetActivitiesFailedRetrieved.incr(); numGetActivitiesFailedRetrieved.incr();
} }

View File

@ -1183,10 +1183,70 @@ public class FederationInterceptorREST extends AbstractRESTRequestInterceptor {
throw new RuntimeException("getSchedulerInfo error."); throw new RuntimeException("getSchedulerInfo error.");
} }
/**
* This method dumps the scheduler logs for the time got in input, and it is
* reachable by using {@link RMWSConsts#SCHEDULER_LOGS}.
*
* @param time the period of time. It is a FormParam.
* @param hsr the servlet request
* @return the result of the operation
* @throws IOException when it cannot create dump log file
*/
@Override @Override
public String dumpSchedulerLogs(String time, HttpServletRequest hsr) public String dumpSchedulerLogs(String time, HttpServletRequest hsr)
throws IOException { throws IOException {
throw new NotImplementedException("Code is not implemented");
// Step1. We will check the time parameter to
// ensure that the time parameter is not empty and greater than 0.
if (StringUtils.isBlank(time)) {
routerMetrics.incrDumpSchedulerLogsFailedRetrieved();
throw new IllegalArgumentException("Parameter error, the time is empty or null.");
}
try {
int period = Integer.parseInt(time);
if (period <= 0) {
throw new IllegalArgumentException("time must be greater than 0.");
}
} catch (NumberFormatException e) {
routerMetrics.incrDumpSchedulerLogsFailedRetrieved();
throw new IllegalArgumentException("time must be a number.");
} catch (IllegalArgumentException e) {
routerMetrics.incrDumpSchedulerLogsFailedRetrieved();
throw e;
}
// Step2. Call dumpSchedulerLogs of each subcluster.
try {
long startTime = clock.getTime();
Map<SubClusterId, SubClusterInfo> subClustersActive = getActiveSubclusters();
final HttpServletRequest hsrCopy = clone(hsr);
Class[] argsClasses = new Class[]{String.class, HttpServletRequest.class};
Object[] args = new Object[]{time, hsrCopy};
ClientMethod remoteMethod = new ClientMethod("dumpSchedulerLogs", argsClasses, args);
Map<SubClusterInfo, String> dumpSchedulerLogsMap = invokeConcurrent(
subClustersActive.values(), remoteMethod, String.class);
StringBuilder stringBuilder = new StringBuilder();
dumpSchedulerLogsMap.forEach((subClusterInfo, msg) -> {
SubClusterId subClusterId = subClusterInfo.getSubClusterId();
stringBuilder.append("subClusterId" + subClusterId + " : " + msg + "; ");
});
long stopTime = clock.getTime();
routerMetrics.succeededDumpSchedulerLogsRetrieved(stopTime - startTime);
return stringBuilder.toString();
} catch (IllegalArgumentException e) {
routerMetrics.incrDumpSchedulerLogsFailedRetrieved();
RouterServerUtil.logAndThrowRunTimeException(e,
"Unable to dump SchedulerLogs by time: %s.", time);
} catch (YarnException e) {
routerMetrics.incrDumpSchedulerLogsFailedRetrieved();
RouterServerUtil.logAndThrowRunTimeException(e,
"dumpSchedulerLogs by time = %s error .", time);
}
routerMetrics.incrDumpSchedulerLogsFailedRetrieved();
throw new RuntimeException("dumpSchedulerLogs Failed.");
} }
/** /**

View File

@ -534,6 +534,11 @@ public class TestRouterMetrics {
metrics.incrRenewDelegationTokenFailedRetrieved(); metrics.incrRenewDelegationTokenFailedRetrieved();
} }
public void getDumpSchedulerLogsFailed() {
LOG.info("Mocked: failed DumpSchedulerLogs call");
metrics.incrDumpSchedulerLogsFailedRetrieved();
}
public void getActivitiesFailed() { public void getActivitiesFailed() {
LOG.info("Mocked: failed getBulkActivitie call"); LOG.info("Mocked: failed getBulkActivitie call");
metrics.incrGetActivitiesFailedRetrieved(); metrics.incrGetActivitiesFailedRetrieved();
@ -774,6 +779,11 @@ public class TestRouterMetrics {
metrics.succeededRenewDelegationTokenRetrieved(duration); metrics.succeededRenewDelegationTokenRetrieved(duration);
} }
public void getDumpSchedulerLogsRetrieved(long duration) {
LOG.info("Mocked: successful DumpSchedulerLogs call with duration {}", duration);
metrics.succeededDumpSchedulerLogsRetrieved(duration);
}
public void getActivitiesRetrieved(long duration) { public void getActivitiesRetrieved(long duration) {
LOG.info("Mocked: successful GetActivities call with duration {}", duration); LOG.info("Mocked: successful GetActivities call with duration {}", duration);
metrics.succeededGetActivitiesLatencyRetrieved(duration); metrics.succeededGetActivitiesLatencyRetrieved(duration);
@ -1618,6 +1628,29 @@ public class TestRouterMetrics {
metrics.getRenewDelegationTokenFailedRetrieved()); metrics.getRenewDelegationTokenFailedRetrieved());
} }
@Test
public void testDumpSchedulerLogsRetrieved() {
long totalGoodBefore = metrics.getNumSucceededDumpSchedulerLogsRetrieved();
goodSubCluster.getDumpSchedulerLogsRetrieved(150);
Assert.assertEquals(totalGoodBefore + 1,
metrics.getNumSucceededDumpSchedulerLogsRetrieved());
Assert.assertEquals(150,
metrics.getLatencySucceededDumpSchedulerLogsRetrieved(), ASSERT_DOUBLE_DELTA);
goodSubCluster.getDumpSchedulerLogsRetrieved(300);
Assert.assertEquals(totalGoodBefore + 2,
metrics.getNumSucceededDumpSchedulerLogsRetrieved());
Assert.assertEquals(225,
metrics.getLatencySucceededDumpSchedulerLogsRetrieved(), ASSERT_DOUBLE_DELTA);
}
@Test
public void testDumpSchedulerLogsRetrievedFailed() {
long totalBadBefore = metrics.getDumpSchedulerLogsFailedRetrieved();
badSubCluster.getDumpSchedulerLogsFailed();
Assert.assertEquals(totalBadBefore + 1,
metrics.getDumpSchedulerLogsFailedRetrieved());
}
@Test @Test
public void testGetActivitiesRetrieved() { public void testGetActivitiesRetrieved() {
long totalGoodBefore = metrics.getNumSucceededGetActivitiesRetrieved(); long totalGoodBefore = metrics.getNumSucceededGetActivitiesRetrieved();

View File

@ -1214,6 +1214,26 @@ public class MockDefaultRequestInterceptorREST
return new RMQueueAclInfo(true, user.getUserName(), ""); return new RMQueueAclInfo(true, user.getUserName(), "");
} }
public String dumpSchedulerLogs(String time, HttpServletRequest hsr)
throws IOException {
int period = Integer.parseInt(time);
if (period <= 0) {
throw new BadRequestException("Period must be greater than 0");
}
return "Capacity scheduler logs are being created.";
}
}
@Override
public String dumpSchedulerLogs(String time, HttpServletRequest hsr) throws IOException {
ResourceManager mockResourceManager = mock(ResourceManager.class);
Configuration conf = new YarnConfiguration();
MockRMWebServices webSvc = new MockRMWebServices(mockResourceManager, conf,
mock(HttpServletResponse.class));
return webSvc.dumpSchedulerLogs(time, hsr);
} }
@Override @Override

View File

@ -134,7 +134,6 @@ import org.junit.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import static org.apache.hadoop.yarn.conf.YarnConfiguration.RM_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT; import static org.apache.hadoop.yarn.conf.YarnConfiguration.RM_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
import static org.apache.hadoop.yarn.conf.YarnConfiguration.RM_DELEGATION_KEY_UPDATE_INTERVAL_KEY; import static org.apache.hadoop.yarn.conf.YarnConfiguration.RM_DELEGATION_KEY_UPDATE_INTERVAL_KEY;
import static org.apache.hadoop.yarn.conf.YarnConfiguration.RM_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT; import static org.apache.hadoop.yarn.conf.YarnConfiguration.RM_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT;
@ -1785,6 +1784,41 @@ public class TestFederationInterceptorREST extends BaseRouterWebServicesTest {
Assert.assertEquals(response.getStatus(), Status.OK.getStatusCode()); Assert.assertEquals(response.getStatus(), Status.OK.getStatusCode());
} }
@Test
public void testDumpSchedulerLogs() throws Exception {
HttpServletRequest mockHsr = mockHttpServletRequestByUserName("admin");
String dumpSchedulerLogsMsg = interceptor.dumpSchedulerLogs("1", mockHsr);
// We cannot guarantee the calling order of the sub-clusters,
// We guarantee that the returned result contains the information of each subCluster.
Assert.assertNotNull(dumpSchedulerLogsMsg);
subClusters.stream().forEach(subClusterId -> {
String subClusterMsg =
"subClusterId" + subClusterId + " : Capacity scheduler logs are being created.; ";
Assert.assertTrue(dumpSchedulerLogsMsg.contains(subClusterMsg));
});
}
@Test
public void testDumpSchedulerLogsError() throws Exception {
HttpServletRequest mockHsr = mockHttpServletRequestByUserName("admin");
// time is empty
LambdaTestUtils.intercept(IllegalArgumentException.class,
"Parameter error, the time is empty or null.",
() -> interceptor.dumpSchedulerLogs(null, mockHsr));
// time is negative
LambdaTestUtils.intercept(IllegalArgumentException.class,
"time must be greater than 0.",
() -> interceptor.dumpSchedulerLogs("-1", mockHsr));
// time is non-numeric
LambdaTestUtils.intercept(IllegalArgumentException.class,
"time must be a number.",
() -> interceptor.dumpSchedulerLogs("abc", mockHsr));
}
@Test @Test
public void testGetActivitiesNormal() { public void testGetActivitiesNormal() {
ActivitiesInfo activitiesInfo = interceptor.getActivities(null, "1", "DIAGNOSTIC"); ActivitiesInfo activitiesInfo = interceptor.getActivities(null, "1", "DIAGNOSTIC");