MAPREDUCE-6776. yarn.app.mapreduce.client.job.max-retries should have a more useful default (miklos.szegedi@cloudera.com via rkanter)

(cherry picked from commit f3f37e6fb8)
This commit is contained in:
Robert Kanter 2016-10-07 14:47:06 -07:00
parent b9761f2fc9
commit 48ca6be76e
3 changed files with 35 additions and 13 deletions

View File

@ -502,7 +502,7 @@ public interface MRJobConfig {
*/ */
public static final String MR_CLIENT_JOB_MAX_RETRIES = public static final String MR_CLIENT_JOB_MAX_RETRIES =
MR_PREFIX + "client.job.max-retries"; MR_PREFIX + "client.job.max-retries";
public static final int DEFAULT_MR_CLIENT_JOB_MAX_RETRIES = 0; public static final int DEFAULT_MR_CLIENT_JOB_MAX_RETRIES = 3;
/** /**
* How long to wait between jobclient retries on failure * How long to wait between jobclient retries on failure

View File

@ -1502,12 +1502,12 @@
<property> <property>
<name>yarn.app.mapreduce.client.job.max-retries</name> <name>yarn.app.mapreduce.client.job.max-retries</name>
<value>0</value> <value>3</value>
<description>The number of retries the client will make for getJob and <description>The number of retries the client will make for getJob and
dependent calls. The default is 0 as this is generally only needed for dependent calls.
non-HDFS DFS where additional, high level retries are required to avoid This is needed for non-HDFS DFS where additional, high level
spurious failures during the getJob call. 30 is a good value for retries are required to avoid spurious failures during the getJob call.
WASB</description> 30 is a good value for WASB</description>
</property> </property>
<property> <property>

View File

@ -225,7 +225,7 @@ public class JobClientUnitTest {
//To prevent the test from running for a very long time, lower the retry //To prevent the test from running for a very long time, lower the retry
JobConf conf = new JobConf(); JobConf conf = new JobConf();
conf.set(MRJobConfig.MR_CLIENT_JOB_MAX_RETRIES, "3"); conf.setInt(MRJobConfig.MR_CLIENT_JOB_MAX_RETRIES, 2);
TestJobClientGetJob client = new TestJobClientGetJob(conf); TestJobClientGetJob client = new TestJobClientGetJob(conf);
JobID id = new JobID("ajob", 1); JobID id = new JobID("ajob", 1);
@ -236,13 +236,35 @@ public class JobClientUnitTest {
assertNotNull(client.getJob(id)); assertNotNull(client.getJob(id));
assertEquals(client.getLastGetJobRetriesCounter(), 0); assertEquals(client.getLastGetJobRetriesCounter(), 0);
//3 retry //2 retries
client.setGetJobRetries(3); client.setGetJobRetries(2);
assertNotNull(client.getJob(id)); assertNotNull(client.getJob(id));
assertEquals(client.getLastGetJobRetriesCounter(), 3); assertEquals(client.getLastGetJobRetriesCounter(), 2);
//beyond MAPREDUCE_JOBCLIENT_GETJOB_MAX_RETRY_KEY, will get null //beyond yarn.app.mapreduce.client.job.max-retries, will get null
client.setGetJobRetries(5); client.setGetJobRetries(3);
assertNull(client.getJob(id));
}
@Test
public void testGetJobRetryDefault() throws Exception {
//To prevent the test from running for a very long time, lower the retry
JobConf conf = new JobConf();
TestJobClientGetJob client = new TestJobClientGetJob(conf);
JobID id = new JobID("ajob", 1);
RunningJob rj = mock(RunningJob.class);
client.setRunningJob(rj);
//3 retries (default)
client.setGetJobRetries(MRJobConfig.DEFAULT_MR_CLIENT_JOB_MAX_RETRIES);
assertNotNull(client.getJob(id));
assertEquals(client.getLastGetJobRetriesCounter(),
MRJobConfig.DEFAULT_MR_CLIENT_JOB_MAX_RETRIES);
//beyond yarn.app.mapreduce.client.job.max-retries, will get null
client.setGetJobRetries(MRJobConfig.DEFAULT_MR_CLIENT_JOB_MAX_RETRIES + 1);
assertNull(client.getJob(id)); assertNull(client.getJob(id));
} }