From c9ba6fa9ba3d07b7c7a2569c6d2d05f435cc7dde Mon Sep 17 00:00:00 2001 From: Robert Kanter Date: Thu, 3 Mar 2016 16:38:07 -0800 Subject: [PATCH] MAPREDUCE-4785. TestMRApp occasionally fails (haibochen via rkanter) (cherry picked from commit ff0ee84d77d9438f0954ae4e1497d63997bb7347) (cherry picked from commit f9f57265cf0ad2685d0d96ec59b8b78ac8bed175) Conflicts: hadoop-mapreduce-project/CHANGES.txt --- hadoop-mapreduce-project/CHANGES.txt | 2 + .../hadoop/mapreduce/v2/app/TestMRApp.java | 104 +++++++++++++++--- 2 files changed, 88 insertions(+), 18 deletions(-) diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index 2ad465971b4..31ec4a1aaac 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -20,6 +20,8 @@ Release 2.7.3 - UNRELEASED BUG FIXES + MAPREDUCE-4785. TestMRApp occasionally fails (haibochen via rkanter) + MAPREDUCE-6540. TestMRTimelineEventHandling fails (sjlee) MAPREDUCE-5485. Allow repeating job commit by extending OutputCommitter API. diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestMRApp.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestMRApp.java index b03d58d01a3..eb6b93292b8 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestMRApp.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestMRApp.java @@ -25,7 +25,10 @@ import static org.mockito.Mockito.verify; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; +import java.util.concurrent.TimeoutException; +import com.google.common.base.Supplier; +import org.apache.hadoop.test.GenericTestUtils; import org.junit.Assert; import org.apache.hadoop.conf.Configuration; @@ -205,10 +208,10 @@ public class TestMRApp { conf.setFloat(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 0.5f); // uberization forces full slowstart (1.0), so disable that conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false); - Job job = app.submit(conf); - app.waitForState(job, JobState.RUNNING); - Assert.assertEquals("Num tasks not correct", 4, job.getTasks().size()); - Iterator it = job.getTasks().values().iterator(); + final Job job1 = app.submit(conf); + app.waitForState(job1, JobState.RUNNING); + Assert.assertEquals("Num tasks not correct", 4, job1.getTasks().size()); + Iterator it = job1.getTasks().values().iterator(); Task mapTask1 = it.next(); Task mapTask2 = it.next(); @@ -240,8 +243,20 @@ public class TestMRApp { app.waitForState(mapTask1, TaskState.SUCCEEDED); app.waitForState(mapTask2, TaskState.SUCCEEDED); - TaskAttemptCompletionEvent[] events = job.getTaskAttemptCompletionEvents(0, - 100); + final int checkIntervalMillis = 100; + final int waitForMillis = 800; + + waitFor(new Supplier() { + @Override + public Boolean get() { + TaskAttemptCompletionEvent[] events = job1 + .getTaskAttemptCompletionEvents(0, 100); + return events.length == 2; + } + }, checkIntervalMillis, waitForMillis); + + TaskAttemptCompletionEvent[] events = job1.getTaskAttemptCompletionEvents + (0, 100); Assert.assertEquals("Expecting 2 completion events for success", 2, events.length); @@ -253,12 +268,21 @@ public class TestMRApp { nr.setNodeState(NodeState.UNHEALTHY); updatedNodes.add(nr); app.getContext().getEventHandler() - .handle(new JobUpdatedNodesEvent(job.getID(), updatedNodes)); + .handle(new JobUpdatedNodesEvent(job1.getID(), updatedNodes)); app.waitForState(task1Attempt, TaskAttemptState.KILLED); app.waitForState(task2Attempt, TaskAttemptState.KILLED); - events = job.getTaskAttemptCompletionEvents(0, 100); + waitFor(new Supplier() { + @Override + public Boolean get() { + TaskAttemptCompletionEvent[] events = job1 + .getTaskAttemptCompletionEvents(0, 100); + return events.length == 4; + } + }, checkIntervalMillis, waitForMillis); + + events = job1.getTaskAttemptCompletionEvents(0, 100); Assert.assertEquals("Expecting 2 more completion events for killed", 4, events.length); @@ -281,7 +305,16 @@ public class TestMRApp { app.waitForState(mapTask1, TaskState.SUCCEEDED); app.waitForState(mapTask2, TaskState.RUNNING); - events = job.getTaskAttemptCompletionEvents(0, 100); + waitFor(new Supplier() { + @Override + public Boolean get() { + TaskAttemptCompletionEvent[] events = job1 + .getTaskAttemptCompletionEvents(0, 100); + return events.length == 5; + } + }, checkIntervalMillis, waitForMillis); + + events = job1.getTaskAttemptCompletionEvents(0, 100); Assert.assertEquals("Expecting 1 more completion events for success", 5, events.length); @@ -295,10 +328,11 @@ public class TestMRApp { conf = new Configuration(); conf.setBoolean(MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE, true); conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false); - job = app.submit(conf); - app.waitForState(job, JobState.RUNNING); - Assert.assertEquals("No of tasks not correct", 4, job.getTasks().size()); - it = job.getTasks().values().iterator(); + + final Job job2 = app.submit(conf); + app.waitForState(job2, JobState.RUNNING); + Assert.assertEquals("No of tasks not correct", 4, job2.getTasks().size()); + it = job2.getTasks().values().iterator(); mapTask1 = it.next(); mapTask2 = it.next(); Task reduceTask1 = it.next(); @@ -308,7 +342,16 @@ public class TestMRApp { app.waitForState(mapTask1, TaskState.SUCCEEDED); app.waitForState(mapTask2, TaskState.RUNNING); - events = job.getTaskAttemptCompletionEvents(0, 100); + waitFor(new Supplier() { + @Override + public Boolean get() { + TaskAttemptCompletionEvent[] events = job2 + .getTaskAttemptCompletionEvents(0, 100); + return events.length == 2; + } + }, checkIntervalMillis, waitForMillis); + + events = job2.getTaskAttemptCompletionEvents(0, 100); Assert.assertEquals( "Expecting 2 completion events for killed & success of map1", 2, events.length); @@ -321,7 +364,16 @@ public class TestMRApp { TaskAttemptEventType.TA_DONE)); app.waitForState(mapTask2, TaskState.SUCCEEDED); - events = job.getTaskAttemptCompletionEvents(0, 100); + waitFor(new Supplier() { + @Override + public Boolean get() { + TaskAttemptCompletionEvent[] events = job2 + .getTaskAttemptCompletionEvents(0, 100); + return events.length == 3; + } + }, checkIntervalMillis, waitForMillis); + + events = job2.getTaskAttemptCompletionEvents(0, 100); Assert.assertEquals("Expecting 1 more completion events for success", 3, events.length); @@ -350,14 +402,30 @@ public class TestMRApp { .handle( new TaskAttemptEvent(task4Attempt.getID(), TaskAttemptEventType.TA_DONE)); - app.waitForState(reduceTask2, TaskState.SUCCEEDED); + app.waitForState(reduceTask2, TaskState.SUCCEEDED); - events = job.getTaskAttemptCompletionEvents(0, 100); + waitFor(new Supplier() { + @Override + public Boolean get() { + TaskAttemptCompletionEvent[] events = job2 + .getTaskAttemptCompletionEvents(0, 100); + return events.length == 5; + } + }, checkIntervalMillis, waitForMillis); + events = job2.getTaskAttemptCompletionEvents(0, 100); Assert.assertEquals("Expecting 2 more completion events for reduce success", 5, events.length); // job succeeds - app.waitForState(job, JobState.SUCCEEDED); + app.waitForState(job2, JobState.SUCCEEDED); + } + + private static void waitFor(Supplier predicate, int + checkIntervalMillis, int checkTotalMillis) throws InterruptedException { + try { + GenericTestUtils.waitFor(predicate, checkIntervalMillis, checkTotalMillis); + } catch (TimeoutException ex) { + } } @Test