MAPREDUCE-6439. AM may fail instead of retrying if RM shuts down during the allocate call. (Anubhav Dhoot via kasha)

This commit is contained in:
Karthik Kambatla 2015-08-15 00:52:11 -07:00
parent dc7a061668
commit 8dfec7a197
6 changed files with 202 additions and 34 deletions

View File

@ -573,6 +573,9 @@ Release 2.7.2 - UNRELEASED
MAPREDUCE-6426. TestShuffleHandler#testGetMapOutputInfo is failing.
(zhihai xu via devaraj)
MAPREDUCE-6439. AM may fail instead of retrying if RM shuts down during the
allocate call. (Anubhav Dhoot via kasha)
Release 2.7.1 - 2015-07-06
INCOMPATIBLE CHANGES

View File

@ -270,35 +270,38 @@ public abstract class RMCommunicator extends AbstractService
super.serviceStop();
}
protected void startAllocatorThread() {
allocatorThread = new Thread(new Runnable() {
@Override
public void run() {
while (!stopped.get() && !Thread.currentThread().isInterrupted()) {
@VisibleForTesting
public class AllocatorRunnable implements Runnable {
@Override
public void run() {
while (!stopped.get() && !Thread.currentThread().isInterrupted()) {
try {
Thread.sleep(rmPollInterval);
try {
Thread.sleep(rmPollInterval);
try {
heartbeat();
} catch (YarnRuntimeException e) {
LOG.error("Error communicating with RM: " + e.getMessage() , e);
return;
} catch (Exception e) {
LOG.error("ERROR IN CONTACTING RM. ", e);
continue;
// TODO: for other exceptions
}
lastHeartbeatTime = context.getClock().getTime();
executeHeartbeatCallbacks();
} catch (InterruptedException e) {
if (!stopped.get()) {
LOG.warn("Allocated thread interrupted. Returning.");
}
heartbeat();
} catch (RMContainerAllocationException e) {
LOG.error("Error communicating with RM: " + e.getMessage() , e);
return;
} catch (Exception e) {
LOG.error("ERROR IN CONTACTING RM. ", e);
continue;
// TODO: for other exceptions
}
lastHeartbeatTime = context.getClock().getTime();
executeHeartbeatCallbacks();
} catch (InterruptedException e) {
if (!stopped.get()) {
LOG.warn("Allocated thread interrupted. Returning.");
}
return;
}
}
});
}
}
protected void startAllocatorThread() {
allocatorThread = new Thread(new AllocatorRunnable());
allocatorThread.setName("RMCommunicator Allocator");
allocatorThread.start();
}

View File

@ -0,0 +1,31 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapreduce.v2.app.rm;
/**
* Exception to denote fatal failure in allocating containers from RM.
*/
public class RMContainerAllocationException extends Exception {
private static final long serialVersionUID = 1L;
public RMContainerAllocationException(Throwable cause) { super(cause); }
public RMContainerAllocationException(String message) { super(message); }
public RMContainerAllocationException(String message, Throwable cause) {
super(message, cause);
}
}

View File

@ -699,7 +699,7 @@ public class RMContainerAllocator extends RMContainerRequestor
// this application must clean itself up.
eventHandler.handle(new JobEvent(this.getJob().getID(),
JobEventType.JOB_AM_REBOOT));
throw new YarnRuntimeException(
throw new RMContainerAllocationException(
"Resource Manager doesn't recognize AttemptId: "
+ this.getContext().getApplicationAttemptId(), e);
} catch (ApplicationMasterNotRegisteredException e) {
@ -717,7 +717,7 @@ public class RMContainerAllocator extends RMContainerRequestor
LOG.error("Could not contact RM after " + retryInterval + " milliseconds.");
eventHandler.handle(new JobEvent(this.getJob().getID(),
JobEventType.JOB_AM_REBOOT));
throw new YarnRuntimeException("Could not contact RM after " +
throw new RMContainerAllocationException("Could not contact RM after " +
retryInterval + " milliseconds.");
}
// Throw this up to the caller, which may decide to ignore it and

View File

@ -0,0 +1,99 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapreduce.v2.app.rm;
import org.apache.hadoop.mapreduce.v2.app.AppContext;
import org.apache.hadoop.mapreduce.v2.app.client.ClientService;
import org.apache.hadoop.mapreduce.v2.app.rm.RMCommunicator.AllocatorRunnable;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.util.Clock;
import org.junit.Test;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
public class TestRMCommunicator {
class MockRMCommunicator extends RMCommunicator {
public MockRMCommunicator(ClientService clientService, AppContext context) {
super(clientService, context);
}
@Override
protected void heartbeat() throws Exception {
}
}
@Test(timeout = 2000)
public void testRMContainerAllocatorExceptionIsHandled() throws Exception {
ClientService mockClientService = mock(ClientService.class);
AppContext mockContext = mock(AppContext.class);
MockRMCommunicator mockRMCommunicator =
new MockRMCommunicator(mockClientService, mockContext);
RMCommunicator communicator = spy(mockRMCommunicator);
Clock mockClock = mock(Clock.class);
when(mockContext.getClock()).thenReturn(mockClock);
doThrow(new RMContainerAllocationException("Test")).doNothing()
.when(communicator).heartbeat();
when(mockClock.getTime()).thenReturn(1L).thenThrow(new AssertionError(
"GetClock called second time, when it should not have since the " +
"thread should have quit"));
AllocatorRunnable testRunnable = communicator.new AllocatorRunnable();
testRunnable.run();
}
@Test(timeout = 2000)
public void testRMContainerAllocatorYarnRuntimeExceptionIsHandled()
throws Exception {
ClientService mockClientService = mock(ClientService.class);
AppContext mockContext = mock(AppContext.class);
MockRMCommunicator mockRMCommunicator =
new MockRMCommunicator(mockClientService, mockContext);
final RMCommunicator communicator = spy(mockRMCommunicator);
Clock mockClock = mock(Clock.class);
when(mockContext.getClock()).thenReturn(mockClock);
doThrow(new YarnRuntimeException("Test")).doNothing()
.when(communicator).heartbeat();
when(mockClock.getTime()).thenReturn(1L).thenAnswer(new Answer<Integer>() {
@Override
public Integer answer(InvocationOnMock invocation) throws Throwable {
communicator.stop();
return 2;
}
}).thenThrow(new AssertionError(
"GetClock called second time, when it should not have since the thread " +
"should have quit"));
AllocatorRunnable testRunnable = communicator.new AllocatorRunnable();
testRunnable.run();
verify(mockClock, times(2)).getTime();
}
}

View File

@ -1830,12 +1830,7 @@ public class TestRMContainerAllocator {
}
}, 100, 10000);
// run the scheduler
try {
super.heartbeat();
} catch (Exception e) {
LOG.error("error in heartbeat ", e);
throw new YarnRuntimeException(e);
}
super.heartbeat();
List<TaskAttemptContainerAssignedEvent> result
= new ArrayList<TaskAttemptContainerAssignedEvent>(events);
@ -1885,7 +1880,7 @@ public class TestRMContainerAllocator {
@Override
protected AllocateResponse makeRemoteRequest() throws IOException,
YarnException {
throw new YarnRuntimeException("for testing");
throw new IOException("for testing");
}
}
@ -2450,7 +2445,7 @@ public class TestRMContainerAllocator {
try {
allocator.schedule();
Assert.fail("Should Have Exception");
} catch (YarnRuntimeException e) {
} catch (RMContainerAllocationException e) {
Assert.assertTrue(e.getMessage().contains("Could not contact RM after"));
}
dispatcher.await();
@ -2671,6 +2666,43 @@ public class TestRMContainerAllocator {
allocator.close();
}
@Test(expected = RMContainerAllocationException.class)
public void testAttemptNotFoundCausesRMCommunicatorException()
throws Exception {
Configuration conf = new Configuration();
MyResourceManager rm = new MyResourceManager(conf);
rm.start();
DrainDispatcher dispatcher = (DrainDispatcher) rm.getRMContext()
.getDispatcher();
// Submit the application
RMApp app = rm.submitApp(1024);
dispatcher.await();
MockNM amNodeManager = rm.registerNode("amNM:1234", 2048);
amNodeManager.nodeHeartbeat(true);
dispatcher.await();
ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt()
.getAppAttemptId();
rm.sendAMLaunched(appAttemptId);
dispatcher.await();
JobId jobId = MRBuilderUtils.newJobId(appAttemptId.getApplicationId(), 0);
Job mockJob = mock(Job.class);
when(mockJob.getReport()).thenReturn(
MRBuilderUtils.newJobReport(jobId, "job", "user", JobState.RUNNING, 0,
0, 0, 0, 0, 0, 0, "jobfile", null, false, ""));
MyContainerAllocator allocator = new MyContainerAllocator(rm, conf,
appAttemptId, mockJob);
// Now kill the application
rm.killApp(app.getApplicationId());
allocator.schedule();
}
private static class MockScheduler implements ApplicationMasterProtocol {
ApplicationAttemptId attemptId;
long nextContainerId = 10;