HADOOP-6762. Exception while doing RPC I/O closes channel. Contributed by Sam Rash and Todd Lipcon.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1419783 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2012-12-10 21:25:38 +00:00
parent f8f4269cd7
commit ee7ca279a0
4 changed files with 225 additions and 31 deletions

View File

@ -175,6 +175,9 @@ Release 2.0.3-alpha - Unreleased
HADOOP-9070. Kerberos SASL server cannot find kerberos key. (daryn via atm) HADOOP-9070. Kerberos SASL server cannot find kerberos key. (daryn via atm)
HADOOP-6762. Exception while doing RPC I/O closes channel
(Sam Rash and todd via todd)
Release 2.0.2-alpha - 2012-09-07 Release 2.0.2-alpha - 2012-09-07
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -38,6 +38,11 @@ import java.util.Iterator;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Random; import java.util.Random;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
@ -78,6 +83,8 @@ import org.apache.hadoop.util.ProtoUtil;
import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
/** A client for an IPC service. IPC calls take a single {@link Writable} as a /** A client for an IPC service. IPC calls take a single {@link Writable} as a
* parameter, and return a {@link Writable} as their value. A service runs on * parameter, and return a {@link Writable} as their value. A service runs on
* a port and is defined by a parameter class and a value class. * a port and is defined by a parameter class and a value class.
@ -101,6 +108,19 @@ public class Client {
final static int PING_CALL_ID = -1; final static int PING_CALL_ID = -1;
/**
* Executor on which IPC calls' parameters are sent. Deferring
* the sending of parameters to a separate thread isolates them
* from thread interruptions in the calling code.
*/
private static final ExecutorService SEND_PARAMS_EXECUTOR =
Executors.newCachedThreadPool(
new ThreadFactoryBuilder()
.setDaemon(true)
.setNameFormat("IPC Parameter Sending Thread #%d")
.build());
/** /**
* set the ping interval value in configuration * set the ping interval value in configuration
* *
@ -244,6 +264,8 @@ public class Client {
private AtomicBoolean shouldCloseConnection = new AtomicBoolean(); // indicate if the connection is closed private AtomicBoolean shouldCloseConnection = new AtomicBoolean(); // indicate if the connection is closed
private IOException closeException; // close reason private IOException closeException; // close reason
private final Object sendParamsLock = new Object();
public Connection(ConnectionId remoteId) throws IOException { public Connection(ConnectionId remoteId) throws IOException {
this.remoteId = remoteId; this.remoteId = remoteId;
this.server = remoteId.getAddress(); this.server = remoteId.getAddress();
@ -829,41 +851,74 @@ public class Client {
* Note: this is not called from the Connection thread, but by other * Note: this is not called from the Connection thread, but by other
* threads. * threads.
*/ */
public void sendParam(Call call) { public void sendParam(final Call call)
throws InterruptedException, IOException {
if (shouldCloseConnection.get()) { if (shouldCloseConnection.get()) {
return; return;
} }
DataOutputBuffer d=null; // Serialize the call to be sent. This is done from the actual
try { // caller thread, rather than the SEND_PARAMS_EXECUTOR thread,
synchronized (this.out) { // so that if the serialization throws an error, it is reported
if (LOG.isDebugEnabled()) // properly. This also parallelizes the serialization.
LOG.debug(getName() + " sending #" + call.id); //
// Format of a call on the wire:
// 0) Length of rest below (1 + 2)
// 1) PayloadHeader - is serialized Delimited hence contains length
// 2) the Payload - the RpcRequest
//
// Items '1' and '2' are prepared here.
final DataOutputBuffer d = new DataOutputBuffer();
RpcPayloadHeaderProto header = ProtoUtil.makeRpcPayloadHeader(
call.rpcKind, RpcPayloadOperationProto.RPC_FINAL_PAYLOAD, call.id);
header.writeDelimitedTo(d);
call.rpcRequest.write(d);
// Serializing the data to be written. synchronized (sendParamsLock) {
// Format: Future<?> senderFuture = SEND_PARAMS_EXECUTOR.submit(new Runnable() {
// 0) Length of rest below (1 + 2) @Override
// 1) PayloadHeader - is serialized Delimited hence contains length public void run() {
// 2) the Payload - the RpcRequest try {
// synchronized (Connection.this.out) {
d = new DataOutputBuffer(); if (shouldCloseConnection.get()) {
RpcPayloadHeaderProto header = ProtoUtil.makeRpcPayloadHeader( return;
call.rpcKind, RpcPayloadOperationProto.RPC_FINAL_PAYLOAD, call.id); }
header.writeDelimitedTo(d);
call.rpcRequest.write(d);
byte[] data = d.getData();
int totalLength = d.getLength(); if (LOG.isDebugEnabled())
out.writeInt(totalLength); // Total Length LOG.debug(getName() + " sending #" + call.id);
out.write(data, 0, totalLength);//PayloadHeader + RpcRequest
out.flush(); byte[] data = d.getData();
int totalLength = d.getLength();
out.writeInt(totalLength); // Total Length
out.write(data, 0, totalLength);//PayloadHeader + RpcRequest
out.flush();
}
} catch (IOException e) {
// exception at this point would leave the connection in an
// unrecoverable state (eg half a call left on the wire).
// So, close the connection, killing any outstanding calls
markClosed(e);
} finally {
//the buffer is just an in-memory buffer, but it is still polite to
// close early
IOUtils.closeStream(d);
}
}
});
try {
senderFuture.get();
} catch (ExecutionException e) {
Throwable cause = e.getCause();
// cause should only be a RuntimeException as the Runnable above
// catches IOException
if (cause instanceof RuntimeException) {
throw (RuntimeException) cause;
} else {
throw new RuntimeException("unexpected checked exception", cause);
}
} }
} catch(IOException e) {
markClosed(e);
} finally {
//the buffer is just an in-memory buffer, but it is still polite to
// close early
IOUtils.closeStream(d);
} }
} }
@ -1136,7 +1191,16 @@ public class Client {
ConnectionId remoteId) throws InterruptedException, IOException { ConnectionId remoteId) throws InterruptedException, IOException {
Call call = new Call(rpcKind, rpcRequest); Call call = new Call(rpcKind, rpcRequest);
Connection connection = getConnection(remoteId, call); Connection connection = getConnection(remoteId, call);
connection.sendParam(call); // send the parameter try {
connection.sendParam(call); // send the parameter
} catch (RejectedExecutionException e) {
throw new IOException("connection has been closed", e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
LOG.warn("interrupted waiting to send params to server", e);
throw new IOException(e);
}
boolean interrupted = false; boolean interrupted = false;
synchronized (call) { synchronized (call) {
while (!call.done) { while (!call.done) {

View File

@ -68,6 +68,7 @@ public class TestIPC {
* of the various writables. * of the various writables.
**/ **/
static boolean WRITABLE_FAULTS_ENABLED = true; static boolean WRITABLE_FAULTS_ENABLED = true;
static int WRITABLE_FAULTS_SLEEP = 0;
static { static {
Client.setPingInterval(conf, PING_INTERVAL); Client.setPingInterval(conf, PING_INTERVAL);
@ -206,16 +207,27 @@ public class TestIPC {
static void maybeThrowIOE() throws IOException { static void maybeThrowIOE() throws IOException {
if (WRITABLE_FAULTS_ENABLED) { if (WRITABLE_FAULTS_ENABLED) {
maybeSleep();
throw new IOException("Injected fault"); throw new IOException("Injected fault");
} }
} }
static void maybeThrowRTE() { static void maybeThrowRTE() {
if (WRITABLE_FAULTS_ENABLED) { if (WRITABLE_FAULTS_ENABLED) {
maybeSleep();
throw new RuntimeException("Injected fault"); throw new RuntimeException("Injected fault");
} }
} }
private static void maybeSleep() {
if (WRITABLE_FAULTS_SLEEP > 0) {
try {
Thread.sleep(WRITABLE_FAULTS_SLEEP);
} catch (InterruptedException ie) {
}
}
}
@SuppressWarnings("unused") @SuppressWarnings("unused")
private static class IOEOnReadWritable extends LongWritable { private static class IOEOnReadWritable extends LongWritable {
public IOEOnReadWritable() {} public IOEOnReadWritable() {}
@ -370,6 +382,27 @@ public class TestIPC {
RTEOnReadWritable.class); RTEOnReadWritable.class);
} }
/**
* Test case that fails a write, but only after taking enough time
* that a ping should have been sent. This is a reproducer for a
* deadlock seen in one iteration of HADOOP-6762.
*/
@Test
public void testIOEOnWriteAfterPingClient() throws Exception {
// start server
Client.setPingInterval(conf, 100);
try {
WRITABLE_FAULTS_SLEEP = 1000;
doErrorTest(IOEOnWriteWritable.class,
LongWritable.class,
LongWritable.class,
LongWritable.class);
} finally {
WRITABLE_FAULTS_SLEEP = 0;
}
}
private static void assertExceptionContains( private static void assertExceptionContains(
Throwable t, String substring) { Throwable t, String substring) {
String msg = StringUtils.stringifyException(t); String msg = StringUtils.stringifyException(t);

View File

@ -38,6 +38,10 @@ import java.net.ConnectException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.InetSocketAddress; import java.net.InetSocketAddress;
import java.util.Arrays; import java.util.Arrays;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import javax.net.SocketFactory; import javax.net.SocketFactory;
@ -823,6 +827,96 @@ public class TestRPC {
} }
} }
@Test(timeout=90000)
public void testRPCInterruptedSimple() throws Exception {
final Configuration conf = new Configuration();
Server server = RPC.getServer(
TestProtocol.class, new TestImpl(), ADDRESS, 0, 5, true, conf, null
);
server.start();
InetSocketAddress addr = NetUtils.getConnectAddress(server);
final TestProtocol proxy = (TestProtocol) RPC.getProxy(
TestProtocol.class, TestProtocol.versionID, addr, conf);
// Connect to the server
proxy.ping();
// Interrupt self, try another call
Thread.currentThread().interrupt();
try {
proxy.ping();
fail("Interruption did not cause IPC to fail");
} catch (IOException ioe) {
if (!ioe.toString().contains("InterruptedException")) {
throw ioe;
}
// clear interrupt status for future tests
Thread.interrupted();
}
}
@Test(timeout=30000)
public void testRPCInterrupted() throws IOException, InterruptedException {
final Configuration conf = new Configuration();
Server server = RPC.getServer(
TestProtocol.class, new TestImpl(), ADDRESS, 0, 5, true, conf, null
);
server.start();
int numConcurrentRPC = 200;
InetSocketAddress addr = NetUtils.getConnectAddress(server);
final CyclicBarrier barrier = new CyclicBarrier(numConcurrentRPC);
final CountDownLatch latch = new CountDownLatch(numConcurrentRPC);
final AtomicBoolean leaderRunning = new AtomicBoolean(true);
final AtomicReference<Throwable> error = new AtomicReference<Throwable>();
Thread leaderThread = null;
for (int i = 0; i < numConcurrentRPC; i++) {
final int num = i;
final TestProtocol proxy = (TestProtocol) RPC.getProxy(
TestProtocol.class, TestProtocol.versionID, addr, conf);
Thread rpcThread = new Thread(new Runnable() {
@Override
public void run() {
try {
barrier.await();
while (num == 0 || leaderRunning.get()) {
proxy.slowPing(false);
}
proxy.slowPing(false);
} catch (Exception e) {
if (num == 0) {
leaderRunning.set(false);
} else {
error.set(e);
}
LOG.error(e);
} finally {
latch.countDown();
}
}
});
rpcThread.start();
if (leaderThread == null) {
leaderThread = rpcThread;
}
}
// let threads get past the barrier
Thread.sleep(1000);
// stop a single thread
while (leaderRunning.get()) {
leaderThread.interrupt();
}
latch.await();
// should not cause any other thread to get an error
assertTrue("rpc got exception " + error.get(), error.get() == null);
}
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
new TestRPC().testCallsInternal(conf); new TestRPC().testCallsInternal(conf);