mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-25 22:36:20 +00:00
Transport: fix racing condition in timeout handling
If a request comes in at the same moment the timeout handler for it runs, we may leak a timeoutInfoHolder and erroneously log "Transport response handler not found of id" . The same issue could cause the request tracer to fire a traceUnresolvedResponse call instead of traceReceivedResponse , causing a failure of testTracerLog ( see #10187 ) . This commit makes sure timeoutInfoHolder is visible before removing the corresponding RequestHolder. It also unifies the TransportService.Adapter#remove(requestId) with TransportService.Adapter#onResponseReceived(requestId), as they are always called together to indicate a response was received. Closes #10187 Closes #10220
This commit is contained in:
parent
f5f9739117
commit
cde7d9af1c
@ -255,16 +255,22 @@ public class TransportService extends AbstractLifecycleComponent<TransportServic
|
|||||||
throw new ElasticsearchIllegalStateException("can't send request to a null node");
|
throw new ElasticsearchIllegalStateException("can't send request to a null node");
|
||||||
}
|
}
|
||||||
final long requestId = newRequestId();
|
final long requestId = newRequestId();
|
||||||
TimeoutHandler timeoutHandler = null;
|
final TimeoutHandler timeoutHandler;
|
||||||
try {
|
try {
|
||||||
|
|
||||||
|
if (options.timeout() == null) {
|
||||||
|
timeoutHandler = null;
|
||||||
|
} else {
|
||||||
|
timeoutHandler = new TimeoutHandler(requestId);
|
||||||
|
}
|
||||||
clientHandlers.put(requestId, new RequestHolder<>(handler, node, action, timeoutHandler));
|
clientHandlers.put(requestId, new RequestHolder<>(handler, node, action, timeoutHandler));
|
||||||
if (started.get() == false) {
|
if (started.get() == false) {
|
||||||
// if we are not started the exception handling will remove the RequestHolder again and calls the handler to notify the caller.
|
// if we are not started the exception handling will remove the RequestHolder again and calls the handler to notify the caller.
|
||||||
// it will only notify if the toStop code hasn't done the work yet.
|
// it will only notify if the toStop code hasn't done the work yet.
|
||||||
throw new TransportException("TransportService is closed stopped can't send request");
|
throw new TransportException("TransportService is closed stopped can't send request");
|
||||||
}
|
}
|
||||||
if (options.timeout() != null) {
|
if (timeoutHandler != null) {
|
||||||
timeoutHandler = new TimeoutHandler(requestId);
|
assert options.timeout() != null;
|
||||||
timeoutHandler.future = threadPool.schedule(options.timeout(), ThreadPool.Names.GENERIC, timeoutHandler);
|
timeoutHandler.future = threadPool.schedule(options.timeout(), ThreadPool.Names.GENERIC, timeoutHandler);
|
||||||
}
|
}
|
||||||
transport.sendRequest(node, requestId, action, request, options);
|
transport.sendRequest(node, requestId, action, request, options);
|
||||||
@ -272,13 +278,9 @@ public class TransportService extends AbstractLifecycleComponent<TransportServic
|
|||||||
// usually happen either because we failed to connect to the node
|
// usually happen either because we failed to connect to the node
|
||||||
// or because we failed serializing the message
|
// or because we failed serializing the message
|
||||||
final RequestHolder holderToNotify = clientHandlers.remove(requestId);
|
final RequestHolder holderToNotify = clientHandlers.remove(requestId);
|
||||||
// if the scheduler raise a EsRejectedExecutionException (due to shutdown), we may have a timeout handler, but no future
|
|
||||||
if (timeoutHandler != null) {
|
|
||||||
FutureUtils.cancel(timeoutHandler.future);
|
|
||||||
}
|
|
||||||
|
|
||||||
// If holderToNotify == null then handler has already been taken care of.
|
// If holderToNotify == null then handler has already been taken care of.
|
||||||
if (holderToNotify != null) {
|
if (holderToNotify != null) {
|
||||||
|
holderToNotify.cancelTimeout();
|
||||||
// callback that an exception happened, but on a different thread since we don't
|
// callback that an exception happened, but on a different thread since we don't
|
||||||
// want handlers to worry about stack overflows
|
// want handlers to worry about stack overflows
|
||||||
final SendRequestTransportException sendRequestException = new SendRequestTransportException(node, action, e);
|
final SendRequestTransportException sendRequestException = new SendRequestTransportException(node, action, e);
|
||||||
@ -376,32 +378,6 @@ public class TransportService extends AbstractLifecycleComponent<TransportServic
|
|||||||
tracerLog.trace("[{}][{}] sent error response (error: [{}])", requestId, action, t.getMessage());
|
tracerLog.trace("[{}][{}] sent error response (error: [{}])", requestId, action, t.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void onResponseReceived(long requestId) {
|
|
||||||
if (traceEnabled()) {
|
|
||||||
// try to resolve the request
|
|
||||||
DiscoveryNode sourceNode = null;
|
|
||||||
String action = null;
|
|
||||||
RequestHolder holder = clientHandlers.get(requestId);
|
|
||||||
if (holder != null) {
|
|
||||||
action = holder.action();
|
|
||||||
sourceNode = holder.node();
|
|
||||||
} else {
|
|
||||||
// lets see if its in the timeout holder
|
|
||||||
TimeoutInfoHolder timeoutInfoHolder = timeoutInfoHandlers.get(requestId);
|
|
||||||
if (timeoutInfoHolder != null) {
|
|
||||||
action = timeoutInfoHolder.action();
|
|
||||||
sourceNode = timeoutInfoHolder.node();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (action == null) {
|
|
||||||
traceUnresolvedResponse(requestId);
|
|
||||||
} else if (shouldTraceAction(action)) {
|
|
||||||
traceReceivedResponse(requestId, sourceNode, action);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onRequestReceived(long requestId, String action) {
|
public void onRequestReceived(long requestId, String action) {
|
||||||
if (traceEnabled() && shouldTraceAction(action)) {
|
if (traceEnabled() && shouldTraceAction(action)) {
|
||||||
@ -415,23 +391,47 @@ public class TransportService extends AbstractLifecycleComponent<TransportServic
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TransportResponseHandler remove(long requestId) {
|
public TransportResponseHandler onResponseReceived(final long requestId) {
|
||||||
RequestHolder holder = clientHandlers.remove(requestId);
|
RequestHolder holder = clientHandlers.remove(requestId);
|
||||||
if (holder == null) {
|
if (holder == null) {
|
||||||
// lets see if its in the timeout holder
|
checkForTimeout(requestId);
|
||||||
TimeoutInfoHolder timeoutInfoHolder = timeoutInfoHandlers.remove(requestId);
|
|
||||||
if (timeoutInfoHolder != null) {
|
|
||||||
long time = System.currentTimeMillis();
|
|
||||||
logger.warn("Received response for a request that has timed out, sent [{}ms] ago, timed out [{}ms] ago, action [{}], node [{}], id [{}]", time - timeoutInfoHolder.sentTime(), time - timeoutInfoHolder.timeoutTime(), timeoutInfoHolder.action(), timeoutInfoHolder.node(), requestId);
|
|
||||||
} else {
|
|
||||||
logger.warn("Transport response handler not found of id [{}]", requestId);
|
|
||||||
}
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
holder.cancel();
|
holder.cancelTimeout();
|
||||||
|
if (traceEnabled() && shouldTraceAction(holder.action())) {
|
||||||
|
traceReceivedResponse(requestId, holder.node(), holder.action());
|
||||||
|
}
|
||||||
return holder.handler();
|
return holder.handler();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void checkForTimeout(long requestId) {
|
||||||
|
// lets see if its in the timeout holder, but sync on mutex to make sure any ongoing timeout handling has finished
|
||||||
|
final DiscoveryNode sourceNode;
|
||||||
|
final String action;
|
||||||
|
assert clientHandlers.get(requestId) == null;
|
||||||
|
TimeoutInfoHolder timeoutInfoHolder = timeoutInfoHandlers.remove(requestId);
|
||||||
|
if (timeoutInfoHolder != null) {
|
||||||
|
long time = System.currentTimeMillis();
|
||||||
|
logger.warn("Received response for a request that has timed out, sent [{}ms] ago, timed out [{}ms] ago, action [{}], node [{}], id [{}]", time - timeoutInfoHolder.sentTime(), time - timeoutInfoHolder.timeoutTime(), timeoutInfoHolder.action(), timeoutInfoHolder.node(), requestId);
|
||||||
|
action = timeoutInfoHolder.action();
|
||||||
|
sourceNode = timeoutInfoHolder.node();
|
||||||
|
} else {
|
||||||
|
logger.warn("Transport response handler not found of id [{}]", requestId);
|
||||||
|
action = null;
|
||||||
|
sourceNode = null;
|
||||||
|
}
|
||||||
|
// call tracer out of lock
|
||||||
|
if (traceEnabled() == false) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (action == null) {
|
||||||
|
assert sourceNode == null;
|
||||||
|
traceUnresolvedResponse(requestId);
|
||||||
|
} else if (shouldTraceAction(action)) {
|
||||||
|
traceReceivedResponse(requestId, sourceNode, action);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void raiseNodeConnected(final DiscoveryNode node) {
|
public void raiseNodeConnected(final DiscoveryNode node) {
|
||||||
threadPool.generic().execute(new Runnable() {
|
threadPool.generic().execute(new Runnable() {
|
||||||
@ -504,29 +504,40 @@ public class TransportService extends AbstractLifecycleComponent<TransportServic
|
|||||||
|
|
||||||
private final long sentTime = System.currentTimeMillis();
|
private final long sentTime = System.currentTimeMillis();
|
||||||
|
|
||||||
ScheduledFuture future;
|
volatile ScheduledFuture future;
|
||||||
|
|
||||||
TimeoutHandler(long requestId) {
|
TimeoutHandler(long requestId) {
|
||||||
this.requestId = requestId;
|
this.requestId = requestId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long sentTime() {
|
|
||||||
return sentTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
if (future.isCancelled()) {
|
// we get first to make sure we only add the TimeoutInfoHandler if needed.
|
||||||
return;
|
final RequestHolder holder = clientHandlers.get(requestId);
|
||||||
}
|
|
||||||
final RequestHolder holder = clientHandlers.remove(requestId);
|
|
||||||
if (holder != null) {
|
if (holder != null) {
|
||||||
// add it to the timeout information holder, in case we are going to get a response later
|
// add it to the timeout information holder, in case we are going to get a response later
|
||||||
long timeoutTime = System.currentTimeMillis();
|
long timeoutTime = System.currentTimeMillis();
|
||||||
timeoutInfoHandlers.put(requestId, new TimeoutInfoHolder(holder.node(), holder.action(), sentTime, timeoutTime));
|
timeoutInfoHandlers.put(requestId, new TimeoutInfoHolder(holder.node(), holder.action(), sentTime, timeoutTime));
|
||||||
holder.handler().handleException(new ReceiveTimeoutTransportException(holder.node(), holder.action(), "request_id [" + requestId + "] timed out after [" + (timeoutTime - sentTime) + "ms]"));
|
// now that we have the information visible via timeoutInfoHandlers, we try to remove the request id
|
||||||
|
final RequestHolder removedHolder = clientHandlers.remove(requestId);
|
||||||
|
if (removedHolder != null) {
|
||||||
|
assert removedHolder == holder : "two different holder instances for request [" + requestId + "]";
|
||||||
|
removedHolder.handler().handleException(new ReceiveTimeoutTransportException(holder.node(), holder.action(), "request_id [" + requestId + "] timed out after [" + (timeoutTime - sentTime) + "ms]"));
|
||||||
|
} else {
|
||||||
|
// response was processed, remove timeout info.
|
||||||
|
timeoutInfoHandlers.remove(requestId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* cancels timeout handling. this is a best effort only to avoid running it. remove the requestId from {@link #clientHandlers}
|
||||||
|
* to make sure this doesn't run.
|
||||||
|
*/
|
||||||
|
public void cancel() {
|
||||||
|
assert clientHandlers.get(requestId) == null : "cancel must be called after the requestId [" + requestId + "] has been removed from clientHandlers";
|
||||||
|
FutureUtils.cancel(future);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static class TimeoutInfoHolder {
|
static class TimeoutInfoHolder {
|
||||||
@ -571,13 +582,13 @@ public class TransportService extends AbstractLifecycleComponent<TransportServic
|
|||||||
|
|
||||||
private final String action;
|
private final String action;
|
||||||
|
|
||||||
private final TimeoutHandler timeout;
|
private final TimeoutHandler timeoutHandler;
|
||||||
|
|
||||||
RequestHolder(TransportResponseHandler<T> handler, DiscoveryNode node, String action, TimeoutHandler timeout) {
|
RequestHolder(TransportResponseHandler<T> handler, DiscoveryNode node, String action, TimeoutHandler timeoutHandler) {
|
||||||
this.handler = handler;
|
this.handler = handler;
|
||||||
this.node = node;
|
this.node = node;
|
||||||
this.action = action;
|
this.action = action;
|
||||||
this.timeout = timeout;
|
this.timeoutHandler = timeoutHandler;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TransportResponseHandler<T> handler() {
|
public TransportResponseHandler<T> handler() {
|
||||||
@ -592,9 +603,9 @@ public class TransportService extends AbstractLifecycleComponent<TransportServic
|
|||||||
return this.action;
|
return this.action;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void cancel() {
|
public void cancelTimeout() {
|
||||||
if (timeout != null) {
|
if (timeoutHandler != null) {
|
||||||
FutureUtils.cancel(timeout.future);
|
timeoutHandler.cancel();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -41,9 +41,10 @@ public interface TransportServiceAdapter {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* called by the {@link Transport) implementation when a response or an exception has been recieved for a previously
|
* called by the {@link Transport) implementation when a response or an exception has been recieved for a previously
|
||||||
* sent request (before any processing or deserialization was done
|
* sent request (before any processing or deserialization was done). Returns the appropriate response handler or null if not
|
||||||
|
* found.
|
||||||
*/
|
*/
|
||||||
void onResponseReceived(long requestId);
|
TransportResponseHandler onResponseReceived(long requestId);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* called by the {@link Transport) implementation when an incoming request arrives but before
|
* called by the {@link Transport) implementation when an incoming request arrives but before
|
||||||
@ -53,8 +54,6 @@ public interface TransportServiceAdapter {
|
|||||||
|
|
||||||
TransportRequestHandler handler(String action);
|
TransportRequestHandler handler(String action);
|
||||||
|
|
||||||
TransportResponseHandler remove(long requestId);
|
|
||||||
|
|
||||||
void raiseNodeConnected(DiscoveryNode node);
|
void raiseNodeConnected(DiscoveryNode node);
|
||||||
|
|
||||||
void raiseNodeDisconnected(DiscoveryNode node);
|
void raiseNodeDisconnected(DiscoveryNode node);
|
||||||
|
@ -235,9 +235,7 @@ public class LocalTransport extends AbstractLifecycleComponent<Transport> implem
|
|||||||
if (isRequest) {
|
if (isRequest) {
|
||||||
handleRequest(stream, requestId, sourceTransport, version);
|
handleRequest(stream, requestId, sourceTransport, version);
|
||||||
} else {
|
} else {
|
||||||
// notify with response before we process it and before we remove information about it.
|
final TransportResponseHandler handler = transportServiceAdapter.onResponseReceived(requestId);
|
||||||
transportServiceAdapter.onResponseReceived(requestId);
|
|
||||||
final TransportResponseHandler handler = transportServiceAdapter.remove(requestId);
|
|
||||||
// ignore if its null, the adapter logs it
|
// ignore if its null, the adapter logs it
|
||||||
if (handler != null) {
|
if (handler != null) {
|
||||||
if (TransportStatus.isError(status)) {
|
if (TransportStatus.isError(status)) {
|
||||||
@ -249,7 +247,7 @@ public class LocalTransport extends AbstractLifecycleComponent<Transport> implem
|
|||||||
}
|
}
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
if (sendRequestId != null) {
|
if (sendRequestId != null) {
|
||||||
TransportResponseHandler handler = transportServiceAdapter.remove(sendRequestId);
|
TransportResponseHandler handler = transportServiceAdapter.onResponseReceived(sendRequestId);
|
||||||
if (handler != null) {
|
if (handler != null) {
|
||||||
handleException(handler, new RemoteTransportException(nodeName(), localAddress, action, e));
|
handleException(handler, new RemoteTransportException(nodeName(), localAddress, action, e));
|
||||||
}
|
}
|
||||||
|
@ -120,9 +120,7 @@ public class MessageChannelHandler extends SimpleChannelUpstreamHandler {
|
|||||||
buffer.readerIndex(expectedIndexReader);
|
buffer.readerIndex(expectedIndexReader);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// notify with response before we process it and before we remove information about it.
|
TransportResponseHandler handler = transportServiceAdapter.onResponseReceived(requestId);
|
||||||
transportServiceAdapter.onResponseReceived(requestId);
|
|
||||||
TransportResponseHandler handler = transportServiceAdapter.remove(requestId);
|
|
||||||
// ignore if its null, the adapter logs it
|
// ignore if its null, the adapter logs it
|
||||||
if (handler != null) {
|
if (handler != null) {
|
||||||
if (TransportStatus.isError(status)) {
|
if (TransportStatus.isError(status)) {
|
||||||
|
@ -20,11 +20,7 @@
|
|||||||
package org.elasticsearch.client.transport;
|
package org.elasticsearch.client.transport;
|
||||||
|
|
||||||
import com.carrotsearch.randomizedtesting.generators.RandomInts;
|
import com.carrotsearch.randomizedtesting.generators.RandomInts;
|
||||||
import org.elasticsearch.Build;
|
|
||||||
import org.elasticsearch.ElasticsearchException;
|
import org.elasticsearch.ElasticsearchException;
|
||||||
import org.elasticsearch.Version;
|
|
||||||
import org.elasticsearch.action.admin.cluster.node.info.NodeInfo;
|
|
||||||
import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse;
|
|
||||||
import org.elasticsearch.action.admin.cluster.node.liveness.LivenessResponse;
|
import org.elasticsearch.action.admin.cluster.node.liveness.LivenessResponse;
|
||||||
import org.elasticsearch.cluster.ClusterName;
|
import org.elasticsearch.cluster.ClusterName;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNode;
|
import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||||
@ -65,7 +61,7 @@ abstract class FailAndRetryMockTransport<Response extends TransportResponse> imp
|
|||||||
|
|
||||||
//we make sure that nodes get added to the connected ones when calling addTransportAddress, by returning proper nodes info
|
//we make sure that nodes get added to the connected ones when calling addTransportAddress, by returning proper nodes info
|
||||||
if (connectMode) {
|
if (connectMode) {
|
||||||
TransportResponseHandler transportResponseHandler = transportServiceAdapter.remove(requestId);
|
TransportResponseHandler transportResponseHandler = transportServiceAdapter.onResponseReceived(requestId);
|
||||||
transportResponseHandler.handleResponse(new LivenessResponse(ClusterName.DEFAULT, node));
|
transportResponseHandler.handleResponse(new LivenessResponse(ClusterName.DEFAULT, node));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -82,7 +78,7 @@ abstract class FailAndRetryMockTransport<Response extends TransportResponse> imp
|
|||||||
//throw whatever exception that is not a subclass of ConnectTransportException
|
//throw whatever exception that is not a subclass of ConnectTransportException
|
||||||
throw new IllegalStateException();
|
throw new IllegalStateException();
|
||||||
} else {
|
} else {
|
||||||
TransportResponseHandler transportResponseHandler = transportServiceAdapter.remove(requestId);
|
TransportResponseHandler transportResponseHandler = transportServiceAdapter.onResponseReceived(requestId);
|
||||||
if (random.nextBoolean()) {
|
if (random.nextBoolean()) {
|
||||||
successes.incrementAndGet();
|
successes.incrementAndGet();
|
||||||
transportResponseHandler.handleResponse(newResponse());
|
transportResponseHandler.handleResponse(newResponse());
|
||||||
|
Loading…
x
Reference in New Issue
Block a user