ARTEMIS-350 fix for potential race

It's possible for the latch used for flow control here to get out of sync. In
other words, multiple count-downs can occur between count-ups so that the latch
always has a count > 0. When this situation arises then every single packet
sent to the replica is delayed by 5 seconds.

The solution here essentially is to eliminate the latch completely and use a
condition/wait/notify pattern.
This commit is contained in:
jbertram 2016-02-16 10:45:25 -06:00
parent 2a639cbd86
commit 2fcd474bb3
3 changed files with 26 additions and 8 deletions

View File

@ -584,7 +584,12 @@ public class JournalStorageManager extends AbstractJournalStorageManager {
SequentialFile seqFile = largeMessagesFactory.createSequentialFile(fileName);
if (!seqFile.exists())
continue;
replicator.syncLargeMessageFile(seqFile, size, id);
if (replicator != null) {
replicator.syncLargeMessageFile(seqFile, size, id);
}
else {
throw ActiveMQMessageBundle.BUNDLE.replicatorIsNull();
}
}
}
@ -711,4 +716,4 @@ public class JournalStorageManager extends AbstractJournalStorageManager {
readUnLock();
}
}
}
}

View File

@ -25,7 +25,7 @@ import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.activemq.artemis.api.core.ActiveMQBuffer;
import org.apache.activemq.artemis.api.core.ActiveMQException;
@ -110,9 +110,10 @@ public final class ReplicationManager implements ActiveMQComponent, ReadyListene
private volatile boolean enabled;
private final AtomicBoolean writable = new AtomicBoolean(false);
private final Object replicationLock = new Object();
private final ReusableLatch latch = new ReusableLatch();
private final Queue<OperationContext> pendingTokens = new ConcurrentLinkedQueue<>();
private final ExecutorFactory executorFactory;
@ -271,10 +272,11 @@ public final class ReplicationManager implements ActiveMQComponent, ReadyListene
if (replicatingChannel != null) {
replicatingChannel.close();
replicatingChannel.getConnection().getTransportConnection().fireReady(true);
latch.setCount(0);
}
synchronized (replicationLock) {
writable.set(true);
replicationLock.notifyAll();
clearReplicationTokens();
}
@ -342,10 +344,15 @@ public final class ReplicationManager implements ActiveMQComponent, ReadyListene
if (enabled) {
pendingTokens.add(repliToken);
if (!replicatingChannel.getConnection().isWritable(this)) {
latch.countUp();
try {
//don't wait for ever as this may hang tests etc, we've probably been closed anyway
latch.await(5, TimeUnit.SECONDS);
long now = System.currentTimeMillis();
long deadline = now + 5000;
while (!writable.get() && now < deadline) {
replicationLock.wait(deadline - now);
now = System.currentTimeMillis();
}
writable.set(false);
}
catch (InterruptedException e) {
throw new ActiveMQInterruptedException(e);
@ -370,7 +377,10 @@ public final class ReplicationManager implements ActiveMQComponent, ReadyListene
@Override
public void readyForWriting() {
latch.countDown();
synchronized (replicationLock) {
writable.set(true);
replicationLock.notifyAll();
}
}
/**

View File

@ -368,4 +368,7 @@ public interface ActiveMQMessageBundle {
@Message(id = 119116, value = "Netty Acceptor unavailable", format = Message.Format.MESSAGE_FORMAT)
IllegalStateException acceptorUnavailable();
@Message(id = 119117, value = "Replicator is null. Replication was likely terminated.")
ActiveMQIllegalStateException replicatorIsNull();
}