ACTIVEMQ6-86 Backup announcement may fail
When a live and backup server are both started at or near the same moment there is a small window where the live server's acceptors have been started but the server's state != STARTED. During this window if the backup sends its announcement the announcement will fail and the backup will shutdown. This fix closes this small window by only starting the acceptors until the server is fully started.
This commit is contained in:
parent
242a5fac8a
commit
9a88941a51
|
@ -111,4 +111,10 @@ public final class BackupReplicationStartFailedMessage extends PacketImpl
|
||||||
result = 31 * result + (problem != null ? problem.hashCode() : 0);
|
result = 31 * result + (problem != null ? problem.hashCode() : 0);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString()
|
||||||
|
{
|
||||||
|
return getParentString() + ", problem=" + problem.name() + "]";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -58,6 +58,8 @@ public interface RemotingService
|
||||||
|
|
||||||
void start() throws Exception;
|
void start() throws Exception;
|
||||||
|
|
||||||
|
void startAcceptors() throws Exception;
|
||||||
|
|
||||||
boolean isStarted();
|
boolean isStarted();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -318,10 +318,10 @@ public class RemotingServiceImpl implements RemotingService, ConnectionLifeCycle
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Acceptor a : acceptors.values())
|
/**
|
||||||
{
|
* Don't start the acceptors here. Only start the acceptors at the every end of the start-up process to avoid
|
||||||
a.start();
|
* race conditions. See {@link #startAcceptors()}.
|
||||||
}
|
*/
|
||||||
|
|
||||||
// This thread checks connections that need to be closed, and also flushes confirmations
|
// This thread checks connections that need to be closed, and also flushes confirmations
|
||||||
failureCheckAndFlushThread = new FailureCheckAndFlushThread(RemotingServiceImpl.CONNECTION_TTL_CHECK_INTERVAL);
|
failureCheckAndFlushThread = new FailureCheckAndFlushThread(RemotingServiceImpl.CONNECTION_TTL_CHECK_INTERVAL);
|
||||||
|
@ -331,6 +331,17 @@ public class RemotingServiceImpl implements RemotingService, ConnectionLifeCycle
|
||||||
started = true;
|
started = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public synchronized void startAcceptors() throws Exception
|
||||||
|
{
|
||||||
|
if (isStarted())
|
||||||
|
{
|
||||||
|
for (Acceptor a : acceptors.values())
|
||||||
|
{
|
||||||
|
a.start();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public synchronized void allowInvmSecurityOverride(ActiveMQPrincipal principal)
|
public synchronized void allowInvmSecurityOverride(ActiveMQPrincipal principal)
|
||||||
{
|
{
|
||||||
defaultInvmSecurityPrincipal = principal;
|
defaultInvmSecurityPrincipal = principal;
|
||||||
|
|
|
@ -431,7 +431,6 @@ public class ActiveMQServerImpl implements ActiveMQServer
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
state = SERVER_STATE.STARTED;
|
|
||||||
ActiveMQServerLogger.LOGGER.serverStarted(getVersion().getFullVersion(), nodeManager.getNodeId(),
|
ActiveMQServerLogger.LOGGER.serverStarted(getVersion().getFullVersion(), nodeManager.getNodeId(),
|
||||||
identity != null ? identity : "");
|
identity != null ? identity : "");
|
||||||
}
|
}
|
||||||
|
@ -1789,16 +1788,17 @@ public class ActiveMQServerImpl implements ActiveMQServer
|
||||||
{
|
{
|
||||||
throw ActiveMQMessageBundle.BUNDLE.nodeIdNull();
|
throw ActiveMQMessageBundle.BUNDLE.nodeIdNull();
|
||||||
}
|
}
|
||||||
activationLatch.countDown();
|
|
||||||
|
|
||||||
// We can only do this after everything is started otherwise we may get nasty races with expired messages
|
// We can only do this after everything is started otherwise we may get nasty races with expired messages
|
||||||
postOffice.startExpiryScanner();
|
postOffice.startExpiryScanner();
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
{
|
|
||||||
activationLatch.countDown();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
public void completeActivation() throws Exception
|
||||||
|
{
|
||||||
|
setState(ActiveMQServerImpl.SERVER_STATE.STARTED);
|
||||||
|
getRemotingService().startAcceptors();
|
||||||
|
activationLatch.countDown();
|
||||||
callActivationCompleteCallbacks();
|
callActivationCompleteCallbacks();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -62,6 +62,8 @@ public class LiveOnlyActivation extends Activation
|
||||||
|
|
||||||
activeMQServer.initialisePart2(false);
|
activeMQServer.initialisePart2(false);
|
||||||
|
|
||||||
|
activeMQServer.completeActivation();
|
||||||
|
|
||||||
if (activeMQServer.getIdentity() != null)
|
if (activeMQServer.getIdentity() != null)
|
||||||
{
|
{
|
||||||
ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity());
|
ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity());
|
||||||
|
|
|
@ -289,6 +289,8 @@ public final class SharedNothingBackupActivation extends Activation
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
activeMQServer.completeActivation();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception e)
|
catch (Exception e)
|
||||||
|
|
|
@ -103,6 +103,8 @@ public class SharedNothingLiveActivation extends LiveActivation
|
||||||
|
|
||||||
activeMQServer.initialisePart2(false);
|
activeMQServer.initialisePart2(false);
|
||||||
|
|
||||||
|
activeMQServer.completeActivation();
|
||||||
|
|
||||||
if (activeMQServer.getIdentity() != null)
|
if (activeMQServer.getIdentity() != null)
|
||||||
{
|
{
|
||||||
ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity());
|
ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity());
|
||||||
|
@ -141,6 +143,7 @@ public class SharedNothingLiveActivation extends LiveActivation
|
||||||
}
|
}
|
||||||
catch (ActiveMQException e)
|
catch (ActiveMQException e)
|
||||||
{
|
{
|
||||||
|
ActiveMQServerLogger.LOGGER.debug("Failed to process backup registration packet", e);
|
||||||
channel.send(new BackupReplicationStartFailedMessage(BackupReplicationStartFailedMessage.BackupRegistrationProblem.EXCEPTION));
|
channel.send(new BackupReplicationStartFailedMessage(BackupReplicationStartFailedMessage.BackupRegistrationProblem.EXCEPTION));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -87,6 +87,8 @@ public final class SharedStoreBackupActivation extends Activation
|
||||||
|
|
||||||
activeMQServer.initialisePart2(scalingDown);
|
activeMQServer.initialisePart2(scalingDown);
|
||||||
|
|
||||||
|
activeMQServer.completeActivation();
|
||||||
|
|
||||||
if (scalingDown)
|
if (scalingDown)
|
||||||
{
|
{
|
||||||
ActiveMQServerLogger.LOGGER.backupServerScaledDown();
|
ActiveMQServerLogger.LOGGER.backupServerScaledDown();
|
||||||
|
|
|
@ -76,6 +76,8 @@ public final class SharedStoreLiveActivation extends LiveActivation
|
||||||
|
|
||||||
activeMQServer.initialisePart2(false);
|
activeMQServer.initialisePart2(false);
|
||||||
|
|
||||||
|
activeMQServer.completeActivation();
|
||||||
|
|
||||||
ActiveMQServerLogger.LOGGER.serverIsLive();
|
ActiveMQServerLogger.LOGGER.serverIsLive();
|
||||||
}
|
}
|
||||||
catch (Exception e)
|
catch (Exception e)
|
||||||
|
|
|
@ -0,0 +1,121 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.activemq.byteman.tests;
|
||||||
|
|
||||||
|
import java.util.concurrent.CountDownLatch;
|
||||||
|
|
||||||
|
import org.apache.activemq.api.config.ActiveMQDefaultConfiguration;
|
||||||
|
import org.apache.activemq.api.core.TransportConfiguration;
|
||||||
|
import org.apache.activemq.core.config.Configuration;
|
||||||
|
import org.apache.activemq.core.server.ActiveMQServer;
|
||||||
|
import org.apache.activemq.tests.util.ReplicatedBackupUtils;
|
||||||
|
import org.apache.activemq.tests.util.ServiceTestBase;
|
||||||
|
import org.apache.activemq.tests.util.TransportConfigurationUtils;
|
||||||
|
import org.jboss.byteman.contrib.bmunit.BMRule;
|
||||||
|
import org.jboss.byteman.contrib.bmunit.BMRules;
|
||||||
|
import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.runner.RunWith;
|
||||||
|
|
||||||
|
@RunWith(BMUnitRunner.class)
|
||||||
|
public class ReplicationBackupTest extends ServiceTestBase
|
||||||
|
{
|
||||||
|
private static final CountDownLatch ruleFired = new CountDownLatch(1);
|
||||||
|
private ActiveMQServer backupServer;
|
||||||
|
private ActiveMQServer liveServer;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* simple test to induce a potential race condition where the server's acceptors are active, but the server's
|
||||||
|
* state != STARTED
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@BMRules
|
||||||
|
(
|
||||||
|
rules =
|
||||||
|
{
|
||||||
|
@BMRule
|
||||||
|
(
|
||||||
|
name = "prevent backup annoucement",
|
||||||
|
targetClass = "org.apache.activemq.core.server.impl.SharedNothingLiveActivation",
|
||||||
|
targetMethod = "run",
|
||||||
|
targetLocation = "AT EXIT",
|
||||||
|
action = "org.apache.activemq.byteman.tests.ReplicationBackupTest.breakIt();"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
public void testReplicatedBackupAnnouncement() throws Exception
|
||||||
|
{
|
||||||
|
TransportConfiguration liveConnector = TransportConfigurationUtils.getNettyConnector(true, 0);
|
||||||
|
TransportConfiguration liveAcceptor = TransportConfigurationUtils.getNettyAcceptor(true, 0);
|
||||||
|
TransportConfiguration backupConnector = TransportConfigurationUtils.getNettyConnector(false, 0);
|
||||||
|
TransportConfiguration backupAcceptor = TransportConfigurationUtils.getNettyAcceptor(false, 0);
|
||||||
|
|
||||||
|
final String suffix = "_backup";
|
||||||
|
|
||||||
|
Configuration backupConfig = createDefaultConfig()
|
||||||
|
.setBindingsDirectory(ActiveMQDefaultConfiguration.getDefaultBindingsDirectory() + suffix)
|
||||||
|
.setJournalDirectory(ActiveMQDefaultConfiguration.getDefaultJournalDir() + suffix)
|
||||||
|
.setPagingDirectory(ActiveMQDefaultConfiguration.getDefaultPagingDir() + suffix)
|
||||||
|
.setLargeMessagesDirectory(ActiveMQDefaultConfiguration.getDefaultLargeMessagesDir() + suffix);
|
||||||
|
|
||||||
|
Configuration liveConfig = createDefaultConfig();
|
||||||
|
|
||||||
|
ReplicatedBackupUtils.configureReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, liveAcceptor);
|
||||||
|
|
||||||
|
liveServer = createServer(liveConfig);
|
||||||
|
|
||||||
|
// start the live server in a new thread so we can start the backup simultaneously to induce a potential race
|
||||||
|
Thread startThread = new Thread(new Runnable()
|
||||||
|
{
|
||||||
|
@Override
|
||||||
|
public void run()
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
liveServer.start();
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
startThread.start();
|
||||||
|
|
||||||
|
ruleFired.await();
|
||||||
|
|
||||||
|
backupServer = createServer(backupConfig);
|
||||||
|
backupServer.start();
|
||||||
|
ServiceTestBase.waitForRemoteBackup(null, 3, true, backupServer);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void breakIt()
|
||||||
|
{
|
||||||
|
ruleFired.countDown();
|
||||||
|
try
|
||||||
|
{
|
||||||
|
/* before the fix this sleep would put the "live" server into a state where the acceptors were started
|
||||||
|
* but the server's state != STARTED which would cause the backup to fail to announce
|
||||||
|
*/
|
||||||
|
Thread.sleep(2000);
|
||||||
|
}
|
||||||
|
catch (InterruptedException e)
|
||||||
|
{
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue