ACTIVEMQ6-86 Backup announcement may fail

When a live and backup server are both started at or near the same moment
there is a small window where the live server's acceptors have been
started but the server's state != STARTED. During this window if the
backup sends its announcement the announcement will fail and the backup
will shutdown. This fix closes this small window by only starting the
acceptors until the server is fully started.
This commit is contained in:
jbertram 2015-03-04 09:44:43 -06:00
parent 242a5fac8a
commit 9a88941a51
10 changed files with 161 additions and 10 deletions

View File

@ -111,4 +111,10 @@ public final class BackupReplicationStartFailedMessage extends PacketImpl
result = 31 * result + (problem != null ? problem.hashCode() : 0); result = 31 * result + (problem != null ? problem.hashCode() : 0);
return result; return result;
} }
@Override
public String toString()
{
return getParentString() + ", problem=" + problem.name() + "]";
}
} }

View File

@ -58,6 +58,8 @@ public interface RemotingService
void start() throws Exception; void start() throws Exception;
void startAcceptors() throws Exception;
boolean isStarted(); boolean isStarted();
/** /**

View File

@ -318,10 +318,10 @@ public class RemotingServiceImpl implements RemotingService, ConnectionLifeCycle
} }
} }
for (Acceptor a : acceptors.values()) /**
{ * Don't start the acceptors here. Only start the acceptors at the every end of the start-up process to avoid
a.start(); * race conditions. See {@link #startAcceptors()}.
} */
// This thread checks connections that need to be closed, and also flushes confirmations // This thread checks connections that need to be closed, and also flushes confirmations
failureCheckAndFlushThread = new FailureCheckAndFlushThread(RemotingServiceImpl.CONNECTION_TTL_CHECK_INTERVAL); failureCheckAndFlushThread = new FailureCheckAndFlushThread(RemotingServiceImpl.CONNECTION_TTL_CHECK_INTERVAL);
@ -331,6 +331,17 @@ public class RemotingServiceImpl implements RemotingService, ConnectionLifeCycle
started = true; started = true;
} }
public synchronized void startAcceptors() throws Exception
{
if (isStarted())
{
for (Acceptor a : acceptors.values())
{
a.start();
}
}
}
public synchronized void allowInvmSecurityOverride(ActiveMQPrincipal principal) public synchronized void allowInvmSecurityOverride(ActiveMQPrincipal principal)
{ {
defaultInvmSecurityPrincipal = principal; defaultInvmSecurityPrincipal = principal;

View File

@ -431,7 +431,6 @@ public class ActiveMQServerImpl implements ActiveMQServer
} }
else else
{ {
state = SERVER_STATE.STARTED;
ActiveMQServerLogger.LOGGER.serverStarted(getVersion().getFullVersion(), nodeManager.getNodeId(), ActiveMQServerLogger.LOGGER.serverStarted(getVersion().getFullVersion(), nodeManager.getNodeId(),
identity != null ? identity : ""); identity != null ? identity : "");
} }
@ -1789,16 +1788,17 @@ public class ActiveMQServerImpl implements ActiveMQServer
{ {
throw ActiveMQMessageBundle.BUNDLE.nodeIdNull(); throw ActiveMQMessageBundle.BUNDLE.nodeIdNull();
} }
activationLatch.countDown();
// We can only do this after everything is started otherwise we may get nasty races with expired messages // We can only do this after everything is started otherwise we may get nasty races with expired messages
postOffice.startExpiryScanner(); postOffice.startExpiryScanner();
} }
else }
{
activationLatch.countDown();
}
public void completeActivation() throws Exception
{
setState(ActiveMQServerImpl.SERVER_STATE.STARTED);
getRemotingService().startAcceptors();
activationLatch.countDown();
callActivationCompleteCallbacks(); callActivationCompleteCallbacks();
} }

View File

@ -62,6 +62,8 @@ public class LiveOnlyActivation extends Activation
activeMQServer.initialisePart2(false); activeMQServer.initialisePart2(false);
activeMQServer.completeActivation();
if (activeMQServer.getIdentity() != null) if (activeMQServer.getIdentity() != null)
{ {
ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity()); ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity());

View File

@ -289,6 +289,8 @@ public final class SharedNothingBackupActivation extends Activation
} }
} }
activeMQServer.completeActivation();
} }
} }
catch (Exception e) catch (Exception e)

View File

@ -103,6 +103,8 @@ public class SharedNothingLiveActivation extends LiveActivation
activeMQServer.initialisePart2(false); activeMQServer.initialisePart2(false);
activeMQServer.completeActivation();
if (activeMQServer.getIdentity() != null) if (activeMQServer.getIdentity() != null)
{ {
ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity()); ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity());
@ -141,6 +143,7 @@ public class SharedNothingLiveActivation extends LiveActivation
} }
catch (ActiveMQException e) catch (ActiveMQException e)
{ {
ActiveMQServerLogger.LOGGER.debug("Failed to process backup registration packet", e);
channel.send(new BackupReplicationStartFailedMessage(BackupReplicationStartFailedMessage.BackupRegistrationProblem.EXCEPTION)); channel.send(new BackupReplicationStartFailedMessage(BackupReplicationStartFailedMessage.BackupRegistrationProblem.EXCEPTION));
} }
} }

View File

@ -87,6 +87,8 @@ public final class SharedStoreBackupActivation extends Activation
activeMQServer.initialisePart2(scalingDown); activeMQServer.initialisePart2(scalingDown);
activeMQServer.completeActivation();
if (scalingDown) if (scalingDown)
{ {
ActiveMQServerLogger.LOGGER.backupServerScaledDown(); ActiveMQServerLogger.LOGGER.backupServerScaledDown();

View File

@ -76,6 +76,8 @@ public final class SharedStoreLiveActivation extends LiveActivation
activeMQServer.initialisePart2(false); activeMQServer.initialisePart2(false);
activeMQServer.completeActivation();
ActiveMQServerLogger.LOGGER.serverIsLive(); ActiveMQServerLogger.LOGGER.serverIsLive();
} }
catch (Exception e) catch (Exception e)

View File

@ -0,0 +1,121 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.activemq.byteman.tests;
import java.util.concurrent.CountDownLatch;
import org.apache.activemq.api.config.ActiveMQDefaultConfiguration;
import org.apache.activemq.api.core.TransportConfiguration;
import org.apache.activemq.core.config.Configuration;
import org.apache.activemq.core.server.ActiveMQServer;
import org.apache.activemq.tests.util.ReplicatedBackupUtils;
import org.apache.activemq.tests.util.ServiceTestBase;
import org.apache.activemq.tests.util.TransportConfigurationUtils;
import org.jboss.byteman.contrib.bmunit.BMRule;
import org.jboss.byteman.contrib.bmunit.BMRules;
import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
import org.junit.Test;
import org.junit.runner.RunWith;
@RunWith(BMUnitRunner.class)
public class ReplicationBackupTest extends ServiceTestBase
{
private static final CountDownLatch ruleFired = new CountDownLatch(1);
private ActiveMQServer backupServer;
private ActiveMQServer liveServer;
/*
* simple test to induce a potential race condition where the server's acceptors are active, but the server's
* state != STARTED
*/
@Test
@BMRules
(
rules =
{
@BMRule
(
name = "prevent backup annoucement",
targetClass = "org.apache.activemq.core.server.impl.SharedNothingLiveActivation",
targetMethod = "run",
targetLocation = "AT EXIT",
action = "org.apache.activemq.byteman.tests.ReplicationBackupTest.breakIt();"
)
}
)
public void testReplicatedBackupAnnouncement() throws Exception
{
TransportConfiguration liveConnector = TransportConfigurationUtils.getNettyConnector(true, 0);
TransportConfiguration liveAcceptor = TransportConfigurationUtils.getNettyAcceptor(true, 0);
TransportConfiguration backupConnector = TransportConfigurationUtils.getNettyConnector(false, 0);
TransportConfiguration backupAcceptor = TransportConfigurationUtils.getNettyAcceptor(false, 0);
final String suffix = "_backup";
Configuration backupConfig = createDefaultConfig()
.setBindingsDirectory(ActiveMQDefaultConfiguration.getDefaultBindingsDirectory() + suffix)
.setJournalDirectory(ActiveMQDefaultConfiguration.getDefaultJournalDir() + suffix)
.setPagingDirectory(ActiveMQDefaultConfiguration.getDefaultPagingDir() + suffix)
.setLargeMessagesDirectory(ActiveMQDefaultConfiguration.getDefaultLargeMessagesDir() + suffix);
Configuration liveConfig = createDefaultConfig();
ReplicatedBackupUtils.configureReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, liveAcceptor);
liveServer = createServer(liveConfig);
// start the live server in a new thread so we can start the backup simultaneously to induce a potential race
Thread startThread = new Thread(new Runnable()
{
@Override
public void run()
{
try
{
liveServer.start();
}
catch (Exception e)
{
e.printStackTrace();
}
}
});
startThread.start();
ruleFired.await();
backupServer = createServer(backupConfig);
backupServer.start();
ServiceTestBase.waitForRemoteBackup(null, 3, true, backupServer);
}
public static void breakIt()
{
ruleFired.countDown();
try
{
/* before the fix this sleep would put the "live" server into a state where the acceptors were started
* but the server's state != STARTED which would cause the backup to fail to announce
*/
Thread.sleep(2000);
}
catch (InterruptedException e)
{
e.printStackTrace();
}
}
}