diff --git a/artemis-cli/src/main/java/org/apache/activemq/artemis/cli/commands/Run.java b/artemis-cli/src/main/java/org/apache/activemq/artemis/cli/commands/Run.java index 8dfae7871b..b594e37785 100644 --- a/artemis-cli/src/main/java/org/apache/activemq/artemis/cli/commands/Run.java +++ b/artemis-cli/src/main/java/org/apache/activemq/artemis/cli/commands/Run.java @@ -19,7 +19,7 @@ package org.apache.activemq.artemis.cli.commands; import java.io.File; import java.util.Timer; import java.util.TimerTask; -import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; import io.airlift.airline.Command; import io.airlift.airline.Option; @@ -71,6 +71,7 @@ public class Run extends LockAbstract { public Object execute(ActionContext context) throws Exception { super.execute(context); + AtomicReference serverActivationFailed = new AtomicReference<>(); try { BrokerDTO broker = getBrokerDTO(); ActiveMQSecurityManager securityManager = SecurityManagerFactory.create(broker.security); @@ -110,8 +111,7 @@ public class Run extends LockAbstract { server = BrokerFactory.createServer(broker.server, securityManager, activateCallback); server.createComponents(); - AtomicBoolean serverActivationFailed = new AtomicBoolean(false); - server.getServer().registerActivationFailureListener(exception -> serverActivationFailed.set(true)); + server.getServer().registerActivationFailureListener(exception -> serverActivationFailed.set(exception)); server.start(); server.getServer().addExternalComponent(managementContext, false); @@ -126,14 +126,16 @@ public class Run extends LockAbstract { server.getServer().addExternalComponent(component, true); assert component.isStarted(); } - - if (serverActivationFailed.get()) { - stop(); - } } catch (Throwable t) { t.printStackTrace(); - stop(); + serverActivationFailed.set(t); } + + if (serverActivationFailed.get() != null) { + stop(); + return serverActivationFailed.get(); + } + return new Pair<>(managementContext, server.getServer()); } diff --git a/artemis-commons/src/main/java/org/apache/activemq/artemis/logs/AuditLogger.java b/artemis-commons/src/main/java/org/apache/activemq/artemis/logs/AuditLogger.java index ff9c7804dd..e9a63421c9 100644 --- a/artemis-commons/src/main/java/org/apache/activemq/artemis/logs/AuditLogger.java +++ b/artemis-commons/src/main/java/org/apache/activemq/artemis/logs/AuditLogger.java @@ -2851,4 +2851,12 @@ public interface AuditLogger extends BasicLogger { @LogMessage(level = Logger.Level.INFO) @Message(id = 601748, value = "User {0} is getting max retry interval on target resource: {1} {2}", format = Message.Format.MESSAGE_FORMAT) void getMaxRetryInterval(String user, Object source, Object... args); + + static void getActivationSequence(Object source) { + BASE_LOGGER.getActivationSequence(getCaller(), source); + } + + @LogMessage(level = Logger.Level.INFO) + @Message(id = 601749, value = "User {0} is getting activation sequence on target resource: {1} {2}", format = Message.Format.MESSAGE_FORMAT) + void getActivationSequence(String user, Object source, Object... args); } diff --git a/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/core/management/ActiveMQServerControl.java b/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/core/management/ActiveMQServerControl.java index e4db945cc2..f040fcaa7b 100644 --- a/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/core/management/ActiveMQServerControl.java +++ b/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/core/management/ActiveMQServerControl.java @@ -321,6 +321,15 @@ public interface ActiveMQServerControl { @Attribute(desc = "Node ID of this server") String getNodeID(); + + /** + * Returns the current activation sequence number of this server. + *
+ * When replicated, peers may coordinate activation with this monotonic sequence + */ + @Attribute(desc = "Activation sequence of this server instance") + long getActivationSequence(); + /** * Returns the management notification address of this server. *
diff --git a/artemis-quorum-ri/pom.xml b/artemis-quorum-ri/pom.xml index 83c81678af..455c4d8b0c 100644 --- a/artemis-quorum-ri/pom.xml +++ b/artemis-quorum-ri/pom.xml @@ -44,6 +44,12 @@ org.apache.zookeeper zookeeper + + + org.slf4j + slf4j-log4j12 + + org.apache.curator @@ -59,6 +65,10 @@ org.jboss.logging jboss-logging + + org.jboss.slf4j + slf4j-jboss-logmanager + org.apache.activemq artemis-commons diff --git a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java index 4764b7c614..2195d5a585 100644 --- a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java +++ b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java @@ -18,16 +18,18 @@ package org.apache.activemq.artemis.quorum.file; import java.io.File; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.HashMap; import java.util.Map; import java.util.Objects; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; import org.apache.activemq.artemis.quorum.DistributedLock; import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; import org.apache.activemq.artemis.quorum.MutableLong; +import org.apache.activemq.artemis.quorum.UnavailableStateException; /** * This is an implementation suitable to be used just on unit tests and it won't attempt @@ -127,8 +129,55 @@ public class FileBasedPrimitiveManager implements DistributedPrimitiveManager { } @Override - public MutableLong getMutableLong(String mutableLongId) throws InterruptedException, ExecutionException, TimeoutException { - // TODO - return null; + public MutableLong getMutableLong(final String mutableLongId) throws ExecutionException { + // use a lock file - but with a prefix + final FileDistributedLock fileDistributedLock = (FileDistributedLock) getDistributedLock("ML:" + mutableLongId); + return new MutableLong() { + @Override + public String getMutableLongId() { + return mutableLongId; + } + + @Override + public long get() throws UnavailableStateException { + try { + return readLong(fileDistributedLock); + } catch (IOException e) { + throw new UnavailableStateException(e); + } + } + + @Override + public void set(long value) throws UnavailableStateException { + try { + writeLong(fileDistributedLock, value); + } catch (IOException e) { + throw new UnavailableStateException(e); + } + } + + @Override + public void close() { + fileDistributedLock.close(); + } + }; + } + + private void writeLong(FileDistributedLock fileDistributedLock, long value) throws IOException { + ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN); + buffer.putLong(value); + buffer.flip(); + if (fileDistributedLock.getChannel().position(0).write(buffer) == Long.BYTES) { + fileDistributedLock.getChannel().force(false); + } + } + + private long readLong(FileDistributedLock fileDistributedLock) throws IOException { + ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN); + if (fileDistributedLock.getChannel().position(0).read(buffer, 0) != Long.BYTES) { + return 0; + } + buffer.flip(); + return buffer.getLong(); } } diff --git a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java index 5e749698d6..13a9e78a9d 100644 --- a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java +++ b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java @@ -138,4 +138,8 @@ final class FileDistributedLock implements DistributedLock { public void close() { close(true); } + + public FileChannel getChannel() { + return channel; + } } diff --git a/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java index e6c8cab1d7..50e4d0c513 100644 --- a/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java +++ b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java @@ -178,6 +178,7 @@ public abstract class DistributedLockTest { ownerManager.getDistributedLock("a").unlock(); Assert.assertFalse(observerManager.getDistributedLock("a").isHeldByCaller()); Assert.assertFalse(ownerManager.getDistributedLock("a").isHeldByCaller()); + Assert.assertTrue(observerManager.getDistributedLock("a").tryLock()); } @Test diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java index f1c97eb7ea..2328d91ee9 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java @@ -34,13 +34,6 @@ public class ReplicationBackupPolicyConfiguration implements HAPolicyConfigurati private long initialReplicationSyncTimeout = ActiveMQDefaultConfiguration.getDefaultInitialReplicationSyncTimeout(); - private int voteRetries = ActiveMQDefaultConfiguration.getDefaultVoteRetries(); - - /** - * TODO: move if into {@link ActiveMQDefaultConfiguration} when the configuration is stable. - */ - private long voteRetryWait = 2000; - private long retryReplicationWait = ActiveMQDefaultConfiguration.getDefaultRetryReplicationWait(); private DistributedPrimitiveManagerConfiguration distributedManagerConfiguration = null; @@ -102,24 +95,6 @@ public class ReplicationBackupPolicyConfiguration implements HAPolicyConfigurati return this; } - public int getVoteRetries() { - return voteRetries; - } - - public ReplicationBackupPolicyConfiguration setVoteRetries(int voteRetries) { - this.voteRetries = voteRetries; - return this; - } - - public ReplicationBackupPolicyConfiguration setVoteRetryWait(long voteRetryWait) { - this.voteRetryWait = voteRetryWait; - return this; - } - - public long getVoteRetryWait() { - return voteRetryWait; - } - public long getRetryReplicationWait() { return retryReplicationWait; } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java index 7bb53184dd..a2e071fa1e 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java @@ -21,25 +21,18 @@ import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfiguration { - private boolean checkForLiveServer = ActiveMQDefaultConfiguration.isDefaultCheckForLiveServer(); - private String groupName = null; private String clusterName = null; private long initialReplicationSyncTimeout = ActiveMQDefaultConfiguration.getDefaultInitialReplicationSyncTimeout(); - private int voteRetries = ActiveMQDefaultConfiguration.getDefaultVoteRetries(); - - /** - * TODO: move if into {@link ActiveMQDefaultConfiguration} when the configuration is stable. - */ - private long voteRetryWait = 2000; - private Long retryReplicationWait = ActiveMQDefaultConfiguration.getDefaultRetryReplicationWait(); private DistributedPrimitiveManagerConfiguration distributedManagerConfiguration = null; + private String coordinationId = null; + public static ReplicationPrimaryPolicyConfiguration withDefault() { return new ReplicationPrimaryPolicyConfiguration(); } @@ -52,15 +45,6 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat return TYPE.PRIMARY; } - public boolean isCheckForLiveServer() { - return checkForLiveServer; - } - - public ReplicationPrimaryPolicyConfiguration setCheckForLiveServer(boolean checkForLiveServer) { - this.checkForLiveServer = checkForLiveServer; - return this; - } - public String getGroupName() { return groupName; } @@ -88,24 +72,6 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat return this; } - public int getVoteRetries() { - return voteRetries; - } - - public ReplicationPrimaryPolicyConfiguration setVoteRetries(int voteRetries) { - this.voteRetries = voteRetries; - return this; - } - - public ReplicationPrimaryPolicyConfiguration setVoteRetryWait(long voteRetryWait) { - this.voteRetryWait = voteRetryWait; - return this; - } - - public long getVoteRetryWait() { - return voteRetryWait; - } - public void setRetryReplicationWait(Long retryReplicationWait) { this.retryReplicationWait = retryReplicationWait; } @@ -122,4 +88,27 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat public DistributedPrimitiveManagerConfiguration getDistributedManagerConfiguration() { return distributedManagerConfiguration; } + + public String getCoordinationId() { + return coordinationId; + } + + public void setCoordinationId(String newCoordinationId) { + if (newCoordinationId == null) { + return; + } + final int len = newCoordinationId.length(); + if (len >= 16) { + this.coordinationId = newCoordinationId.substring(0, 16); + } else if (len % 2 != 0) { + // must be even for conversion to uuid, extend to next even + this.coordinationId = newCoordinationId + "+"; + } else if (len > 0 ) { + // run with it + this.coordinationId = newCoordinationId; + } + if (this.coordinationId != null) { + this.coordinationId = this.coordinationId.replace('-', '.'); + } + } } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java index 4932e01439..93eb2dbb0e 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java @@ -1715,22 +1715,18 @@ public final class FileConfigurationParser extends XMLConfigurationUtil { private ReplicationPrimaryPolicyConfiguration createReplicationPrimaryHaPolicy(Element policyNode, Configuration config) { ReplicationPrimaryPolicyConfiguration configuration = ReplicationPrimaryPolicyConfiguration.withDefault(); - configuration.setCheckForLiveServer(getBoolean(policyNode, "check-for-live-server", configuration.isCheckForLiveServer())); - configuration.setGroupName(getString(policyNode, "group-name", configuration.getGroupName(), Validators.NO_CHECK)); configuration.setClusterName(getString(policyNode, "cluster-name", configuration.getClusterName(), Validators.NO_CHECK)); configuration.setInitialReplicationSyncTimeout(getLong(policyNode, "initial-replication-sync-timeout", configuration.getInitialReplicationSyncTimeout(), Validators.GT_ZERO)); - configuration.setVoteRetries(getInteger(policyNode, "vote-retries", configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO)); - - configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); - - configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); + configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getRetryReplicationWait(), Validators.GT_ZERO)); configuration.setDistributedManagerConfiguration(createDistributedPrimitiveManagerConfiguration(policyNode, config)); + configuration.setCoordinationId(getString(policyNode, "coordination-id", configuration.getCoordinationId(), Validators.NOT_NULL_OR_EMPTY)); + return configuration; } @@ -1748,11 +1744,7 @@ public final class FileConfigurationParser extends XMLConfigurationUtil { configuration.setMaxSavedReplicatedJournalsSize(getInteger(policyNode, "max-saved-replicated-journals-size", configuration.getMaxSavedReplicatedJournalsSize(), Validators.MINUS_ONE_OR_GE_ZERO)); - configuration.setVoteRetries(getInteger(policyNode, "vote-retries", configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO)); - - configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); - - configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); + configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getRetryReplicationWait(), Validators.GT_ZERO)); configuration.setDistributedManagerConfiguration(createDistributedPrimitiveManagerConfiguration(policyNode, config)); diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/management/impl/ActiveMQServerControlImpl.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/management/impl/ActiveMQServerControlImpl.java index 28ea231994..145a23ad75 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/management/impl/ActiveMQServerControlImpl.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/management/impl/ActiveMQServerControlImpl.java @@ -4204,6 +4204,17 @@ public class ActiveMQServerControlImpl extends AbstractControl implements Active return server.getNodeID() == null ? null : server.getNodeID().toString(); } + @Override + public long getActivationSequence() { + if (AuditLogger.isBaseLoggingEnabled()) { + AuditLogger.getActivationSequence(this.server); + } + if (server.getNodeManager() != null) { + return server.getNodeManager().getNodeActivationSequence(); + } + return 0; + } + @Override public String getManagementNotificationAddress() { if (AuditLogger.isBaseLoggingEnabled()) { diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/protocol/core/impl/wireformat/ReplicationStartSyncMessage.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/protocol/core/impl/wireformat/ReplicationStartSyncMessage.java index 018535f426..ebd5d3603a 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/protocol/core/impl/wireformat/ReplicationStartSyncMessage.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/protocol/core/impl/wireformat/ReplicationStartSyncMessage.java @@ -43,7 +43,8 @@ public class ReplicationStartSyncMessage extends PacketImpl { public enum SyncDataType { JournalBindings(AbstractJournalStorageManager.JournalContent.BINDINGS.typeByte), JournalMessages(AbstractJournalStorageManager.JournalContent.MESSAGES.typeByte), - LargeMessages((byte) 2); + LargeMessages((byte) 2), + ActivationSequence((byte) 3); private byte code; @@ -62,6 +63,9 @@ public class ReplicationStartSyncMessage extends PacketImpl { return JournalMessages; if (code == LargeMessages.code) return LargeMessages; + if (code == ActivationSequence.code) + return ActivationSequence; + throw new InvalidParameterException("invalid byte: " + code); } } @@ -80,6 +84,14 @@ public class ReplicationStartSyncMessage extends PacketImpl { nodeID = ""; // this value will be ignored } + + public ReplicationStartSyncMessage(String nodeID, long nodeDataVersion) { + this(nodeID); + ids = new long[1]; + ids[0] = nodeDataVersion; + dataType = SyncDataType.ActivationSequence; + } + public ReplicationStartSyncMessage(String nodeID) { this(); synchronizationIsFinished = true; @@ -118,10 +130,6 @@ public class ReplicationStartSyncMessage extends PacketImpl { DataConstants.SIZE_BOOLEAN + // buffer.writeBoolean(allowsAutoFailBack); nodeID.length() * 3; // buffer.writeString(nodeID); -- an estimate - - if (synchronizationIsFinished) { - return size; - } size += DataConstants.SIZE_BYTE + // buffer.writeByte(dataType.code); DataConstants.SIZE_INT + // buffer.writeInt(ids.length); DataConstants.SIZE_LONG * ids.length; // the write loop @@ -135,8 +143,6 @@ public class ReplicationStartSyncMessage extends PacketImpl { buffer.writeBoolean(synchronizationIsFinished); buffer.writeBoolean(allowsAutoFailBack); buffer.writeString(nodeID); - if (synchronizationIsFinished) - return; buffer.writeByte(dataType.code); buffer.writeInt(ids.length); for (long id : ids) { @@ -149,9 +155,6 @@ public class ReplicationStartSyncMessage extends PacketImpl { synchronizationIsFinished = buffer.readBoolean(); allowsAutoFailBack = buffer.readBoolean(); nodeID = buffer.readString(); - if (synchronizationIsFinished) { - return; - } dataType = SyncDataType.getDataType(buffer.readByte()); int length = buffer.readInt(); ids = new long[length]; diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java index 495b35d281..ad2093ad8b 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java @@ -483,7 +483,7 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon logger.trace("Backup is synchronized / BACKUP-SYNC-DONE"); } - ActiveMQServerLogger.LOGGER.backupServerSynched(server); + ActiveMQServerLogger.LOGGER.backupServerSynchronized(server, liveID); return; } @@ -560,6 +560,11 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon return replicationResponseMessage; if (packet.isSynchronizationFinished()) { + if (packet.getFileIds() != null && packet.getFileIds().length == 1) { + // this is the version sequence of the data we are replicating + // verified if we activate with this data + server.getNodeManager().writeNodeActivationSequence(packet.getFileIds()[0]); + } finishSynchronization(packet.getNodeID()); replicationResponseMessage.setSynchronizationIsFinishedAcknowledgement(true); return replicationResponseMessage; diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationManager.java index ccebb6a3a0..7be1e17554 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationManager.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationManager.java @@ -821,7 +821,7 @@ public final class ReplicationManager implements ActiveMQComponent { } synchronizationIsFinishedAcknowledgement.countUp(); - sendReplicatePacket(new ReplicationStartSyncMessage(nodeID)); + sendReplicatePacket(new ReplicationStartSyncMessage(nodeID, server.getNodeManager().getNodeActivationSequence())); try { if (!synchronizationIsFinishedAcknowledgement.await(initialReplicationSyncTimeout)) { ActiveMQReplicationTimeooutException exception = ActiveMQMessageBundle.BUNDLE.replicationSynchronizationTimeout(initialReplicationSyncTimeout); diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServerLogger.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServerLogger.java index 1baf9ca49b..7d46a0d6a6 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServerLogger.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServerLogger.java @@ -190,8 +190,8 @@ public interface ActiveMQServerLogger extends BasicLogger { void errorStoppingConnectorService(@Cause Throwable e, String name); @LogMessage(level = Logger.Level.INFO) - @Message(id = 221024, value = "Backup server {0} is synchronized with live-server.", format = Message.Format.MESSAGE_FORMAT) - void backupServerSynched(ActiveMQServerImpl server); + @Message(id = 221024, value = "Backup server {0} is synchronized with live server, nodeID={1}.", format = Message.Format.MESSAGE_FORMAT) + void backupServerSynchronized(ActiveMQServerImpl server, String liveID); @LogMessage(level = Logger.Level.INFO) @Message(id = 221025, value = "Replication: sending {0} (size={1}) to replica.", format = Message.Format.MESSAGE_FORMAT) diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java index d136402f69..b6dd08f19c 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java @@ -39,6 +39,7 @@ public abstract class NodeManager implements ActiveMQComponent { private UUID uuid; private boolean isStarted = false; private final Set lockListeners; + protected long nodeActivationSequence; // local version of a coordinated sequence, tracking state transitions of ownership public NodeManager(final boolean replicatedBackup) { this.replicatedBackup = replicatedBackup; @@ -79,18 +80,30 @@ public abstract class NodeManager implements ActiveMQComponent { } } - public long readDataVersion() throws NodeManagerException { + public long readNodeActivationSequence() throws NodeManagerException { // TODO make it abstract throw new UnsupportedOperationException("TODO"); } - public void writeDataVersion(long version) throws NodeManagerException { + public void writeNodeActivationSequence(long version) throws NodeManagerException { // TODO make it abstract throw new UnsupportedOperationException("TODO"); } public abstract SimpleString readNodeId() throws NodeManagerException; + public long getNodeActivationSequence() { + synchronized (nodeIDGuard) { + return nodeActivationSequence; + } + } + + public void setNodeActivationSequence(long activationSequence) { + synchronized (nodeIDGuard) { + nodeActivationSequence = activationSequence; + } + } + public UUID getUUID() { synchronized (nodeIDGuard) { return uuid; diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java index 5427360fcf..c3d0c87d59 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java @@ -32,10 +32,8 @@ public class ReplicationBackupPolicy implements HAPolicy activationParams, IOCriticalErrorListener shutdownOnCriticalIO) throws Exception { - return new ReplicationBackupActivation(server, wasLive, DistributedPrimitiveManager.newInstanceOf( - distributedManagerConfiguration.getClassName(), - distributedManagerConfiguration.getProperties()), this); + return new ReplicationBackupActivation(server, DistributedPrimitiveManager.newInstanceOf( + managerConfiguration.getClassName(), managerConfiguration.getProperties()), this); } @Override @@ -157,14 +146,6 @@ public class ReplicationBackupPolicy implements HAPolicy - * This search isn't filtering the caller broker transport and is meant to be used - * when the broker acceptors aren't running yet. - */ - public static boolean searchActiveLiveNodeId(String clusterName, - String nodeId, - long timeout, - TimeUnit unit, - Configuration serverConfiguration) throws ActiveMQException { - if (serverConfiguration.getClusterConfigurations().isEmpty()) - return false; - final ClusterConnectionConfiguration clusterConnectionConfiguration = ConfigurationUtils.getReplicationClusterConfiguration(serverConfiguration, clusterName); - - final LiveNodeIdListener liveNodeIdListener = new LiveNodeIdListener(nodeId, serverConfiguration.getClusterUser(), serverConfiguration.getClusterPassword()); - - try (ServerLocatorInternal locator = createLocator(serverConfiguration, clusterConnectionConfiguration)) { - // if would like to filter out a transport configuration: - // locator.setClusterTransportConfiguration(callerBrokerTransportConfiguration) - locator.addClusterTopologyListener(liveNodeIdListener); - locator.setReconnectAttempts(0); - try (ClientSessionFactoryInternal ignored = locator.connectNoWarnings()) { - return liveNodeIdListener.awaitNodePresent(timeout, unit); - } catch (Exception notConnected) { - if (!(notConnected instanceof ActiveMQException) || ActiveMQExceptionType.INTERNAL_ERROR.equals(((ActiveMQException) notConnected).getType())) { - // report all exceptions that aren't ActiveMQException and all INTERNAL_ERRORs - ActiveMQServerLogger.LOGGER.failedConnectingToCluster(notConnected); - } - return false; - } - } - } - - private static final class LiveNodeIdListener implements ClusterTopologyListener { - - private static final Logger logger = Logger.getLogger(LiveNodeIdListener.class); - private final String nodeId; - private final String user; - private final String password; - private final CountDownLatch searchCompleted; - private boolean isNodePresent = false; - - LiveNodeIdListener(String nodeId, String user, String password) { - this.nodeId = nodeId; - this.user = user; - this.password = password; - this.searchCompleted = new CountDownLatch(1); - } - - @Override - public void nodeUP(TopologyMember topologyMember, boolean last) { - boolean isOurNodeId = nodeId != null && nodeId.equals(topologyMember.getNodeId()); - if (isOurNodeId && isActive(topologyMember.getLive())) { - isNodePresent = true; - } - if (isOurNodeId || last) { - searchCompleted.countDown(); - } - } - - public boolean awaitNodePresent(long timeout, TimeUnit unit) throws InterruptedException { - searchCompleted.await(timeout, unit); - return isNodePresent; - } - - /** - * In a cluster of replicated live/backup pairs if a backup crashes and then its live crashes the cluster will - * retain the topology information of the live such that when the live server restarts it will check the - * cluster to see if its nodeID is present (which it will be) and then it will activate as a backup rather than - * a live. To prevent this situation an additional check is necessary to see if the server with the matching - * nodeID is actually active or not which is done by attempting to make a connection to it. - * - * @param transportConfiguration - * @return - */ - private boolean isActive(TransportConfiguration transportConfiguration) { - try (ServerLocator serverLocator = ActiveMQClient.createServerLocator(false, transportConfiguration); - ClientSessionFactory clientSessionFactory = serverLocator.createSessionFactory(); - ClientSession clientSession = clientSessionFactory.createSession(user, password, false, false, false, false, 0)) { - return true; - } catch (Exception e) { - logger.debug("isActive check failed", e); - return false; - } - } - - @Override - public void nodeDown(long eventUID, String nodeID) { - // no-op - } - } - - private static ServerLocatorInternal createLocator(Configuration configuration, - ClusterConnectionConfiguration config) throws ActiveMQException { - final ServerLocatorInternal locator; - if (config.getDiscoveryGroupName() != null) { - DiscoveryGroupConfiguration dg = configuration.getDiscoveryGroupConfigurations().get(config.getDiscoveryGroupName()); - - if (dg == null) { - throw ActiveMQMessageBundle.BUNDLE.noDiscoveryGroupFound(null); - } - locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(dg); - } else { - TransportConfiguration[] tcConfigs = config.getStaticConnectors() != null ? configuration.getTransportConfigurations(config.getStaticConnectors()) : null; - - locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(tcConfigs); - } - return locator; - } - -} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java index 4a8b59f27d..56b59252a8 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java @@ -28,7 +28,7 @@ import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.utils.UUID; import org.apache.activemq.artemis.utils.UUIDGenerator; -import static java.nio.file.StandardOpenOption.CREATE_NEW; +import static java.nio.file.StandardOpenOption.CREATE; import static java.nio.file.StandardOpenOption.READ; import static java.nio.file.StandardOpenOption.WRITE; @@ -36,33 +36,36 @@ public abstract class FileBasedNodeManager extends NodeManager { protected static final byte FIRST_TIME_START = '0'; public static final String SERVER_LOCK_NAME = "server.lock"; - public static final String DATA_VERSION_NAME = "server.data.version"; + public static final String SERVER_ACTIVATION_SEQUENCE_NAME = "server.activation.sequence"; private static final String ACCESS_MODE = "rw"; private final File directory; protected FileChannel channel; - protected FileChannel dataVersionChannel; + protected FileChannel activationSequenceChannel; public FileBasedNodeManager(boolean replicatedBackup, File directory) { super(replicatedBackup); this.directory = directory; + if (directory != null) { + directory.mkdirs(); + } } - protected void useDataVersionChannel() throws IOException { - if (dataVersionChannel != null) { + protected void useActivationSequenceChannel() throws IOException { + if (activationSequenceChannel != null) { return; } - dataVersionChannel = FileChannel.open(newFile(DATA_VERSION_NAME).toPath(), READ, WRITE, CREATE_NEW); + activationSequenceChannel = FileChannel.open(newFile(SERVER_ACTIVATION_SEQUENCE_NAME).toPath(), READ, WRITE, CREATE); } @Override - public long readDataVersion() throws NodeManagerException { + public long readNodeActivationSequence() throws NodeManagerException { if (!isStarted()) { throw new NodeManagerException(new IllegalStateException("node manager must be started first")); } try { - useDataVersionChannel(); + useActivationSequenceChannel(); ByteBuffer tmpBuffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN); - if (dataVersionChannel.read(tmpBuffer, 0) != Long.BYTES) { + if (activationSequenceChannel.read(tmpBuffer, 0) != Long.BYTES) { return 0; } tmpBuffer.flip(); @@ -73,16 +76,17 @@ public abstract class FileBasedNodeManager extends NodeManager { } @Override - public void writeDataVersion(long version) throws NodeManagerException { + public void writeNodeActivationSequence(long version) throws NodeManagerException { if (!isStarted()) { throw new NodeManagerException(new IllegalStateException("node manager must be started first")); } try { - useDataVersionChannel(); + useActivationSequenceChannel(); ByteBuffer tmpBuffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN); tmpBuffer.putLong(0, version); - dataVersionChannel.write(tmpBuffer, 0); - dataVersionChannel.force(false); + activationSequenceChannel.write(tmpBuffer, 0); + activationSequenceChannel.force(false); + setNodeActivationSequence(version); } catch (IOException ie) { throw new NodeManagerException(ie); } @@ -149,12 +153,8 @@ public abstract class FileBasedNodeManager extends NodeManager { createNodeId(); } - /** - * @return - */ protected final File newFile(final String fileName) { - File file = new File(directory, fileName); - return file; + return new File(directory, fileName); } protected final synchronized void createNodeId() throws IOException { @@ -190,8 +190,8 @@ public abstract class FileBasedNodeManager extends NodeManager { channelCopy.close(); } finally { try { - FileChannel dataVersionChannel = this.dataVersionChannel; - this.dataVersionChannel = null; + FileChannel dataVersionChannel = this.activationSequenceChannel; + this.activationSequenceChannel = null; if (dataVersionChannel != null) { dataVersionChannel.close(); } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java index 4441bf0018..d925113b38 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java @@ -19,9 +19,6 @@ package org.apache.activemq.artemis.core.server.impl; import javax.annotation.concurrent.GuardedBy; import java.util.Objects; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Consumer; @@ -35,7 +32,7 @@ import org.apache.activemq.artemis.core.replication.ReplicationEndpoint; import org.apache.activemq.artemis.core.server.ActiveMQServer; import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; import org.apache.activemq.artemis.core.server.LiveNodeLocator; -import org.apache.activemq.artemis.core.server.NodeManager;; +import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.core.server.cluster.ClusterControl; import org.apache.activemq.artemis.core.server.cluster.ClusterController; import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationBackupPolicy; @@ -45,6 +42,8 @@ import org.apache.activemq.artemis.quorum.UnavailableStateException; import org.jboss.logging.Logger; import static org.apache.activemq.artemis.core.server.impl.ReplicationObserver.ReplicationFailure; +import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.ensureSequentialAccessToNodeData; +import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.tryActivate; /** * This activation can be used by a primary while trying to fail-back ie {@code failback == true} or @@ -54,7 +53,6 @@ public final class ReplicationBackupActivation extends Activation implements Dis private static final Logger LOGGER = Logger.getLogger(ReplicationBackupActivation.class); - private final boolean wasLive; private final ReplicationBackupPolicy policy; private final ActiveMQServerImpl activeMQServer; // This field is != null iff this node is a primary during a fail-back ie acting as a backup in order to become live again. @@ -72,10 +70,8 @@ public final class ReplicationBackupActivation extends Activation implements Dis private final AtomicBoolean stopping; public ReplicationBackupActivation(final ActiveMQServerImpl activeMQServer, - final boolean wasLive, final DistributedPrimitiveManager distributedManager, final ReplicationBackupPolicy policy) { - this.wasLive = wasLive; this.activeMQServer = activeMQServer; if (policy.isTryFailback()) { final SimpleString serverNodeID = activeMQServer.getNodeID(); @@ -146,14 +142,35 @@ public final class ReplicationBackupActivation extends Activation implements Dis } } try { - LOGGER.info("Trying to reach majority of quorum service nodes"); distributedManager.start(); - LOGGER.info("Quorum service available: starting broker"); + final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence(); + // only a backup with positive local activation sequence could contain valuable data + if (nodeActivationSequence > 0) { + final String nodeId = activeMQServer.getNodeManager().getNodeId().toString(); + DistributedLock liveLockWithInSyncReplica; + while (true) { + distributedManager.start(); + try { + liveLockWithInSyncReplica = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER); + break; + } catch (UnavailableStateException canRecoverEx) { + distributedManager.stop(); + } + } + if (liveLockWithInSyncReplica != null) { + // retain state and start as live + if (!activeMQServer.initialisePart1(false)) { + return; + } + activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED); + startAsLive(liveLockWithInSyncReplica); + return; + } + } distributedManager.addUnavailableManagerListener(this); // Stop the previous node manager and create a new one with NodeManager::replicatedBackup == true: // NodeManager::start skip setup lock file with NodeID, until NodeManager::stopBackup is called. activeMQServer.resetNodeManager(); - activeMQServer.getNodeManager().stop(); // A primary need to preserve NodeID across runs activeMQServer.moveServerData(policy.getMaxSavedReplicatedJournalsSize(), policy.isTryFailback()); activeMQServer.getNodeManager().start(); @@ -164,11 +181,15 @@ public final class ReplicationBackupActivation extends Activation implements Dis if (closed) return; } + + final ClusterController clusterController = activeMQServer.getClusterManager().getClusterController(); + + LOGGER.infof("Apache ActiveMQ Artemis Backup Server version %s [%s] started, awaiting connection to a live cluster member to start replication", activeMQServer.getVersion().getFullVersion(), + activeMQServer.toString()); + clusterController.awaitConnectionToReplicationCluster(); activeMQServer.getBackupManager().start(); - ActiveMQServerLogger.LOGGER.backupServerStarted(activeMQServer.getVersion().getFullVersion(), - activeMQServer.getNodeManager().getNodeId()); activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED); final DistributedLock liveLock = replicateAndFailover(clusterController); if (liveLock == null) { @@ -192,6 +213,14 @@ public final class ReplicationBackupActivation extends Activation implements Dis liveLock.close(); return; } + try { + ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER); + } catch (Throwable fatal) { + LOGGER.warn(fatal); + // policy is already live one, but there's no activation yet: we can just stop + asyncRestartServer(activeMQServer, false, false); + throw new ActiveMQIllegalStateException("This server cannot ensure sequential access to broker data: activation is failed"); + } ActiveMQServerLogger.LOGGER.becomingLive(activeMQServer); // stopBackup is going to write the NodeID previously set on the NodeManager, // because activeMQServer.resetNodeManager() has created a NodeManager with replicatedBackup == true. @@ -260,28 +289,38 @@ public final class ReplicationBackupActivation extends Activation implements Dis return null; } LOGGER.debugf("ReplicationFailure = %s", failure); - boolean voluntaryFailOver = false; switch (failure) { case VoluntaryFailOver: - voluntaryFailOver = true; case NonVoluntaryFailover: - final DistributedLock liveLock = tryAcquireLiveLock(); // from now on we're meant to stop: // - due to failover // - due to restart/stop - assert stopping.get(); - if (liveLock != null) { - return liveLock; + if (!stopping.compareAndSet(false, true)) { + return null; } - boolean restart = true; - if (voluntaryFailOver && isFirstFailbackAttempt()) { - restart = false; - LOGGER.error("Failed to fail-back: stopping broker based on quorum results"); + // no more interested into these events: handling it manually from here + distributedManager.removeUnavailableManagerListener(this); + final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence(); + final String nodeId = activeMQServer.getNodeManager().getNodeId().toString(); + DistributedLock liveLockWithInSyncReplica = null; + if (nodeActivationSequence > 0) { + try { + liveLockWithInSyncReplica = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER); + } catch (Throwable error) { + // no need to retry here, can just restart as backup that will handle a more resilient tryActivate + LOGGER.warn("Errored while attempting failover", error); + liveLockWithInSyncReplica = null; + } } else { - ActiveMQServerLogger.LOGGER.restartingAsBackupBasedOnQuorumVoteResults(); + LOGGER.warnf("We expect local activation sequence for NodeID = %s to be > 0 on a fail-over, while is %d", nodeId, nodeActivationSequence); } - // let's ignore the stopping flag here, we're in control of it - asyncRestartServer(activeMQServer, restart, false); + assert stopping.get(); + if (liveLockWithInSyncReplica != null) { + return liveLockWithInSyncReplica; + } + ActiveMQServerLogger.LOGGER.restartingAsBackupBasedOnQuorumVoteResults(); + // let's ignore the stopping flag here, we're already in control of it + asyncRestartServer(activeMQServer, true, false); return null; case RegistrationError: LOGGER.error("Stopping broker because of critical registration error"); @@ -307,73 +346,7 @@ public final class ReplicationBackupActivation extends Activation implements Dis } } } finally { - silentExecution("Errored on cluster topology listener for replication cleanup", () -> clusterController.removeClusterTopologyListenerForReplication(nodeLocator)); - } - } - - /** - * {@code wasLive} is {code true} only while transitioning from primary to backup.
- * If a natural born backup become live and allows failback, while transitioning to back again - * {@code wasLive} is still {@code false}.
- * The check on {@link ReplicationBackupPolicy#isTryFailback()} is redundant but still useful for correctness. - *

- * In case of fail-back, any event that's going to restart this broker as backup (eg quorum service unavailable - * or some replication failures) will cause {@code wasLive} to be {@code false}, because the HA policy set isn't - * a primary anymore. - */ - private boolean isFirstFailbackAttempt() { - return wasLive && policy.isTryFailback(); - } - - private DistributedLock tryAcquireLiveLock() throws InterruptedException { - // disable quorum service unavailability handling and just treat this imperatively - if (!stopping.compareAndSet(false, true)) { - // already unavailable quorum service: fail fast - return null; - } - distributedManager.removeUnavailableManagerListener(this); - assert activeMQServer.getNodeManager().getNodeId() != null; - final String liveID = activeMQServer.getNodeManager().getNodeId().toString(); - final int voteRetries = policy.getVoteRetries(); - final long maxAttempts = voteRetries >= 0 ? (voteRetries + 1) : -1; - if (maxAttempts == -1) { - LOGGER.error("It's not safe to retry an infinite amount of time to acquire a live lock: please consider setting a vote-retries value"); - } - final long voteRetryWait = policy.getVoteRetryWait(); - final DistributedLock liveLock = getLock(distributedManager, liveID); - if (liveLock == null) { - return null; - } - for (long attempt = 0; maxAttempts >= 0 ? (attempt < maxAttempts) : true; attempt++) { - try { - if (liveLock.tryLock(voteRetryWait, TimeUnit.MILLISECONDS)) { - LOGGER.debugf("%s live lock acquired after %d attempts.", liveID, (attempt + 1)); - return liveLock; - } - } catch (UnavailableStateException e) { - LOGGER.warnf(e, "Failed to acquire live lock %s because of unavailable quorum service: stop trying", liveID); - distributedManager.stop(); - return null; - } - } - LOGGER.warnf("Failed to acquire live lock %s after %d tries", liveID, maxAttempts); - distributedManager.stop(); - return null; - } - - private DistributedLock getLock(final DistributedPrimitiveManager manager, - final String lockId) throws InterruptedException { - if (!manager.isStarted()) { - return null; - } - try { - return manager.getDistributedLock(lockId); - } catch (ExecutionException e) { - LOGGER.warnf(e, "Errored while getting lock %s", lockId); - return null; - } catch (TimeoutException te) { - LOGGER.warnf(te, "Timeout while getting lock %s", lockId); - return null; + silentExecution("Error on cluster topology listener for replication cleanup", () -> clusterController.removeClusterTopologyListenerForReplication(nodeLocator)); } } @@ -405,7 +378,6 @@ public final class ReplicationBackupActivation extends Activation implements Dis return ReplicationFailure.RegistrationError; } this.replicationEndpoint = replicationEndpoint; - assert replicationEndpoint != null; try { return replicationObserver.awaitReplicationFailure(); } finally { @@ -414,11 +386,11 @@ public final class ReplicationBackupActivation extends Activation implements Dis closeChannelOf(replicationEndpoint); } } finally { - silentExecution("Errored on live control close", liveControl::close); + silentExecution("Error on live control close", liveControl::close); } } finally { - silentExecution("Errored on cluster topology listener cleanup", () -> clusterController.removeClusterTopologyListener(replicationObserver)); - silentExecution("Errored while removing incoming interceptor for replication", () -> clusterController.removeIncomingInterceptorForReplication(replicationError)); + silentExecution("Error on cluster topology listener cleanup", () -> clusterController.removeClusterTopologyListener(replicationObserver)); + silentExecution("Error while removing incoming interceptor for replication", () -> clusterController.removeIncomingInterceptorForReplication(replicationError)); } } finally { this.replicationObserver = null; @@ -438,7 +410,7 @@ public final class ReplicationBackupActivation extends Activation implements Dis return; } if (replicationEndpoint.getChannel() != null) { - silentExecution("Errored while closing replication endpoint channel", () -> replicationEndpoint.getChannel().close()); + silentExecution("Error while closing replication endpoint channel", () -> replicationEndpoint.getChannel().close()); replicationEndpoint.setChannel(null); } } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java index 285f516220..abf92226d7 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java @@ -19,12 +19,14 @@ package org.apache.activemq.artemis.core.server.impl; import javax.annotation.concurrent.GuardedBy; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.activemq.artemis.api.core.ActiveMQAlreadyReplicatingException; import org.apache.activemq.artemis.api.core.ActiveMQException; import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException; import org.apache.activemq.artemis.api.core.Pair; import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.core.persistence.StorageManager; import org.apache.activemq.artemis.core.protocol.core.Channel; import org.apache.activemq.artemis.core.protocol.core.ChannelHandler; import org.apache.activemq.artemis.core.protocol.core.CoreRemotingConnection; @@ -46,7 +48,10 @@ import org.apache.activemq.artemis.quorum.UnavailableStateException; import org.apache.activemq.artemis.spi.core.remoting.Acceptor; import org.jboss.logging.Logger; -import static org.apache.activemq.artemis.core.server.impl.ClusterTopologySearch.searchActiveLiveNodeId; +import static org.apache.activemq.artemis.core.server.ActiveMQServer.SERVER_STATE.STARTED; +import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.awaitNextCommittedActivationSequence; +import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.ensureSequentialAccessToNodeData; +import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.tryActivate; /** * This is going to be {@link #run()} just by natural born primary, at the first start. @@ -55,8 +60,9 @@ import static org.apache.activemq.artemis.core.server.impl.ClusterTopologySearch public class ReplicationPrimaryActivation extends LiveActivation implements DistributedLock.UnavailableLockListener { private static final Logger LOGGER = Logger.getLogger(ReplicationPrimaryActivation.class); - private static final long DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS = 20_000; - private static final long BLOCKING_CALLS_TIMEOUT_MILLIS = 5_000; + // This is the time we expect a replica to become a live from the quorum pov + // ie time to execute tryActivate and ensureSequentialAccessToNodeData + private static final long FAILBACK_TIMEOUT_MILLIS = 4_000; private final ReplicationPrimaryPolicy policy; @@ -69,7 +75,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist private final DistributedPrimitiveManager distributedManager; - private volatile boolean stoppingServer; + private final AtomicBoolean stoppingServer; public ReplicationPrimaryActivation(final ActiveMQServerImpl activeMQServer, final DistributedPrimitiveManager distributedManager, @@ -78,6 +84,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist this.policy = policy; this.replicationLock = new Object(); this.distributedManager = distributedManager; + this.stoppingServer = new AtomicBoolean(); } /** @@ -101,19 +108,36 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist @Override public void run() { try { - - final NodeManager nodeManager = activeMQServer.getNodeManager(); - - final String nodeId = nodeManager.readNodeId().toString(); - - final long dataVersion = nodeManager.readDataVersion(); - - final DistributedLock liveLock = searchLiveOrAcquireLiveLock(nodeId, BLOCKING_CALLS_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS); - + // we have a common nodeId that we can share and coordinate with between peers + if (policy.getCoordinationId() != null) { + LOGGER.infof("Applying shared peer NodeID=%s to enable coordinated live activation", policy.getCoordinationId()); + // REVISIT: this is quite clunky, also in backup activation, we just need new nodeID persisted! + activeMQServer.resetNodeManager(); + activeMQServer.getNodeManager().start(); + activeMQServer.getNodeManager().setNodeID(policy.getCoordinationId()); + activeMQServer.getNodeManager().stopBackup(); + } + final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence(); + final String nodeId = activeMQServer.getNodeManager().readNodeId().toString(); + DistributedLock liveLock; + while (true) { + distributedManager.start(); + try { + liveLock = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER); + break; + } catch (UnavailableStateException canRecoverEx) { + distributedManager.stop(); + } + } if (liveLock == null) { + distributedManager.stop(); + LOGGER.infof("This broker cannot become a live server with NodeID = %s: restarting as backup", nodeId); + activeMQServer.setHAPolicy(policy.getBackupPolicy()); return; } + ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER); + activeMQServer.initialisePart1(false); activeMQServer.initialisePart2(false); @@ -142,73 +166,9 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist } } - private DistributedLock searchLiveOrAcquireLiveLock(final String nodeId, - final long blockingCallTimeout, - final TimeUnit unit) throws ActiveMQException, InterruptedException { - if (policy.isCheckForLiveServer()) { - LOGGER.infof("Searching a live server with NodeID = %s", nodeId); - if (searchActiveLiveNodeId(policy.getClusterName(), nodeId, blockingCallTimeout, unit, activeMQServer.getConfiguration())) { - LOGGER.infof("Found a live server with NodeID = %s: restarting as backup", nodeId); - activeMQServer.setHAPolicy(policy.getBackupPolicy()); - return null; - } - } - startDistributedPrimitiveManager(); - return acquireDistributeLock(getDistributeLock(nodeId), blockingCallTimeout, unit); - } - - private void startDistributedPrimitiveManager() throws InterruptedException, ActiveMQException { - LOGGER.infof("Trying to reach the majority of quorum nodes in %d ms.", DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS); - try { - if (distributedManager.start(DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) { - return; - } - } catch (InterruptedException ie) { - throw ie; - } catch (Throwable t) { - LOGGER.debug(t); - } - assert !distributedManager.isStarted(); - throw new ActiveMQException("Cannot reach the majority of quorum nodes"); - } - - private DistributedLock getDistributeLock(final String nodeId) throws InterruptedException, ActiveMQException { - try { - return distributedManager.getDistributedLock(nodeId); - } catch (Throwable t) { - try { - distributedManager.stop(); - } catch (Throwable ignore) { - // don't care - } - if (t instanceof InterruptedException) { - throw (InterruptedException) t; - } - throw new ActiveMQException("Cannot obtain a live lock instance"); - } - } - - private DistributedLock acquireDistributeLock(final DistributedLock liveLock, - final long acquireLockTimeout, - final TimeUnit unit) throws InterruptedException, ActiveMQException { - try { - if (liveLock.tryLock(acquireLockTimeout, unit)) { - return liveLock; - } - } catch (UnavailableStateException e) { - LOGGER.debug(e); - } - try { - distributedManager.stop(); - } catch (Throwable ignore) { - // don't care - } - throw new ActiveMQException("Failed to become live"); - } - @Override public ChannelHandler getActivationChannelHandler(final Channel channel, final Acceptor acceptorUsed) { - if (stoppingServer) { + if (stoppingServer.get()) { return null; } return packet -> { @@ -268,7 +228,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist awaitBackupAnnouncementOnFailbackRequest(clusterConnection); } } catch (Exception e) { - if (activeMQServer.getState() == ActiveMQServerImpl.SERVER_STATE.STARTED) { + if (activeMQServer.getState() == STARTED) { /* * The reasoning here is that the exception was either caused by (1) the * (interaction with) the backup, or (2) by an IO Error at the storage. If (1), we @@ -283,7 +243,9 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist ActiveMQServerLogger.LOGGER.errorStoppingReplication(amqe); } finally { synchronized (replicationLock) { - this.replicationManager = null; + if (this.replicationManager == replicationManager) { + this.replicationManager = null; + } } } } @@ -308,35 +270,55 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist } } - /** - * If {@link #asyncStopServer()} happens before this call, the restart just won't happen. - * If {@link #asyncStopServer()} happens after this call, will make the server to stop right after being restarted. - */ private void restartAsBackupAfterFailback() throws Exception { - if (stoppingServer) { + if (stoppingServer.get()) { return; } - synchronized (this) { - if (stoppingServer) { + final String coordinatedLockAndNodeId; + final long inSyncReplicaActivation; + synchronized (replicationLock) { + if (stoppingServer.get()) { return; } - distributedManager.stop(); + final ReplicationManager replicationManager = this.replicationManager; + if (replicationManager == null) { + LOGGER.warnf("Failback interrupted"); + // we got a disconnection from the replica *before* stopping acceptors: better not failback! + return; + } + // IMPORTANT: this is going to save server::fail to issue a replica connection failure (with failed == false) + // because onReplicationConnectionClose fail-fast on stopping == true. + if (!stoppingServer.compareAndSet(false, true)) { + LOGGER.infof("Failback interrupted: server is already stopping"); + return; + } + coordinatedLockAndNodeId = activeMQServer.getNodeManager().getNodeId().toString(); + inSyncReplicaActivation = activeMQServer.getNodeManager().getNodeActivationSequence(); + // none can notice a concurrent drop of replica connection here: awaitNextCommittedActivationSequence defensively + // wait FAILBACK_TIMEOUT_MILLIS, proceed as backup and compete to become live again activeMQServer.fail(true); - ActiveMQServerLogger.LOGGER.restartingReplicatedBackupAfterFailback(); - activeMQServer.setHAPolicy(policy.getBackupPolicy()); - activeMQServer.start(); } + try { + distributedManager.start(); + if (!awaitNextCommittedActivationSequence(distributedManager, coordinatedLockAndNodeId, inSyncReplicaActivation, FAILBACK_TIMEOUT_MILLIS, LOGGER)) { + LOGGER.warnf("Timed out waiting for failback server activation with NodeID = %s: and sequence > %d: after %dms", + coordinatedLockAndNodeId, inSyncReplicaActivation, FAILBACK_TIMEOUT_MILLIS); + } + } catch (UnavailableStateException ignored) { + LOGGER.debug("Unavailable distributed manager while awaiting failback activation sequence: ignored", ignored); + } finally { + distributedManager.stop(); + } + ActiveMQServerLogger.LOGGER.restartingReplicatedBackupAfterFailback(); + activeMQServer.setHAPolicy(policy.getBackupPolicy()); + activeMQServer.start(); } private void asyncStopServer() { - if (stoppingServer) { + if (stoppingServer.get()) { return; } - synchronized (this) { - if (stoppingServer) { - return; - } - stoppingServer = true; + if (stoppingServer.compareAndSet(false, true)) { new Thread(() -> { try { activeMQServer.stop(); @@ -374,19 +356,28 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist private void onReplicationConnectionClose() { ExecutorService executorService = activeMQServer.getThreadPool(); if (executorService != null) { - synchronized (replicationLock) { - if (replicationManager == null) { - return; - } + if (stoppingServer.get()) { + return; } executorService.execute(() -> { synchronized (replicationLock) { if (replicationManager == null) { return; } + // we increment only if we are staying alive + if (!stoppingServer.get() && STARTED.equals(activeMQServer.getState())) { + try { + ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER); + } catch (Throwable fatal) { + LOGGER.errorf(fatal, "Unexpected exception: %s on attempted activation sequence increment; stopping server async", fatal.getLocalizedMessage()); + asyncStopServer(); + } + } // this is going to stop the replication manager - activeMQServer.getStorageManager().stopReplication(); - assert !replicationManager.isStarted(); + final StorageManager storageManager = activeMQServer.getStorageManager(); + if (storageManager != null) { + storageManager.stopReplication(); + } replicationManager = null; } }); diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/quorum/ActivationSequenceStateMachine.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/quorum/ActivationSequenceStateMachine.java new file mode 100644 index 0000000000..96083634df --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/quorum/ActivationSequenceStateMachine.java @@ -0,0 +1,312 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.activemq.artemis.core.server.impl.quorum; + +import java.util.Objects; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import org.apache.activemq.artemis.api.core.ActiveMQException; +import org.apache.activemq.artemis.core.server.ActiveMQServer; +import org.apache.activemq.artemis.core.server.NodeManager; +import org.apache.activemq.artemis.quorum.DistributedLock; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.apache.activemq.artemis.quorum.MutableLong; +import org.apache.activemq.artemis.quorum.UnavailableStateException; +import org.jboss.logging.Logger; + +/** + * This class contains the activation sequence logic of the pluggable quorum vote: + * it should be used by {@link org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation} + * and {@link org.apache.activemq.artemis.core.server.impl.ReplicationPrimaryActivation} to coordinate + * for replication. + */ +public final class ActivationSequenceStateMachine { + + private static final long CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS = 200; + private static final long CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS = 2000; + private static final long LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS = 2000; + + private ActivationSequenceStateMachine() { + + } + + /** + * It loops if the data of the broker is still valuable, but cannot become live. + * It loops (temporarly) if data is in sync or can self-heal, but cannot yet acquire the live lock. + *

+ * It stops loop and return: + *

    + *
  • {@code null}: if data is stale (and there are no rights to become live) + *
  • {@code !=null}: if data is in sync and the {@link DistributedLock} is correctly acquired + *

+ *

+ * After successfully returning from this method ie not null return value, a broker should use + * {@link #ensureSequentialAccessToNodeData(ActiveMQServer, DistributedPrimitiveManager, Logger)} to complete + * the activation and guarantee the initial not-replicated ownership of data. + */ + public static DistributedLock tryActivate(final String nodeId, + final long nodeActivationSequence, + final DistributedPrimitiveManager distributedManager, + final Logger logger) throws InterruptedException, ExecutionException, TimeoutException, UnavailableStateException { + final DistributedLock activationLock = distributedManager.getDistributedLock(nodeId); + try (MutableLong coordinatedNodeSequence = distributedManager.getMutableLong(nodeId)) { + while (true) { + // dirty read is sufficient to know if we are *not* an in sync replica + // typically the lock owner will increment to signal our data is stale and we are happy without any + // further coordination at this point + switch (validateActivationSequence(coordinatedNodeSequence, activationLock, nodeId, nodeActivationSequence, logger)) { + + case Stale: + activationLock.close(); + return null; + case SelfRepair: + case InSync: + break; + case MaybeInSync: + if (activationLock.tryLock()) { + // BAD: where's the broker that should commit it? + activationLock.unlock(); + logger.warnf("Cannot assume live role for NodeID = %s: claimed activation sequence need to be repaired", + nodeId); + TimeUnit.MILLISECONDS.sleep(CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS); + continue; + } + // quick path while data is still valuable: wait until something change (commit/repair) + TimeUnit.MILLISECONDS.sleep(CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS); + continue; + } + // SelfRepair, InSync + if (!activationLock.tryLock(LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) { + logger.debugf("Candidate for Node ID = %s, with local activation sequence: %d, cannot acquire live lock within %dms; retrying", + nodeId, nodeActivationSequence, LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS); + continue; + } + switch (validateActivationSequence(coordinatedNodeSequence, activationLock, nodeId, nodeActivationSequence, logger)) { + + case Stale: + activationLock.close(); + return null; + case SelfRepair: + // Self-repair sequence ie we were the only one with the most up to date data. + // NOTE: We cannot move the sequence now, let's delay it on ensureSequentialAccessToNodeData + logger.infof("Assuming live role for NodeID = %s: local activation sequence %d matches claimed coordinated activation sequence %d. Repairing sequence", nodeId, nodeActivationSequence, nodeActivationSequence); + return activationLock; + case InSync: + // we are an in_sync_replica, good to go live as UNREPLICATED + logger.infof("Assuming live role for NodeID = %s, local activation sequence %d matches current coordinated activation sequence %d", nodeId, nodeActivationSequence, nodeActivationSequence); + return activationLock; + case MaybeInSync: + activationLock.unlock(); + logger.warnf("Cannot assume live role for NodeID = %s: claimed activation sequence need to be repaired", nodeId); + TimeUnit.MILLISECONDS.sleep(CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS); + continue; + } + } + } + } + + private enum ValidationResult { + /** + * coordinated activation sequence (claimed/committed) is far beyond the local one: data is not valuable anymore + **/ + Stale, + /** + * coordinated activation sequence is the same as local one: data is in sync + **/ + InSync, + /** + * next coordinated activation sequence is not committed yet: maybe data is in sync + **/ + MaybeInSync, + /** + * next coordinated activation sequence is not committed yet, but this broker can self-repair: data is in sync + **/ + SelfRepair + } + + private static ValidationResult validateActivationSequence(final MutableLong coordinatedNodeSequence, + final DistributedLock activationLock, + final String lockAndLongId, + final long nodeActivationSequence, + final Logger logger) throws UnavailableStateException { + assert coordinatedNodeSequence.getMutableLongId().equals(lockAndLongId); + assert activationLock.getLockId().equals(lockAndLongId); + final long currentCoordinatedNodeSequence = coordinatedNodeSequence.get(); + if (nodeActivationSequence == currentCoordinatedNodeSequence) { + return ValidationResult.InSync; + } + if (currentCoordinatedNodeSequence > 0) { + logger.infof("Not a candidate for NodeID = %s activation, local activation sequence %d does not match coordinated activation sequence %d", + lockAndLongId, nodeActivationSequence, currentCoordinatedNodeSequence); + return ValidationResult.Stale; + } + // claimed activation sequence + final long claimedCoordinatedNodeSequence = -currentCoordinatedNodeSequence; + final long sequenceGap = claimedCoordinatedNodeSequence - nodeActivationSequence; + if (sequenceGap == 0) { + return ValidationResult.SelfRepair; + } + if (sequenceGap == 1) { + // maybe data is still valuable + return ValidationResult.MaybeInSync; + } + assert sequenceGap > 1; + // sequence is moved so much that data is no longer valuable + logger.infof("Not a candidate for NodeID = %s activation, local activation sequence %d does not match coordinated activation sequence %d", + lockAndLongId, nodeActivationSequence, claimedCoordinatedNodeSequence); + return ValidationResult.Stale; + } + + /** + * It wait until {@code timeoutMillis ms} has passed or the coordinated activation sequence has progressed enough + */ + public static boolean awaitNextCommittedActivationSequence(final DistributedPrimitiveManager distributedManager, + final String coordinatedLockAndNodeId, + final long activationSequence, + final long timeoutMills, + final Logger logger) + throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + Objects.requireNonNull(distributedManager); + Objects.requireNonNull(logger); + Objects.requireNonNull(coordinatedLockAndNodeId); + if (activationSequence < 0) { + throw new IllegalArgumentException("activationSequence must be >= 0, while is " + activationSequence); + } + if (!distributedManager.isStarted()) { + throw new IllegalStateException("manager must be started"); + } + final MutableLong coordinatedActivationSequence = distributedManager.getMutableLong(coordinatedLockAndNodeId); + // wait for the live to activate and run un replicated with a sequence > inSyncReplicaActivation + // this read can be dirty b/c we are just looking for an increment. + boolean anyNext = false; + final long timeoutNs = TimeUnit.MILLISECONDS.toNanos(timeoutMills); + final long started = System.nanoTime(); + long elapsedNs; + do { + final long coordinatedValue = coordinatedActivationSequence.get(); + if (coordinatedValue > activationSequence) { + // all good, some activation has gone ahead + logger.infof("Detected a new activation sequence with NodeID = %s: and sequence: %d", coordinatedLockAndNodeId, coordinatedValue); + anyNext = true; + break; + } + if (coordinatedValue < 0) { + // commit claim + final long claimedSequence = -coordinatedValue; + final long activationsGap = claimedSequence - activationSequence; + if (activationsGap > 1) { + // all good, some activation has gone ahead + logger.infof("Detected furthers sequential server activations from sequence %d, with NodeID = %s: and claimed sequence: %d", activationSequence, coordinatedLockAndNodeId, claimedSequence); + anyNext = true; + break; + } + // activation is still in progress + logger.debugf("Detected claiming of activation sequence = %d for NodeID = %s", claimedSequence, coordinatedLockAndNodeId); + } + try { + TimeUnit.MILLISECONDS.sleep(CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS); + } catch (InterruptedException ignored) { + } + elapsedNs = System.nanoTime() - started; + } + while (elapsedNs < timeoutNs); + return anyNext; + } + + /** + * This is going to increment the coordinated activation sequence while holding the live lock, failing with some exception otherwise.
+ *

+ * The acceptable states are {@link ValidationResult#InSync} and {@link ValidationResult#SelfRepair}, throwing some exception otherwise. + *

+ * This must be used while holding a live lock to ensure not-exclusive ownership of data ie can be both used + * while loosing connectivity with a replica or after successfully {@link #tryActivate(String, long, DistributedPrimitiveManager, Logger)}. + */ + public static void ensureSequentialAccessToNodeData(ActiveMQServer activeMQServer, + DistributedPrimitiveManager distributedPrimitiveManager, + final Logger logger) throws ActiveMQException, InterruptedException, UnavailableStateException, ExecutionException, TimeoutException { + + final NodeManager nodeManager = activeMQServer.getNodeManager(); + final String lockAndLongId = nodeManager.getNodeId().toString(); + final DistributedLock liveLock = distributedPrimitiveManager.getDistributedLock(lockAndLongId); + if (!liveLock.isHeldByCaller()) { + final String message = String.format("Server [%s], live lock for NodeID = %s, not held, activation sequence cannot be safely changed", + activeMQServer, lockAndLongId); + logger.info(message); + throw new UnavailableStateException(message); + } + final long nodeActivationSequence = nodeManager.readNodeActivationSequence(); + final MutableLong coordinatedNodeActivationSequence = distributedPrimitiveManager.getMutableLong(lockAndLongId); + final long currentCoordinatedActivationSequence = coordinatedNodeActivationSequence.get(); + final long nextActivationSequence; + if (currentCoordinatedActivationSequence < 0) { + // Check Self-Repair + if (nodeActivationSequence != -currentCoordinatedActivationSequence) { + final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, local activation sequence %d does not match current claimed coordinated sequence %d: need repair", + activeMQServer, lockAndLongId, nodeActivationSequence, -currentCoordinatedActivationSequence); + logger.info(message); + throw new ActiveMQException(message); + } + // auto-repair: this is the same server that failed to commit its claimed sequence + nextActivationSequence = nodeActivationSequence; + } else { + // Check InSync + if (nodeActivationSequence != currentCoordinatedActivationSequence) { + final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, local activation sequence %d does not match current coordinated sequence %d", + activeMQServer, lockAndLongId, nodeActivationSequence, currentCoordinatedActivationSequence); + logger.info(message); + throw new ActiveMQException(message); + } + nextActivationSequence = nodeActivationSequence + 1; + } + // UN_REPLICATED STATE ENTER: auto-repair doesn't need to claim and write locally + if (nodeActivationSequence != nextActivationSequence) { + // claim + if (!coordinatedNodeActivationSequence.compareAndSet(nodeActivationSequence, -nextActivationSequence)) { + final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, activation sequence claim failed, local activation sequence %d no longer matches current coordinated sequence %d", + activeMQServer, lockAndLongId, nodeActivationSequence, coordinatedNodeActivationSequence.get()); + logger.infof(message); + throw new ActiveMQException(message); + } + // claim success: write locally + try { + nodeManager.writeNodeActivationSequence(nextActivationSequence); + } catch (NodeManager.NodeManagerException fatal) { + logger.errorf("Server [%s] failed to set local activation sequence to: %d for NodeId =%s. Cannot continue committing coordinated activation sequence: REQUIRES ADMIN INTERVENTION", + activeMQServer, nextActivationSequence, lockAndLongId); + throw new UnavailableStateException(fatal); + } + logger.infof("Server [%s], incremented local activation sequence to: %d for NodeId = %s", + activeMQServer, nextActivationSequence, lockAndLongId); + } else { + // self-heal need to update the in-memory sequence, because no writes will do it + nodeManager.setNodeActivationSequence(nextActivationSequence); + } + // commit + if (!coordinatedNodeActivationSequence.compareAndSet(-nextActivationSequence, nextActivationSequence)) { + final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, activation sequence commit failed, local activation sequence %d no longer matches current coordinated sequence %d", + activeMQServer, lockAndLongId, nodeActivationSequence, coordinatedNodeActivationSequence.get()); + logger.infof(message); + throw new ActiveMQException(message); + } + logger.infof("Server [%s], incremented coordinated activation sequence to: %d for NodeId = %s", + activeMQServer, nextActivationSequence, lockAndLongId); + } + +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/management/impl/ManagementServiceImpl.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/management/impl/ManagementServiceImpl.java index c5a49fa30d..5d9115b040 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/management/impl/ManagementServiceImpl.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/management/impl/ManagementServiceImpl.java @@ -244,7 +244,9 @@ public class ManagementServiceImpl implements ManagementService { ObjectName objectName = objectNameBuilder.getActiveMQServerObjectName(); unregisterFromJMX(objectName); unregisterFromRegistry(ResourceNames.BROKER); - unregisterMeters(ResourceNames.BROKER + "." + messagingServer.getConfiguration().getName()); + if (messagingServer != null) { + unregisterMeters(ResourceNames.BROKER + "." + messagingServer.getConfiguration().getName()); + } } @Override diff --git a/artemis-server/src/main/resources/schema/artemis-configuration.xsd b/artemis-server/src/main/resources/schema/artemis-configuration.xsd index 9e79a54b7f..69e06cdcc2 100644 --- a/artemis-server/src/main/resources/schema/artemis-configuration.xsd +++ b/artemis-server/src/main/resources/schema/artemis-configuration.xsd @@ -3189,13 +3189,12 @@ - + - Whether to check the cluster for a (live) server using our own server ID when starting - up. This option is only necessary for performing 'fail-back' on replicating - servers. Strictly speaking this setting only applies to live servers and not to - backups. + The common identity to use for coordination that is shared across instances that will replicate. + The value will be used as the internal server nodeId and as the identity of entities in the + distributed-primitive-manager. @@ -3208,21 +3207,6 @@ - - - - If we start as a replica and lose connection to the master, how many times should we attempt to vote - for quorum before restarting - - - - - - - How long to wait (in milliseconds) between each vote - - - @@ -3288,20 +3272,6 @@ - - - - If we lose connection to the master, how many times should we attempt to vote for quorum before restarting - - - - - - - How long to wait (in milliseconds) between each vote - - - diff --git a/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java b/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java index 4b37a76125..4dc63779b0 100644 --- a/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java +++ b/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java @@ -283,7 +283,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase { assertFalse(policy.canScaleDown()); assertFalse(policy.isBackup()); assertFalse(policy.isSharedStore()); - assertTrue(policy.isCheckForLiveServer()); assertTrue(policy.isWaitForActivation()); assertEquals("purple", policy.getGroupName()); assertEquals("purple", policy.getBackupGroupName()); @@ -297,8 +296,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase { assertEquals(policy.getBackupGroupName(), failbackPolicy.getBackupGroupName()); assertEquals(policy.getClusterName(), failbackPolicy.getClusterName()); assertEquals(failbackPolicy.getMaxSavedReplicatedJournalsSize(), ActiveMQDefaultConfiguration.getDefaultMaxSavedReplicatedJournalsSize()); - assertEquals(1, failbackPolicy.getVoteRetries()); - assertEquals(1000, failbackPolicy.getVoteRetryWait()); assertTrue(failbackPolicy.isTryFailback()); assertTrue(failbackPolicy.isBackup()); assertFalse(failbackPolicy.isSharedStore()); @@ -337,8 +334,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase { assertEquals("tiddles", policy.getBackupGroupName()); assertEquals("33rrrrr", policy.getClusterName()); assertEquals(22, policy.getMaxSavedReplicatedJournalsSize()); - assertEquals(1, policy.getVoteRetries()); - assertEquals(1000, policy.getVoteRetryWait()); assertFalse(policy.isTryFailback()); assertTrue(policy.isBackup()); assertFalse(policy.isSharedStore()); @@ -358,7 +353,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase { assertFalse(failoverLivePolicy.canScaleDown()); assertFalse(failoverLivePolicy.isBackup()); assertFalse(failoverLivePolicy.isSharedStore()); - assertFalse(failoverLivePolicy.isCheckForLiveServer()); assertTrue(failoverLivePolicy.isWaitForActivation()); assertEquals(policy.getGroupName(), failoverLivePolicy.getGroupName()); assertEquals(policy.getClusterName(), failoverLivePolicy.getClusterName()); diff --git a/artemis-server/src/test/resources/backup-hapolicy-config.xml b/artemis-server/src/test/resources/backup-hapolicy-config.xml index be55236676..070c5672b4 100644 --- a/artemis-server/src/test/resources/backup-hapolicy-config.xml +++ b/artemis-server/src/test/resources/backup-hapolicy-config.xml @@ -30,8 +30,6 @@ 33rrrrr 9876 12345 - 1 - 1000 false diff --git a/artemis-server/src/test/resources/primary-hapolicy-config.xml b/artemis-server/src/test/resources/primary-hapolicy-config.xml index 5b88bcd80f..8e0ea49406 100644 --- a/artemis-server/src/test/resources/primary-hapolicy-config.xml +++ b/artemis-server/src/test/resources/primary-hapolicy-config.xml @@ -27,9 +27,6 @@ abcdefg 9876 12345 - true - 1 - 1000 org.apache.activemq.artemis.core.config.impl.HAPolicyConfigurationTest$FakeDistributedPrimitiveManager diff --git a/docs/user-manual/en/ha.md b/docs/user-manual/en/ha.md index 96045e95e4..0395fcb63e 100644 --- a/docs/user-manual/en/ha.md +++ b/docs/user-manual/en/ha.md @@ -65,8 +65,8 @@ which we will cover in a later chapter. > message data will not be available after failover. The `ha-policy` type configures which strategy a cluster should use to -provide the backing up of a servers data. Within this configuration -element is configured how a server should behave within the cluster, +provide the backing up of a server's data. Within this configuration +element we configure how a server should behave within the cluster, either as a master (live), slave (backup) or colocated (both live and backup). This would look something like: @@ -98,7 +98,7 @@ or ``` -*Replication* allows too to configure 2 new roles to enable *pluggable quorum* provider configuration, by using: +*Replication* allows the configuration of two new roles to enable *pluggable quorum* provider configuration, by using: ```xml @@ -116,17 +116,14 @@ to configure the classic *master* role, and ``` for the classic *slave* one. -If *replication* is configured using such new roles some additional element is required to complete configuration, detailed later. +If *replication* is configured using such new roles some additional element are required to complete configuration as detailed later. ### IMPORTANT NOTE ON PLUGGABLE QUORUM VOTE FEATURE -This feature is still **EXPERIMENTAL** and not meant to be run in production yet. +This feature is still **EXPERIMENTAL**. Extra testing should be done before running this feature into production. Please raise issues eventually found to the ActiveMQ Artemis Mail Lists. It means: -- its configuration can change until declared as **officially stable** -- it has to solve yet an inherent data misalignment issue with replication (it can happen with `classic` replication as well) - -More info about this issue are on [ARTEMIS-3340](https://issues.apache.org/jira/browse/ARTEMIS-3340). +- it's configuration can change until declared as **officially stable** ### Data Replication @@ -226,12 +223,12 @@ changes and repeats the process. > live server by changing `slave` to `master`. Much like in the shared-store case, when the live server stops or -crashes, its replicating backup will become active and take over its +crashes, it's replicating backup will become active and take over its duties. Specifically, the backup will become active when it loses -connection to its live server. This can be problematic because this can -also happen because of a temporary network problem. +connection to its live server. This can be problematic because it can +also happen as the result of temporary network problem. -This issue is solved in 2 different ways depending on which replication roles are configured: +The issue can be solved in two different ways, depending on which replication roles are configured: - **classic replication** (`master`/`slave` roles): backup will try to determine whether it still can connect to the other servers in the cluster. If it can connect to more than half the servers, it will become active, if more than half the @@ -275,7 +272,7 @@ The backup server must be similarly configured but as a `slave` ``` -To configure a pluggable quorum replication's primary and backup instead: +To configure a pluggable quorum replication's primary and backup use: ```xml @@ -391,7 +388,6 @@ For `primary`: - true @@ -418,16 +414,16 @@ The configuration of `class-name` as follows ``` isn't really needed, because Apache Curator is the default provider, but has been shown for completeness. -The `properties` element, instead +The `properties` element: ```xml ``` -Can specify a list of `property` elements in the form of key-value pairs, depending the ones -accepted by the specified `class-name` provider. +can specify a list of `property` elements in the form of key-value pairs, appropriate to what is +supported by the specified `class-name` provider. -Apache Curator's provider allow to configure these properties: +Apache Curator's provider allows the following properties: - [`connect-string`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#connectString(java.lang.String)): (no default) - [`session-ms`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#sessionTimeoutMs(int)): (default is 18000 ms) @@ -438,37 +434,26 @@ Apache Curator's provider allow to configure these properties: - [`retries-ms`](https://curator.apache.org/apidocs/org/apache/curator/retry/RetryNTimes.html#%3Cinit%3E(int,int)): (default is 1000 ms) - [`namespace`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#namespace(java.lang.String)): (no default) -Configuration of the [Apache Zookeeper](https://zookeeper.apache.org/) nodes is left to the user, but there are few +Configuration of the [Apache Zookeeper](https://zookeeper.apache.org/) ensemble is the responsibility of the user, but there are few **suggestions to improve the reliability of the quorum service**: - broker `session_ms` must be `>= 2 * server tick time` and `<= 20 * server tick time` as by [Zookeeper 3.6.3 admin guide](https://zookeeper.apache.org/doc/r3.6.3/zookeeperAdmin.html): it directly impacts how fast a backup can failover to an isolated/killed/unresponsive live; the higher, the slower. - GC on broker machine should allow keeping GC pauses within 1/3 of `session_ms` in order to let the Zookeeper heartbeat protocol -to work reliably: if it's not possible, better increase `session_ms` accepting a slower failover + work reliably. If that is not possible, it is better to increase `session_ms`, accepting a slower failover. - Zookeeper must have enough resources to keep GC (and OS) pauses much smaller than server tick time: please consider carefully if broker and Zookeeper node should share the same physical machine, depending on the expected load of the broker - network isolation protection requires configuring >=3 Zookeeper nodes #### *Important*: Notes on pluggable quorum replication configuration -The first `classic` replication configuration that won't apply to the pluggable quorum replication -is `vote-on-replication-failure` and configure it produces a startup error: pluggable quorum replication -always behave like `vote-on-replication-failure` `true` ie shutting down a live broker (and its JVM) in case of quorum loss. +There are some no longer needed `classic` replication configurations: + - `vote-on-replication-failure` + - `quorum-vote-wait` + - `vote-retries` + - `vote-retries-wait` + - `check-for-live-server` -The second deprecated `classic` replication configuration is `quorum-vote-wait`: given that the pluggable quorum vote replication -requires backup to have an always-on reliable quorum service, there's no need to specify the timeout to reach -the majority of quorum nodes. A backup remains inactive (ie JVM still up, console too, unable to sync with live, to failover etc etc) -until the majority of quorum nodes is reachable again, re-activating if happens. - -The only exception is with primary failing-back to an existing live backup using `true`: -if the quorum service isn't immediately available the primary (and its JVM) just stop, allowing fail-fast failing-back. - -There are few *semantic differences* of other existing properties: -- `vote-retry-wait`: in `classic` replication means how long to wait between each quorum vote try, while with pluggable quorum replication - means how long request to failover for each attempt -- `vote-retries`: differently from `classic`, the amount of vote attempt is `1 + vote-retries` (with classic is just `vote-retries`). - Setting `0` means no retries, leaving backup to still perform an initial attempt. - **Notes on replication configuration with [Apache curator](https://curator.apache.org/) quorum provider** As said some paragraphs above, `session-ms` affect the failover duration: a backup can @@ -479,34 +464,34 @@ For the former case (session expiration with live no longer present), the backup 1. cluster connection PINGs (affected by [connection-ttl](connection-ttl.md) tuning) 2. closed TCP connection notification (depends by TCP configuration and networking stack/topology) -These 2 cases have 2 different failover duration depending on different factors: -1. `connection-ttl` affect how much time of the expiring `session-ms` is used to just detect a missing live broker: the higher `connection-tt`, - the slower it reacts; backup can attempt to failover for the remaining `session-ms - connection-ttl` -2. `session-ms` expiration is immediately detected: backup must try to failover for >=`session-ms` to be sure to catch - the session expiration and complete failover - -The previous comments are meant to suggest to the careful reader that the minimum time to attempt to failover -cannot be below the full `session-ms` expires. -In short, it means -``` - total failover attempt time > session-ms -``` -with -``` - total failover attempt time = vote-retry-wait * (vote-retries + 1) -``` -and by consequence: -``` - vote-retry-wait * (vote-retries + 1) > session-ms -``` -For example with `session-ms = 18000 ms`, safe values for failover timeout are: -```xml - 11 - 2000 -``` -Because `11 * 2000 = 22000 ms` that's bigger then `18000 ms`. +The suggestion is to tune `connection-ttl` low enough to attempt failover as soon as possible, while taking in consideration that +the whole fail-over duration cannot last less than the configured `session-ms`. -There's no risk that a backup broker will early stop attempting to failover, losing its chance to become live. +##### Peer or Multi Primary +With coordination delegated to the quorum service, roles are less important. It is possible to have two peer servers compete +for activation; the winner activating as live, the looser taking up a backup role. On restart, 'any' peer server +with the most up to date journal can activate. +The instances need to know in advance, what identity they will coordinate on. +In the replication 'primary' ha policy we can explicitly set the 'coordination-id' to a common value for all peers in a cluster. + +For `multi primary`: +```xml + + + + + org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager + + + + + peer-journal-001 + + + +``` +Note: the string value provided will be converted internally into a 16 byte UUID, so it may not be immediately recognisable or human-readable, +however it will ensure that all 'peers' coordinate. ### Shared Store @@ -637,10 +622,10 @@ another server using its nodeID. If it finds one, it will contact this server and try to "fail-back". Since this is a remote replication scenario, the "starting live" will have to synchronize its data with the server running with its ID, once they are in sync, it will request the -other server (which it assumes it is a back that has assumed its duties) -to shutdown for it to take over. This is necessary because otherwise the +other server (which it assumes it is a backup that has assumed its duties) +to shutdown, for it to take over. This is necessary because otherwise the live server has no means to know whether there was a fail-over or not, -and if there was if the server that took its duties is still running or +and if there was, if the server that took its duties is still running or not. To configure this option at your `broker.xml` configuration file as follows, for classic replication: @@ -663,21 +648,21 @@ And pluggable quorum replication: - true + ``` The key difference from classic replication is that if `master` cannot reach any -live server with its same nodeID, it's going straight to become live, while `primary` -request it to the quorum provider, searching again for any existing live if -the quorum provider is not available (eg connectivity loss, consensus absence) or -if there's another live broker with the same nodeID alive, in an endless loop. +live server with its nodeID, it activates unilaterally. +With `primary`, the responsibilities of coordination are delegated to the quorum provider, +there are no unilateral decisions. The `primary` will only activate when +it knows that it has the most up to date version of the journal identified by its nodeID. In short: a started `primary` cannot become live without consensus. -> **Warning** +> **Warning for classic replication** > > Be aware that if you restart a live server while after failover has > occurred then `check-for-live-server` must be set to `true`. If not the live server diff --git a/examples/features/ha/pom.xml b/examples/features/ha/pom.xml index 36eb4403ab..92e2198842 100644 --- a/examples/features/ha/pom.xml +++ b/examples/features/ha/pom.xml @@ -62,6 +62,7 @@ under the License. scale-down stop-server-failover transaction-failover + zookeeper-single-pair-failback @@ -81,6 +82,7 @@ under the License. replicated-transaction-failover scale-down transaction-failover + zookeeper-single-pair-failback diff --git a/examples/features/ha/zookeeper-single-pair-failback/pom.xml b/examples/features/ha/zookeeper-single-pair-failback/pom.xml new file mode 100644 index 0000000000..51044d7145 --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/pom.xml @@ -0,0 +1,110 @@ + + + + + 4.0.0 + + + org.apache.activemq.examples.failover + broker-failover + 2.18.0-SNAPSHOT + + + zookeeper-single-pair-ordered-failback + jar + ActiveMQ Artemis Zookeeper Single Pair Ordered Failback Example + + + ${project.basedir}/../../../.. + + + + + org.apache.activemq + artemis-cli + ${project.version} + + + jakarta.jms + jakarta.jms-api + + + + + + + org.apache.activemq + artemis-maven-plugin + + + create0 + + create + + + + -Djava.net.preferIPv4Stack=true + ${basedir}/target/server0 + ${basedir}/target/classes/activemq/server0 + -Dudp-address=${udp-address} + + + + create1 + + create + + + + -Djava.net.preferIPv4Stack=true + ${basedir}/target/server1 + ${basedir}/target/classes/activemq/server1 + -Dudp-address=${udp-address} + + + + runClient + + runClient + + + org.apache.activemq.artemis.jms.example.ZookeeperSinglePairFailback + + ${basedir}/target/server0 + ${basedir}/target/server1 + + + + + + + org.apache.activemq.examples.failover + zookeeper-single-pair-ordered-failback + 2.18.0-SNAPSHOT + + + + + org.apache.maven.plugins + maven-clean-plugin + + + + diff --git a/examples/features/ha/zookeeper-single-pair-failback/readme.md b/examples/features/ha/zookeeper-single-pair-failback/readme.md new file mode 100644 index 0000000000..f9fc00f5b1 --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/readme.md @@ -0,0 +1,94 @@ +# Zookeeper Single Pair Failback Example + +This example demonstrates two servers coupled as a primary-backup pair for high availability (HA) using +pluggable quorum vote replication Reference Implementation based on [Apache Curator](https://curator.apache.org/) to use +[Apache Zookeeper](https://zookeeper.apache.org/) as external quorum service. + +The example shows a client connection failing over from live to backup when the live broker is crashed and +then back to the original live when it is restarted (i.e. "failback"). + +To run the example, simply type **mvn verify** from this directory after running a Zookeeper node at `localhost:2181`. + +If no Zookeeper node is configured, can use the commands below (see [Official Zookeeper Docker Image Site](https://hub.docker.com/_/zookeeper) +for more details on how configure it). + +Run Zookeeper `3.6.3` with: +``` +$ docker run --name artemis-zk --network host --restart always -d zookeeper:3.6.3 +``` +By default, the official docker image exposes `2181 2888 3888 8080` as client, follower, election and AdminServer ports. + +Verify Zookeeper server is correctly started by running: +``` +$ docker logs --follow artemis-zk +``` +It should print the Zookeeper welcome ASCII logs: +``` +ZooKeeper JMX enabled by default +Using config: /conf/zoo.cfg +2021-08-05 14:29:29,431 [myid:] - INFO [main:QuorumPeerConfig@174] - Reading configuration from: /conf/zoo.cfg +2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@451] - clientPort is not set +2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@464] - secureClientPort is not set +2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@480] - observerMasterPort is not set +2021-08-05 14:29:29,435 [myid:] - INFO [main:QuorumPeerConfig@497] - metricsProvider.className is org.apache.zookeeper.metrics.impl.DefaultMetricsProvider +2021-08-05 14:29:29,438 [myid:] - ERROR [main:QuorumPeerConfig@722] - Invalid configuration, only one server specified (ignoring) +2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@78] - autopurge.snapRetainCount set to 3 +2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@79] - autopurge.purgeInterval set to 0 +2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@101] - Purge task is not scheduled. +2021-08-05 14:29:29,441 [myid:1] - WARN [main:QuorumPeerMain@138] - Either no config or no quorum defined in config, running in standalone mode +2021-08-05 14:29:29,444 [myid:1] - INFO [main:ManagedUtil@44] - Log4j 1.2 jmx support found and enabled. +2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@174] - Reading configuration from: /conf/zoo.cfg +2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@451] - clientPort is not set +2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@464] - secureClientPort is not set +2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@480] - observerMasterPort is not set +2021-08-05 14:29:29,450 [myid:1] - INFO [main:QuorumPeerConfig@497] - metricsProvider.className is org.apache.zookeeper.metrics.impl.DefaultMetricsProvider +2021-08-05 14:29:29,450 [myid:1] - ERROR [main:QuorumPeerConfig@722] - Invalid configuration, only one server specified (ignoring) +2021-08-05 14:29:29,451 [myid:1] - INFO [main:ZooKeeperServerMain@122] - Starting server +2021-08-05 14:29:29,459 [myid:1] - INFO [main:ServerMetrics@62] - ServerMetrics initialized with provider org.apache.zookeeper.metrics.impl.DefaultMetricsProvider@525f1e4e +2021-08-05 14:29:29,461 [myid:1] - INFO [main:FileTxnSnapLog@124] - zookeeper.snapshot.trust.empty : false +2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - +2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - ______ _ +2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - |___ / | | +2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - / / ___ ___ | | __ ___ ___ _ __ ___ _ __ +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - / / / _ \ / _ \ | |/ / / _ \ / _ \ | '_ \ / _ \ | '__| +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - / /__ | (_) | | (_) | | < | __/ | __/ | |_) | | __/ | | +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - /_____| \___/ \___/ |_|\_\ \___| \___| | .__/ \___| |_| +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - | | +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - |_| +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - +``` +Alternatively, can run +``` +$ docker run -it --rm --network host zookeeper:3.6.3 zkCli.sh -server localhost:2181 +``` +Zookeeper server can be reached using localhost:2181 if it output something like: +``` +2021-08-05 14:56:03,739 [myid:localhost:2181] - INFO [main-SendThread(localhost:2181):ClientCnxn$SendThread@1448] - Session establishment complete on server localhost/0:0:0:0:0:0:0:1:2181, session id = 0x100078b8cfc0002, negotiated timeout = 30000 + +``` +Type +``` +[zk: localhost:2181(CONNECTED) 0] quit +``` +to quit the client instance. + +The 2 brokers of this example are already configured to connect to a single Zookeeper node at the mentioned address, thanks to the XML configuration of their `manager`: +```xml + + + + + + + +``` +**NOTE** the `namespace` parameter is used to separate the pair information from others if the Zookeeper node is shared with other applications. + +**WARNING** As already recommended on the [High Availability section](https://activemq.apache.org/components/artemis/documentation/latest/ha.html), a production environment needs >= 3 nodes to protect against network partitions. + +This example can be run with +``` +$ mvn verify +``` + +For more information on ActiveMQ Artemis failover and HA, and clustering in general, please see the clustering section of the user manual. \ No newline at end of file diff --git a/examples/features/ha/zookeeper-single-pair-failback/src/main/java/org/apache/activemq/artemis/jms/example/ZookeeperSinglePairFailback.java b/examples/features/ha/zookeeper-single-pair-failback/src/main/java/org/apache/activemq/artemis/jms/example/ZookeeperSinglePairFailback.java new file mode 100644 index 0000000000..fa1172c572 --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/src/main/java/org/apache/activemq/artemis/jms/example/ZookeeperSinglePairFailback.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.jms.example; + +import javax.jms.Connection; +import javax.jms.ConnectionFactory; +import javax.jms.JMSException; +import javax.jms.MessageConsumer; +import javax.jms.MessageProducer; +import javax.jms.Queue; +import javax.jms.Session; +import javax.jms.TextMessage; +import javax.naming.InitialContext; + +import org.apache.activemq.artemis.util.ServerUtil; + +/** + * Example of live and replicating backup pair using Zookeeper as the quorum provider. + *

+ * After both servers are started, the live server is killed and the backup becomes active ("fails-over"). + *

+ * Later the live server is restarted and takes back its position by asking the backup to stop ("fail-back"). + */ +public class ZookeeperSinglePairFailback { + + private static Process server0; + + private static Process server1; + + public static void main(final String[] args) throws Exception { + // Step 0. Prepare Zookeeper Evironment as shown on readme.md + + final int numMessages = 30; + + Connection connection = null; + + InitialContext initialContext = null; + + try { + server0 = ServerUtil.startServer(args[0], ZookeeperSinglePairFailback.class.getSimpleName() + "-primary", 0, 30000); + server1 = ServerUtil.startServer(args[1], ZookeeperSinglePairFailback.class.getSimpleName() + "-backup", 1, 10000); + + // Step 2. Get an initial context for looking up JNDI from the server #1 + initialContext = new InitialContext(); + + // Step 3. Look up the JMS resources from JNDI + Queue queue = (Queue) initialContext.lookup("queue/exampleQueue"); + ConnectionFactory connectionFactory = (ConnectionFactory) initialContext.lookup("ConnectionFactory"); + + // Step 4. Create a JMS Connection + connection = connectionFactory.createConnection(); + + // Step 5. Create a *non-transacted* JMS Session with client acknowledgement + Session session = connection.createSession(false, Session.CLIENT_ACKNOWLEDGE); + + // Step 6. Start the connection to ensure delivery occurs + connection.start(); + + // Step 7. Create a JMS MessageProducer and a MessageConsumer + MessageProducer producer = session.createProducer(queue); + MessageConsumer consumer = session.createConsumer(queue); + + // Step 8. Send some messages to server #1, the live server + for (int i = 0; i < numMessages; i++) { + TextMessage message = session.createTextMessage("This is text message " + i); + producer.send(message); + System.out.println("Sent message: " + message.getText()); + } + + // Step 9. Receive and acknowledge a third of the sent messages + TextMessage message0 = null; + for (int i = 0; i < numMessages / 3; i++) { + message0 = (TextMessage) consumer.receive(5000); + System.out.println("Got message: " + message0.getText()); + } + message0.acknowledge(); + System.out.println("Received and acknowledged a third of the sent messages"); + + // Step 10. Receive the rest third of the sent messages but *do not* acknowledge them yet + for (int i = numMessages / 3; i < numMessages; i++) { + message0 = (TextMessage) consumer.receive(5000); + System.out.println("Got message: " + message0.getText()); + } + System.out.println("Received without acknowledged the rest of the sent messages"); + + Thread.sleep(2000); + // Step 11. Crash server #0, the live server, and wait a little while to make sure + // it has really crashed + ServerUtil.killServer(server0); + System.out.println("Killed primary"); + + Thread.sleep(2000); + + // Step 12. Acknowledging the received messages will fail as failover to the backup server has occurred + try { + message0.acknowledge(); + } catch (JMSException e) { + System.out.println("Got (the expected) exception while acknowledging message: " + e.getMessage()); + } + + // Step 13. Consume again the 2nd third of the messages again. Note that they are not considered as redelivered. + for (int i = numMessages / 3; i < (numMessages / 3) * 2; i++) { + message0 = (TextMessage) consumer.receive(5000); + System.out.printf("Got message: %s (redelivered?: %s)\n", message0.getText(), message0.getJMSRedelivered()); + } + + // Step 14. Acknowledging them on the failed-over broker works fine + message0.acknowledge(); + System.out.println("Acknowledged 2n third of messages"); + + // Step 15. Restarting primary + server0 = ServerUtil.startServer(args[0], ZookeeperSinglePairFailback.class.getSimpleName() + "-primary", 0, 10000); + System.out.println("Started primary"); + + // await fail-back to complete + Thread.sleep(4000); + + // Step 16. Consuming the 3rd third of the messages. Note that they are not considered as redelivered. + for (int i = (numMessages / 3) * 2; i < numMessages; i++) { + message0 = (TextMessage) consumer.receive(5000); + System.out.printf("Got message: %s (redelivered?: %s)\n", message0.getText(), message0.getJMSRedelivered()); + } + message0.acknowledge(); + System.out.println("Acknowledged 3d third of messages"); + + } finally { + // Step 17. Be sure to close our resources! + + if (connection != null) { + connection.close(); + } + + if (initialContext != null) { + initialContext.close(); + } + + ServerUtil.killServer(server0); + ServerUtil.killServer(server1); + + // Step 18. stop the ZK server + } + } +} diff --git a/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server0/broker.xml b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server0/broker.xml new file mode 100644 index 0000000000..1223fca0e3 --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server0/broker.xml @@ -0,0 +1,90 @@ + + + + + + ./data/bindings + + ./data/journal + + ./data/largemessages + + ./data/paging + + exampleUser + + secret + + + + + + + + + + + + + + + + + tcp://localhost:61616 + tcp://localhost:61617 + + + + + tcp://localhost:61616 + + + + + netty-connector + + netty-backup-connector + + + + + + + + + + + + + + + + + + + +

+ + + +
+ + + diff --git a/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server1/broker.xml b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server1/broker.xml new file mode 100644 index 0000000000..bfe63dacda --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server1/broker.xml @@ -0,0 +1,91 @@ + + + + + + ./data/bindings + + ./data/journal + + ./data/largemessages + + ./data/paging + + exampleUser + + secret + + + + + + + + + + + + true + + + + + + + tcp://localhost:61616 + tcp://localhost:61617 + + + + + tcp://localhost:61617 + + + + + netty-connector + + netty-live-connector + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
diff --git a/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/jndi.properties b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/jndi.properties new file mode 100644 index 0000000000..7f7a19f280 --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/jndi.properties @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +java.naming.factory.initial=org.apache.activemq.artemis.jndi.ActiveMQInitialContextFactory +connectionFactory.ConnectionFactory=tcp://localhost:61616?ha=true&retryInterval=1000&retryIntervalMultiplier=1.0&reconnectAttempts=-1 +queue.queue/exampleQueue=exampleQueue diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java index a69c44ec18..50c33a9ec7 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java @@ -77,7 +77,6 @@ public class InfiniteRedeliveryTest extends ActiveMQTestBase { Configuration backupConfig; Configuration liveConfig; - NodeManager nodeManager; protected TestableServer createTestableServer(Configuration config, NodeManager nodeManager) throws Exception { boolean isBackup = config.getHAPolicyConfiguration() instanceof ReplicaPolicyConfiguration || config.getHAPolicyConfiguration() instanceof SharedStoreSlavePolicyConfiguration; @@ -99,13 +98,11 @@ public class InfiniteRedeliveryTest extends ActiveMQTestBase { - nodeManager = new InVMNodeManager(true, backupConfig.getJournalLocation()); - - backupServer = createTestableServer(backupConfig, nodeManager); + backupServer = createTestableServer(backupConfig, new InVMNodeManager(true, backupConfig.getJournalLocation())); liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(TransportConfigurationUtils.getNettyAcceptor(true, 0)); - liveServer = createTestableServer(liveConfig, nodeManager); + liveServer = createTestableServer(liveConfig, new InVMNodeManager(false, liveConfig.getJournalLocation())); } protected void configureReplicationPair(TransportConfiguration backupConnector, diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java index a13ef3dbbb..3dcf9a9d22 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java @@ -659,6 +659,8 @@ public class FailoverTest extends FailoverTestBase { backupServer.getServer().fail(true); + decrementActivationSequenceForForceRestartOf(liveServer); + liveServer.start(); consumer.close(); @@ -823,6 +825,7 @@ public class FailoverTest extends FailoverTestBase { Assert.assertFalse("must NOT be a backup", isBackup); adaptLiveConfigForReplicatedFailBack(liveServer); beforeRestart(liveServer); + decrementActivationSequenceForForceRestartOf(liveServer); liveServer.start(); Assert.assertTrue("live initialized...", liveServer.getServer().waitForActivation(15, TimeUnit.SECONDS)); @@ -931,12 +934,13 @@ public class FailoverTest extends FailoverTestBase { while (!backupServer.isStarted() && i++ < 100) { Thread.sleep(100); } - liveServer.getServer().waitForActivation(5, TimeUnit.SECONDS); + backupServer.getServer().waitForActivation(5, TimeUnit.SECONDS); Assert.assertTrue(backupServer.isStarted()); if (isReplicated) { FileMoveManager moveManager = new FileMoveManager(backupServer.getServer().getConfiguration().getJournalLocation(), 0); - Assert.assertEquals(1, moveManager.getNumberOfFolders()); + // backup has not had a chance to restart as a backup and cleanup + Wait.assertTrue(() -> moveManager.getNumberOfFolders() <= 2); } } else { backupServer.stop(); @@ -2427,6 +2431,10 @@ public class FailoverTest extends FailoverTestBase { // no-op } + protected void decrementActivationSequenceForForceRestartOf(TestableServer liveServer) throws Exception { + // no-op + } + protected ClientSession sendAndConsume(final ClientSessionFactory sf1, final boolean createQueue) throws Exception { ClientSession session = createSession(sf1, false, true, true); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java index 1bcd820a1b..d863cf59fb 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java @@ -46,7 +46,6 @@ import org.apache.activemq.artemis.core.remoting.impl.invm.InVMRegistry; import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.core.server.cluster.ha.HAPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.ReplicatedPolicy; -import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationPrimaryPolicy; import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl; import org.apache.activemq.artemis.core.server.impl.InVMNodeManager; import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; @@ -88,6 +87,10 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { protected NodeManager nodeManager; + protected NodeManager backupNodeManager; + + protected DistributedPrimitiveManagerConfiguration managerConfiguration; + protected boolean startBackupServer = true; @Override @@ -173,6 +176,10 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { return new InVMNodeManager(false); } + protected NodeManager createNodeManager(Configuration configuration) throws Exception { + return new InVMNodeManager(false, configuration.getNodeManagerLockLocation()); + } + protected void createConfigs() throws Exception { nodeManager = createNodeManager(); TransportConfiguration liveConnector = getConnectorTransportConfiguration(true); @@ -211,13 +218,14 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { backupConfig.setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)).setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setSecurityEnabled(false); setupHAPolicyConfiguration(); - nodeManager = createReplicatedBackupNodeManager(backupConfig); + backupNodeManager = createReplicatedBackupNodeManager(backupConfig); - backupServer = createTestableServer(backupConfig); + backupServer = createTestableServer(backupConfig, backupNodeManager); liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true)); - liveServer = createTestableServer(liveConfig); + nodeManager = createNodeManager(liveConfig); + liveServer = createTestableServer(liveConfig, nodeManager); if (supportsRetention()) { liveServer.getServer().getConfiguration().setJournalRetentionDirectory(getJournalDir(0, false) + "_retention"); @@ -233,7 +241,7 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { backupConfig = createDefaultInVMConfig(); liveConfig = createDefaultInVMConfig(); - DistributedPrimitiveManagerConfiguration managerConfiguration = + managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); @@ -242,13 +250,14 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { backupConfig.setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)).setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setSecurityEnabled(false); setupHAPolicyConfiguration(); - nodeManager = createReplicatedBackupNodeManager(backupConfig); + backupNodeManager = createReplicatedBackupNodeManager(backupConfig); - backupServer = createTestableServer(backupConfig); + backupServer = createTestableServer(backupConfig, backupNodeManager); liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true)); - liveServer = createTestableServer(liveConfig); + nodeManager = createNodeManager(liveConfig); + liveServer = createTestableServer(liveConfig, nodeManager); } protected void setupHAPolicyConfiguration() { @@ -272,8 +281,6 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { HAPolicy policy = server.getServer().getHAPolicy(); if (policy instanceof ReplicatedPolicy) { ((ReplicatedPolicy) policy).setCheckForLiveServer(true); - } else if (policy instanceof ReplicationPrimaryPolicy) { - Assert.assertTrue("Adapting won't work for the current configuration", ((ReplicationPrimaryPolicy) policy).isCheckForLiveServer()); } } @@ -294,6 +301,7 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { nodeManager = null; + backupNodeManager = null; try { ServerSocket serverSocket = new ServerSocket(61616); serverSocket.close(); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java index a3f19cf412..49bd396b05 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java @@ -182,8 +182,6 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase { break; case PluggableQuorumReplication: haPolicyConfiguration = ReplicationBackupPolicyConfiguration.withDefault() - .setVoteRetries(1) - .setVoteRetryWait(1000) .setDistributedManagerConfiguration(getOrCreatePluggableQuorumConfiguration()) .setGroupName(getNodeGroupName() != null ? (getNodeGroupName() + "-" + i) : null); break; @@ -278,12 +276,14 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase { return addClientSession(sf.createSession(xa, autoCommitSends, autoCommitAcks)); } - protected void waitForDistribution(SimpleString address, ActiveMQServer server, int messageCount) throws Exception { + protected boolean waitForDistribution(SimpleString address, ActiveMQServer server, int messageCount) throws Exception { ActiveMQServerLogger.LOGGER.debug("waiting for distribution of messages on server " + server); Queue q = (Queue) server.getPostOffice().getBinding(address).getBindable(); - Wait.waitFor(() -> getMessageCount(q) >= messageCount); + return Wait.waitFor(() -> { + return getMessageCount(q) >= messageCount; + }); } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java index d4a9c3d1e1..ce775368ee 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java @@ -156,12 +156,13 @@ public class ReplicaTimeoutTest extends ActiveMQTestBase { liveConfig.setBindingsDirectory(getBindingsDir(0, false)).setJournalDirectory(getJournalDir(0, false)). setPagingDirectory(getPageDir(0, false)).setLargeMessagesDirectory(getLargeMessagesDir(0, false)).setSecurityEnabled(false); - NodeManager nodeManager = createReplicatedBackupNodeManager(backupConfig); + NodeManager replicatedBackupNodeManager = createReplicatedBackupNodeManager(backupConfig); - backupServer = createTestableServer(backupConfig, nodeManager); + backupServer = createTestableServer(backupConfig, replicatedBackupNodeManager); liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true)); + NodeManager nodeManager = createReplicatedBackupNodeManager(liveConfig); liveServer = createTestableServer(liveConfig, nodeManager); final TestableServer theBackup = backupServer; diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java index 6712c74692..15cdd621f1 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java @@ -75,7 +75,10 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated sendCrashReceive(); Wait.assertTrue(backupServers.get(0)::isActive, 5000, 10); + Wait.assertTrue(backupServers.get(1)::isActive, 5000, 10); waitForTopology(backupServers.get(0).getServer(), liveServers.size(), 2); + waitForTopology(backupServers.get(1).getServer(), liveServers.size(), 2); + sendCrashBackupReceive(); } @@ -115,6 +118,14 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated } protected void sendCrashBackupReceive() throws Exception { + + //make sure bindings are ready before sending messages b/c we verify strict load balancing in waitForDistribution + this.waitForBindings( backupServers.get(0).getServer(), ADDRESS.toString(), false, 1, 0, 2000); + this.waitForBindings( backupServers.get(0).getServer(), ADDRESS.toString(), false, 1, 0, 2000); + + this.waitForBindings( backupServers.get(1).getServer(), ADDRESS.toString(), false, 1, 0, 2000); + this.waitForBindings( backupServers.get(1).getServer(), ADDRESS.toString(), false, 1, 0, 2000); + ServerLocator locator0 = getBackupServerLocator(0); ServerLocator locator1 = getBackupServerLocator(1); @@ -138,8 +149,8 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated producer.close(); - waitForDistribution(ADDRESS, backupServers.get(0).getServer(), 100); - waitForDistribution(ADDRESS, backupServers.get(1).getServer(), 100); + assertTrue(waitForDistribution(ADDRESS, backupServers.get(0).getServer(), 100)); + assertTrue(waitForDistribution(ADDRESS, backupServers.get(1).getServer(), 100)); List toCrash = new ArrayList<>(); for (TestableServer backupServer : backupServers) { diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java index b1e6a94dc2..95a69262ed 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java @@ -24,7 +24,6 @@ import org.apache.activemq.artemis.api.core.ActiveMQException; import org.apache.activemq.artemis.api.core.Interceptor; import org.apache.activemq.artemis.api.core.TransportConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.core.protocol.core.Packet; import org.apache.activemq.artemis.core.protocol.core.impl.PacketImpl; import org.apache.activemq.artemis.spi.core.protocol.RemotingConnection; @@ -90,7 +89,6 @@ public class PluggableQuorumBackupAuthenticationTest extends FailoverTestBase { @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java index 6f620a64f2..0f10c44934 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java @@ -17,8 +17,7 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; -import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;; import org.apache.activemq.artemis.tests.integration.cluster.failover.BackupSyncJournalTest; public class PluggableQuorumBackupSyncJournalTest extends BackupSyncJournalTest { @@ -30,8 +29,6 @@ public class PluggableQuorumBackupSyncJournalTest extends BackupSyncJournalTest @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()) - .setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) .setMaxSavedReplicatedJournalsSize(2) .setAllowFailBack(true); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java index eb2f1dc2a0..274fb5acb8 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java @@ -29,18 +29,24 @@ import org.apache.activemq.artemis.api.core.QueueConfiguration; import org.apache.activemq.artemis.api.core.TransportConfiguration; import org.apache.activemq.artemis.api.core.client.ClientSession; import org.apache.activemq.artemis.component.WebServerComponent; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.core.server.ActiveMQServer; +import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.core.server.ServiceComponent; import org.apache.activemq.artemis.dto.AppDTO; import org.apache.activemq.artemis.dto.WebServerDTO; +import org.apache.activemq.artemis.quorum.MutableLong; +import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTest; +import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer; import org.apache.activemq.artemis.tests.util.Wait; +import org.jboss.logging.Logger; import org.junit.Assert; import org.junit.Test; public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends FailoverTest { + private static final Logger log = Logger.getLogger(PluggableQuorumReplicatedLargeMessageFailoverTest.class); protected void beforeWaitForRemoteBackupSynchronization() { } @@ -170,8 +176,6 @@ public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends Failo @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()) - .setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) .setMaxSavedReplicatedJournalsSize(2) .setAllowFailBack(true); @@ -210,4 +214,30 @@ public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends Failo } super.crash(sessions); } + + @Override + protected void decrementActivationSequenceForForceRestartOf(TestableServer testableServer) throws Exception { + doDecrementActivationSequenceForForceRestartOf(log, nodeManager, managerConfiguration); + } + + public static void doDecrementActivationSequenceForForceRestartOf(Logger log, NodeManager nodeManager, DistributedPrimitiveManagerConfiguration distributedPrimitiveManagerConfiguration) throws Exception { + nodeManager.start(); + long localActivation = nodeManager.readNodeActivationSequence(); + // file based + FileBasedPrimitiveManager fileBasedPrimitiveManager = new FileBasedPrimitiveManager(distributedPrimitiveManagerConfiguration.getProperties()); + fileBasedPrimitiveManager.start(); + try { + MutableLong mutableLong = fileBasedPrimitiveManager.getMutableLong(nodeManager.getNodeId().toString()); + + if (!mutableLong.compareAndSet(localActivation + 1, localActivation)) { + throw new Exception("Failed to decrement coordinated activation sequence to:" + localActivation + ", not +1 : " + mutableLong.get()); + } + log.warn("Intentionally decrementing coordinated activation sequence for test, may result is lost data"); + + } finally { + fileBasedPrimitiveManager.stop(); + nodeManager.stop(); + } + } + } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java index 1381cad3c4..a42547c6e7 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java @@ -17,7 +17,6 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.tests.integration.cluster.failover.NettyReplicationStopTest; public class PluggableQuorumNettyReplicationStopTest extends NettyReplicationStopTest { @@ -29,7 +28,6 @@ public class PluggableQuorumNettyReplicationStopTest extends NettyReplicationSto @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java index b5b8b3af02..b351706e43 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java @@ -17,7 +17,6 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.tests.integration.cluster.failover.PageCleanupWhileReplicaCatchupTest; public class PluggableQuorumPageCleanupWhileReplicaCatchupTest extends PageCleanupWhileReplicaCatchupTest { @@ -29,7 +28,6 @@ public class PluggableQuorumPageCleanupWhileReplicaCatchupTest extends PageClean @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java index 4c15d7bd3b..0718962e10 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java @@ -48,7 +48,6 @@ public class PluggableQuorumReplicaTimeoutTest extends ReplicaTimeoutTest { managerConfiguration, managerConfiguration); ReplicationPrimaryPolicyConfiguration primaryConfiguration = ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()); primaryConfiguration.setInitialReplicationSyncTimeout(1000); - primaryConfiguration.setCheckForLiveServer(true); ReplicationBackupPolicyConfiguration backupConfiguration = ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()); backupConfiguration.setInitialReplicationSyncTimeout(1000); backupConfiguration.setMaxSavedReplicatedJournalsSize(2) diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java index 9d7cb4d3b2..a675200508 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java @@ -18,11 +18,15 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.api.core.client.ClientSession; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.tests.integration.cluster.failover.LargeMessageFailoverTest; +import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer; +import org.jboss.logging.Logger; + +import static org.apache.activemq.artemis.tests.integration.cluster.failover.quorum.PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.doDecrementActivationSequenceForForceRestartOf; public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMessageFailoverTest { + private static final Logger log = Logger.getLogger(PluggableQuorumReplicatedLargeMessageFailoverTest.class); @Override protected void createConfigs() throws Exception { createPluggableReplicatedConfigs(); @@ -30,7 +34,6 @@ public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMess @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } @@ -58,4 +61,8 @@ public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMess super.crash(sessions); } + @Override + protected void decrementActivationSequenceForForceRestartOf(TestableServer liveServer) throws Exception { + doDecrementActivationSequenceForForceRestartOf(log, nodeManager, managerConfiguration); + } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java index 5954d8992d..a1e152e813 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java @@ -18,7 +18,6 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.api.core.client.ClientSession; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.tests.integration.cluster.util.BackupSyncDelay; import org.junit.After; import org.junit.Before; @@ -66,7 +65,6 @@ public class PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest extends @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) .setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java index d80460855b..9c4d60004f 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java @@ -17,7 +17,6 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.tests.integration.cluster.failover.PagingFailoverTest; public class PluggableQuorumReplicatedPagingFailoverTest extends PagingFailoverTest { @@ -29,7 +28,6 @@ public class PluggableQuorumReplicatedPagingFailoverTest extends PagingFailoverT @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/management/ActiveMQServerControlUsingCoreTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/management/ActiveMQServerControlUsingCoreTest.java index 7eb69530c5..ab060fac52 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/management/ActiveMQServerControlUsingCoreTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/management/ActiveMQServerControlUsingCoreTest.java @@ -520,6 +520,11 @@ public class ActiveMQServerControlUsingCoreTest extends ActiveMQServerControlTes return (String) proxy.retrieveAttributeValue("nodeID"); } + @Override + public long getActivationSequence() { + return (Long) proxy.retrieveAttributeValue("activationSequence"); + } + @Override public String getManagementAddress() { return (String) proxy.retrieveAttributeValue("managementAddress"); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java index 11e4380246..1a84de67f5 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java @@ -45,8 +45,6 @@ public class PluggableQuorumReplicationFlowControlTest extends SharedNothingRepl ReplicationBackupPolicyConfiguration haPolicy = ReplicationBackupPolicyConfiguration.withDefault(); haPolicy.setDistributedManagerConfiguration(managerConfiguration); haPolicy.setClusterName("cluster"); - // fail-fast in order to let the backup to quickly retry syncing with primary - haPolicy.setVoteRetries(0); return haPolicy; } @@ -54,7 +52,6 @@ public class PluggableQuorumReplicationFlowControlTest extends SharedNothingRepl protected HAPolicyConfiguration createReplicationLiveConfiguration() { ReplicationPrimaryPolicyConfiguration haPolicy = ReplicationPrimaryPolicyConfiguration.withDefault(); haPolicy.setDistributedManagerConfiguration(managerConfiguration); - haPolicy.setCheckForLiveServer(false); return haPolicy; } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java index f5ed720792..8d71a57e4e 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java @@ -18,7 +18,6 @@ package org.apache.activemq.artemis.tests.integration.replication; import org.apache.activemq.artemis.api.core.TransportConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; @@ -30,7 +29,7 @@ public class PluggableQuorumReplicationOrderTest extends ReplicationOrderTest { @Parameterized.Parameter public boolean useNetty; - @Parameterized.Parameters(name = "useNetty={1}") + @Parameterized.Parameters(name = "useNetty={0}") public static Iterable getParams() { return asList(new Object[][]{{false}, {true}}); } @@ -42,8 +41,6 @@ public class PluggableQuorumReplicationOrderTest extends ReplicationOrderTest { @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()) - .setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) .setMaxSavedReplicatedJournalsSize(2) .setAllowFailBack(true); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java index 1d702b5b88..714b2982db 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java @@ -18,14 +18,37 @@ package org.apache.activemq.artemis.tests.integration.replication; import java.io.IOException; import java.util.Collections; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import org.apache.activemq.artemis.api.core.Message; +import org.apache.activemq.artemis.api.core.QueueConfiguration; +import org.apache.activemq.artemis.api.core.RoutingType; +import org.apache.activemq.artemis.api.core.client.ClientConsumer; +import org.apache.activemq.artemis.api.core.client.ClientMessage; +import org.apache.activemq.artemis.api.core.client.ClientProducer; +import org.apache.activemq.artemis.api.core.client.ClientSession; +import org.apache.activemq.artemis.api.core.client.ClientSessionFactory; +import org.apache.activemq.artemis.api.core.client.ServerLocator; +import org.apache.activemq.artemis.core.client.impl.ServerLocatorImpl; +import org.apache.activemq.artemis.core.config.Configuration; import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.core.server.ActivateCallback; +import org.apache.activemq.artemis.core.server.ActiveMQServer; +import org.apache.activemq.artemis.core.server.ActiveMQServers; +import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager; +import org.apache.activemq.artemis.quorum.DistributedLock; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.apache.activemq.artemis.quorum.MutableLong; import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; +import org.apache.activemq.artemis.tests.util.Wait; +import org.junit.Assert; import org.junit.Before; import org.junit.Rule; +import org.junit.Test; import org.junit.rules.TemporaryFolder; public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest { @@ -43,7 +66,6 @@ public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest protected HAPolicyConfiguration createReplicationLiveConfiguration() { ReplicationPrimaryPolicyConfiguration haPolicy = ReplicationPrimaryPolicyConfiguration.withDefault(); haPolicy.setDistributedManagerConfiguration(managerConfiguration); - haPolicy.setCheckForLiveServer(false); return haPolicy; } @@ -55,4 +77,616 @@ public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest return haPolicy; } + @Test + public void testUnReplicatedOrderedTransition() throws Exception { + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + ServerLocator locator = ServerLocatorImpl.newLocator("(tcp://localhost:61616,tcp://localhost:61617)?ha=true"); + locator.setCallTimeout(60_000L); + locator.setConnectionTTL(60_000L); + + ClientSessionFactory csf = locator.createSessionFactory(); + ClientSession clientSession = csf.createSession(); + clientSession.createQueue(new QueueConfiguration("slow").setRoutingType(RoutingType.ANYCAST)); + clientSession.close(); + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + Wait.waitFor(backupServer::isStarted); + + waitForTopology(liveServer, 1, 1, 30000); + waitForTopology(backupServer, 1, 1, 30000); + + liveServer.stop(); + + // backup will take over and run un replicated + + csf = locator.createSessionFactory(); + clientSession = csf.createSession(); + clientSession.createQueue(new QueueConfiguration("slow_un_replicated").setRoutingType(RoutingType.ANYCAST)); + clientSession.close(); + + waitForTopology(backupServer, 1, 0, 30000); + assertTrue(Wait.waitFor(() -> 2L == backupServer.getNodeManager().getNodeActivationSequence())); + + backupServer.stop(false); + + // now only backup should be able to start as it has run un_replicated + liveServer.start(); + Wait.assertFalse(liveServer::isActive); + liveServer.stop(); + + // restart backup + backupServer.start(); + + Wait.waitFor(backupServer::isStarted); + assertEquals(3L, backupServer.getNodeManager().getNodeActivationSequence()); + + csf = locator.createSessionFactory(); + clientSession = csf.createSession(); + clientSession.createQueue(new QueueConfiguration("backup_as_un_replicated").setRoutingType(RoutingType.ANYCAST)); + clientSession.close(); + + // verify the live restart as a backup to the restarted backupServer that has taken on the live role, no failback + liveServer.start(); + + csf = locator.createSessionFactory(); + clientSession = csf.createSession(); + clientSession.createQueue(new QueueConfiguration("backup_as_replicated").setRoutingType(RoutingType.ANYCAST)); + clientSession.close(); + + assertTrue(Wait.waitFor(liveServer::isReplicaSync)); + assertTrue(Wait.waitFor(() -> 3L == liveServer.getNodeManager().getNodeActivationSequence())); + + backupServer.stop(true); + + waitForTopology(liveServer, 1, 0, 30000); + assertTrue(Wait.waitFor(() -> 4L == liveServer.getNodeManager().getNodeActivationSequence())); + + liveServer.stop(true); + clientSession.close(); + locator.close(); + } + + @Test + public void testBackupFailoverAndPrimaryFailback() throws Exception { + final int timeout = (int) TimeUnit.SECONDS.toMillis(30); + + // start live + Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer primaryInstance = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + primaryInstance.setIdentity("PRIMARY"); + primaryInstance.start(); + + // primary initially UN REPLICATED + Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence()); + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ((ReplicationBackupPolicyConfiguration)backupConfiguration.getHAPolicyConfiguration()).setAllowFailBack(true); + + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + Wait.waitFor(backupServer::isStarted); + + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout); + + // primary REPLICATED, backup matches (has replicated) activation sequence + Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence()); + Assert.assertEquals(1L, backupServer.getNodeManager().getNodeActivationSequence()); + + primaryInstance.stop(); + + // backup UN REPLICATED (new version) + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 2L == backupServer.getNodeManager().getNodeActivationSequence(), timeout); + + // just to let the console logging breath! + TimeUnit.MILLISECONDS.sleep(100); + + // restart primary that will request failback + ActiveMQServer restartedPrimaryForFailBack = primaryInstance; //addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + restartedPrimaryForFailBack.start(); + + // first step is backup getting replicated + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout); + + // restarted primary will run un replicated (increment sequence) while backup restarts to revert to backup role. + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> { + try { + return 3L == restartedPrimaryForFailBack.getNodeManager().getNodeActivationSequence(); + } catch (NullPointerException ok) { + return false; + } + }, timeout); + + // the backup should then resume with an insync replica view of that version + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> restartedPrimaryForFailBack.isReplicaSync(), timeout); + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout); + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 3L == backupServer.getNodeManager().getNodeActivationSequence(), timeout); + + // just to let the console logging breath! + TimeUnit.MILLISECONDS.sleep(100); + + // stop backup to verify primary goes on with new sequence as un replicated + backupServer.stop(); + + // just to let the console logging breath! + TimeUnit.MILLISECONDS.sleep(100); + + // live goes un replicated + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> { + try { + return 4L == restartedPrimaryForFailBack.getNodeManager().getNodeActivationSequence(); + } catch (NullPointerException ok) { + return false; + } + }, timeout); + + restartedPrimaryForFailBack.stop(); + } + + + @Test + public void testPrimaryIncrementActivationSequenceOnUnReplicated() throws Exception { + final int timeout = (int) TimeUnit.SECONDS.toMillis(30); + + // start live + Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer primaryInstance = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + primaryInstance.setIdentity("PRIMARY"); + primaryInstance.start(); + + // primary UN REPLICATED + Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence()); + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + Wait.waitFor(backupServer::isStarted); + + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout); + + // primary REPLICATED, backup matches (has replicated) activation sequence + Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence()); + Assert.assertEquals(1L, backupServer.getNodeManager().getNodeActivationSequence()); + + // transition to un replicated once backup goes away + backupServer.stop(); + + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 2L == primaryInstance.getNodeManager().getNodeActivationSequence(), timeout); + + // done + primaryInstance.stop(); + } + + + @Test + public void testBackupStartsFirst() throws Exception { + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + } + + @Test + public void testBackupOutOfSequenceReleasesLock() throws Exception { + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + + backupServer.stop(); + + TimeUnit.SECONDS.sleep(1); + + liveServer.stop(); + // backup can get lock but does not have the sequence to start, will try and be a backup + + backupServer.start(); + + // live server should be active + liveServer.start(); + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + } + + + @Test + public void testBackupOutOfSequenceCheckActivationSequence() throws Exception { + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + + backupServer.stop(); + + TimeUnit.SECONDS.sleep(1); + + final String coordinatedId = liveServer.getNodeID().toString(); + liveServer.stop(); + + // backup can get lock but does not have the sequence to start, will try and be a backup + // to verify it can short circuit with a dirty read we grab the lock for a little while + DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf( + managerConfiguration.getClassName(), + managerConfiguration.getProperties()); + distributedPrimitiveManager.start(); + final DistributedLock lock = distributedPrimitiveManager.getDistributedLock(coordinatedId); + assertTrue(lock.tryLock()); + CountDownLatch preActivate = new CountDownLatch(1); + backupServer.registerActivateCallback(new ActivateCallback() { + @Override + public void preActivate() { + ActivateCallback.super.preActivate(); + preActivate.countDown(); + } + }); + backupServer.start(); + + // it should be able to do a dirty read of the sequence id and not have to wait to get a lock + assertTrue(preActivate.await(1, TimeUnit.SECONDS)); + + // release the lock + distributedPrimitiveManager.stop(); + + // live server should be active + liveServer.start(); + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + } + + @Test + public void testSelfRepairPrimary() throws Exception { + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + + liveServer.start(); + final String coordinatedId = liveServer.getNodeID().toString(); + Wait.waitFor(liveServer::isStarted); + liveServer.stop(); + + liveServer.start(); + Wait.waitFor(liveServer::isStarted); + Assert.assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence()); + liveServer.stop(); + + // backup can get lock but does not have the sequence to start, will try and be a backup + // to verify it can short circuit with a dirty read we grab the lock for a little while + DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager + .newInstanceOf(managerConfiguration.getClassName(), managerConfiguration.getProperties()); + distributedPrimitiveManager.start(); + try (DistributedLock lock = distributedPrimitiveManager.getDistributedLock(coordinatedId)) { + assertTrue(lock.tryLock()); + distributedPrimitiveManager.getMutableLong(coordinatedId).compareAndSet(2, -2); + } + liveServer.start(); + Wait.waitFor(liveServer::isStarted); + Assert.assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence()); + Assert.assertEquals(2, distributedPrimitiveManager.getMutableLong(coordinatedId).get()); + + distributedPrimitiveManager.stop(); + + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + Wait.waitFor(backupServer::isReplicaSync); + Assert.assertEquals(2, backupServer.getNodeManager().getNodeActivationSequence()); + backupServer.stop(); + } + + @Test + public void testPrimaryPeers() throws Exception { + final String PEER_NODE_ID = "some-shared-id-001"; + + final Configuration liveConfiguration = createLiveConfiguration(); + ((ReplicationPrimaryPolicyConfiguration)liveConfiguration.getHAPolicyConfiguration()).setCoordinationId(PEER_NODE_ID); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + ServerLocator locator = ServerLocatorImpl.newLocator("(tcp://localhost:61616,tcp://localhost:61617)?ha=true"); + locator.setCallTimeout(60_000L); + locator.setConnectionTTL(60_000L); + final ClientSessionFactory keepLocatorAliveSLF = locator.createSessionFactory(); + + ClientSessionFactory csf = locator.createSessionFactory(); + sendTo(csf, "live_un_replicated"); + csf.close(); + + // start peer, will backup + Configuration peerLiveConfiguration = createBackupConfiguration(); // to get acceptors and locators ports that won't clash + peerLiveConfiguration.setHAPolicyConfiguration(createReplicationLiveConfiguration()); + ((ReplicationPrimaryPolicyConfiguration)peerLiveConfiguration.getHAPolicyConfiguration()).setCoordinationId(PEER_NODE_ID); + peerLiveConfiguration.setName("localhost::live-peer"); + + ActiveMQServer livePeerServer = addServer(ActiveMQServers.newActiveMQServer(peerLiveConfiguration)); + livePeerServer.setIdentity("LIVE-PEER"); + livePeerServer.start(); + + Wait.waitFor(livePeerServer::isStarted); + + waitForTopology(liveServer, 1, 1, 30000); + waitForTopology(livePeerServer, 1, 1, 30000); + + liveServer.stop(); + + // livePeerServer will take over and run un replicated + + csf = locator.createSessionFactory(); + receiveFrom(csf, "live_un_replicated"); + sendTo(csf, "peer_un_replicated"); + csf.close(); + + waitForTopology(livePeerServer, 1, 0, 30000); + + assertTrue(Wait.waitFor(() -> 2L == livePeerServer.getNodeManager().getNodeActivationSequence())); + + livePeerServer.stop(false); + + liveServer.start(); + + Wait.assertTrue(() -> !liveServer.isActive()); + + // restart backup + livePeerServer.start(); + + Wait.waitFor(livePeerServer::isStarted); + + assertEquals(3L, livePeerServer.getNodeManager().getNodeActivationSequence()); + + csf = locator.createSessionFactory(); + receiveFrom(csf, "peer_un_replicated"); + sendTo(csf, "backup_as_un_replicated"); + csf.close(); + + // verify the live restart as a backup to the restarted PeerLiveServer that has taken on the live role + liveServer.start(); + + csf = locator.createSessionFactory(); + receiveFrom(csf, "backup_as_un_replicated"); + sendTo(csf, "backup_as_replicated"); + csf.close(); + + assertTrue(Wait.waitFor(liveServer::isReplicaSync)); + assertTrue(Wait.waitFor(() -> 3L == liveServer.getNodeManager().getNodeActivationSequence())); + + waitForTopology(liveServer, 1, 1, 30000); + waitForTopology(livePeerServer, 1, 1, 30000); + + livePeerServer.stop(true); + + assertTrue(Wait.waitFor(() -> 4L == liveServer.getNodeManager().getNodeActivationSequence())); + + csf = locator.createSessionFactory(); + receiveFrom(csf, "backup_as_replicated"); + csf.close(); + + waitForTopology(liveServer, 1, 0, 30000); + + liveServer.stop(true); + keepLocatorAliveSLF.close(); + locator.close(); + } + + @Test + public void testUnavailableSelfHeal() throws Exception { + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + + final String coordinatedId = liveServer.getNodeID().toString(); + + backupServer.stop(); + TimeUnit.MILLISECONDS.sleep(500); + liveServer.stop(); + + // some manual intervention to force an unavailable + // simulate live failing in activation local sequence update on un replicated run when backup stops. + + DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf(managerConfiguration.getClassName(), managerConfiguration.getProperties()); + distributedPrimitiveManager.start(); + final MutableLong activationSequence = distributedPrimitiveManager.getMutableLong(coordinatedId); + Assert.assertTrue(activationSequence.compareAndSet(2, -2)); + + // case: 1, the fail to write locally 2 but the write actually succeeding + // should delay pending resolution of the uncommitted claim + backupServer.start(); + + // live server should activate after self healing its outstanding claim + liveServer.start(); + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + } + + @Test + public void testUnavailableAdminIntervention() throws Exception { + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + + final String coordinatedId = liveServer.getNodeID().toString(); + + System.err.println("coodr id: " + coordinatedId); + backupServer.stop(); + TimeUnit.MILLISECONDS.sleep(500); + liveServer.stop(); + + // some manual intervention to force an unavailable + // simulate live failing in activation local sequence update on un replicated run when backup stops. + + DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf( + managerConfiguration.getClassName(), + managerConfiguration.getProperties()); + distributedPrimitiveManager.start(); + final MutableLong coordinatedActivationSequence = distributedPrimitiveManager.getMutableLong(coordinatedId); + Assert.assertTrue(coordinatedActivationSequence.compareAndSet(2, -2)); + + // case: 2, the fail to write locally 2 but the write actually failing + // need to put 1 in the local activation sequence of the live + FileLockNodeManager fileLockNodeManager = new FileLockNodeManager(liveConfiguration.getNodeManagerLockLocation().getAbsoluteFile(), true); + fileLockNodeManager.start(); + assertEquals(2, fileLockNodeManager.readNodeActivationSequence()); + fileLockNodeManager.writeNodeActivationSequence(1); + fileLockNodeManager.stop(); + + // should delay pending resolution of the uncommitted claim + backupServer.start(); + CountDownLatch liveStarting = new CountDownLatch(1); + // should delay pending resolution of the uncommitted claim + // IMPORTANT: primary activation run on the start caller thread!! We need another thread here + final Thread liveServerStarterThread = new Thread(() -> { + liveStarting.countDown(); + try { + liveServer.start(); + } catch (Throwable e) { + e.printStackTrace(); + } + }); + liveServerStarterThread.start(); + liveStarting.await(); + TimeUnit.MILLISECONDS.sleep(500); + // both are candidates and one of them failed to commit the claim + // let them compete on retry + Assert.assertTrue(coordinatedActivationSequence.compareAndSet(-2, 1)); + // one of the two can activate + Wait.waitFor(() -> liveServer.isStarted() || backupServer.isStarted()); + + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + + assertEquals(2, backupServer.getNodeManager().getNodeActivationSequence()); + assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence()); + + } + + private void sendTo(ClientSessionFactory clientSessionFactory, String addr) throws Exception { + ClientSession clientSession = clientSessionFactory.createSession(true, true); + clientSession.createQueue(new QueueConfiguration(addr).setRoutingType(RoutingType.ANYCAST).setDurable(true)); + ClientProducer producer = clientSession.createProducer(addr); + ClientMessage message = clientSession.createMessage(true); + message.putStringProperty("K", addr); + message.putLongProperty("delay", 0L); // so slow interceptor does not get us + producer.send(message); + producer.close(); + clientSession.close(); + } + + private void receiveFrom(ClientSessionFactory clientSessionFactory, String addr) throws Exception { + ClientSession clientSession = clientSessionFactory.createSession(true, true); + clientSession.start(); + ClientConsumer consumer = clientSession.createConsumer(addr); + Message message = consumer.receive(4000); + assertNotNull(message); + assertTrue(message.getStringProperty("K").equals(addr)); + consumer.close(); + clientSession.close(); + } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java index 1d3317c40e..fde20f4a87 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java @@ -244,7 +244,7 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase { .setCheckForLiveServer(false); } - private Configuration createLiveConfiguration() throws Exception { + protected Configuration createLiveConfiguration() throws Exception { Configuration conf = new ConfigurationImpl(); conf.setName("localhost::live"); @@ -275,7 +275,7 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase { return new ReplicaPolicyConfiguration().setClusterName("cluster"); } - private Configuration createBackupConfiguration() throws Exception { + protected Configuration createBackupConfiguration() throws Exception { Configuration conf = new ConfigurationImpl(); conf.setName("localhost::backup"); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java index 60f27a13c2..267f553211 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java @@ -24,7 +24,6 @@ import org.apache.activemq.artemis.api.core.client.ClientSession; import org.apache.activemq.artemis.api.core.client.ClientSessionFactory; import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; import org.apache.activemq.artemis.spi.core.security.ActiveMQBasicSecurityManager; import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTestBase; @@ -66,9 +65,9 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes .setLargeMessagesDirectory(getLargeMessagesDir(0, true)); setupHAPolicyConfiguration(); - nodeManager = createReplicatedBackupNodeManager(backupConfig); + backupNodeManager = createReplicatedBackupNodeManager(backupConfig); - backupServer = createTestableServer(backupConfig); + backupServer = createTestableServer(backupConfig, backupNodeManager); backupServer.getServer().setSecurityManager(new ActiveMQBasicSecurityManager()); @@ -77,7 +76,8 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes .clearAcceptorConfigurations() .addAcceptorConfiguration(getAcceptorTransportConfiguration(true)); - liveServer = createTestableServer(liveConfig); + nodeManager = createNodeManager(liveConfig); + liveServer = createTestableServer(liveConfig, nodeManager); liveServer.getServer().setSecurityManager(new ActiveMQBasicSecurityManager()); } @@ -94,7 +94,6 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } diff --git a/tests/smoke-tests/pom.xml b/tests/smoke-tests/pom.xml index 764fa0a396..9684932e8c 100644 --- a/tests/smoke-tests/pom.xml +++ b/tests/smoke-tests/pom.xml @@ -677,6 +677,44 @@ + + test-compile + create-zk-replication-primary-peer-a + + create + + + ${basedir}/target/classes/servers/zkReplicationPrimaryPeerA + true + admin + admin + ${basedir}/target/zkReplicationPrimaryPeerA + + + --java-options + -Djava.rmi.server.hostname=localhost + + + + + test-compile + create-zk-replication-primary-peer-b + + create + + + ${basedir}/target/classes/servers/zkReplicationPrimaryPeerB + true + admin + admin + ${basedir}/target/zkReplicationPrimaryPeerB + + + --java-options + -Djava.rmi.server.hostname=localhost + + + test-compile create-zk-replication-backup diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml index 2095d20830..75b9e7405f 100644 --- a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml @@ -39,7 +39,6 @@ under the License. - true diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/broker.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/broker.xml new file mode 100644 index 0000000000..9c2567d509 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/broker.xml @@ -0,0 +1,138 @@ + + + + + + primary-peer-a + + ./data/bindings + + ./data/journal + + ./data/largemessages + + ./data/paging + + + + + + org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager + + + + + peer-journal-001 + + + + + + + tcp://localhost:61616 + tcp://localhost:61716 + + + + + + tcp://localhost:61616 + + + admin + + password + + + + artemis + OFF + 1 + + peer + + + + + + + + + + + + + + + + + + + + + + + + + + + DLQ + ExpiryQueue + 0 + + -1 + 10 + PAGE + true + true + true + true + + + + DLQ + ExpiryQueue + 0 + + 10MB + 1MB + + 10 + PAGE + true + true + true + true + + + + +
+ + +
+
+ + + +
+
+
+
diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/management.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/management.xml new file mode 100644 index 0000000000..576f1e5995 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/management.xml @@ -0,0 +1,20 @@ + + + + + \ No newline at end of file diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/broker.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/broker.xml new file mode 100644 index 0000000000..3db6ead7a1 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/broker.xml @@ -0,0 +1,138 @@ + + + + + + primary-peer-b + + ./data/bindings + + ./data/journal + + ./data/largemessages + + ./data/paging + + + + + + org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager + + + + + peer-journal-001 + + + + + + + tcp://localhost:61716 + tcp://localhost:61616 + + + + + + tcp://localhost:61716 + + + admin + + password + + + + artemis + OFF + 1 + + peer + + + + + + + + + + + + + + + + + + + + + + + + + + + DLQ + ExpiryQueue + 0 + + -1 + 10 + PAGE + true + true + true + true + + + + DLQ + ExpiryQueue + 0 + + 10MB + 1MB + + 10 + PAGE + true + true + true + true + + + + +
+ + +
+
+ + + +
+
+
+
diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/management.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/management.xml new file mode 100644 index 0000000000..14bbaf2218 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/management.xml @@ -0,0 +1,20 @@ + + + + + \ No newline at end of file diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java index d5948f57a1..ab6ee9237e 100644 --- a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java @@ -20,8 +20,7 @@ package org.apache.activemq.artemis.tests.smoke.quorum; import javax.management.remote.JMXServiceURL; import java.net.MalformedURLException; import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; +import java.util.LinkedList; import java.util.Objects; import java.util.Optional; import java.util.concurrent.TimeUnit; @@ -55,15 +54,15 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { private static final Logger LOGGER = Logger.getLogger(PluggableQuorumSinglePairTest.class); - private static final String JMX_SERVER_HOSTNAME = "localhost"; - private static final int JMX_PORT_PRIMARY = 10099; - private static final int JMX_PORT_BACKUP = 10199; + static final String JMX_SERVER_HOSTNAME = "localhost"; + static final int JMX_PORT_PRIMARY = 10099; + static final int JMX_PORT_BACKUP = 10199; - private static final String PRIMARY_DATA_FOLDER = "ReplicationPrimary";; - private static final String BACKUP_DATA_FOLDER = "ReplicationBackup"; + static final String PRIMARY_DATA_FOLDER = "ReplicationPrimary"; + static final String BACKUP_DATA_FOLDER = "ReplicationBackup"; - private static final int PRIMARY_PORT_OFFSET = 0; - private static final int BACKUP_PORT_OFFSET = PRIMARY_PORT_OFFSET + 100; + static final int PRIMARY_PORT_OFFSET = 0; + static final int BACKUP_PORT_OFFSET = PRIMARY_PORT_OFFSET + 100; public static class BrokerControl { @@ -73,7 +72,7 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { final JMXServiceURL jmxServiceURL; final int portID; - private BrokerControl(final String name, int jmxPort, String dataFolder, int portID) { + BrokerControl(final String name, int jmxPort, String dataFolder, int portID) { this.portID = portID; this.dataFolder = dataFolder; try { @@ -108,6 +107,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { public Optional listNetworkTopology() throws Exception { return Jmx.listNetworkTopology(jmxServiceURL, objectNameBuilder); } + + public Optional getActivationSequence() throws Exception { + return Jmx.getActivationSequence(jmxServiceURL, objectNameBuilder); + } + + public Optional isActive() throws Exception { + return Jmx.isActive(jmxServiceURL, objectNameBuilder); + } } @Parameterized.Parameter @@ -118,14 +125,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { return Arrays.asList(new Object[][]{{false}, {true}}); } - private final BrokerControl primary; - private final BrokerControl backup; - private final Collection brokers; + protected BrokerControl primary; + protected BrokerControl backup; + protected LinkedList brokers; public PluggableQuorumSinglePairTest(String brokerFolderPrefix) { primary = new BrokerControl("primary", JMX_PORT_PRIMARY, brokerFolderPrefix + PRIMARY_DATA_FOLDER, PRIMARY_PORT_OFFSET); backup = new BrokerControl("backup", JMX_PORT_BACKUP, brokerFolderPrefix + BACKUP_DATA_FOLDER, BACKUP_PORT_OFFSET); - brokers = Collections.unmodifiableList(Arrays.asList(primary, backup)); + brokers = new LinkedList(Arrays.asList(primary, backup)); } protected abstract boolean awaitAsyncSetupCompleted(long timeout, TimeUnit unit) throws InterruptedException; @@ -150,6 +157,10 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { Process primaryInstance = primary.startServer(this, timeout); Assert.assertTrue(awaitAsyncSetupCompleted(timeout, TimeUnit.MILLISECONDS)); Wait.assertTrue(() -> !primary.isBackup().orElse(true), timeout); + + // primary UN REPLICATED + Assert.assertEquals(1L, primary.getActivationSequence().get().longValue()); + LOGGER.info("started primary"); LOGGER.info("starting backup"); Process backupInstance = backup.startServer(this, 0); @@ -176,6 +187,11 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { Assert.assertNotNull(urlPrimary); LOGGER.infof("primary: %s", urlPrimary); Assert.assertNotEquals(urlPrimary, urlBackup); + + // primary REPLICATED, backup matches (has replicated) activation sequence + Assert.assertEquals(1L, primary.getActivationSequence().get().longValue()); + Assert.assertEquals(1L, backup.getActivationSequence().get().longValue()); + LOGGER.info("killing primary"); ServerUtil.killServer(primaryInstance, forceKill); LOGGER.info("killed primary"); @@ -188,11 +204,15 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { .and(withNodes(1))), timeout); LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get()); Assert.assertEquals(nodeID, backup.getNodeID().get()); + + // backup UN REPLICATED (new version) + Assert.assertEquals(2L, backup.getActivationSequence().get().longValue()); + // wait a bit before restarting primary LOGGER.info("waiting before starting primary"); TimeUnit.SECONDS.sleep(4); LOGGER.info("starting primary"); - primary.startServer(this, 0); + primaryInstance = primary.startServer(this, 0); LOGGER.info("started primary"); Wait.assertTrue(() -> backup.isBackup().orElse(false), timeout); Assert.assertTrue(!primary.isBackup().get()); @@ -209,6 +229,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { Assert.assertTrue(backup.isReplicaSync().get()); LOGGER.infof("backup is synchronized with live"); Assert.assertEquals(nodeID, primary.getNodeID().get()); + + // primary ran un replicated for a short while after failback, before backup was in sync + Assert.assertEquals(3L, primary.getActivationSequence().get().longValue()); + Assert.assertEquals(3L, backup.getActivationSequence().get().longValue()); + + LOGGER.infof("Done, killing both"); + ServerUtil.killServer(primaryInstance); + ServerUtil.killServer(backupInstance); } @Test @@ -272,5 +300,87 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { Wait.waitFor(()-> !backupInstance.isAlive(), timeout); } + + @Test + public void testOnlyLastUnreplicatedCanStart() throws Exception { + final int timeout = (int) TimeUnit.SECONDS.toMillis(30); + LOGGER.info("starting primary"); + Process primaryInstance = primary.startServer(this, timeout); + Assert.assertTrue(awaitAsyncSetupCompleted(timeout, TimeUnit.MILLISECONDS)); + Wait.assertTrue(() -> !primary.isBackup().orElse(true), timeout); + LOGGER.info("started primary"); + LOGGER.info("starting backup"); + Process backupInstance = backup.startServer(this, 0); + Wait.assertTrue(() -> backup.isBackup().orElse(false), timeout); + final String nodeID = primary.getNodeID().get(); + Assert.assertNotNull(nodeID); + LOGGER.infof("NodeID: %s", nodeID); + for (BrokerControl broker : brokers) { + Wait.assertTrue(() -> validateNetworkTopology(broker.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, Objects::nonNull)) + .and(withBackup(nodeID, Objects::nonNull)) + .and(withMembers(1)) + .and(withNodes(2))), timeout); + } + LOGGER.infof("primary topology is: %s", primary.listNetworkTopology().get()); + LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get()); + Assert.assertTrue(backup.isReplicaSync().get()); + LOGGER.infof("backup is synchronized with live"); + final String urlBackup = backupOf(nodeID, decodeNetworkTopologyJson(backup.listNetworkTopology().get())); + Assert.assertNotNull(urlBackup); + LOGGER.infof("backup: %s", urlBackup); + final String urlPrimary = liveOf(nodeID, decodeNetworkTopologyJson(primary.listNetworkTopology().get())); + Assert.assertNotNull(urlPrimary); + LOGGER.infof("primary: %s", urlPrimary); + Assert.assertNotEquals(urlPrimary, urlBackup); + + + // verify sequence id's in sync + Assert.assertEquals(1L, primary.getActivationSequence().get().longValue()); + Assert.assertEquals(1L, backup.getActivationSequence().get().longValue()); + + LOGGER.info("killing primary"); + ServerUtil.killServer(primaryInstance, forceKill); + LOGGER.info("killed primary"); + Wait.assertTrue(() -> !backup.isBackup().orElse(true), timeout); + Wait.assertTrue(() -> validateNetworkTopology(backup.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, urlBackup::equals)) + .and(withBackup(nodeID, Objects::isNull)) + .and(withMembers(1)) + .and(withNodes(1))), timeout); + LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get()); + Assert.assertEquals(nodeID, backup.getNodeID().get()); + + + // backup now UNREPLICATED, it is the only node that can continue + Assert.assertEquals(2L, backup.getActivationSequence().get().longValue()); + + LOGGER.info("killing backup"); + ServerUtil.killServer(backupInstance, forceKill); + + // wait a bit before restarting primary + LOGGER.info("waiting before starting primary"); + TimeUnit.SECONDS.sleep(4); + LOGGER.info("restarting primary"); + + Process restartedPrimary = primary.startServer(this, 0); + LOGGER.info("restarted primary, " + restartedPrimary); + + Wait.assertFalse("Primary shouldn't activate", () -> primary.isActive().orElse(false), 5000); + + ServerUtil.killServer(restartedPrimary); + + LOGGER.info("restarting backup"); + + // backup can resume with data seq 3 + final Process restartedBackupInstance = backup.startServer(this, 5000); + Wait.waitFor(() -> backup.isActive().orElse(false), 5000); + assertTrue(Wait.waitFor(() -> nodeID.equals(backup.getNodeID().orElse("not set yet")))); + LOGGER.info("restarted backup"); + + Assert.assertEquals(3L, backup.getActivationSequence().get().longValue()); + } } diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumPeerTest.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumPeerTest.java new file mode 100644 index 0000000000..14fabd8292 --- /dev/null +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumPeerTest.java @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.activemq.artemis.tests.smoke.quorum; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.Objects; +import java.util.concurrent.TimeUnit; + +import org.apache.activemq.artemis.util.ServerUtil; +import org.apache.activemq.artemis.utils.Wait; +import org.jboss.logging.Logger; +import org.junit.Assert; +import org.junit.Test; + +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.containsExactNodeIds; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.validateNetworkTopology; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withBackup; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withLive; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withMembers; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withNodes; + +public class ZookeeperPluggableQuorumPeerTest extends ZookeeperPluggableQuorumSinglePairTest { + + private static final Logger LOGGER = Logger.getLogger(ZookeeperPluggableQuorumPeerTest.class); + + public ZookeeperPluggableQuorumPeerTest() { + super(); + // accepting the primary/backup vars to reuse the test, for peers, these are interchangeable as either can take + // both roles as both wish to be primary but will revert to backup + primary = new BrokerControl("primary-peer-a", JMX_PORT_PRIMARY, "zkReplicationPrimaryPeerA", PRIMARY_PORT_OFFSET); + backup = new BrokerControl("primary-peer-b", JMX_PORT_BACKUP, "zkReplicationPrimaryPeerB", BACKUP_PORT_OFFSET); + brokers = new LinkedList(Arrays.asList(primary, backup)); + } + + @Test + @Override + public void testBackupFailoverAndPrimaryFailback() throws Exception { + // peers don't request fail back by default + // just wait for setup to avoid partial stop of zk via fast tear down with async setup + Wait.waitFor(this::ensembleHasLeader); + } + + @Test + public void testMultiPrimary_Peer() throws Exception { + + final int timeout = (int) TimeUnit.SECONDS.toMillis(30); + LOGGER.info("starting peer b primary"); + + Process backupInstance = backup.startServer(this, timeout); + + // alive as unreplicated, it has configured node id + assertTrue(Wait.waitFor(() -> 1L == backup.getActivationSequence().orElse(Long.MAX_VALUE).longValue())); + + final String nodeID = backup.getNodeID().get(); + Assert.assertNotNull(nodeID); + LOGGER.infof("NodeID: %s", nodeID); + + LOGGER.info("starting peer a primary"); + primary.startServer(this, 0); + Wait.assertTrue(() -> primary.isBackup().orElse(false), timeout); + + Wait.assertTrue(() -> !backup.isBackup().orElse(true), timeout); + + for (BrokerControl broker : brokers) { + Wait.assertTrue(() -> validateNetworkTopology(broker.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, Objects::nonNull)) + .and(withBackup(nodeID, Objects::nonNull)) + .and(withMembers(1)) + .and(withNodes(2))), timeout); + } + + LOGGER.infof("primary topology is: %s", primary.listNetworkTopology().get()); + LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get()); + Assert.assertTrue(backup.isReplicaSync().get()); + Assert.assertTrue(primary.isReplicaSync().get()); + + + LOGGER.info("killing peer-b"); + ServerUtil.killServer(backupInstance, forceKill); + + // peer-a now UNREPLICATED + Wait.assertTrue(() -> 2L == primary.getActivationSequence().get().longValue()); + + LOGGER.info("restarting peer-b"); + backup.startServer(this, 0); + + assertTrue(Wait.waitFor(() -> nodeID.equals(backup.getNodeID().orElse("not set yet")))); + // peer-b now a REPLICA + Wait.waitFor(() -> backup.isReplicaSync().get()); + Wait.assertTrue(() -> 2L == backup.getActivationSequence().get().longValue()); + } +} diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java index df2123f7d6..57f1cce866 100644 --- a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java @@ -1,13 +1,13 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

+ * + * http://www.apache.org/licenses/LICENSE-2.0 + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,6 +19,8 @@ package org.apache.activemq.artemis.tests.smoke.quorum; import java.util.List; import java.util.concurrent.TimeUnit; + +import org.apache.activemq.artemis.utils.ThreadLeakCheckRule; import org.apache.curator.test.InstanceSpec; import org.apache.curator.test.TestingCluster; import org.apache.curator.test.TestingZooKeeperServer; @@ -38,7 +40,7 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); - private TestingCluster testingServer; + protected TestingCluster testingServer; private InstanceSpec[] clusterSpecs; private int nodes; @@ -60,6 +62,8 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl @Override @After public void after() throws Exception { + // zk bits that leak from servers + ThreadLeakCheckRule.addKownThread("ListenerHandler-"); try { super.after(); } finally { @@ -76,6 +80,16 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl return true; } + protected boolean ensembleHasLeader() { + return testingServer.getServers().stream().filter(ZookeeperPluggableQuorumSinglePairTest::isLeader).count() != 0; + } + + private static boolean isLeader(TestingZooKeeperServer server) { + long leaderId = server.getQuorumPeer().getLeaderId(); + long id = server.getQuorumPeer().getId(); + return id == leaderId; + } + @Override protected void stopMajority() throws Exception { List followers = testingServer.getServers(); diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java index 079e2f75b0..5c5f044382 100644 --- a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java @@ -77,6 +77,16 @@ public class Jmx { return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::getNodeID, ActiveMQServerControl.class, throwable -> null); } + public static Optional getActivationSequence(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception { + return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::getActivationSequence, ActiveMQServerControl.class, throwable -> null); + + } + + public static Optional isActive(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception { + return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::isActive, ActiveMQServerControl.class, throwable -> null); + + } + public static Optional listNetworkTopology(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception { return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::listNetworkTopology, ActiveMQServerControl.class, throwable -> null); diff --git a/tests/unit-tests/src/test/java/org/apache/activemq/artemis/tests/unit/core/server/impl/FileLockTest.java b/tests/unit-tests/src/test/java/org/apache/activemq/artemis/tests/unit/core/server/impl/FileLockTest.java index 88127937bc..bfac547824 100644 --- a/tests/unit-tests/src/test/java/org/apache/activemq/artemis/tests/unit/core/server/impl/FileLockTest.java +++ b/tests/unit-tests/src/test/java/org/apache/activemq/artemis/tests/unit/core/server/impl/FileLockTest.java @@ -18,6 +18,7 @@ package org.apache.activemq.artemis.tests.unit.core.server.impl; import java.io.File; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager; import org.apache.activemq.artemis.tests.util.ActiveMQTestBase; import org.junit.Before; @@ -33,6 +34,25 @@ public class FileLockTest extends ActiveMQTestBase { file.mkdirs(); } + @Test + public void testSetNodeID() throws Exception { + FileLockNodeManager underTest = new FileLockNodeManager(getTestDirfile(), false); + ReplicationPrimaryPolicyConfiguration replicationPrimaryPolicyConfiguration = ReplicationPrimaryPolicyConfiguration.withDefault(); + String seed = ""; + for (int i = 0; i < 20; i++) { + replicationPrimaryPolicyConfiguration.setCoordinationId(seed); + if (replicationPrimaryPolicyConfiguration.getCoordinationId() != null) { + underTest.setNodeID(replicationPrimaryPolicyConfiguration.getCoordinationId()); + } + seed += String.valueOf(i); + } + + replicationPrimaryPolicyConfiguration.setCoordinationId("somme-dash-and-odd"); + if (replicationPrimaryPolicyConfiguration.getCoordinationId() != null) { + underTest.setNodeID(replicationPrimaryPolicyConfiguration.getCoordinationId()); + } + } + @Test public void testNIOLock() throws Exception { doTestLock(new FileLockNodeManager(getTestDirfile(), false), new FileLockNodeManager(getTestDirfile(), false));