From 536271485f1b19df9c1c71089fe1e0814a309e0e Mon Sep 17 00:00:00 2001 From: Francesco Nigro Date: Tue, 14 Apr 2020 17:11:32 +0200 Subject: [PATCH 1/4] ARTEMIS-2716 Pluggable Quorum Vote --- .../artemis/utils/ThreadLeakCheckRule.java | 12 + .../config/ActiveMQDefaultConfiguration.java | 16 + artemis-distribution/pom.xml | 11 + .../src/main/assembly/dep.xml | 9 + .../src/main/resources/features.xml | 1 + artemis-quorum-api/pom.xml | 41 ++ .../artemis/quorum/DistributedLock.java | 87 +++ .../quorum/DistributedPrimitiveManager.java | 56 ++ .../activemq/artemis/quorum/MutableLong.java | 51 ++ .../quorum/UnavailableStateException.java | 36 ++ artemis-quorum-ri/pom.xml | 124 ++++ .../file/FileBasedPrimitiveManager.java | 134 ++++ .../quorum/file/FileDistributedLock.java | 141 +++++ .../zookeeper/CuratorDistributedLock.java | 171 +++++ .../CuratorDistributedPrimitive.java | 172 +++++ .../CuratorDistributedPrimitiveManager.java | 367 +++++++++++ .../quorum/zookeeper/CuratorMutableLong.java | 67 ++ .../artemis/quorum/DistributedLockTest.java | 297 +++++++++ .../quorum/file/FileDistributedLockTest.java | 70 ++ .../zookeeper/CuratorDistributedLockTest.java | 364 +++++++++++ ...uratorDistributedPrimitiveManagerTest.java | 140 ++++ artemis-server/pom.xml | 5 + .../core/config/ConfigurationUtils.java | 9 + .../core/config/HAPolicyConfiguration.java | 4 +- ...tributedPrimitiveManagerConfiguration.java | 39 ++ .../ReplicationBackupPolicyConfiguration.java | 140 ++++ ...ReplicationPrimaryPolicyConfiguration.java | 125 ++++ .../impl/FileConfigurationParser.java | 82 +++ .../core/replication/ReplicationEndpoint.java | 42 +- .../artemis/core/server/ActiveMQServer.java | 6 - .../artemis/core/server/LiveNodeLocator.java | 22 +- .../artemis/core/server/NodeManager.java | 10 + .../server/cluster/ClusterController.java | 68 +- .../core/server/cluster/ClusterManager.java | 4 +- .../core/server/cluster/ha/HAPolicy.java | 4 + .../cluster/ha/ReplicationBackupPolicy.java | 176 +++++ .../cluster/ha/ReplicationPrimaryPolicy.java | 166 +++++ .../qourum/SharedNothingBackupQuorum.java | 13 +- .../core/server/files/FileMoveManager.java | 45 +- .../artemis/core/server/impl/Activation.java | 4 + .../core/server/impl/ActiveMQServerImpl.java | 35 +- .../AnyLiveNodeLocatorForReplication.java | 6 +- .../server/impl/ClusterTopologySearch.java | 160 +++++ .../server/impl/FileBasedNodeManager.java | 65 +- .../NamedLiveNodeIdLocatorForReplication.java | 127 ++++ .../NamedLiveNodeLocatorForReplication.java | 7 +- .../impl/ReplicationBackupActivation.java | 599 ++++++++++++++++++ .../core/server/impl/ReplicationObserver.java | 332 ++++++++++ .../impl/ReplicationPrimaryActivation.java | 439 +++++++++++++ .../impl/SharedNothingBackupActivation.java | 24 +- .../impl/SharedNothingLiveActivation.java | 9 + .../schema/artemis-configuration.xsd | 197 +++++- .../impl/HAPolicyConfigurationTest.java | 257 ++++++++ .../artemis/tests/util/ActiveMQTestBase.java | 15 + .../test/resources/backup-hapolicy-config.xml | 54 ++ .../resources/primary-hapolicy-config.xml | 52 ++ docs/user-manual/en/ha.md | 259 +++++++- pom.xml | 31 + .../extras/byteman/ScaleDownFailoverTest.java | 6 +- .../extras/byteman/ScaleDownFailureTest.java | 4 +- tests/integration-tests/pom.xml | 6 + .../client/InfiniteRedeliveryTest.java | 13 +- ...PluggableQuorumInfiniteRedeliveryTest.java | 60 ++ .../cluster/distribution/ClusterTestBase.java | 89 ++- .../distribution/ClusterWithBackupTest.java | 12 +- .../SimpleSymmetricClusterTest.java | 12 +- .../SymmetricClusterWithBackupTest.java | 20 +- .../cluster/failover/FailoverTest.java | 13 +- .../cluster/failover/FailoverTestBase.java | 45 +- .../GroupingFailoverReplicationTest.java | 4 +- .../GroupingFailoverSharedServerTest.java | 4 +- .../failover/GroupingFailoverTestBase.java | 62 +- .../LiveVoteOnBackupFailureClusterTest.java | 18 +- .../MultipleServerFailoverTestBase.java | 84 ++- .../failover/NettyReplicationStopTest.java | 2 +- .../failover/NetworkIsolationTest.java | 4 +- .../cluster/failover/ReplicaTimeoutTest.java | 74 ++- .../failover/ReplicatedDistributionTest.java | 10 +- ...ultipleServerFailoverExtraBackupsTest.java | 26 +- .../ReplicatedMultipleServerFailoverTest.java | 18 +- .../failover/SharedStoreDistributionTest.java | 4 +- .../SharedStoreDontWaitForActivationTest.java | 4 +- .../failover/SharedStoreMetricsLeakTest.java | 4 +- .../SharedStoreScaleDownBackupTest.java | 6 +- .../StaticClusterWithBackupFailoverTest.java | 12 +- ...uggableQuorumBackupAuthenticationTest.java | 121 ++++ .../PluggableQuorumBackupSyncJournalTest.java | 39 ++ ...orumExtraBackupReplicatedFailoverTest.java | 104 +++ ...QuorumGroupingFailoverReplicationTest.java | 29 + ...ettyNoGroupNameReplicatedFailoverTest.java | 213 +++++++ ...uggableQuorumNettyReplicationStopTest.java | 36 ++ ...rumPageCleanupWhileReplicaCatchupTest.java | 36 ++ .../PluggableQuorumReplicaTimeoutTest.java | 62 ++ ...gableQuorumReplicatedDistributionTest.java | 27 + ...umReplicatedLargeMessageFailoverTest.java} | 21 +- ...tedLargeMessageWithDelayFailoverTest.java} | 22 +- ...bleQuorumReplicatedPagingFailoverTest.java | 35 + .../cluster/util/BackupSyncDelay.java | 16 +- .../openwire/OpenWireProtocolManagerTest.java | 2 +- ...gableQuorumReplicationFlowControlTest.java | 60 ++ .../PluggableQuorumReplicationOrderTest.java | 63 ++ .../PluggableQuorumReplicationTest.java | 58 ++ .../replication/ReplicationTest.java | 31 +- ...aredNothingReplicationFlowControlTest.java | 21 +- .../SharedNothingReplicationTest.java | 20 +- ...uorumBasicSecurityManagerFailoverTest.java | 128 ++++ .../server/ScaleDown3NodeTest.java | 6 +- .../tests/util/ReplicatedBackupUtils.java | 29 + tests/smoke-tests/pom.xml | 74 +++ .../servers/zkReplicationBackup/broker.xml | 138 ++++ .../zkReplicationBackup/management.xml | 20 + .../servers/zkReplicationPrimary/broker.xml | 137 ++++ .../zkReplicationPrimary/management.xml | 20 + .../quorum/PluggableQuorumSinglePairTest.java | 276 ++++++++ ...ookeeperPluggableQuorumSinglePairTest.java | 87 +++ .../artemis/tests/smoke/utils/Jmx.java | 157 +++++ 116 files changed, 8405 insertions(+), 298 deletions(-) create mode 100644 artemis-quorum-api/pom.xml create mode 100644 artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/DistributedLock.java create mode 100644 artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/DistributedPrimitiveManager.java create mode 100644 artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/MutableLong.java create mode 100644 artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/UnavailableStateException.java create mode 100644 artemis-quorum-ri/pom.xml create mode 100644 artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java create mode 100644 artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java create mode 100644 artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedLock.java create mode 100644 artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitive.java create mode 100644 artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitiveManager.java create mode 100644 artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorMutableLong.java create mode 100644 artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java create mode 100644 artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/file/FileDistributedLockTest.java create mode 100644 artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedLockTest.java create mode 100644 artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitiveManagerTest.java create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/DistributedPrimitiveManagerConfiguration.java create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationPrimaryPolicy.java create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ClusterTopologySearch.java create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/NamedLiveNodeIdLocatorForReplication.java create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationObserver.java create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java create mode 100644 artemis-server/src/test/resources/backup-hapolicy-config.xml create mode 100644 artemis-server/src/test/resources/primary-hapolicy-config.xml create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/PluggableQuorumInfiniteRedeliveryTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumExtraBackupReplicatedFailoverTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumGroupingFailoverReplicationTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedDistributionTest.java rename tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/{ReplicatedLargeMessageFailoverTest.java => quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java} (63%) rename tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/{ReplicatedLargeMessageWithDelayFailoverTest.java => quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java} (73%) create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java create mode 100644 tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java create mode 100644 tests/smoke-tests/src/main/resources/servers/zkReplicationBackup/broker.xml create mode 100644 tests/smoke-tests/src/main/resources/servers/zkReplicationBackup/management.xml create mode 100644 tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml create mode 100644 tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/management.xml create mode 100644 tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java create mode 100644 tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java create mode 100644 tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java diff --git a/artemis-commons/src/test/java/org/apache/activemq/artemis/utils/ThreadLeakCheckRule.java b/artemis-commons/src/test/java/org/apache/activemq/artemis/utils/ThreadLeakCheckRule.java index 3d6082bbde..bd7f80866f 100644 --- a/artemis-commons/src/test/java/org/apache/activemq/artemis/utils/ThreadLeakCheckRule.java +++ b/artemis-commons/src/test/java/org/apache/activemq/artemis/utils/ThreadLeakCheckRule.java @@ -270,6 +270,18 @@ public class ThreadLeakCheckRule extends TestWatcher { } else if (threadName.contains("ObjectCleanerThread")) { // Required since upgrade to Netty 4.1.22 maybe because https://github.com/netty/netty/commit/739e70398ccb6b11ffa97c6b5f8d55e455a2165e return true; + } else if (threadName.contains("RMI TCP")) { + return true; + } else if (threadName.contains("RMI Scheduler")) { + return true; + } else if (threadName.contains("RMI RenewClean")) { + return true; + } else if (threadName.contains("Signal Dispatcher")) { + return true; + } else if (threadName.contains("ForkJoinPool.commonPool")) { + return true; + } else if (threadName.contains("GC Daemon")) { + return true; } else { for (StackTraceElement element : thread.getStackTrace()) { if (element.getClassName().contains("org.jboss.byteman.agent.TransformListener")) { diff --git a/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/config/ActiveMQDefaultConfiguration.java b/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/config/ActiveMQDefaultConfiguration.java index f5771398a3..41351a2ea3 100644 --- a/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/config/ActiveMQDefaultConfiguration.java +++ b/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/config/ActiveMQDefaultConfiguration.java @@ -264,6 +264,9 @@ public final class ActiveMQDefaultConfiguration { // the directory to store the journal files in private static String DEFAULT_JOURNAL_DIR = "data/journal"; + // the directory to store the data files in + private static String DEFAULT_DATA_DIR = "data"; + // true means that the journal directory will be created private static boolean DEFAULT_CREATE_JOURNAL_DIR = true; @@ -627,6 +630,8 @@ public final class ActiveMQDefaultConfiguration { public static final String DEFAULT_TEMPORARY_QUEUE_NAMESPACE = ""; + private static final String DEFAULT_DISTRIBUTED_PRIMITIVE_MANAGER_CLASS_NAME = "org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager"; + // Number of concurrent workers for a core bridge public static int DEFAULT_BRIDGE_CONCURRENCY = 1; @@ -938,6 +943,13 @@ public final class ActiveMQDefaultConfiguration { return DEFAULT_JOURNAL_DIR; } + /** + * the directory to store the journal files in + */ + public static String getDefaultDataDir() { + return DEFAULT_DATA_DIR; + } + /** * true means that the journal directory will be created */ @@ -1721,6 +1733,10 @@ public final class ActiveMQDefaultConfiguration { return DEFAULT_TEMPORARY_QUEUE_NAMESPACE; } + public static String getDefaultDistributedPrimitiveManagerClassName() { + return DEFAULT_DISTRIBUTED_PRIMITIVE_MANAGER_CLASS_NAME; + } + public static int getDefaultBridgeConcurrency() { return DEFAULT_BRIDGE_CONCURRENCY; } diff --git a/artemis-distribution/pom.xml b/artemis-distribution/pom.xml index eae42a6ca4..443362a98f 100644 --- a/artemis-distribution/pom.xml +++ b/artemis-distribution/pom.xml @@ -231,6 +231,17 @@ ${project.version} javadoc + + + org.apache.activemq + artemis-quorum-api + ${project.version} + + + org.apache.activemq + artemis-quorum-ri + ${project.version} + io.netty netty-all diff --git a/artemis-distribution/src/main/assembly/dep.xml b/artemis-distribution/src/main/assembly/dep.xml index 018266dc87..f4d240fe00 100644 --- a/artemis-distribution/src/main/assembly/dep.xml +++ b/artemis-distribution/src/main/assembly/dep.xml @@ -62,6 +62,9 @@ org.apache.activemq.rest:artemis-rest org.apache.qpid:qpid-jms-client io.micrometer:micrometer-core + + org.apache.activemq:artemis-quorum-api + org.apache.activemq:artemis-quorum-ri jakarta.jms:jakarta.jms-api @@ -97,6 +100,12 @@ com.sun.xml.bind:jaxb-impl jakarta.activation:jakarta.activation-api jakarta.security.auth.message:jakarta.security.auth.message-api + + org.apache.curator:curator-recipes + org.apache.curator:curator-client + org.apache.curator:curator-framework + org.apache.zookeeper:zookeeper + org.apache.zookeeper:zookeeper-jute mvn:org.apache.activemq/activemq-artemis-native/${activemq-artemis-native-version} + mvn:org.apache.activemq/artemis-quorum-api/${pom.version} mvn:org.apache.activemq/artemis-server-osgi/${pom.version} diff --git a/artemis-quorum-api/pom.xml b/artemis-quorum-api/pom.xml new file mode 100644 index 0000000000..febbe5e871 --- /dev/null +++ b/artemis-quorum-api/pom.xml @@ -0,0 +1,41 @@ + + + 4.0.0 + + + org.apache.activemq + artemis-pom + 2.18.0-SNAPSHOT + + + artemis-quorum-api + bundle + ActiveMQ Artemis Quorum API + + + ${project.basedir}/.. + + + + + com.google.errorprone + error_prone_core + + + \ No newline at end of file diff --git a/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/DistributedLock.java b/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/DistributedLock.java new file mode 100644 index 0000000000..ac83a2c859 --- /dev/null +++ b/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/DistributedLock.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum; + +import java.util.Objects; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.LockSupport; + +public interface DistributedLock extends AutoCloseable { + + String getLockId(); + + boolean isHeldByCaller() throws UnavailableStateException; + + boolean tryLock() throws UnavailableStateException, InterruptedException; + + default boolean tryLock(long timeout, TimeUnit unit) throws UnavailableStateException, InterruptedException { + // it doesn't make sense to be super fast + final long TARGET_FIRE_PERIOD_NS = TimeUnit.MILLISECONDS.toNanos(250); + if (timeout < 0) { + throw new IllegalArgumentException("timeout cannot be negative"); + } + Objects.requireNonNull(unit); + if (timeout == 0) { + return tryLock(); + } + final Thread currentThread = Thread.currentThread(); + final long timeoutNs = unit.toNanos(timeout); + final long start = System.nanoTime(); + final long deadline = start + timeoutNs; + long expectedNextFireTime = start; + while (!currentThread.isInterrupted()) { + long parkNs = expectedNextFireTime - System.nanoTime(); + while (parkNs > 0) { + LockSupport.parkNanos(parkNs); + if (currentThread.isInterrupted()) { + throw new InterruptedException(); + } + final long now = System.nanoTime(); + parkNs = expectedNextFireTime - now; + } + if (tryLock()) { + return true; + } + final long now = System.nanoTime(); + final long remainingTime = deadline - now; + if (remainingTime <= 0) { + return false; + } + if (remainingTime < TARGET_FIRE_PERIOD_NS) { + expectedNextFireTime = now; + } else { + expectedNextFireTime += TARGET_FIRE_PERIOD_NS; + } + } + throw new InterruptedException(); + } + + void unlock() throws UnavailableStateException; + + void addListener(UnavailableLockListener listener); + + void removeListener(UnavailableLockListener listener); + + @FunctionalInterface + interface UnavailableLockListener { + + void onUnavailableLockEvent(); + } + + @Override + void close(); +} diff --git a/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/DistributedPrimitiveManager.java b/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/DistributedPrimitiveManager.java new file mode 100644 index 0000000000..049971a881 --- /dev/null +++ b/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/DistributedPrimitiveManager.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum; + +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +public interface DistributedPrimitiveManager extends AutoCloseable { + + static DistributedPrimitiveManager newInstanceOf(String className, Map properties) throws Exception { + return (DistributedPrimitiveManager) Class.forName(className).getDeclaredConstructor(Map.class).newInstance(properties); + } + + @FunctionalInterface + interface UnavailableManagerListener { + + void onUnavailableManagerEvent(); + } + + void addUnavailableManagerListener(UnavailableManagerListener listener); + + void removeUnavailableManagerListener(UnavailableManagerListener listener); + + boolean start(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException; + + void start() throws InterruptedException, ExecutionException; + + boolean isStarted(); + + void stop(); + + DistributedLock getDistributedLock(String lockId) throws InterruptedException, ExecutionException, TimeoutException; + + MutableLong getMutableLong(String mutableLongId) throws InterruptedException, ExecutionException, TimeoutException; + + @Override + default void close() { + stop(); + } +} \ No newline at end of file diff --git a/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/MutableLong.java b/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/MutableLong.java new file mode 100644 index 0000000000..fe4b13bb3c --- /dev/null +++ b/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/MutableLong.java @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.activemq.artemis.quorum; + +public interface MutableLong extends AutoCloseable { + + String getMutableLongId(); + + long get() throws UnavailableStateException; + + void set(long value) throws UnavailableStateException; + + /** + * This is not meant to be atomic; it's semantically equivalent to: + *
+    *    long oldValue = mutableLong.get();
+    *    if (mutableLong.oldValue != expectedValue) {
+    *       return false;
+    *    }
+    *    mutableLong.set(newValue);
+    *    return true;
+    * 
+ */ + default boolean compareAndSet(long expectedValue, long newValue) throws UnavailableStateException { + final long oldValue = get(); + if (oldValue != expectedValue) { + return false; + } + set(newValue); + return true; + } + + @Override + void close(); +} diff --git a/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/UnavailableStateException.java b/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/UnavailableStateException.java new file mode 100644 index 0000000000..2ae88c657f --- /dev/null +++ b/artemis-quorum-api/src/main/java/org/apache/activemq/artemis/quorum/UnavailableStateException.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum; + +public final class UnavailableStateException extends Exception { + + public UnavailableStateException() { + super(); + } + + public UnavailableStateException(String message) { + super(message); + } + + public UnavailableStateException(String message, Throwable cause) { + super(message, cause); + } + + public UnavailableStateException(Throwable cause) { + super(cause); + } +} diff --git a/artemis-quorum-ri/pom.xml b/artemis-quorum-ri/pom.xml new file mode 100644 index 0000000000..83c81678af --- /dev/null +++ b/artemis-quorum-ri/pom.xml @@ -0,0 +1,124 @@ + + + 4.0.0 + + + org.apache.activemq + artemis-pom + 2.18.0-SNAPSHOT + + + artemis-quorum-ri + jar + ActiveMQ Artemis Quorum RI + + + ${project.basedir}/.. + + + + + org.apache.curator + curator-recipes + + + org.apache.curator + curator-client + + + org.apache.zookeeper + zookeeper + + + org.apache.curator + curator-test + ${curator.version} + + + org.apache.activemq + artemis-quorum-api + ${project.version} + + + org.jboss.logging + jboss-logging + + + org.apache.activemq + artemis-commons + ${project.version} + + + com.google.errorprone + error_prone_core + + + + junit + junit + test + + + org.hamcrest + hamcrest + ${hamcrest.version} + test + + + + org.jboss.logging + jboss-logging-processor + provided + true + + + org.jboss.logmanager + jboss-logmanager + test + + + org.wildfly.common + wildfly-common + test + + + org.apache.activemq + artemis-commons + ${project.version} + test + test-jar + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + test + + test-jar + + + + + + + \ No newline at end of file diff --git a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java new file mode 100644 index 0000000000..4764b7c614 --- /dev/null +++ b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum.file; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import org.apache.activemq.artemis.quorum.DistributedLock; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.apache.activemq.artemis.quorum.MutableLong; + +/** + * This is an implementation suitable to be used just on unit tests and it won't attempt + * to manage nor purge existing stale locks files. It's part of the tests life-cycle to properly + * set-up and tear-down the environment. + */ +public class FileBasedPrimitiveManager implements DistributedPrimitiveManager { + + private final File locksFolder; + private final Map locks; + private boolean started; + + public FileBasedPrimitiveManager(Map args) { + this(new File(args.get("locks-folder"))); + } + + public FileBasedPrimitiveManager(File locksFolder) { + Objects.requireNonNull(locksFolder); + if (!locksFolder.exists()) { + throw new IllegalStateException(locksFolder + " is supposed to already exists"); + } + if (!locksFolder.isDirectory()) { + throw new IllegalStateException(locksFolder + " is supposed to be a directory"); + } + this.locksFolder = locksFolder; + this.locks = new HashMap<>(); + } + + @Override + public boolean isStarted() { + return started; + } + + @Override + public void addUnavailableManagerListener(UnavailableManagerListener listener) { + // noop + } + + @Override + public void removeUnavailableManagerListener(UnavailableManagerListener listener) { + // noop + } + + @Override + public boolean start(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException { + if (timeout >= 0) { + Objects.requireNonNull(unit); + } + if (started) { + return true; + } + started = true; + return true; + } + + @Override + public void start() throws InterruptedException, ExecutionException { + start(-1, null); + } + + @Override + public void stop() { + if (!started) { + return; + } + try { + locks.forEach((lockId, lock) -> { + try { + lock.close(false); + } catch (Throwable t) { + // TODO no op for now: log would be better! + } + }); + locks.clear(); + } finally { + started = false; + } + } + + @Override + public DistributedLock getDistributedLock(String lockId) throws ExecutionException { + Objects.requireNonNull(lockId); + if (!started) { + throw new IllegalStateException("manager should be started first"); + } + final FileDistributedLock lock = locks.get(lockId); + if (lock != null && !lock.isClosed()) { + return lock; + } + try { + final FileDistributedLock newLock = new FileDistributedLock(locks::remove, locksFolder, lockId); + locks.put(lockId, newLock); + return newLock; + } catch (IOException ioEx) { + throw new ExecutionException(ioEx); + } + } + + @Override + public MutableLong getMutableLong(String mutableLongId) throws InterruptedException, ExecutionException, TimeoutException { + // TODO + return null; + } +} diff --git a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java new file mode 100644 index 0000000000..5e749698d6 --- /dev/null +++ b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum.file; + +import java.io.File; +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.channels.FileLock; +import java.nio.channels.OverlappingFileLockException; +import java.nio.file.StandardOpenOption; +import java.util.function.Consumer; + +import org.apache.activemq.artemis.quorum.DistributedLock; + +final class FileDistributedLock implements DistributedLock { + + private final String lockId; + private final Consumer onClosedLock; + private boolean closed; + private FileLock fileLock; + private final FileChannel channel; + + FileDistributedLock(Consumer onClosedLock, File locksFolder, String lockId) throws IOException { + this.onClosedLock = onClosedLock; + this.lockId = lockId; + this.closed = false; + this.fileLock = null; + this.channel = FileChannel.open(new File(locksFolder, lockId).toPath(), StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE); + } + + private void checkNotClosed() { + if (closed) { + throw new IllegalStateException("This lock is closed"); + } + } + + @Override + public String getLockId() { + checkNotClosed(); + return lockId; + } + + @Override + public boolean isHeldByCaller() { + checkNotClosed(); + final FileLock fileLock = this.fileLock; + if (fileLock == null) { + return false; + } + return fileLock.isValid(); + } + + @Override + public boolean tryLock() { + checkNotClosed(); + final FileLock fileLock = this.fileLock; + if (fileLock != null) { + throw new IllegalStateException("unlock first"); + } + final FileLock lock; + try { + lock = channel.tryLock(); + } catch (OverlappingFileLockException o) { + // this process already hold this lock, but not this manager + return false; + } catch (Throwable t) { + throw new IllegalStateException(t); + } + if (lock == null) { + return false; + } + this.fileLock = lock; + return true; + } + + @Override + public void unlock() { + checkNotClosed(); + final FileLock fileLock = this.fileLock; + if (fileLock != null) { + this.fileLock = null; + try { + fileLock.close(); + } catch (IOException e) { + // noop + } + } + } + + @Override + public void addListener(UnavailableLockListener listener) { + checkNotClosed(); + // noop + } + + @Override + public void removeListener(UnavailableLockListener listener) { + checkNotClosed(); + // noop + } + + public boolean isClosed() { + return closed; + } + + public void close(boolean useCallback) { + if (closed) { + return; + } + try { + if (useCallback) { + onClosedLock.accept(lockId); + } + unlock(); + channel.close(); + } catch (IOException e) { + // ignore it + } finally { + closed = true; + } + } + + @Override + public void close() { + close(true); + } +} diff --git a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedLock.java b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedLock.java new file mode 100644 index 0000000000..206529862b --- /dev/null +++ b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedLock.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum.zookeeper; + +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.UUID; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.TimeUnit; + +import org.apache.activemq.artemis.quorum.DistributedLock; +import org.apache.activemq.artemis.quorum.UnavailableStateException; +import org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager.PrimitiveId; +import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2; +import org.apache.curator.framework.recipes.locks.Lease; + +final class CuratorDistributedLock extends CuratorDistributedPrimitive implements DistributedLock { + + private final InterProcessSemaphoreV2 ipcSem; + private final CopyOnWriteArrayList listeners; + private Lease lease; + private byte[] leaseVersion; + + CuratorDistributedLock(PrimitiveId id, CuratorDistributedPrimitiveManager manager, InterProcessSemaphoreV2 ipcSem) { + super(id, manager); + this.ipcSem = ipcSem; + this.listeners = new CopyOnWriteArrayList<>(); + this.leaseVersion = null; + } + + @Override + protected void handleReconnected() { + super.handleReconnected(); + if (leaseVersion != null) { + assert lease != null; + try { + if (Arrays.equals(lease.getData(), leaseVersion)) { + return; + } + onLost(); + } catch (Exception e) { + onLost(); + } + } + } + + @Override + protected void handleLost() { + super.handleLost(); + lease = null; + leaseVersion = null; + for (UnavailableLockListener listener : listeners) { + listener.onUnavailableLockEvent(); + } + } + + @Override + public String getLockId() { + return getId().id; + } + + @Override + public boolean isHeldByCaller() throws UnavailableStateException { + return run(() -> { + checkUnavailable(); + if (lease == null) { + return false; + } + assert leaseVersion != null; + try { + return Arrays.equals(lease.getData(), leaseVersion); + } catch (Throwable t) { + throw new UnavailableStateException(t); + } + }); + } + + @Override + public boolean tryLock() throws UnavailableStateException, InterruptedException { + return tryRun(() -> { + if (lease != null) { + throw new IllegalStateException("unlock first"); + } + checkUnavailable(); + try { + final byte[] leaseVersion = UUID.randomUUID().toString().getBytes(StandardCharsets.UTF_8); + ipcSem.setNodeData(leaseVersion); + lease = ipcSem.acquire(0, TimeUnit.NANOSECONDS); + if (lease == null) { + ipcSem.setNodeData(null); + return false; + } + this.leaseVersion = leaseVersion; + assert Arrays.equals(lease.getData(), leaseVersion); + return true; + } catch (InterruptedException ie) { + throw ie; + } catch (Throwable e) { + throw new UnavailableStateException(e); + } + }); + } + + @Override + public void unlock() throws UnavailableStateException { + run(() -> { + checkUnavailable(); + final Lease lease = this.lease; + if (lease != null) { + this.lease = null; + this.leaseVersion = null; + try { + ipcSem.returnLease(lease); + } catch (Throwable e) { + throw new UnavailableStateException(e); + } + } + return null; + }); + } + + @Override + public void addListener(UnavailableLockListener listener) { + run(() -> { + listeners.add(listener); + fireUnavailableListener(listener::onUnavailableLockEvent); + return null; + }); + } + + @Override + public void removeListener(UnavailableLockListener listener) { + run(() -> { + listeners.remove(listener); + return null; + }); + } + + @Override + protected void handleClosed() { + super.handleClosed(); + listeners.clear(); + final Lease lease = this.lease; + if (lease == null) { + return; + } + this.lease = null; + if (isUnavailable()) { + return; + } + try { + ipcSem.returnLease(lease); + } catch (Throwable t) { + // TODO silent, but debug ;) + } + } +} diff --git a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitive.java b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitive.java new file mode 100644 index 0000000000..24fdd88648 --- /dev/null +++ b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitive.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum.zookeeper; + +import org.apache.activemq.artemis.quorum.UnavailableStateException; +import org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager.PrimitiveId; + +import static org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager.PrimitiveType.validatePrimitiveInstance; + +public abstract class CuratorDistributedPrimitive implements AutoCloseable { + + // this is used to prevent deadlocks on close + private final CuratorDistributedPrimitiveManager manager; + private final PrimitiveId id; + + private boolean unavailable; + private boolean closed; + + protected CuratorDistributedPrimitive(PrimitiveId id, CuratorDistributedPrimitiveManager manager) { + this.id = id; + this.manager = manager; + this.closed = false; + this.unavailable = false; + validatePrimitiveInstance(this); + } + + final PrimitiveId getId() { + return id; + } + + final void onReconnected() { + synchronized (manager) { + if (closed || unavailable) { + return; + } + handleReconnected(); + } + } + + protected void handleReconnected() { + + } + + final void onLost() { + synchronized (manager) { + if (closed || unavailable) { + return; + } + unavailable = true; + handleLost(); + } + } + + protected void handleLost() { + + } + + final void onSuspended() { + synchronized (manager) { + if (closed || unavailable) { + return; + } + handleSuspended(); + } + } + + protected void handleSuspended() { + + } + + final void onRemoved() { + close(false); + } + + private void checkNotClosed() { + if (closed) { + throw new IllegalStateException("This lock is closed"); + } + } + + @FunctionalInterface + protected interface PrimitiveAction { + + R call() throws T; + } + + @FunctionalInterface + protected interface InterruptablePrimitiveAction { + + R call() throws InterruptedException, T; + } + + protected final void checkUnavailable() throws UnavailableStateException { + if (unavailable) { + throw new UnavailableStateException(id.type + " with id = " + id.id + " isn't available"); + } + } + + protected final void fireUnavailableListener(Runnable task) { + run(() -> { + if (!unavailable) { + return false; + } + manager.startHandlingEvents(); + try { + task.run(); + } finally { + manager.completeHandlingEvents(); + } + return true; + }); + } + + protected final R run(PrimitiveAction action) throws T { + synchronized (manager) { + manager.checkHandlingEvents(); + checkNotClosed(); + return action.call(); + } + } + + protected final R tryRun(InterruptablePrimitiveAction action) throws InterruptedException, T { + synchronized (manager) { + manager.checkHandlingEvents(); + checkNotClosed(); + return action.call(); + } + } + + private void close(boolean remove) { + synchronized (manager) { + manager.checkHandlingEvents(); + if (closed) { + return; + } + closed = true; + if (remove) { + manager.remove(this); + } + handleClosed(); + } + } + + protected void handleClosed() { + + } + + protected final boolean isUnavailable() { + synchronized (manager) { + return unavailable; + } + } + + @Override + public final void close() { + close(true); + } +} diff --git a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitiveManager.java b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitiveManager.java new file mode 100644 index 0000000000..fd316dc9a6 --- /dev/null +++ b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitiveManager.java @@ -0,0 +1,367 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum.zookeeper; + +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.activemq.artemis.quorum.DistributedLock; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.apache.activemq.artemis.quorum.MutableLong; +import org.apache.curator.framework.CuratorFramework; +import org.apache.curator.framework.CuratorFrameworkFactory; +import org.apache.curator.framework.recipes.atomic.DistributedAtomicLong; +import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2; +import org.apache.curator.framework.state.ConnectionState; +import org.apache.curator.framework.state.ConnectionStateListener; +import org.apache.curator.retry.RetryForever; +import org.apache.curator.retry.RetryNTimes; + +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.joining; + +public class CuratorDistributedPrimitiveManager implements DistributedPrimitiveManager, ConnectionStateListener { + + enum PrimitiveType { + lock, mutableLong; + + static T validatePrimitiveInstance(T primitive) { + if (primitive == null) { + return null; + } + boolean valid = false; + switch (primitive.getId().type) { + + case lock: + valid = primitive instanceof CuratorDistributedLock; + break; + case mutableLong: + valid = primitive instanceof CuratorMutableLong; + break; + } + if (!valid) { + throw new AssertionError("Implementation error: " + primitive.getClass() + " is wrongly considered " + primitive.getId().type); + } + return primitive; + } + } + + static final class PrimitiveId { + + final String id; + final PrimitiveType type; + + private PrimitiveId(String id, PrimitiveType type) { + this.id = requireNonNull(id); + this.type = requireNonNull(type); + } + + static PrimitiveId of(String id, PrimitiveType type) { + return new PrimitiveId(id, type); + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + + PrimitiveId that = (PrimitiveId) o; + + if (!Objects.equals(id, that.id)) + return false; + return type == that.type; + } + + @Override + public int hashCode() { + int result = id != null ? id.hashCode() : 0; + result = 31 * result + (type != null ? type.hashCode() : 0); + return result; + } + } + + private static final String CONNECT_STRING_PARAM = "connect-string"; + private static final String NAMESPACE_PARAM = "namespace"; + private static final String SESSION_MS_PARAM = "session-ms"; + private static final String SESSION_PERCENT_PARAM = "session-percent"; + private static final String CONNECTION_MS_PARAM = "connection-ms"; + private static final String RETRIES_PARAM = "retries"; + private static final String RETRIES_MS_PARAM = "retries-ms"; + private static final Set VALID_PARAMS = Stream.of( + CONNECT_STRING_PARAM, + NAMESPACE_PARAM, + SESSION_MS_PARAM, + SESSION_PERCENT_PARAM, + CONNECTION_MS_PARAM, + RETRIES_PARAM, + RETRIES_MS_PARAM).collect(Collectors.toSet()); + private static final String VALID_PARAMS_ON_ERROR = VALID_PARAMS.stream().collect(joining(",")); + // It's 9 times the default ZK tick time ie 2000 ms + private static final String DEFAULT_SESSION_TIMEOUT_MS = Integer.toString(18_000); + private static final String DEFAULT_CONNECTION_TIMEOUT_MS = Integer.toString(8_000); + private static final String DEFAULT_RETRIES = Integer.toString(1); + private static final String DEFAULT_RETRIES_MS = Integer.toString(1000); + // why 1/3 of the session? https://cwiki.apache.org/confluence/display/CURATOR/TN14 + private static final String DEFAULT_SESSION_PERCENT = Integer.toString(33); + + private static Map validateParameters(Map config) { + config.forEach((parameterName, ignore) -> validateParameter(parameterName)); + return config; + } + + private static void validateParameter(String parameterName) { + if (!VALID_PARAMS.contains(parameterName)) { + throw new IllegalArgumentException("non existent parameter " + parameterName + ": accepted list is " + VALID_PARAMS_ON_ERROR); + } + } + + private CuratorFramework client; + private final Map primitives; + private CopyOnWriteArrayList listeners; + private boolean unavailable; + private boolean handlingEvents; + private final CuratorFrameworkFactory.Builder curatorBuilder; + + public CuratorDistributedPrimitiveManager(Map config) { + this(validateParameters(config), true); + } + + private CuratorDistributedPrimitiveManager(Map config, boolean ignore) { + this(config.get(CONNECT_STRING_PARAM), + config.get(NAMESPACE_PARAM), + Integer.parseInt(config.getOrDefault(SESSION_MS_PARAM, DEFAULT_SESSION_TIMEOUT_MS)), + Integer.parseInt(config.getOrDefault(SESSION_PERCENT_PARAM, DEFAULT_SESSION_PERCENT)), + Integer.parseInt(config.getOrDefault(CONNECTION_MS_PARAM, DEFAULT_CONNECTION_TIMEOUT_MS)), + Integer.parseInt(config.getOrDefault(RETRIES_PARAM, DEFAULT_RETRIES)), + Integer.parseInt(config.getOrDefault(RETRIES_MS_PARAM, DEFAULT_RETRIES_MS))); + } + + private CuratorDistributedPrimitiveManager(String connectString, + String namespace, + int sessionMs, + int sessionPercent, + int connectionMs, + int retries, + int retriesMs) { + curatorBuilder = CuratorFrameworkFactory.builder() + .connectString(connectString) + .namespace(namespace) + .sessionTimeoutMs(sessionMs) + .connectionTimeoutMs(connectionMs) + .retryPolicy(retries >= 0 ? new RetryNTimes(retries, retriesMs) : new RetryForever(retriesMs)) + .simulatedSessionExpirationPercent(sessionPercent); + this.primitives = new HashMap<>(); + this.listeners = null; + this.unavailable = false; + this.handlingEvents = false; + } + + @Override + public synchronized boolean isStarted() { + checkHandlingEvents(); + return client != null; + } + + @Override + public synchronized void addUnavailableManagerListener(UnavailableManagerListener listener) { + checkHandlingEvents(); + if (listeners == null) { + return; + } + listeners.add(listener); + if (unavailable) { + startHandlingEvents(); + try { + listener.onUnavailableManagerEvent(); + } finally { + completeHandlingEvents(); + } + } + } + + @Override + public synchronized void removeUnavailableManagerListener(UnavailableManagerListener listener) { + checkHandlingEvents(); + if (listeners == null) { + return; + } + listeners.remove(listener); + } + + @Override + public synchronized boolean start(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException { + checkHandlingEvents(); + if (timeout >= 0) { + if (timeout > Integer.MAX_VALUE) { + throw new IllegalArgumentException("curator manager won't support too long timeout ie >" + Integer.MAX_VALUE); + } + requireNonNull(unit); + } + if (client != null) { + return true; + } + final CuratorFramework client = curatorBuilder.build(); + try { + client.start(); + if (!client.blockUntilConnected((int) timeout, unit)) { + client.close(); + return false; + } + this.client = client; + this.listeners = new CopyOnWriteArrayList<>(); + client.getConnectionStateListenable().addListener(this); + return true; + } catch (InterruptedException e) { + client.close(); + throw e; + } + } + + @Override + public synchronized void start() throws InterruptedException, ExecutionException { + start(-1, null); + } + + @Override + public synchronized void stop() { + checkHandlingEvents(); + final CuratorFramework client = this.client; + if (client == null) { + return; + } + this.client = null; + unavailable = false; + listeners.clear(); + this.listeners = null; + client.getConnectionStateListenable().removeListener(this); + primitives.forEach((id, primitive) -> { + try { + primitive.onRemoved(); + } catch (Throwable t) { + // TODO log? + } + }); + primitives.clear(); + client.close(); + } + + private synchronized T getPrimitive(PrimitiveId id, + Function primitiveFactory) { + checkHandlingEvents(); + requireNonNull(id); + if (client == null) { + throw new IllegalStateException("manager isn't started yet!"); + } + final CuratorDistributedPrimitive primitive = PrimitiveType.validatePrimitiveInstance(primitives.get(id)); + if (primitive != null) { + return (T) primitive; + } + final T newPrimitive = PrimitiveType.validatePrimitiveInstance(primitiveFactory.apply(id)); + primitives.put(id, newPrimitive); + if (unavailable) { + startHandlingEvents(); + try { + newPrimitive.onLost(); + } finally { + completeHandlingEvents(); + } + } + return newPrimitive; + } + + @Override + public DistributedLock getDistributedLock(String lockId) { + return getPrimitive(PrimitiveId.of(lockId, PrimitiveType.lock), + id -> new CuratorDistributedLock(id, this, + new InterProcessSemaphoreV2(client, "/" + id.id + "/locks", 1))); + } + + @Override + public MutableLong getMutableLong(String mutableLongId) { + return getPrimitive(PrimitiveId.of(mutableLongId, PrimitiveType.mutableLong), + id -> new CuratorMutableLong(id, this, + new DistributedAtomicLong(client, "/" + mutableLongId + "/activation-sequence", new RetryNTimes(0, 0)))); + } + + protected void startHandlingEvents() { + handlingEvents = true; + } + + protected void completeHandlingEvents() { + handlingEvents = false; + } + + protected void checkHandlingEvents() { + if (client == null) { + return; + } + if (handlingEvents) { + throw new IllegalStateException("UnavailableManagerListener isn't supposed to modify the manager or its primitives on event handling!"); + } + } + + @Override + public synchronized void stateChanged(CuratorFramework client, ConnectionState newState) { + if (this.client != client) { + return; + } + if (unavailable) { + return; + } + startHandlingEvents(); + try { + switch (newState) { + case LOST: + unavailable = true; + listeners.forEach(listener -> listener.onUnavailableManagerEvent()); + primitives.forEach((id, primitive) -> primitive.onLost()); + break; + case RECONNECTED: + primitives.forEach((id, primitive) -> primitive.onReconnected()); + break; + case SUSPENDED: + primitives.forEach((id, primitive) -> primitive.onSuspended()); + break; + } + } finally { + completeHandlingEvents(); + } + } + + /** + * Used for testing purposes + */ + public synchronized CuratorFramework getCurator() { + checkHandlingEvents(); + return client; + } + + public synchronized void remove(CuratorDistributedPrimitive primitive) { + checkHandlingEvents(); + primitives.remove(primitive.getId()); + } +} diff --git a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorMutableLong.java b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorMutableLong.java new file mode 100644 index 0000000000..83cceb26f2 --- /dev/null +++ b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorMutableLong.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum.zookeeper; + +import org.apache.activemq.artemis.quorum.MutableLong; +import org.apache.activemq.artemis.quorum.UnavailableStateException; +import org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager.PrimitiveId; +import org.apache.curator.framework.recipes.atomic.AtomicValue; +import org.apache.curator.framework.recipes.atomic.DistributedAtomicLong; + +final class CuratorMutableLong extends CuratorDistributedPrimitive implements MutableLong { + + private final DistributedAtomicLong atomicLong; + + CuratorMutableLong(PrimitiveId id, CuratorDistributedPrimitiveManager manager, DistributedAtomicLong atomicLong) { + super(id, manager); + this.atomicLong = atomicLong; + } + + @Override + public String getMutableLongId() { + return getId().id; + } + + @Override + public long get() throws UnavailableStateException { + return run(() -> { + checkUnavailable(); + try { + AtomicValue atomicValue = atomicLong.get(); + if (!atomicValue.succeeded()) { + throw new UnavailableStateException("cannot query long " + getId()); + } + return atomicValue.postValue(); + } catch (Throwable e) { + throw new UnavailableStateException(e); + } + }); + } + + @Override + public void set(long value) throws UnavailableStateException { + run(() -> { + checkUnavailable(); + try { + atomicLong.forceSet(value); + return null; + } catch (Throwable e) { + throw new UnavailableStateException(e); + } + }); + } +} diff --git a/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java new file mode 100644 index 0000000000..e6c8cab1d7 --- /dev/null +++ b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java @@ -0,0 +1,297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.Consumer; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; + +public abstract class DistributedLockTest { + + private final ArrayList closeables = new ArrayList<>(); + + @Before + public void setupEnv() throws Throwable { + } + + protected abstract void configureManager(Map config); + + protected abstract String managerClassName(); + + @After + public void tearDownEnv() throws Throwable { + closeables.forEach(closeables -> { + try { + closeables.close(); + } catch (Throwable t) { + // silent here + } + }); + } + + protected DistributedPrimitiveManager createManagedDistributeManager() { + return createManagedDistributeManager(stringStringMap -> { + }); + } + + protected DistributedPrimitiveManager createManagedDistributeManager(Consumer> defaultConfiguration) { + try { + final HashMap config = new HashMap<>(); + configureManager(config); + defaultConfiguration.accept(config); + final DistributedPrimitiveManager manager = DistributedPrimitiveManager.newInstanceOf(managerClassName(), config); + closeables.add(manager); + return manager; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Test + public void managerReturnsSameLockIfNotClosed() throws ExecutionException, InterruptedException, TimeoutException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + Assert.assertSame(manager.getDistributedLock("a"), manager.getDistributedLock("a")); + } + + @Test + public void managerReturnsDifferentLocksIfClosed() throws ExecutionException, InterruptedException, TimeoutException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedLock closedLock = manager.getDistributedLock("a"); + closedLock.close(); + Assert.assertNotSame(closedLock, manager.getDistributedLock("a")); + } + + @Test + public void managerReturnsDifferentLocksOnRestart() throws ExecutionException, InterruptedException, TimeoutException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedLock closedLock = manager.getDistributedLock("a"); + manager.stop(); + manager.start(); + Assert.assertNotSame(closedLock, manager.getDistributedLock("a")); + } + + @Test(expected = IllegalStateException.class) + public void managerCannotGetLockIfNotStarted() throws ExecutionException, InterruptedException, TimeoutException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.getDistributedLock("a"); + } + + @Test(expected = NullPointerException.class) + public void managerCannotGetLockWithNullLockId() throws ExecutionException, InterruptedException, TimeoutException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + manager.getDistributedLock(null); + } + + @Test + public void closingLockUnlockIt() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedLock closedLock = manager.getDistributedLock("a"); + Assert.assertTrue(closedLock.tryLock()); + closedLock.close(); + Assert.assertTrue(manager.getDistributedLock("a").tryLock()); + } + + @Test + public void managerStopUnlockLocks() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + Assert.assertTrue(manager.getDistributedLock("a").tryLock()); + Assert.assertTrue(manager.getDistributedLock("b").tryLock()); + manager.stop(); + manager.start(); + Assert.assertFalse(manager.getDistributedLock("a").isHeldByCaller()); + Assert.assertFalse(manager.getDistributedLock("b").isHeldByCaller()); + } + + @Test + public void acquireAndReleaseLock() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedLock lock = manager.getDistributedLock("a"); + Assert.assertFalse(lock.isHeldByCaller()); + Assert.assertTrue(lock.tryLock()); + Assert.assertTrue(lock.isHeldByCaller()); + lock.unlock(); + Assert.assertFalse(lock.isHeldByCaller()); + } + + @Test(expected = IllegalStateException.class) + public void cannotAcquireSameLockTwice() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedLock lock = manager.getDistributedLock("a"); + Assert.assertTrue(lock.tryLock()); + lock.tryLock(); + } + + @Test + public void heldLockIsVisibleByDifferentManagers() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager ownerManager = createManagedDistributeManager(); + DistributedPrimitiveManager observerManager = createManagedDistributeManager(); + ownerManager.start(); + observerManager.start(); + Assert.assertTrue(ownerManager.getDistributedLock("a").tryLock()); + Assert.assertTrue(ownerManager.getDistributedLock("a").isHeldByCaller()); + Assert.assertFalse(observerManager.getDistributedLock("a").isHeldByCaller()); + } + + @Test + public void unlockedLockIsVisibleByDifferentManagers() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager ownerManager = createManagedDistributeManager(); + DistributedPrimitiveManager observerManager = createManagedDistributeManager(); + ownerManager.start(); + observerManager.start(); + Assert.assertTrue(ownerManager.getDistributedLock("a").tryLock()); + ownerManager.getDistributedLock("a").unlock(); + Assert.assertFalse(observerManager.getDistributedLock("a").isHeldByCaller()); + Assert.assertFalse(ownerManager.getDistributedLock("a").isHeldByCaller()); + } + + @Test + public void cannotAcquireSameLockFromDifferentManagers() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager ownerManager = createManagedDistributeManager(); + DistributedPrimitiveManager notOwnerManager = createManagedDistributeManager(); + ownerManager.start(); + notOwnerManager.start(); + Assert.assertTrue(ownerManager.getDistributedLock("a").tryLock()); + Assert.assertFalse(notOwnerManager.getDistributedLock("a").tryLock()); + } + + @Test + public void cannotUnlockFromNotOwnerManager() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager ownerManager = createManagedDistributeManager(); + DistributedPrimitiveManager notOwnerManager = createManagedDistributeManager(); + ownerManager.start(); + notOwnerManager.start(); + Assert.assertTrue(ownerManager.getDistributedLock("a").tryLock()); + notOwnerManager.getDistributedLock("a").unlock(); + Assert.assertFalse(notOwnerManager.getDistributedLock("a").isHeldByCaller()); + Assert.assertTrue(ownerManager.getDistributedLock("a").isHeldByCaller()); + } + + @Test + public void timedTryLockSucceedWithShortTimeout() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedLock backgroundLock = manager.getDistributedLock("a"); + Assert.assertTrue(backgroundLock.tryLock(1, TimeUnit.NANOSECONDS)); + } + + @Test + public void timedTryLockFailAfterTimeout() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedPrimitiveManager otherManager = createManagedDistributeManager(); + otherManager.start(); + Assert.assertTrue(otherManager.getDistributedLock("a").tryLock()); + final long start = System.nanoTime(); + final long timeoutSec = 1; + Assert.assertFalse(manager.getDistributedLock("a").tryLock(timeoutSec, TimeUnit.SECONDS)); + final long elapsed = TimeUnit.NANOSECONDS.toSeconds(System.nanoTime() - start); + assertThat(elapsed, greaterThanOrEqualTo(timeoutSec)); + } + + @Test + public void timedTryLockSuccess() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedPrimitiveManager otherManager = createManagedDistributeManager(); + otherManager.start(); + Assert.assertTrue(otherManager.getDistributedLock("a").tryLock()); + DistributedLock backgroundLock = manager.getDistributedLock("a"); + CompletableFuture acquired = new CompletableFuture<>(); + CountDownLatch startedTry = new CountDownLatch(1); + Thread tryLockThread = new Thread(() -> { + startedTry.countDown(); + try { + if (!backgroundLock.tryLock(Long.MAX_VALUE, TimeUnit.DAYS)) { + acquired.complete(false); + } else { + acquired.complete(true); + } + } catch (Throwable e) { + acquired.complete(false); + } + }); + tryLockThread.start(); + Assert.assertTrue(startedTry.await(10, TimeUnit.SECONDS)); + otherManager.getDistributedLock("a").unlock(); + Assert.assertTrue(acquired.get(4, TimeUnit.SECONDS)); + } + + @Test + public void interruptStopTimedTryLock() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedPrimitiveManager otherManager = createManagedDistributeManager(); + otherManager.start(); + Assert.assertTrue(otherManager.getDistributedLock("a").tryLock()); + DistributedLock backgroundLock = manager.getDistributedLock("a"); + CompletableFuture interrupted = new CompletableFuture<>(); + CountDownLatch startedTry = new CountDownLatch(1); + Thread tryLockThread = new Thread(() -> { + startedTry.countDown(); + try { + backgroundLock.tryLock(Long.MAX_VALUE, TimeUnit.DAYS); + interrupted.complete(false); + } catch (UnavailableStateException e) { + interrupted.complete(false); + } catch (InterruptedException e) { + interrupted.complete(true); + } + }); + tryLockThread.start(); + Assert.assertTrue(startedTry.await(10, TimeUnit.SECONDS)); + // let background lock to perform some tries + TimeUnit.SECONDS.sleep(1); + tryLockThread.interrupt(); + Assert.assertTrue(interrupted.get(4, TimeUnit.SECONDS)); + } + + @Test + public void lockAndMutableLongWithSameIdCanExistsTogether() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + final String id = "a"; + Assert.assertTrue(manager.getDistributedLock(id).tryLock()); + Assert.assertEquals(0, manager.getMutableLong(id).get()); + manager.getMutableLong(id).set(1); + Assert.assertTrue(manager.getDistributedLock(id).isHeldByCaller()); + Assert.assertEquals(1, manager.getMutableLong(id).get()); + } + +} + diff --git a/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/file/FileDistributedLockTest.java b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/file/FileDistributedLockTest.java new file mode 100644 index 0000000000..c565c6b411 --- /dev/null +++ b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/file/FileDistributedLockTest.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum.file; + +import java.io.File; +import java.lang.reflect.InvocationTargetException; +import java.util.Collections; +import java.util.Map; + +import org.apache.activemq.artemis.quorum.DistributedLockTest; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class FileDistributedLockTest extends DistributedLockTest { + + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); + + private File locksFolder; + + @Before + @Override + public void setupEnv() throws Throwable { + locksFolder = tmpFolder.newFolder("locks-folder"); + super.setupEnv(); + } + + @Override + protected void configureManager(Map config) { + config.put("locks-folder", locksFolder.toString()); + } + + @Override + protected String managerClassName() { + return FileBasedPrimitiveManager.class.getName(); + } + + @Test + public void reflectiveManagerCreation() throws Exception { + DistributedPrimitiveManager.newInstanceOf(managerClassName(), Collections.singletonMap("locks-folder", locksFolder.toString())); + } + + @Test(expected = InvocationTargetException.class) + public void reflectiveManagerCreationFailWithoutLocksFolder() throws Exception { + DistributedPrimitiveManager.newInstanceOf(managerClassName(), Collections.emptyMap()); + } + + @Test(expected = InvocationTargetException.class) + public void reflectiveManagerCreationFailIfLocksFolderIsNotFolder() throws Exception { + DistributedPrimitiveManager.newInstanceOf(managerClassName(), Collections.singletonMap("locks-folder", tmpFolder.newFile().toString())); + } + +} diff --git a/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedLockTest.java b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedLockTest.java new file mode 100644 index 0000000000..61502ae3e4 --- /dev/null +++ b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedLockTest.java @@ -0,0 +1,364 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum.zookeeper; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; + +import com.google.common.base.Predicates; +import org.apache.activemq.artemis.quorum.DistributedLock; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.apache.activemq.artemis.quorum.UnavailableStateException; +import org.apache.activemq.artemis.utils.Wait; +import org.apache.curator.test.InstanceSpec; +import org.apache.curator.test.TestingCluster; + +import org.apache.activemq.artemis.quorum.DistributedLockTest; +import org.apache.curator.test.TestingZooKeeperServer; +import org.junit.Assert; +import org.junit.Assume; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import static java.lang.Boolean.TRUE; +import static org.hamcrest.Matchers.greaterThan; + +@RunWith(value = Parameterized.class) +public class CuratorDistributedLockTest extends DistributedLockTest { + + private static final int BASE_SERVER_PORT = 6666; + private static final int CONNECTION_MS = 2000; + // Beware: the server tick must be small enough that to let the session to be correctly expired + private static final int SESSION_MS = 6000; + private static final int SERVER_TICK_MS = 2000; + private static final int RETRIES_MS = 100; + private static final int RETRIES = 1; + + @Parameterized.Parameter + public int nodes; + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); + private TestingCluster testingServer; + private InstanceSpec[] clusterSpecs; + private String connectString; + + @Parameterized.Parameters(name = "nodes={0}") + public static Iterable getTestParameters() { + return Arrays.asList(new Object[][]{{3}, {5}}); + } + + @Override + public void setupEnv() throws Throwable { + clusterSpecs = new InstanceSpec[nodes]; + for (int i = 0; i < nodes; i++) { + clusterSpecs[i] = new InstanceSpec(tmpFolder.newFolder(), BASE_SERVER_PORT + i, -1, -1, true, -1, SERVER_TICK_MS, -1); + } + testingServer = new TestingCluster(clusterSpecs); + testingServer.start(); + // start waits for quorumPeer!=null but not that it has started... + Wait.waitFor(this::ensembleHasLeader); + connectString = testingServer.getConnectString(); + super.setupEnv(); + } + + @Override + public void tearDownEnv() throws Throwable { + super.tearDownEnv(); + testingServer.close(); + } + + @Override + protected void configureManager(Map config) { + config.put("connect-string", connectString); + config.put("session-ms", Integer.toString(SESSION_MS)); + config.put("connection-ms", Integer.toString(CONNECTION_MS)); + config.put("retries", Integer.toString(RETRIES)); + config.put("retries-ms", Integer.toString(RETRIES_MS)); + } + + @Override + protected String managerClassName() { + return CuratorDistributedPrimitiveManager.class.getName(); + } + + @Test(expected = RuntimeException.class) + public void cannotCreateManagerWithNotValidParameterNames() { + final DistributedPrimitiveManager manager = createManagedDistributeManager(config -> config.put("_", "_")); + } + + @Test + public void canAcquireLocksFromDifferentNamespace() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + final DistributedPrimitiveManager manager1 = createManagedDistributeManager(config -> config.put("namespace", "1")); + manager1.start(); + final DistributedPrimitiveManager manager2 = createManagedDistributeManager(config -> config.put("namespace", "2")); + manager2.start(); + Assert.assertTrue(manager1.getDistributedLock("a").tryLock()); + Assert.assertTrue(manager2.getDistributedLock("a").tryLock()); + } + + @Test + public void cannotStartManagerWithDisconnectedServer() throws IOException, ExecutionException, InterruptedException { + final DistributedPrimitiveManager manager = createManagedDistributeManager(); + testingServer.close(); + Assert.assertFalse(manager.start(1, TimeUnit.SECONDS)); + } + + @Test(expected = UnavailableStateException.class) + public void cannotAcquireLockWithDisconnectedServer() throws IOException, ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + final DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + final DistributedLock lock = manager.getDistributedLock("a"); + final CountDownLatch notAvailable = new CountDownLatch(1); + final DistributedLock.UnavailableLockListener listener = notAvailable::countDown; + lock.addListener(listener); + testingServer.close(); + Assert.assertTrue(notAvailable.await(30, TimeUnit.SECONDS)); + lock.tryLock(); + } + + @Test(expected = UnavailableStateException.class) + public void cannotTryLockWithDisconnectedServer() throws IOException, ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + final DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + final DistributedLock lock = manager.getDistributedLock("a"); + testingServer.close(); + lock.tryLock(); + } + + @Test(expected = UnavailableStateException.class) + public void cannotCheckLockStatusWithDisconnectedServer() throws IOException, ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + final DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + final DistributedLock lock = manager.getDistributedLock("a"); + Assert.assertFalse(lock.isHeldByCaller()); + Assert.assertTrue(lock.tryLock()); + testingServer.close(); + lock.isHeldByCaller(); + } + + @Test(expected = UnavailableStateException.class) + public void looseLockAfterServerStop() throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException, IOException { + final DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + final DistributedLock lock = manager.getDistributedLock("a"); + Assert.assertTrue(lock.tryLock()); + Assert.assertTrue(lock.isHeldByCaller()); + final CountDownLatch notAvailable = new CountDownLatch(1); + final DistributedLock.UnavailableLockListener listener = notAvailable::countDown; + lock.addListener(listener); + Assert.assertEquals(1, notAvailable.getCount()); + testingServer.close(); + Assert.assertTrue(notAvailable.await(30, TimeUnit.SECONDS)); + lock.isHeldByCaller(); + } + + @Test + public void canAcquireLockOnMajorityRestart() throws Exception { + final DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + final DistributedLock lock = manager.getDistributedLock("a"); + Assert.assertTrue(lock.tryLock()); + Assert.assertTrue(lock.isHeldByCaller()); + final CountDownLatch notAvailable = new CountDownLatch(1); + final DistributedLock.UnavailableLockListener listener = notAvailable::countDown; + lock.addListener(listener); + Assert.assertEquals(1, notAvailable.getCount()); + testingServer.stop(); + notAvailable.await(); + manager.stop(); + restartMajorityNodes(true); + final DistributedPrimitiveManager otherManager = createManagedDistributeManager(); + otherManager.start(); + // await more then the expected value, that depends by how curator session expiration is configured + TimeUnit.MILLISECONDS.sleep(SESSION_MS + SERVER_TICK_MS); + Assert.assertTrue(otherManager.getDistributedLock("a").tryLock()); + } + + @Test + public void cannotStartManagerWithoutQuorum() throws Exception { + Assume.assumeThat(nodes, greaterThan(1)); + DistributedPrimitiveManager manager = createManagedDistributeManager(); + stopMajorityNotLeaderNodes(true); + Assert.assertFalse(manager.start(2, TimeUnit.SECONDS)); + Assert.assertFalse(manager.isStarted()); + } + + @Test(expected = UnavailableStateException.class) + public void cannotAcquireLockWithoutQuorum() throws Exception { + Assume.assumeThat(nodes, greaterThan(1)); + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + stopMajorityNotLeaderNodes(true); + DistributedLock lock = manager.getDistributedLock("a"); + lock.tryLock(); + } + + @Test + public void cannotCheckLockWithoutQuorum() throws Exception { + Assume.assumeThat(nodes, greaterThan(1)); + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + stopMajorityNotLeaderNodes(true); + DistributedLock lock = manager.getDistributedLock("a"); + final boolean held; + try { + held = lock.isHeldByCaller(); + } catch (UnavailableStateException expected) { + return; + } + Assert.assertFalse(held); + } + + @Test + public void canGetLockWithoutQuorum() throws Exception { + Assume.assumeThat(nodes, greaterThan(1)); + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + stopMajorityNotLeaderNodes(true); + DistributedLock lock = manager.getDistributedLock("a"); + Assert.assertNotNull(lock); + } + + @Test + public void notifiedAsUnavailableWhileLoosingQuorum() throws Exception { + Assume.assumeThat(nodes, greaterThan(1)); + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedLock lock = manager.getDistributedLock("a"); + CountDownLatch unavailable = new CountDownLatch(1); + lock.addListener(unavailable::countDown); + stopMajorityNotLeaderNodes(true); + Assert.assertTrue(unavailable.await(SESSION_MS + SERVER_TICK_MS, TimeUnit.MILLISECONDS)); + } + + @Test + public void beNotifiedOnce() throws Exception { + Assume.assumeThat(nodes, greaterThan(1)); + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedLock lock = manager.getDistributedLock("a"); + final AtomicInteger unavailableManager = new AtomicInteger(0); + final AtomicInteger unavailableLock = new AtomicInteger(0); + manager.addUnavailableManagerListener(unavailableManager::incrementAndGet); + lock.addListener(unavailableLock::incrementAndGet); + stopMajorityNotLeaderNodes(true); + TimeUnit.MILLISECONDS.sleep(SESSION_MS + SERVER_TICK_MS + CONNECTION_MS); + Assert.assertEquals(1, unavailableLock.get()); + Assert.assertEquals(1, unavailableManager.get()); + } + + @Test + public void beNotifiedOfUnavailabilityWhileBlockedOnTimedLock() throws Exception { + Assume.assumeThat(nodes, greaterThan(1)); + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + DistributedLock lock = manager.getDistributedLock("a"); + final AtomicInteger unavailableManager = new AtomicInteger(0); + final AtomicInteger unavailableLock = new AtomicInteger(0); + manager.addUnavailableManagerListener(unavailableManager::incrementAndGet); + lock.addListener(unavailableLock::incrementAndGet); + final DistributedPrimitiveManager otherManager = createManagedDistributeManager(); + otherManager.start(); + Assert.assertTrue(otherManager.getDistributedLock("a").tryLock()); + final CountDownLatch startedTimedLock = new CountDownLatch(1); + final AtomicReference unavailableTimedLock = new AtomicReference<>(null); + Thread timedLock = new Thread(() -> { + startedTimedLock.countDown(); + try { + lock.tryLock(Long.MAX_VALUE, TimeUnit.DAYS); + unavailableTimedLock.set(false); + } catch (UnavailableStateException e) { + unavailableTimedLock.set(true); + } catch (InterruptedException e) { + unavailableTimedLock.set(false); + } + }); + timedLock.start(); + Assert.assertTrue(startedTimedLock.await(10, TimeUnit.SECONDS)); + TimeUnit.SECONDS.sleep(1); + stopMajorityNotLeaderNodes(true); + TimeUnit.MILLISECONDS.sleep(SESSION_MS + CONNECTION_MS); + Wait.waitFor(() -> unavailableLock.get() > 0, SERVER_TICK_MS); + Assert.assertEquals(1, unavailableManager.get()); + Assert.assertEquals(TRUE, unavailableTimedLock.get()); + } + + @Test + public void beNotifiedOfAlreadyUnavailableManagerAfterAddingListener() throws Exception { + DistributedPrimitiveManager manager = createManagedDistributeManager(); + manager.start(); + final AtomicBoolean unavailable = new AtomicBoolean(false); + DistributedPrimitiveManager.UnavailableManagerListener managerListener = () -> { + unavailable.set(true); + }; + manager.addUnavailableManagerListener(managerListener); + Assert.assertFalse(unavailable.get()); + stopMajorityNotLeaderNodes(true); + Wait.waitFor(unavailable::get); + manager.removeUnavailableManagerListener(managerListener); + final AtomicInteger unavailableOnRegister = new AtomicInteger(); + manager.addUnavailableManagerListener(unavailableOnRegister::incrementAndGet); + Assert.assertEquals(1, unavailableOnRegister.get()); + unavailableOnRegister.set(0); + try (DistributedLock lock = manager.getDistributedLock("a")) { + lock.addListener(unavailableOnRegister::incrementAndGet); + Assert.assertEquals(1, unavailableOnRegister.get()); + } + } + + private boolean ensembleHasLeader() { + return testingServer.getServers().stream().filter(CuratorDistributedLockTest::isLeader).count() != 0; + } + + private static boolean isLeader(TestingZooKeeperServer server) { + long leaderId = server.getQuorumPeer().getLeaderId(); + long id = server.getQuorumPeer().getId(); + return id == leaderId; + } + + private void stopMajorityNotLeaderNodes(boolean fromLast) throws Exception { + List followers = testingServer.getServers().stream().filter(Predicates.not(CuratorDistributedLockTest::isLeader)).collect(Collectors.toList()); + final int quorum = (nodes / 2) + 1; + for (int i = 0; i < quorum; i++) { + final int nodeIndex = fromLast ? (followers.size() - 1) - i : i; + followers.get(nodeIndex).stop(); + } + } + + private void restartMajorityNodes(boolean startFromLast) throws Exception { + final int quorum = (nodes / 2) + 1; + for (int i = 0; i < quorum; i++) { + final int nodeIndex = startFromLast ? (nodes - 1) - i : i; + if (!testingServer.restartServer(clusterSpecs[nodeIndex])) { + throw new IllegalStateException("errored while restarting " + clusterSpecs[nodeIndex]); + } + } + } +} diff --git a/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitiveManagerTest.java b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitiveManagerTest.java new file mode 100644 index 0000000000..ecfa7f1906 --- /dev/null +++ b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/zookeeper/CuratorDistributedPrimitiveManagerTest.java @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.quorum.zookeeper; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; + +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.apache.curator.framework.CuratorFramework; +import org.apache.curator.test.InstanceSpec; +import org.apache.curator.test.TestingCluster; +import org.apache.curator.utils.ZKPaths; +import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.ZooKeeper; +import org.apache.zookeeper.data.Stat; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class CuratorDistributedPrimitiveManagerTest { + + private final ArrayList autoCloseables = new ArrayList<>(); + + private static final int BASE_SERVER_PORT = 6666; + private static final int CONNECTION_MS = 2000; + // Beware: the server tick must be small enough that to let the session to be correctly expired + private static final int SESSION_MS = 6000; + private static final int SERVER_TICK_MS = 2000; + private static final int RETRIES_MS = 100; + private static final int RETRIES = 1; + + public int nodes = 1; + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); + private TestingCluster testingServer; + private String connectString; + + + @Before + public void setupEnv() throws Throwable { + InstanceSpec[] clusterSpecs = new InstanceSpec[nodes]; + for (int i = 0; i < nodes; i++) { + clusterSpecs[i] = new InstanceSpec(tmpFolder.newFolder(), BASE_SERVER_PORT + i, -1, -1, true, -1, SERVER_TICK_MS, -1); + } + testingServer = new TestingCluster(clusterSpecs); + testingServer.start(); + connectString = testingServer.getConnectString(); + } + + @After + public void tearDownEnv() throws Throwable { + autoCloseables.forEach(closeables -> { + try { + closeables.close(); + } catch (Throwable t) { + // silent here + } + }); + testingServer.close(); + } + + protected void configureManager(Map config) { + config.put("connect-string", connectString); + config.put("session-ms", Integer.toString(SESSION_MS)); + config.put("connection-ms", Integer.toString(CONNECTION_MS)); + config.put("retries", Integer.toString(RETRIES)); + config.put("retries-ms", Integer.toString(RETRIES_MS)); + } + + protected DistributedPrimitiveManager createManagedDistributeManager(Consumer> defaultConfiguration) { + try { + final HashMap config = new HashMap<>(); + configureManager(config); + defaultConfiguration.accept(config); + final DistributedPrimitiveManager manager = DistributedPrimitiveManager.newInstanceOf(managerClassName(), config); + autoCloseables.add(manager); + return manager; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + protected String managerClassName() { + return CuratorDistributedPrimitiveManager.class.getName(); + } + + + @Test + public void verifyLayoutInZK() throws Exception { + final DistributedPrimitiveManager manager = createManagedDistributeManager(config -> config.put("namespace", "activemq-artemis")); + manager.start(); + Assert.assertTrue(manager.getDistributedLock("journal-identity-000-111").tryLock()); + + Assert.assertTrue(manager.getMutableLong("journal-identity-000-111").compareAndSet(0, 1)); + + CuratorFramework curatorFramework = ((CuratorDistributedPrimitiveManager)manager).getCurator(); + List entries = new LinkedList<>(); + dumpZK(curatorFramework.getZookeeperClient().getZooKeeper(), "/", entries); + + Assert.assertTrue(entries.get(2).contains("activation-sequence")); + + for (String entry: entries) { + System.err.println("ZK: " + entry); + } + } + + private void dumpZK(ZooKeeper zooKeeper, String path, List entries) throws InterruptedException, KeeperException { + List children = ZKPaths.getSortedChildren(zooKeeper,path); + for (String s: children) { + if (!s.equals("zookeeper")) { + String qualifiedPath = (path.endsWith("/") ? path : path + "/") + s; + Stat stat = new Stat(); + zooKeeper.getData(qualifiedPath, null, stat); + entries.add(qualifiedPath + ", data-len:" + stat.getDataLength() + ", ephemeral: " + (stat.getEphemeralOwner() != 0)); + dumpZK(zooKeeper, qualifiedPath, entries); + } + } + } +} diff --git a/artemis-server/pom.xml b/artemis-server/pom.xml index 7360f488b4..6f2d4d4b5e 100644 --- a/artemis-server/pom.xml +++ b/artemis-server/pom.xml @@ -85,6 +85,11 @@ artemis-core-client ${project.version}
+ + org.apache.activemq + artemis-quorum-api + ${project.version} + org.apache.activemq activemq-artemis-native diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ConfigurationUtils.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ConfigurationUtils.java index a3149473a6..473edcdb5b 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ConfigurationUtils.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ConfigurationUtils.java @@ -22,6 +22,8 @@ import java.util.List; import org.apache.activemq.artemis.api.config.ActiveMQDefaultConfiguration; import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException; import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ColocatedPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.LiveOnlyPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicaPolicyConfiguration; @@ -31,6 +33,8 @@ import org.apache.activemq.artemis.core.config.ha.SharedStoreSlavePolicyConfigur import org.apache.activemq.artemis.core.server.ActiveMQMessageBundle; import org.apache.activemq.artemis.core.server.ActiveMQServer; import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; +import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationBackupPolicy; +import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationPrimaryPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.BackupPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.ColocatedPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.HAPolicy; @@ -79,6 +83,11 @@ public final class ConfigurationUtils { ReplicaPolicyConfiguration pc = (ReplicaPolicyConfiguration) conf; return new ReplicaPolicy(pc.getClusterName(), pc.getMaxSavedReplicatedJournalsSize(), pc.getGroupName(), pc.isRestartBackup(), pc.isAllowFailBack(), pc.getInitialReplicationSyncTimeout(), getScaleDownPolicy(pc.getScaleDownConfiguration()), server.getNetworkHealthCheck(), pc.getVoteOnReplicationFailure(), pc.getQuorumSize(), pc.getVoteRetries(), pc.getVoteRetryWait(), pc.getQuorumVoteWait(), pc.getRetryReplicationWait()); } + case PRIMARY: + return ReplicationPrimaryPolicy.with((ReplicationPrimaryPolicyConfiguration) conf); + case BACKUP: { + return ReplicationBackupPolicy.with((ReplicationBackupPolicyConfiguration) conf); + } case SHARED_STORE_MASTER: { SharedStoreMasterPolicyConfiguration pc = (SharedStoreMasterPolicyConfiguration) conf; return new SharedStoreMasterPolicy(pc.isFailoverOnServerShutdown(), pc.isWaitForActivation()); diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/HAPolicyConfiguration.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/HAPolicyConfiguration.java index d1b2a260b4..c18f4dc710 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/HAPolicyConfiguration.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/HAPolicyConfiguration.java @@ -26,7 +26,9 @@ public interface HAPolicyConfiguration extends Serializable { REPLICA("Replica"), SHARED_STORE_MASTER("Shared Store Master"), SHARED_STORE_SLAVE("Shared Store Slave"), - COLOCATED("Colocated"); + COLOCATED("Colocated"), + PRIMARY("Primary"), + BACKUP("Backup"); private String name; diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/DistributedPrimitiveManagerConfiguration.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/DistributedPrimitiveManagerConfiguration.java new file mode 100644 index 0000000000..0dc7971e92 --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/DistributedPrimitiveManagerConfiguration.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.core.config.ha; + +import java.io.Serializable; +import java.util.Map; + +public class DistributedPrimitiveManagerConfiguration implements Serializable { + + private final String className; + private final Map properties; + + public DistributedPrimitiveManagerConfiguration(String className, Map properties) { + this.className = className; + this.properties = properties; + } + + public Map getProperties() { + return properties; + } + + public String getClassName() { + return className; + } +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java new file mode 100644 index 0000000000..f1c97eb7ea --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.core.config.ha; + +import org.apache.activemq.artemis.api.config.ActiveMQDefaultConfiguration; +import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; + +public class ReplicationBackupPolicyConfiguration implements HAPolicyConfiguration { + + private String clusterName = null; + + private int maxSavedReplicatedJournalsSize = ActiveMQDefaultConfiguration.getDefaultMaxSavedReplicatedJournalsSize(); + + private String groupName = null; + + /* + * used in the replicated policy after failover + * */ + private boolean allowFailBack = false; + + private long initialReplicationSyncTimeout = ActiveMQDefaultConfiguration.getDefaultInitialReplicationSyncTimeout(); + + private int voteRetries = ActiveMQDefaultConfiguration.getDefaultVoteRetries(); + + /** + * TODO: move if into {@link ActiveMQDefaultConfiguration} when the configuration is stable. + */ + private long voteRetryWait = 2000; + + private long retryReplicationWait = ActiveMQDefaultConfiguration.getDefaultRetryReplicationWait(); + + private DistributedPrimitiveManagerConfiguration distributedManagerConfiguration = null; + + public static final ReplicationBackupPolicyConfiguration withDefault() { + return new ReplicationBackupPolicyConfiguration(); + } + + private ReplicationBackupPolicyConfiguration() { + } + + @Override + public HAPolicyConfiguration.TYPE getType() { + return TYPE.BACKUP; + } + + public String getClusterName() { + return clusterName; + } + + public ReplicationBackupPolicyConfiguration setClusterName(String clusterName) { + this.clusterName = clusterName; + return this; + } + + public int getMaxSavedReplicatedJournalsSize() { + return maxSavedReplicatedJournalsSize; + } + + public ReplicationBackupPolicyConfiguration setMaxSavedReplicatedJournalsSize(int maxSavedReplicatedJournalsSize) { + this.maxSavedReplicatedJournalsSize = maxSavedReplicatedJournalsSize; + return this; + } + + public String getGroupName() { + return groupName; + } + + public ReplicationBackupPolicyConfiguration setGroupName(String groupName) { + this.groupName = groupName; + return this; + } + + public boolean isAllowFailBack() { + return allowFailBack; + } + + public ReplicationBackupPolicyConfiguration setAllowFailBack(boolean allowFailBack) { + this.allowFailBack = allowFailBack; + return this; + } + + public long getInitialReplicationSyncTimeout() { + return initialReplicationSyncTimeout; + } + + public ReplicationBackupPolicyConfiguration setInitialReplicationSyncTimeout(long initialReplicationSyncTimeout) { + this.initialReplicationSyncTimeout = initialReplicationSyncTimeout; + return this; + } + + public int getVoteRetries() { + return voteRetries; + } + + public ReplicationBackupPolicyConfiguration setVoteRetries(int voteRetries) { + this.voteRetries = voteRetries; + return this; + } + + public ReplicationBackupPolicyConfiguration setVoteRetryWait(long voteRetryWait) { + this.voteRetryWait = voteRetryWait; + return this; + } + + public long getVoteRetryWait() { + return voteRetryWait; + } + + public long getRetryReplicationWait() { + return retryReplicationWait; + } + + public ReplicationBackupPolicyConfiguration setRetryReplicationWait(long retryReplicationWait) { + this.retryReplicationWait = retryReplicationWait; + return this; + } + + public ReplicationBackupPolicyConfiguration setDistributedManagerConfiguration(DistributedPrimitiveManagerConfiguration configuration) { + this.distributedManagerConfiguration = configuration; + return this; + } + + public DistributedPrimitiveManagerConfiguration getDistributedManagerConfiguration() { + return distributedManagerConfiguration; + } +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java new file mode 100644 index 0000000000..7bb53184dd --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.core.config.ha; + +import org.apache.activemq.artemis.api.config.ActiveMQDefaultConfiguration; +import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; + +public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfiguration { + + private boolean checkForLiveServer = ActiveMQDefaultConfiguration.isDefaultCheckForLiveServer(); + + private String groupName = null; + + private String clusterName = null; + + private long initialReplicationSyncTimeout = ActiveMQDefaultConfiguration.getDefaultInitialReplicationSyncTimeout(); + + private int voteRetries = ActiveMQDefaultConfiguration.getDefaultVoteRetries(); + + /** + * TODO: move if into {@link ActiveMQDefaultConfiguration} when the configuration is stable. + */ + private long voteRetryWait = 2000; + + private Long retryReplicationWait = ActiveMQDefaultConfiguration.getDefaultRetryReplicationWait(); + + private DistributedPrimitiveManagerConfiguration distributedManagerConfiguration = null; + + public static ReplicationPrimaryPolicyConfiguration withDefault() { + return new ReplicationPrimaryPolicyConfiguration(); + } + + private ReplicationPrimaryPolicyConfiguration() { + } + + @Override + public TYPE getType() { + return TYPE.PRIMARY; + } + + public boolean isCheckForLiveServer() { + return checkForLiveServer; + } + + public ReplicationPrimaryPolicyConfiguration setCheckForLiveServer(boolean checkForLiveServer) { + this.checkForLiveServer = checkForLiveServer; + return this; + } + + public String getGroupName() { + return groupName; + } + + public ReplicationPrimaryPolicyConfiguration setGroupName(String groupName) { + this.groupName = groupName; + return this; + } + + public String getClusterName() { + return clusterName; + } + + public ReplicationPrimaryPolicyConfiguration setClusterName(String clusterName) { + this.clusterName = clusterName; + return this; + } + + public long getInitialReplicationSyncTimeout() { + return initialReplicationSyncTimeout; + } + + public ReplicationPrimaryPolicyConfiguration setInitialReplicationSyncTimeout(long initialReplicationSyncTimeout) { + this.initialReplicationSyncTimeout = initialReplicationSyncTimeout; + return this; + } + + public int getVoteRetries() { + return voteRetries; + } + + public ReplicationPrimaryPolicyConfiguration setVoteRetries(int voteRetries) { + this.voteRetries = voteRetries; + return this; + } + + public ReplicationPrimaryPolicyConfiguration setVoteRetryWait(long voteRetryWait) { + this.voteRetryWait = voteRetryWait; + return this; + } + + public long getVoteRetryWait() { + return voteRetryWait; + } + + public void setRetryReplicationWait(Long retryReplicationWait) { + this.retryReplicationWait = retryReplicationWait; + } + + public Long getRetryReplicationWait() { + return retryReplicationWait; + } + + public ReplicationPrimaryPolicyConfiguration setDistributedManagerConfiguration(DistributedPrimitiveManagerConfiguration configuration) { + this.distributedManagerConfiguration = configuration; + return this; + } + + public DistributedPrimitiveManagerConfiguration getDistributedManagerConfiguration() { + return distributedManagerConfiguration; + } +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java index bc3c0a779e..4932e01439 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java @@ -69,7 +69,10 @@ import org.apache.activemq.artemis.core.config.federation.FederationQueuePolicyC import org.apache.activemq.artemis.core.config.federation.FederationStreamConfiguration; import org.apache.activemq.artemis.core.config.federation.FederationTransformerConfiguration; import org.apache.activemq.artemis.core.config.federation.FederationUpstreamConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ColocatedPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.LiveOnlyPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicaPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicatedPolicyConfiguration; @@ -1607,6 +1610,16 @@ public final class FileConfigurationParser extends XMLConfigurationUtil { Element colocatedNode = (Element) colocatedNodeList.item(0); mainConfig.setHAPolicyConfiguration(createColocatedHaPolicy(colocatedNode, true)); } + NodeList primaryNodeList = e.getElementsByTagName("primary"); + if (primaryNodeList.getLength() > 0) { + Element primaryNode = (Element) primaryNodeList.item(0); + mainConfig.setHAPolicyConfiguration(createReplicationPrimaryHaPolicy(primaryNode, mainConfig)); + } + NodeList backupNodeList = e.getElementsByTagName("backup"); + if (backupNodeList.getLength() > 0) { + Element backupNode = (Element) backupNodeList.item(0); + mainConfig.setHAPolicyConfiguration(createReplicationBackupHaPolicy(backupNode, mainConfig)); + } } else if (haNode.getTagName().equals("shared-store")) { NodeList masterNodeList = e.getElementsByTagName("master"); if (masterNodeList.getLength() > 0) { @@ -1699,6 +1712,75 @@ public final class FileConfigurationParser extends XMLConfigurationUtil { return configuration; } + private ReplicationPrimaryPolicyConfiguration createReplicationPrimaryHaPolicy(Element policyNode, Configuration config) { + ReplicationPrimaryPolicyConfiguration configuration = ReplicationPrimaryPolicyConfiguration.withDefault(); + + configuration.setCheckForLiveServer(getBoolean(policyNode, "check-for-live-server", configuration.isCheckForLiveServer())); + + configuration.setGroupName(getString(policyNode, "group-name", configuration.getGroupName(), Validators.NO_CHECK)); + + configuration.setClusterName(getString(policyNode, "cluster-name", configuration.getClusterName(), Validators.NO_CHECK)); + + configuration.setInitialReplicationSyncTimeout(getLong(policyNode, "initial-replication-sync-timeout", configuration.getInitialReplicationSyncTimeout(), Validators.GT_ZERO)); + + configuration.setVoteRetries(getInteger(policyNode, "vote-retries", configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO)); + + configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); + + configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); + + configuration.setDistributedManagerConfiguration(createDistributedPrimitiveManagerConfiguration(policyNode, config)); + + return configuration; + } + + private ReplicationBackupPolicyConfiguration createReplicationBackupHaPolicy(Element policyNode, Configuration config) { + + ReplicationBackupPolicyConfiguration configuration = ReplicationBackupPolicyConfiguration.withDefault(); + + configuration.setGroupName(getString(policyNode, "group-name", configuration.getGroupName(), Validators.NO_CHECK)); + + configuration.setAllowFailBack(getBoolean(policyNode, "allow-failback", configuration.isAllowFailBack())); + + configuration.setInitialReplicationSyncTimeout(getLong(policyNode, "initial-replication-sync-timeout", configuration.getInitialReplicationSyncTimeout(), Validators.GT_ZERO)); + + configuration.setClusterName(getString(policyNode, "cluster-name", configuration.getClusterName(), Validators.NO_CHECK)); + + configuration.setMaxSavedReplicatedJournalsSize(getInteger(policyNode, "max-saved-replicated-journals-size", configuration.getMaxSavedReplicatedJournalsSize(), Validators.MINUS_ONE_OR_GE_ZERO)); + + configuration.setVoteRetries(getInteger(policyNode, "vote-retries", configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO)); + + configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); + + configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); + + configuration.setDistributedManagerConfiguration(createDistributedPrimitiveManagerConfiguration(policyNode, config)); + + return configuration; + } + + private DistributedPrimitiveManagerConfiguration createDistributedPrimitiveManagerConfiguration(Element policyNode, Configuration config) { + final Element managerNode = (Element) policyNode.getElementsByTagName("manager").item(0); + final String className = getString(managerNode, "class-name", + ActiveMQDefaultConfiguration.getDefaultDistributedPrimitiveManagerClassName(), + Validators.NO_CHECK); + final Map properties; + if (parameterExists(managerNode, "properties")) { + final NodeList propertyNodeList = managerNode.getElementsByTagName("property"); + final int propertiesCount = propertyNodeList.getLength(); + properties = new HashMap<>(propertiesCount); + for (int i = 0; i < propertiesCount; i++) { + final Element propertyNode = (Element) propertyNodeList.item(i); + final String propertyName = propertyNode.getAttributeNode("key").getValue(); + final String propertyValue = propertyNode.getAttributeNode("value").getValue(); + properties.put(propertyName, propertyValue); + } + } else { + properties = new HashMap<>(1); + } + return new DistributedPrimitiveManagerConfiguration(className, properties); + } + private SharedStoreMasterPolicyConfiguration createSharedStoreMasterHaPolicy(Element policyNode) { SharedStoreMasterPolicyConfiguration configuration = new SharedStoreMasterPolicyConfiguration(); diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java index b173f3afdc..495b35d281 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java @@ -37,7 +37,6 @@ import org.apache.activemq.artemis.api.core.Interceptor; import org.apache.activemq.artemis.api.core.Message; import org.apache.activemq.artemis.api.core.SimpleString; import org.apache.activemq.artemis.core.config.Configuration; -import org.apache.activemq.artemis.core.io.IOCriticalErrorListener; import org.apache.activemq.artemis.core.io.SequentialFile; import org.apache.activemq.artemis.core.journal.EncoderPersister; import org.apache.activemq.artemis.core.journal.Journal; @@ -82,9 +81,8 @@ import org.apache.activemq.artemis.core.replication.ReplicationManager.ADD_OPERA import org.apache.activemq.artemis.core.server.ActiveMQComponent; import org.apache.activemq.artemis.core.server.ActiveMQMessageBundle; import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; -import org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothingBackupQuorum; + import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl; -import org.apache.activemq.artemis.core.server.impl.SharedNothingBackupActivation; import org.apache.activemq.artemis.utils.actors.OrderedExecutorFactory; import org.jboss.logging.Logger; @@ -94,12 +92,20 @@ import org.jboss.logging.Logger; */ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQComponent { + public interface ReplicationEndpointEventListener { + + void onRemoteBackupUpToDate(); + + void onLiveStopping(ReplicationLiveIsStoppingMessage.LiveStopping message) throws ActiveMQException; + + void onLiveNodeId(String nodeId); + } + private static final Logger logger = Logger.getLogger(ReplicationEndpoint.class); - private final IOCriticalErrorListener criticalErrorListener; private final ActiveMQServerImpl server; private final boolean wantedFailBack; - private final SharedNothingBackupActivation activation; + private final ReplicationEndpointEventListener eventListener; private final boolean noSync = false; private Channel channel; private boolean supportResponseBatching; @@ -129,8 +135,6 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon private boolean deletePages = true; private volatile boolean started; - private SharedNothingBackupQuorum backupQuorum; - private Executor executor; private List outgoingInterceptors = null; @@ -140,13 +144,11 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon // Constructors -------------------------------------------------- public ReplicationEndpoint(final ActiveMQServerImpl server, - IOCriticalErrorListener criticalErrorListener, boolean wantedFailBack, - SharedNothingBackupActivation activation) { + ReplicationEndpointEventListener eventListener) { this.server = server; - this.criticalErrorListener = criticalErrorListener; this.wantedFailBack = wantedFailBack; - this.activation = activation; + this.eventListener = eventListener; this.pendingPackets = new ArrayDeque<>(); this.supportResponseBatching = false; } @@ -287,7 +289,7 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon * @throws ActiveMQException */ private void handleLiveStopping(ReplicationLiveIsStoppingMessage packet) throws ActiveMQException { - activation.remoteFailOver(packet.isFinalMessage()); + eventListener.onLiveStopping(packet.isFinalMessage()); } @Override @@ -474,8 +476,8 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon } journalsHolder = null; - backupQuorum.liveIDSet(liveID); - activation.setRemoteBackupUpToDate(); + eventListener.onLiveNodeId(liveID); + eventListener.onRemoteBackupUpToDate(); if (logger.isTraceEnabled()) { logger.trace("Backup is synchronized / BACKUP-SYNC-DONE"); @@ -597,7 +599,7 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon if (packet.getNodeID() != null) { // At the start of replication, we still do not know which is the nodeID that the live uses. // This is the point where the backup gets this information. - backupQuorum.liveIDSet(packet.getNodeID()); + eventListener.onLiveNodeId(packet.getNodeID()); } break; @@ -900,16 +902,6 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon } } - /** - * Sets the quorumManager used by the server in the replicationEndpoint. It is used to inform the - * backup server of the live's nodeID. - * - * @param backupQuorum - */ - public void setBackupQuorum(SharedNothingBackupQuorum backupQuorum) { - this.backupQuorum = backupQuorum; - } - /** * @param executor2 */ diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServer.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServer.java index 33d8d1834e..e3249793e6 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServer.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServer.java @@ -40,7 +40,6 @@ import org.apache.activemq.artemis.core.persistence.OperationContext; import org.apache.activemq.artemis.core.persistence.StorageManager; import org.apache.activemq.artemis.core.postoffice.PostOffice; import org.apache.activemq.artemis.core.remoting.server.RemotingService; -import org.apache.activemq.artemis.core.replication.ReplicationEndpoint; import org.apache.activemq.artemis.core.replication.ReplicationManager; import org.apache.activemq.artemis.core.security.Role; import org.apache.activemq.artemis.core.security.SecurityAuth; @@ -166,11 +165,6 @@ public interface ActiveMQServer extends ServiceComponent { CriticalAnalyzer getCriticalAnalyzer(); - /** - * @return - */ - ReplicationEndpoint getReplicationEndpoint(); - /** * it will release hold a lock for the activation. */ diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/LiveNodeLocator.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/LiveNodeLocator.java index 56fef7c77c..1515bdc51c 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/LiveNodeLocator.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/LiveNodeLocator.java @@ -21,7 +21,6 @@ import org.apache.activemq.artemis.api.core.Pair; import org.apache.activemq.artemis.api.core.TransportConfiguration; import org.apache.activemq.artemis.api.core.client.ClusterTopologyListener; import org.apache.activemq.artemis.core.client.impl.ServerLocatorInternal; -import org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothingBackupQuorum; /** * A class that will locate a particular live server running in a cluster. How this live is chosen @@ -31,16 +30,23 @@ import org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothingBacku */ public abstract class LiveNodeLocator implements ClusterTopologyListener { - private SharedNothingBackupQuorum backupQuorum; + @FunctionalInterface + public interface BackupRegistrationListener { - public LiveNodeLocator(SharedNothingBackupQuorum backupQuorum) { - this.backupQuorum = backupQuorum; + void onBackupRegistrationFailed(boolean alreadyReplicating); + } + + private final BackupRegistrationListener backupRegistrationListener; + + public LiveNodeLocator(BackupRegistrationListener backupRegistrationListener) { + this.backupRegistrationListener = backupRegistrationListener; } /** * Use this constructor when the LiveNodeLocator is used for scaling down rather than replicating */ public LiveNodeLocator() { + this(null); } /** @@ -67,12 +73,8 @@ public abstract class LiveNodeLocator implements ClusterTopologyListener { * tells the locator the the current connector has failed. */ public void notifyRegistrationFailed(boolean alreadyReplicating) { - if (backupQuorum != null) { - if (alreadyReplicating) { - backupQuorum.notifyAlreadyReplicating(); - } else { - backupQuorum.notifyRegistrationFailed(); - } + if (backupRegistrationListener != null) { + backupRegistrationListener.onBackupRegistrationFailed(alreadyReplicating); } } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java index a6b05e1787..d136402f69 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java @@ -79,6 +79,16 @@ public abstract class NodeManager implements ActiveMQComponent { } } + public long readDataVersion() throws NodeManagerException { + // TODO make it abstract + throw new UnsupportedOperationException("TODO"); + } + + public void writeDataVersion(long version) throws NodeManagerException { + // TODO make it abstract + throw new UnsupportedOperationException("TODO"); + } + public abstract SimpleString readNodeId() throws NodeManagerException; public UUID getUUID() { diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ClusterController.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ClusterController.java index 6ef9f262f4..ec9f153cd9 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ClusterController.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ClusterController.java @@ -80,10 +80,16 @@ public class ClusterController implements ActiveMQComponent { private boolean started; private SimpleString replicatedClusterName; - public ClusterController(ActiveMQServer server, ScheduledExecutorService scheduledExecutor) { + public ClusterController(ActiveMQServer server, + ScheduledExecutorService scheduledExecutor, + boolean useQuorumManager) { this.server = server; executor = server.getExecutorFactory().getExecutor(); - quorumManager = new QuorumManager(scheduledExecutor, this); + quorumManager = useQuorumManager ? new QuorumManager(scheduledExecutor, this) : null; + } + + public ClusterController(ActiveMQServer server, ScheduledExecutorService scheduledExecutor) { + this(server, scheduledExecutor, true); } @Override @@ -108,11 +114,11 @@ public class ClusterController implements ActiveMQComponent { //latch so we know once we are connected replicationClusterConnectedLatch = new CountDownLatch(1); //and add the quorum manager as a topology listener - if (defaultLocator != null) { - defaultLocator.addClusterTopologyListener(quorumManager); - } - if (quorumManager != null) { + if (defaultLocator != null) { + defaultLocator.addClusterTopologyListener(quorumManager); + } + //start the quorum manager quorumManager.start(); } @@ -126,6 +132,26 @@ public class ClusterController implements ActiveMQComponent { } } + /** + * It adds {@code clusterTopologyListener} to {@code defaultLocator}. + */ + public void addClusterTopologyListener(ClusterTopologyListener clusterTopologyListener) { + if (!this.started || defaultLocator == null) { + throw new IllegalStateException("the controller must be started and with a locator initialized"); + } + this.defaultLocator.addClusterTopologyListener(clusterTopologyListener); + } + + /** + * It remove {@code clusterTopologyListener} from {@code defaultLocator}. + */ + public void removeClusterTopologyListener(ClusterTopologyListener clusterTopologyListener) { + if (!this.started || defaultLocator == null) { + throw new IllegalStateException("the controller must be started and with a locator initialized"); + } + this.defaultLocator.removeClusterTopologyListener(clusterTopologyListener); + } + @Override public void stop() throws Exception { if (logger.isDebugEnabled()) { @@ -138,7 +164,9 @@ public class ClusterController implements ActiveMQComponent { serverLocatorInternal.close(); } //stop the quorum manager - quorumManager.stop(); + if (quorumManager != null) { + quorumManager.stop(); + } } @Override @@ -223,6 +251,17 @@ public class ClusterController implements ActiveMQComponent { } } + /** + * add a cluster listener + * + * @param listener + */ + public void removeClusterTopologyListenerForReplication(ClusterTopologyListener listener) { + if (replicationLocator != null) { + replicationLocator.removeClusterTopologyListener(listener); + } + } + /** * add an interceptor * @@ -232,6 +271,15 @@ public class ClusterController implements ActiveMQComponent { replicationLocator.addIncomingInterceptor(interceptor); } + /** + * remove an interceptor + * + * @param interceptor + */ + public void removeIncomingInterceptorForReplication(Interceptor interceptor) { + replicationLocator.removeIncomingInterceptor(interceptor); + } + /** * connect to a specific node in the cluster used for replication * @@ -406,7 +454,11 @@ public class ClusterController implements ActiveMQComponent { logger.debug("there is no acceptor used configured at the CoreProtocolManager " + this); } } else if (packet.getType() == PacketImpl.QUORUM_VOTE) { - quorumManager.handleQuorumVote(clusterChannel, packet); + if (quorumManager != null) { + quorumManager.handleQuorumVote(clusterChannel, packet); + } else { + logger.warnf("Received %s on a cluster connection that's using the new quorum vote algorithm.", packet); + } } else if (packet.getType() == PacketImpl.SCALEDOWN_ANNOUNCEMENT) { ScaleDownAnnounceMessage message = (ScaleDownAnnounceMessage) packet; //we don't really need to check as it should always be true diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ClusterManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ClusterManager.java index 6950120a37..44a81e4b65 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ClusterManager.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ClusterManager.java @@ -157,7 +157,7 @@ public class ClusterManager implements ActiveMQComponent { final ManagementService managementService, final Configuration configuration, final NodeManager nodeManager, - final boolean backup) { + final boolean useQuorumManager) { this.executorFactory = executorFactory; executor = executorFactory.getExecutor(); @@ -174,7 +174,7 @@ public class ClusterManager implements ActiveMQComponent { this.nodeManager = nodeManager; - clusterController = new ClusterController(server, scheduledExecutor); + clusterController = new ClusterController(server, scheduledExecutor, useQuorumManager); haManager = server.getActivation().getHAManager(); } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/HAPolicy.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/HAPolicy.java index c5d62ac378..34c8412767 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/HAPolicy.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/HAPolicy.java @@ -57,4 +57,8 @@ public interface HAPolicy { String getScaleDownClustername(); + default boolean useQuorumManager() { + return true; + } + } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java new file mode 100644 index 0000000000..5427360fcf --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.core.server.cluster.ha; + +import java.util.Map; +import java.util.Objects; + +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; +import org.apache.activemq.artemis.core.io.IOCriticalErrorListener; +import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl; +import org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; + +public class ReplicationBackupPolicy implements HAPolicy { + + private final ReplicationPrimaryPolicy livePolicy; + private final String groupName; + private final String clusterName; + private final int maxSavedReplicatedJournalsSize; + private final int voteRetries; + private final long voteRetryWait; + private final long retryReplicationWait; + private final DistributedPrimitiveManagerConfiguration distributedManagerConfiguration; + private final boolean tryFailback; + + private ReplicationBackupPolicy(ReplicationBackupPolicyConfiguration configuration, + ReplicationPrimaryPolicy livePolicy) { + Objects.requireNonNull(livePolicy); + this.clusterName = configuration.getClusterName(); + this.maxSavedReplicatedJournalsSize = configuration.getMaxSavedReplicatedJournalsSize(); + this.groupName = configuration.getGroupName(); + this.voteRetries = configuration.getVoteRetries(); + this.voteRetryWait = configuration.getVoteRetryWait(); + this.retryReplicationWait = configuration.getRetryReplicationWait(); + this.distributedManagerConfiguration = configuration.getDistributedManagerConfiguration(); + this.tryFailback = true; + this.livePolicy = livePolicy; + } + + private ReplicationBackupPolicy(ReplicationBackupPolicyConfiguration configuration) { + this.clusterName = configuration.getClusterName(); + this.maxSavedReplicatedJournalsSize = configuration.getMaxSavedReplicatedJournalsSize(); + this.groupName = configuration.getGroupName(); + this.voteRetries = configuration.getVoteRetries(); + this.voteRetryWait = configuration.getVoteRetryWait(); + this.retryReplicationWait = configuration.getRetryReplicationWait(); + this.distributedManagerConfiguration = configuration.getDistributedManagerConfiguration(); + this.tryFailback = false; + livePolicy = ReplicationPrimaryPolicy.failoverPolicy( + configuration.getInitialReplicationSyncTimeout(), + configuration.getGroupName(), + configuration.getClusterName(), + this, + configuration.isAllowFailBack(), + configuration.getDistributedManagerConfiguration()); + } + + public boolean isTryFailback() { + return tryFailback; + } + + /** + * It creates a policy which live policy won't cause to broker to try failback. + */ + public static ReplicationBackupPolicy with(ReplicationBackupPolicyConfiguration configuration) { + return new ReplicationBackupPolicy(configuration); + } + + /** + * It creates a companion backup policy for a natural-born primary: it would cause the broker to try failback. + */ + static ReplicationBackupPolicy failback(int voteRetries, + long voteRetryWait, + long retryReplicationWait, + String clusterName, + String groupName, + ReplicationPrimaryPolicy livePolicy, + DistributedPrimitiveManagerConfiguration distributedManagerConfiguration) { + return new ReplicationBackupPolicy(ReplicationBackupPolicyConfiguration.withDefault() + .setVoteRetries(voteRetries) + .setVoteRetryWait(voteRetryWait) + .setRetryReplicationWait(retryReplicationWait) + .setClusterName(clusterName) + .setGroupName(groupName) + .setDistributedManagerConfiguration(distributedManagerConfiguration), + livePolicy); + } + + @Override + public ReplicationBackupActivation createActivation(ActiveMQServerImpl server, + boolean wasLive, + Map activationParams, + IOCriticalErrorListener shutdownOnCriticalIO) throws Exception { + return new ReplicationBackupActivation(server, wasLive, DistributedPrimitiveManager.newInstanceOf( + distributedManagerConfiguration.getClassName(), + distributedManagerConfiguration.getProperties()), this); + } + + @Override + public boolean isSharedStore() { + return false; + } + + @Override + public boolean isBackup() { + return true; + } + + @Override + public boolean canScaleDown() { + return false; + } + + @Override + public String getScaleDownGroupName() { + return null; + } + + @Override + public String getScaleDownClustername() { + return null; + } + + public String getClusterName() { + return clusterName; + } + + @Override + public String getBackupGroupName() { + return groupName; + } + + public String getGroupName() { + return groupName; + } + + public ReplicationPrimaryPolicy getLivePolicy() { + return livePolicy; + } + + public int getMaxSavedReplicatedJournalsSize() { + return maxSavedReplicatedJournalsSize; + } + + public int getVoteRetries() { + return voteRetries; + } + + public long getVoteRetryWait() { + return voteRetryWait; + } + + public long getRetryReplicationWait() { + return retryReplicationWait; + } + + @Override + public boolean useQuorumManager() { + return false; + } +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationPrimaryPolicy.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationPrimaryPolicy.java new file mode 100644 index 0000000000..fe9b99358e --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationPrimaryPolicy.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.core.server.cluster.ha; + +import java.util.Map; +import java.util.Objects; + +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; +import org.apache.activemq.artemis.core.io.IOCriticalErrorListener; +import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl; +import org.apache.activemq.artemis.core.server.impl.ReplicationPrimaryActivation; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; + +public class ReplicationPrimaryPolicy implements HAPolicy { + + private final ReplicationBackupPolicy backupPolicy; + private final String clusterName; + private final String groupName; + private final boolean checkForLiveServer; + private final long initialReplicationSyncTimeout; + private final DistributedPrimitiveManagerConfiguration distributedManagerConfiguration; + private final boolean allowAutoFailBack; + + private ReplicationPrimaryPolicy(ReplicationPrimaryPolicyConfiguration configuration, + ReplicationBackupPolicy backupPolicy, + boolean allowAutoFailBack) { + Objects.requireNonNull(backupPolicy); + clusterName = configuration.getClusterName(); + groupName = configuration.getGroupName(); + checkForLiveServer = configuration.isCheckForLiveServer(); + initialReplicationSyncTimeout = configuration.getInitialReplicationSyncTimeout(); + distributedManagerConfiguration = configuration.getDistributedManagerConfiguration(); + this.allowAutoFailBack = allowAutoFailBack; + this.backupPolicy = backupPolicy; + } + + private ReplicationPrimaryPolicy(ReplicationPrimaryPolicyConfiguration config) { + clusterName = config.getClusterName(); + groupName = config.getGroupName(); + checkForLiveServer = config.isCheckForLiveServer(); + initialReplicationSyncTimeout = config.getInitialReplicationSyncTimeout(); + distributedManagerConfiguration = config.getDistributedManagerConfiguration(); + this.allowAutoFailBack = false; + backupPolicy = ReplicationBackupPolicy.failback(config.getVoteRetries(), config.getVoteRetryWait(), + config.getRetryReplicationWait(), config.getClusterName(), + config.getGroupName(), this, + config.getDistributedManagerConfiguration()); + } + + /** + * It creates a companion failing-over primary policy for a natural-born backup: it's allowed to allow auto fail-back + * only if configured to do it. + */ + static ReplicationPrimaryPolicy failoverPolicy(long initialReplicationSyncTimeout, + String groupName, + String clusterName, + ReplicationBackupPolicy replicaPolicy, + boolean allowAutoFailback, + DistributedPrimitiveManagerConfiguration distributedManagerConfiguration) { + return new ReplicationPrimaryPolicy(ReplicationPrimaryPolicyConfiguration.withDefault() + .setCheckForLiveServer(false) + .setInitialReplicationSyncTimeout(initialReplicationSyncTimeout) + .setGroupName(groupName) + .setClusterName(clusterName) + .setDistributedManagerConfiguration(distributedManagerConfiguration), + replicaPolicy, allowAutoFailback); + } + + /** + * It creates a primary policy that never allow auto fail-back.
+ * It's meant to be used for natural-born primary brokers: its backup policy is set to always try to fail-back. + */ + public static ReplicationPrimaryPolicy with(ReplicationPrimaryPolicyConfiguration configuration) { + return new ReplicationPrimaryPolicy(configuration); + } + + public ReplicationBackupPolicy getBackupPolicy() { + return backupPolicy; + } + + @Override + public ReplicationPrimaryActivation createActivation(ActiveMQServerImpl server, + boolean wasLive, + Map activationParams, + IOCriticalErrorListener shutdownOnCriticalIO) throws Exception { + return new ReplicationPrimaryActivation(server, + DistributedPrimitiveManager.newInstanceOf( + distributedManagerConfiguration.getClassName(), + distributedManagerConfiguration.getProperties()), this); + } + + @Override + public boolean isSharedStore() { + return false; + } + + @Override + public boolean isBackup() { + return false; + } + + @Override + public boolean isWaitForActivation() { + return true; + } + + @Override + public boolean canScaleDown() { + return false; + } + + @Override + public String getBackupGroupName() { + return groupName; + } + + @Override + public String getScaleDownGroupName() { + return null; + } + + @Override + public String getScaleDownClustername() { + return null; + } + + public boolean isCheckForLiveServer() { + return checkForLiveServer; + } + + public boolean isAllowAutoFailBack() { + return allowAutoFailBack; + } + + public String getClusterName() { + return clusterName; + } + + public long getInitialReplicationSyncTimeout() { + return initialReplicationSyncTimeout; + } + + public String getGroupName() { + return groupName; + } + + @Override + public boolean useQuorumManager() { + return false; + } +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/SharedNothingBackupQuorum.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/SharedNothingBackupQuorum.java index 2e4b0f7523..82b0a3f4ba 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/SharedNothingBackupQuorum.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/qourum/SharedNothingBackupQuorum.java @@ -28,11 +28,12 @@ import org.apache.activemq.artemis.core.client.impl.Topology; import org.apache.activemq.artemis.core.protocol.core.CoreRemotingConnection; import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.ReplicationLiveIsStoppingMessage; import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; +import org.apache.activemq.artemis.core.server.LiveNodeLocator.BackupRegistrationListener; import org.apache.activemq.artemis.core.server.NetworkHealthCheck; import org.apache.activemq.artemis.core.server.NodeManager; import org.jboss.logging.Logger; -public class SharedNothingBackupQuorum implements Quorum, SessionFailureListener { +public class SharedNothingBackupQuorum implements Quorum, SessionFailureListener, BackupRegistrationListener { private static final Logger LOGGER = Logger.getLogger(SharedNothingBackupQuorum.class); @@ -236,13 +237,9 @@ public class SharedNothingBackupQuorum implements Quorum, SessionFailureListener } } - public void notifyRegistrationFailed() { - signal = BACKUP_ACTIVATION.FAILURE_REPLICATING; - latch.countDown(); - } - - public void notifyAlreadyReplicating() { - signal = BACKUP_ACTIVATION.ALREADY_REPLICATING; + @Override + public void onBackupRegistrationFailed(boolean alreadyReplicating) { + signal = alreadyReplicating ? BACKUP_ACTIVATION.ALREADY_REPLICATING : BACKUP_ACTIVATION.FAILURE_REPLICATING; latch.countDown(); } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/files/FileMoveManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/files/FileMoveManager.java index f29e4a1741..5e2c1cb441 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/files/FileMoveManager.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/files/FileMoveManager.java @@ -36,6 +36,7 @@ public class FileMoveManager { private static final Logger logger = Logger.getLogger(FileMoveManager.class); private final File folder; + private final String[] prefixesToPreserve; private int maxFolders; public static final String PREFIX = "oldreplica."; @@ -70,9 +71,10 @@ public class FileMoveManager { this(folder, -1); } - public FileMoveManager(File folder, int maxFolders) { + public FileMoveManager(File folder, int maxFolders, String... prefixesToPreserve) { this.folder = folder; this.maxFolders = maxFolders; + this.prefixesToPreserve = prefixesToPreserve != null ? Arrays.copyOf(prefixesToPreserve, prefixesToPreserve.length) : null; } public int getMaxFolders() { @@ -99,8 +101,23 @@ public class FileMoveManager { ActiveMQServerLogger.LOGGER.backupDeletingData(folder.getPath()); for (String fileMove : files) { File fileFrom = new File(folder, fileMove); - logger.tracef("deleting %s", fileFrom); - deleteTree(fileFrom); + if (prefixesToPreserve != null) { + boolean skip = false; + for (String prefixToPreserve : prefixesToPreserve) { + if (fileMove.startsWith(prefixToPreserve)) { + logger.tracef("skipping %s", fileFrom); + skip = true; + break; + } + } + if (!skip) { + logger.tracef("deleting %s", fileFrom); + deleteTree(fileFrom); + } + } else { + logger.tracef("deleting %s", fileFrom); + deleteTree(fileFrom); + } } } else { // Since we will create one folder, we are already taking that one into consideration @@ -113,8 +130,26 @@ public class FileMoveManager { for (String fileMove : files) { File fileFrom = new File(folder, fileMove); File fileTo = new File(folderTo, fileMove); - logger.tracef("doMove:: moving %s as %s", fileFrom, fileTo); - Files.move(fileFrom.toPath(), fileTo.toPath()); + if (prefixesToPreserve != null) { + boolean copy = false; + for (String prefixToPreserve : prefixesToPreserve) { + if (fileMove.startsWith(prefixToPreserve)) { + logger.tracef("skipping %s", fileFrom); + copy = true; + break; + } + } + if (copy) { + logger.tracef("copying %s to %s", fileFrom, fileTo); + Files.copy(fileFrom.toPath(), fileTo.toPath()); + } else { + logger.tracef("doMove:: moving %s as %s", fileFrom, fileTo); + Files.move(fileFrom.toPath(), fileTo.toPath()); + } + } else { + logger.tracef("doMove:: moving %s as %s", fileFrom, fileTo); + Files.move(fileFrom.toPath(), fileTo.toPath()); + } } } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/Activation.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/Activation.java index 0eab1ba30f..a055036ef8 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/Activation.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/Activation.java @@ -110,4 +110,8 @@ public abstract class Activation implements Runnable { public ReplicationManager getReplicationManager() { return null; } + + public boolean isReplicaSync() { + return false; + } } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ActiveMQServerImpl.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ActiveMQServerImpl.java index 740c3f7b8d..cfd929199f 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ActiveMQServerImpl.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ActiveMQServerImpl.java @@ -109,7 +109,6 @@ import org.apache.activemq.artemis.core.postoffice.impl.LocalQueueBinding; import org.apache.activemq.artemis.core.postoffice.impl.PostOfficeImpl; import org.apache.activemq.artemis.core.remoting.server.RemotingService; import org.apache.activemq.artemis.core.remoting.server.impl.RemotingServiceImpl; -import org.apache.activemq.artemis.core.replication.ReplicationEndpoint; import org.apache.activemq.artemis.core.replication.ReplicationManager; import org.apache.activemq.artemis.core.security.CheckType; import org.apache.activemq.artemis.core.security.Role; @@ -797,14 +796,6 @@ public class ActiveMQServerImpl implements ActiveMQServer { } } - @Override - public ReplicationEndpoint getReplicationEndpoint() { - if (activation instanceof SharedNothingBackupActivation) { - return ((SharedNothingBackupActivation) activation).getReplicationEndpoint(); - } - return null; - } - @Override public void unlockActivation() { activationLock.release(); @@ -921,7 +912,7 @@ public class ActiveMQServerImpl implements ActiveMQServer { return threadPool; } - public void setActivation(SharedNothingLiveActivation activation) { + public void setActivation(Activation activation) { this.activation = activation; } @@ -1145,19 +1136,7 @@ public class ActiveMQServerImpl implements ActiveMQServer { @Override public boolean isReplicaSync() { - if (activation instanceof SharedNothingLiveActivation) { - ReplicationManager replicationManager = getReplicationManager(); - - if (replicationManager == null) { - return false; - } else { - return !replicationManager.isSynchronizing(); - } - } else if (activation instanceof SharedNothingBackupActivation) { - return ((SharedNothingBackupActivation) activation).isRemoteBackupUpToDate(); - } else { - return false; - } + return activation.isReplicaSync(); } public void stop(boolean failoverOnServerShutdown, final boolean criticalIOError, boolean restarting) { @@ -3116,7 +3095,7 @@ public class ActiveMQServerImpl implements ActiveMQServer { postOffice = new PostOfficeImpl(this, storageManager, pagingManager, queueFactory, managementService, configuration.getMessageExpiryScanPeriod(), configuration.getAddressQueueScanPeriod(), configuration.getWildcardConfiguration(), configuration.getIDCacheSize(), configuration.isPersistIDCache(), addressSettingsRepository); // This can't be created until node id is set - clusterManager = new ClusterManager(executorFactory, this, postOffice, scheduledPool, managementService, configuration, nodeManager, haPolicy.isBackup()); + clusterManager = new ClusterManager(executorFactory, this, postOffice, scheduledPool, managementService, configuration, nodeManager, haPolicy.useQuorumManager()); federationManager = new FederationManager(this); @@ -4191,10 +4170,16 @@ public class ActiveMQServerImpl implements ActiveMQServer { * move any older data away and log a warning about it. */ void moveServerData(int maxSavedReplicated) throws IOException { + moveServerData(maxSavedReplicated, false); + } + + void moveServerData(int maxSavedReplicated, boolean preserveLockFiles) throws IOException { File[] dataDirs = new File[]{configuration.getBindingsLocation(), configuration.getJournalLocation(), configuration.getPagingLocation(), configuration.getLargeMessagesLocation()}; for (File data : dataDirs) { - FileMoveManager moveManager = new FileMoveManager(data, maxSavedReplicated); + final boolean isLockFolder = preserveLockFiles ? data.equals(configuration.getNodeManagerLockLocation()) : false; + final String[] lockPrefixes = isLockFolder ? new String[]{FileBasedNodeManager.SERVER_LOCK_NAME, "serverlock"} : null; + FileMoveManager moveManager = new FileMoveManager(data, maxSavedReplicated, lockPrefixes); moveManager.doMove(); } } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/AnyLiveNodeLocatorForReplication.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/AnyLiveNodeLocatorForReplication.java index 015339aafe..de4b409942 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/AnyLiveNodeLocatorForReplication.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/AnyLiveNodeLocatorForReplication.java @@ -29,7 +29,6 @@ import org.apache.activemq.artemis.api.core.Pair; import org.apache.activemq.artemis.api.core.TransportConfiguration; import org.apache.activemq.artemis.api.core.client.TopologyMember; import org.apache.activemq.artemis.core.server.LiveNodeLocator; -import org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothingBackupQuorum; import org.apache.activemq.artemis.utils.ConcurrentUtil; /** @@ -47,8 +46,9 @@ public class AnyLiveNodeLocatorForReplication extends LiveNodeLocator { private String nodeID; - public AnyLiveNodeLocatorForReplication(SharedNothingBackupQuorum backupQuorum, ActiveMQServerImpl server, long retryReplicationWait) { - super(backupQuorum); + public AnyLiveNodeLocatorForReplication(BackupRegistrationListener backupRegistrationListener, + ActiveMQServerImpl server, long retryReplicationWait) { + super(backupRegistrationListener); this.server = server; this.retryReplicationWait = retryReplicationWait; } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ClusterTopologySearch.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ClusterTopologySearch.java new file mode 100644 index 0000000000..a8ceef42b1 --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ClusterTopologySearch.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.core.server.impl; + +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import org.apache.activemq.artemis.api.core.ActiveMQException; +import org.apache.activemq.artemis.api.core.ActiveMQExceptionType; +import org.apache.activemq.artemis.api.core.DiscoveryGroupConfiguration; +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.api.core.client.ActiveMQClient; +import org.apache.activemq.artemis.api.core.client.ClientSession; +import org.apache.activemq.artemis.api.core.client.ClientSessionFactory; +import org.apache.activemq.artemis.api.core.client.ClusterTopologyListener; +import org.apache.activemq.artemis.api.core.client.ServerLocator; +import org.apache.activemq.artemis.api.core.client.TopologyMember; +import org.apache.activemq.artemis.core.client.impl.ClientSessionFactoryInternal; +import org.apache.activemq.artemis.core.client.impl.ServerLocatorInternal; +import org.apache.activemq.artemis.core.config.ClusterConnectionConfiguration; +import org.apache.activemq.artemis.core.config.Configuration; +import org.apache.activemq.artemis.core.config.ConfigurationUtils; +import org.apache.activemq.artemis.core.server.ActiveMQMessageBundle; +import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; +import org.jboss.logging.Logger; + +/** + * This class contains some utils to allow a broker to check presence and role of another broker in the cluster. + */ +final class ClusterTopologySearch { + + private ClusterTopologySearch() { + + } + + /** + * Determines whether there is a live server already running with nodeID.
+ * This search isn't filtering the caller broker transport and is meant to be used + * when the broker acceptors aren't running yet. + */ + public static boolean searchActiveLiveNodeId(String clusterName, + String nodeId, + long timeout, + TimeUnit unit, + Configuration serverConfiguration) throws ActiveMQException { + if (serverConfiguration.getClusterConfigurations().isEmpty()) + return false; + final ClusterConnectionConfiguration clusterConnectionConfiguration = ConfigurationUtils.getReplicationClusterConfiguration(serverConfiguration, clusterName); + + final LiveNodeIdListener liveNodeIdListener = new LiveNodeIdListener(nodeId, serverConfiguration.getClusterUser(), serverConfiguration.getClusterPassword()); + + try (ServerLocatorInternal locator = createLocator(serverConfiguration, clusterConnectionConfiguration)) { + // if would like to filter out a transport configuration: + // locator.setClusterTransportConfiguration(callerBrokerTransportConfiguration) + locator.addClusterTopologyListener(liveNodeIdListener); + locator.setReconnectAttempts(0); + try (ClientSessionFactoryInternal ignored = locator.connectNoWarnings()) { + return liveNodeIdListener.awaitNodePresent(timeout, unit); + } catch (Exception notConnected) { + if (!(notConnected instanceof ActiveMQException) || ActiveMQExceptionType.INTERNAL_ERROR.equals(((ActiveMQException) notConnected).getType())) { + // report all exceptions that aren't ActiveMQException and all INTERNAL_ERRORs + ActiveMQServerLogger.LOGGER.failedConnectingToCluster(notConnected); + } + return false; + } + } + } + + private static final class LiveNodeIdListener implements ClusterTopologyListener { + + private static final Logger logger = Logger.getLogger(LiveNodeIdListener.class); + private final String nodeId; + private final String user; + private final String password; + private final CountDownLatch searchCompleted; + private boolean isNodePresent = false; + + LiveNodeIdListener(String nodeId, String user, String password) { + this.nodeId = nodeId; + this.user = user; + this.password = password; + this.searchCompleted = new CountDownLatch(1); + } + + @Override + public void nodeUP(TopologyMember topologyMember, boolean last) { + boolean isOurNodeId = nodeId != null && nodeId.equals(topologyMember.getNodeId()); + if (isOurNodeId && isActive(topologyMember.getLive())) { + isNodePresent = true; + } + if (isOurNodeId || last) { + searchCompleted.countDown(); + } + } + + public boolean awaitNodePresent(long timeout, TimeUnit unit) throws InterruptedException { + searchCompleted.await(timeout, unit); + return isNodePresent; + } + + /** + * In a cluster of replicated live/backup pairs if a backup crashes and then its live crashes the cluster will + * retain the topology information of the live such that when the live server restarts it will check the + * cluster to see if its nodeID is present (which it will be) and then it will activate as a backup rather than + * a live. To prevent this situation an additional check is necessary to see if the server with the matching + * nodeID is actually active or not which is done by attempting to make a connection to it. + * + * @param transportConfiguration + * @return + */ + private boolean isActive(TransportConfiguration transportConfiguration) { + try (ServerLocator serverLocator = ActiveMQClient.createServerLocator(false, transportConfiguration); + ClientSessionFactory clientSessionFactory = serverLocator.createSessionFactory(); + ClientSession clientSession = clientSessionFactory.createSession(user, password, false, false, false, false, 0)) { + return true; + } catch (Exception e) { + logger.debug("isActive check failed", e); + return false; + } + } + + @Override + public void nodeDown(long eventUID, String nodeID) { + // no-op + } + } + + private static ServerLocatorInternal createLocator(Configuration configuration, + ClusterConnectionConfiguration config) throws ActiveMQException { + final ServerLocatorInternal locator; + if (config.getDiscoveryGroupName() != null) { + DiscoveryGroupConfiguration dg = configuration.getDiscoveryGroupConfigurations().get(config.getDiscoveryGroupName()); + + if (dg == null) { + throw ActiveMQMessageBundle.BUNDLE.noDiscoveryGroupFound(null); + } + locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(dg); + } else { + TransportConfiguration[] tcConfigs = config.getStaticConnectors() != null ? configuration.getTransportConfigurations(config.getStaticConnectors()) : null; + + locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(tcConfigs); + } + return locator; + } + +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java index cfbcb47d02..4a8b59f27d 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.channels.FileChannel; import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; @@ -27,19 +28,66 @@ import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.utils.UUID; import org.apache.activemq.artemis.utils.UUIDGenerator; +import static java.nio.file.StandardOpenOption.CREATE_NEW; +import static java.nio.file.StandardOpenOption.READ; +import static java.nio.file.StandardOpenOption.WRITE; + public abstract class FileBasedNodeManager extends NodeManager { protected static final byte FIRST_TIME_START = '0'; public static final String SERVER_LOCK_NAME = "server.lock"; + public static final String DATA_VERSION_NAME = "server.data.version"; private static final String ACCESS_MODE = "rw"; private final File directory; protected FileChannel channel; + protected FileChannel dataVersionChannel; public FileBasedNodeManager(boolean replicatedBackup, File directory) { super(replicatedBackup); this.directory = directory; } + protected void useDataVersionChannel() throws IOException { + if (dataVersionChannel != null) { + return; + } + dataVersionChannel = FileChannel.open(newFile(DATA_VERSION_NAME).toPath(), READ, WRITE, CREATE_NEW); + } + + @Override + public long readDataVersion() throws NodeManagerException { + if (!isStarted()) { + throw new NodeManagerException(new IllegalStateException("node manager must be started first")); + } + try { + useDataVersionChannel(); + ByteBuffer tmpBuffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN); + if (dataVersionChannel.read(tmpBuffer, 0) != Long.BYTES) { + return 0; + } + tmpBuffer.flip(); + return tmpBuffer.getLong(0); + } catch (IOException ie) { + throw new NodeManagerException(ie); + } + } + + @Override + public void writeDataVersion(long version) throws NodeManagerException { + if (!isStarted()) { + throw new NodeManagerException(new IllegalStateException("node manager must be started first")); + } + try { + useDataVersionChannel(); + ByteBuffer tmpBuffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN); + tmpBuffer.putLong(0, version); + dataVersionChannel.write(tmpBuffer, 0); + dataVersionChannel.force(false); + } catch (IOException ie) { + throw new NodeManagerException(ie); + } + } + /** * Ensures existence of persistent information about the server's nodeID. *

@@ -137,9 +185,20 @@ public abstract class FileBasedNodeManager extends NodeManager { @Override public synchronized void stop() throws Exception { FileChannel channelCopy = channel; - if (channelCopy != null) - channelCopy.close(); - super.stop(); + try { + if (channelCopy != null) + channelCopy.close(); + } finally { + try { + FileChannel dataVersionChannel = this.dataVersionChannel; + this.dataVersionChannel = null; + if (dataVersionChannel != null) { + dataVersionChannel.close(); + } + } finally { + super.stop(); + } + } } @Override diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/NamedLiveNodeIdLocatorForReplication.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/NamedLiveNodeIdLocatorForReplication.java new file mode 100644 index 0000000000..58cb32a4b9 --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/NamedLiveNodeIdLocatorForReplication.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.core.server.impl; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.Queue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.activemq.artemis.api.core.ActiveMQException; +import org.apache.activemq.artemis.api.core.Pair; +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.api.core.client.TopologyMember; +import org.apache.activemq.artemis.core.server.LiveNodeLocator; +import org.apache.activemq.artemis.utils.ConcurrentUtil; + +/** + * It looks for a live server in the cluster with a specific NodeID + */ +public class NamedLiveNodeIdLocatorForReplication extends LiveNodeLocator { + + private final Lock lock = new ReentrantLock(); + private final Condition condition = lock.newCondition(); + private final String nodeID; + private final long retryReplicationWait; + private final Queue> liveConfigurations = new LinkedList<>(); + private final ArrayList> triedConfigurations = new ArrayList<>(); + private boolean found; + + public NamedLiveNodeIdLocatorForReplication(String nodeID, + BackupRegistrationListener backupRegistrationListener, + long retryReplicationWait) { + super(backupRegistrationListener); + this.nodeID = nodeID; + this.retryReplicationWait = retryReplicationWait; + } + + @Override + public void locateNode() throws ActiveMQException { + locateNode(-1L); + } + + @Override + public void locateNode(long timeout) throws ActiveMQException { + try { + lock.lock(); + if (liveConfigurations.size() == 0) { + try { + if (timeout != -1L) { + ConcurrentUtil.await(condition, timeout); + } else { + while (liveConfigurations.size() == 0) { + condition.await(retryReplicationWait, TimeUnit.MILLISECONDS); + liveConfigurations.addAll(triedConfigurations); + triedConfigurations.clear(); + } + } + } catch (InterruptedException e) { + //ignore + } + } + } finally { + lock.unlock(); + } + } + + @Override + public void nodeUP(TopologyMember topologyMember, boolean last) { + try { + lock.lock(); + if (nodeID.equals(topologyMember.getNodeId()) && topologyMember.getLive() != null) { + Pair liveConfiguration = new Pair<>(topologyMember.getLive(), topologyMember.getBackup()); + if (!liveConfigurations.contains(liveConfiguration)) { + liveConfigurations.add(liveConfiguration); + } + found = true; + condition.signal(); + } + } finally { + lock.unlock(); + } + } + + @Override + public void nodeDown(long eventUID, String nodeID) { + //no op + } + + @Override + public String getNodeID() { + return found ? nodeID : null; + } + + @Override + public Pair getLiveConfiguration() { + return liveConfigurations.peek(); + } + + @Override + public void notifyRegistrationFailed(boolean alreadyReplicating) { + try { + lock.lock(); + triedConfigurations.add(liveConfigurations.poll()); + super.notifyRegistrationFailed(alreadyReplicating); + } finally { + lock.unlock(); + } + } +} + diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/NamedLiveNodeLocatorForReplication.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/NamedLiveNodeLocatorForReplication.java index 624808d1f0..c4775d2758 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/NamedLiveNodeLocatorForReplication.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/NamedLiveNodeLocatorForReplication.java @@ -29,7 +29,6 @@ import org.apache.activemq.artemis.api.core.Pair; import org.apache.activemq.artemis.api.core.TransportConfiguration; import org.apache.activemq.artemis.api.core.client.TopologyMember; import org.apache.activemq.artemis.core.server.LiveNodeLocator; -import org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothingBackupQuorum; import org.apache.activemq.artemis.utils.ConcurrentUtil; /** @@ -48,8 +47,10 @@ public class NamedLiveNodeLocatorForReplication extends LiveNodeLocator { private String nodeID; - public NamedLiveNodeLocatorForReplication(String backupGroupName, SharedNothingBackupQuorum quorumManager, long retryReplicationWait) { - super(quorumManager); + public NamedLiveNodeLocatorForReplication(String backupGroupName, + BackupRegistrationListener backupRegistrationListener, + long retryReplicationWait) { + super(backupRegistrationListener); this.backupGroupName = backupGroupName; this.retryReplicationWait = retryReplicationWait; } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java new file mode 100644 index 0000000000..4441bf0018 --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java @@ -0,0 +1,599 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.core.server.impl; + +import javax.annotation.concurrent.GuardedBy; + +import java.util.Objects; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Consumer; + +import org.apache.activemq.artemis.api.core.ActiveMQException; +import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException; +import org.apache.activemq.artemis.api.core.Pair; +import org.apache.activemq.artemis.api.core.SimpleString; +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.core.protocol.core.Channel; +import org.apache.activemq.artemis.core.replication.ReplicationEndpoint; +import org.apache.activemq.artemis.core.server.ActiveMQServer; +import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; +import org.apache.activemq.artemis.core.server.LiveNodeLocator; +import org.apache.activemq.artemis.core.server.NodeManager;; +import org.apache.activemq.artemis.core.server.cluster.ClusterControl; +import org.apache.activemq.artemis.core.server.cluster.ClusterController; +import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationBackupPolicy; +import org.apache.activemq.artemis.quorum.DistributedLock; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.apache.activemq.artemis.quorum.UnavailableStateException; +import org.jboss.logging.Logger; + +import static org.apache.activemq.artemis.core.server.impl.ReplicationObserver.ReplicationFailure; + +/** + * This activation can be used by a primary while trying to fail-back ie {@code failback == true} or + * by a natural-born backup ie {@code failback == false}.
+ */ +public final class ReplicationBackupActivation extends Activation implements DistributedPrimitiveManager.UnavailableManagerListener { + + private static final Logger LOGGER = Logger.getLogger(ReplicationBackupActivation.class); + + private final boolean wasLive; + private final ReplicationBackupPolicy policy; + private final ActiveMQServerImpl activeMQServer; + // This field is != null iff this node is a primary during a fail-back ie acting as a backup in order to become live again. + private final String expectedNodeID; + @GuardedBy("this") + private boolean closed; + private final DistributedPrimitiveManager distributedManager; + // Used for monitoring purposes + private volatile ReplicationObserver replicationObserver; + // Used for testing purposes + private volatile ReplicationEndpoint replicationEndpoint; + // Used for testing purposes + private Consumer onReplicationEndpointCreation; + // Used to arbiter one-shot server stop/restart + private final AtomicBoolean stopping; + + public ReplicationBackupActivation(final ActiveMQServerImpl activeMQServer, + final boolean wasLive, + final DistributedPrimitiveManager distributedManager, + final ReplicationBackupPolicy policy) { + this.wasLive = wasLive; + this.activeMQServer = activeMQServer; + if (policy.isTryFailback()) { + final SimpleString serverNodeID = activeMQServer.getNodeID(); + if (serverNodeID == null || serverNodeID.isEmpty()) { + throw new IllegalStateException("A failback activation must be biased around a specific NodeID"); + } + this.expectedNodeID = serverNodeID.toString(); + } else { + this.expectedNodeID = null; + } + this.distributedManager = distributedManager; + this.policy = policy; + this.replicationObserver = null; + this.replicationEndpoint = null; + this.stopping = new AtomicBoolean(false); + } + + /** + * used for testing purposes. + */ + public DistributedPrimitiveManager getDistributedManager() { + return distributedManager; + } + + @Override + public void onUnavailableManagerEvent() { + synchronized (this) { + if (closed) { + return; + } + } + LOGGER.info("Unavailable quorum service detected: try restart server"); + asyncRestartServer(activeMQServer, true); + } + + /** + * This util class exists because {@link LiveNodeLocator} need a {@link LiveNodeLocator.BackupRegistrationListener} + * to forward backup registration failure events: this is used to switch on/off backup registration event listening + * on an existing locator. + */ + private static final class RegistrationFailureForwarder implements LiveNodeLocator.BackupRegistrationListener, AutoCloseable { + + private static final LiveNodeLocator.BackupRegistrationListener NOOP_LISTENER = ignore -> { + }; + private volatile LiveNodeLocator.BackupRegistrationListener listener = NOOP_LISTENER; + + public RegistrationFailureForwarder to(LiveNodeLocator.BackupRegistrationListener listener) { + this.listener = listener; + return this; + } + + @Override + public void onBackupRegistrationFailed(boolean alreadyReplicating) { + listener.onBackupRegistrationFailed(alreadyReplicating); + } + + @Override + public void close() { + listener = NOOP_LISTENER; + } + } + + @Override + public void run() { + synchronized (this) { + if (closed) { + return; + } + } + try { + LOGGER.info("Trying to reach majority of quorum service nodes"); + distributedManager.start(); + LOGGER.info("Quorum service available: starting broker"); + distributedManager.addUnavailableManagerListener(this); + // Stop the previous node manager and create a new one with NodeManager::replicatedBackup == true: + // NodeManager::start skip setup lock file with NodeID, until NodeManager::stopBackup is called. + activeMQServer.resetNodeManager(); + activeMQServer.getNodeManager().stop(); + // A primary need to preserve NodeID across runs + activeMQServer.moveServerData(policy.getMaxSavedReplicatedJournalsSize(), policy.isTryFailback()); + activeMQServer.getNodeManager().start(); + if (!activeMQServer.initialisePart1(false)) { + return; + } + synchronized (this) { + if (closed) + return; + } + final ClusterController clusterController = activeMQServer.getClusterManager().getClusterController(); + clusterController.awaitConnectionToReplicationCluster(); + activeMQServer.getBackupManager().start(); + ActiveMQServerLogger.LOGGER.backupServerStarted(activeMQServer.getVersion().getFullVersion(), + activeMQServer.getNodeManager().getNodeId()); + activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED); + final DistributedLock liveLock = replicateAndFailover(clusterController); + if (liveLock == null) { + return; + } + startAsLive(liveLock); + } catch (Exception e) { + if ((e instanceof InterruptedException || e instanceof IllegalStateException) && !activeMQServer.isStarted()) { + // do not log these errors if the server is being stopped. + return; + } + ActiveMQServerLogger.LOGGER.initializationError(e); + } + } + + private void startAsLive(final DistributedLock liveLock) throws Exception { + activeMQServer.setHAPolicy(policy.getLivePolicy()); + + synchronized (activeMQServer) { + if (!activeMQServer.isStarted()) { + liveLock.close(); + return; + } + ActiveMQServerLogger.LOGGER.becomingLive(activeMQServer); + // stopBackup is going to write the NodeID previously set on the NodeManager, + // because activeMQServer.resetNodeManager() has created a NodeManager with replicatedBackup == true. + activeMQServer.getNodeManager().stopBackup(); + activeMQServer.getStorageManager().start(); + activeMQServer.getBackupManager().activated(); + // IMPORTANT: + // we're setting this activation JUST because it would allow the server to use its + // getActivationChannelHandler to handle replication + final ReplicationPrimaryActivation primaryActivation = new ReplicationPrimaryActivation(activeMQServer, distributedManager, policy.getLivePolicy()); + liveLock.addListener(primaryActivation); + activeMQServer.setActivation(primaryActivation); + activeMQServer.initialisePart2(false); + // calling primaryActivation.stateChanged !isHelByCaller is necessary in case the lock was unavailable + // before liveLock.addListener: just throwing an exception won't stop the broker. + final boolean stillLive; + try { + stillLive = liveLock.isHeldByCaller(); + } catch (UnavailableStateException e) { + LOGGER.warn(e); + primaryActivation.onUnavailableLockEvent(); + throw new ActiveMQIllegalStateException("This server cannot check its role as a live: activation is failed"); + } + if (!stillLive) { + primaryActivation.onUnavailableLockEvent(); + throw new ActiveMQIllegalStateException("This server is not live anymore: activation is failed"); + } + if (activeMQServer.getIdentity() != null) { + ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity()); + } else { + ActiveMQServerLogger.LOGGER.serverIsLive(); + } + activeMQServer.completeActivation(true); + } + } + + private LiveNodeLocator createLiveNodeLocator(final LiveNodeLocator.BackupRegistrationListener registrationListener) { + if (expectedNodeID != null) { + assert policy.isTryFailback(); + return new NamedLiveNodeIdLocatorForReplication(expectedNodeID, registrationListener, policy.getRetryReplicationWait()); + } + return policy.getGroupName() == null ? + new AnyLiveNodeLocatorForReplication(registrationListener, activeMQServer, policy.getRetryReplicationWait()) : + new NamedLiveNodeLocatorForReplication(policy.getGroupName(), registrationListener, policy.getRetryReplicationWait()); + } + + private DistributedLock replicateAndFailover(final ClusterController clusterController) throws ActiveMQException, InterruptedException { + final RegistrationFailureForwarder registrationFailureForwarder = new RegistrationFailureForwarder(); + // node locator isn't stateless and contains a live-list of candidate nodes to connect too, hence + // it MUST be reused for each replicateLive attempt + final LiveNodeLocator nodeLocator = createLiveNodeLocator(registrationFailureForwarder); + clusterController.addClusterTopologyListenerForReplication(nodeLocator); + try { + while (true) { + synchronized (this) { + if (closed) { + return null; + } + } + final ReplicationFailure failure = replicateLive(clusterController, nodeLocator, registrationFailureForwarder); + if (failure == null) { + Thread.sleep(clusterController.getRetryIntervalForReplicatedCluster()); + continue; + } + if (!activeMQServer.isStarted()) { + return null; + } + LOGGER.debugf("ReplicationFailure = %s", failure); + boolean voluntaryFailOver = false; + switch (failure) { + case VoluntaryFailOver: + voluntaryFailOver = true; + case NonVoluntaryFailover: + final DistributedLock liveLock = tryAcquireLiveLock(); + // from now on we're meant to stop: + // - due to failover + // - due to restart/stop + assert stopping.get(); + if (liveLock != null) { + return liveLock; + } + boolean restart = true; + if (voluntaryFailOver && isFirstFailbackAttempt()) { + restart = false; + LOGGER.error("Failed to fail-back: stopping broker based on quorum results"); + } else { + ActiveMQServerLogger.LOGGER.restartingAsBackupBasedOnQuorumVoteResults(); + } + // let's ignore the stopping flag here, we're in control of it + asyncRestartServer(activeMQServer, restart, false); + return null; + case RegistrationError: + LOGGER.error("Stopping broker because of critical registration error"); + asyncRestartServer(activeMQServer, false); + return null; + case AlreadyReplicating: + // can just retry here, data should be clean and nodeLocator + // should remove the live node that has answered this + LOGGER.info("Live broker was already replicating: retry sync with another live"); + continue; + case ClosedObserver: + return null; + case BackupNotInSync: + LOGGER.info("Replication failure while initial sync not yet completed: restart as backup"); + asyncRestartServer(activeMQServer, true); + return null; + case WrongNodeId: + LOGGER.error("Stopping broker because of wrong node ID communication from live: maybe a misbehaving live?"); + asyncRestartServer(activeMQServer, false); + return null; + default: + throw new AssertionError("Unsupported failure " + failure); + } + } + } finally { + silentExecution("Errored on cluster topology listener for replication cleanup", () -> clusterController.removeClusterTopologyListenerForReplication(nodeLocator)); + } + } + + /** + * {@code wasLive} is {code true} only while transitioning from primary to backup.
+ * If a natural born backup become live and allows failback, while transitioning to back again + * {@code wasLive} is still {@code false}.
+ * The check on {@link ReplicationBackupPolicy#isTryFailback()} is redundant but still useful for correctness. + *

+ * In case of fail-back, any event that's going to restart this broker as backup (eg quorum service unavailable + * or some replication failures) will cause {@code wasLive} to be {@code false}, because the HA policy set isn't + * a primary anymore. + */ + private boolean isFirstFailbackAttempt() { + return wasLive && policy.isTryFailback(); + } + + private DistributedLock tryAcquireLiveLock() throws InterruptedException { + // disable quorum service unavailability handling and just treat this imperatively + if (!stopping.compareAndSet(false, true)) { + // already unavailable quorum service: fail fast + return null; + } + distributedManager.removeUnavailableManagerListener(this); + assert activeMQServer.getNodeManager().getNodeId() != null; + final String liveID = activeMQServer.getNodeManager().getNodeId().toString(); + final int voteRetries = policy.getVoteRetries(); + final long maxAttempts = voteRetries >= 0 ? (voteRetries + 1) : -1; + if (maxAttempts == -1) { + LOGGER.error("It's not safe to retry an infinite amount of time to acquire a live lock: please consider setting a vote-retries value"); + } + final long voteRetryWait = policy.getVoteRetryWait(); + final DistributedLock liveLock = getLock(distributedManager, liveID); + if (liveLock == null) { + return null; + } + for (long attempt = 0; maxAttempts >= 0 ? (attempt < maxAttempts) : true; attempt++) { + try { + if (liveLock.tryLock(voteRetryWait, TimeUnit.MILLISECONDS)) { + LOGGER.debugf("%s live lock acquired after %d attempts.", liveID, (attempt + 1)); + return liveLock; + } + } catch (UnavailableStateException e) { + LOGGER.warnf(e, "Failed to acquire live lock %s because of unavailable quorum service: stop trying", liveID); + distributedManager.stop(); + return null; + } + } + LOGGER.warnf("Failed to acquire live lock %s after %d tries", liveID, maxAttempts); + distributedManager.stop(); + return null; + } + + private DistributedLock getLock(final DistributedPrimitiveManager manager, + final String lockId) throws InterruptedException { + if (!manager.isStarted()) { + return null; + } + try { + return manager.getDistributedLock(lockId); + } catch (ExecutionException e) { + LOGGER.warnf(e, "Errored while getting lock %s", lockId); + return null; + } catch (TimeoutException te) { + LOGGER.warnf(te, "Timeout while getting lock %s", lockId); + return null; + } + } + + private ReplicationObserver replicationObserver() { + if (policy.isTryFailback()) { + return ReplicationObserver.failbackObserver(activeMQServer.getNodeManager(), activeMQServer.getBackupManager(), activeMQServer.getScheduledPool(), expectedNodeID); + } + return ReplicationObserver.failoverObserver(activeMQServer.getNodeManager(), activeMQServer.getBackupManager(), activeMQServer.getScheduledPool()); + } + + private ReplicationFailure replicateLive(final ClusterController clusterController, + final LiveNodeLocator liveLocator, + final RegistrationFailureForwarder registrationFailureForwarder) throws ActiveMQException { + try (ReplicationObserver replicationObserver = replicationObserver(); + RegistrationFailureForwarder ignored = registrationFailureForwarder.to(replicationObserver)) { + this.replicationObserver = replicationObserver; + clusterController.addClusterTopologyListener(replicationObserver); + // ReplicationError notifies backup registration failures to live locator -> forwarder -> observer + final ReplicationError replicationError = new ReplicationError(liveLocator); + clusterController.addIncomingInterceptorForReplication(replicationError); + try { + final ClusterControl liveControl = tryLocateAndConnectToLive(liveLocator, clusterController); + if (liveControl == null) { + return null; + } + try { + final ReplicationEndpoint replicationEndpoint = tryAuthorizeAndAsyncRegisterAsBackupToLive(liveControl, replicationObserver); + if (replicationEndpoint == null) { + return ReplicationFailure.RegistrationError; + } + this.replicationEndpoint = replicationEndpoint; + assert replicationEndpoint != null; + try { + return replicationObserver.awaitReplicationFailure(); + } finally { + this.replicationEndpoint = null; + ActiveMQServerImpl.stopComponent(replicationEndpoint); + closeChannelOf(replicationEndpoint); + } + } finally { + silentExecution("Errored on live control close", liveControl::close); + } + } finally { + silentExecution("Errored on cluster topology listener cleanup", () -> clusterController.removeClusterTopologyListener(replicationObserver)); + silentExecution("Errored while removing incoming interceptor for replication", () -> clusterController.removeIncomingInterceptorForReplication(replicationError)); + } + } finally { + this.replicationObserver = null; + } + } + + private static void silentExecution(String debugErrorMessage, Runnable task) { + try { + task.run(); + } catch (Throwable ignore) { + LOGGER.debug(debugErrorMessage, ignore); + } + } + + private static void closeChannelOf(final ReplicationEndpoint replicationEndpoint) { + if (replicationEndpoint == null) { + return; + } + if (replicationEndpoint.getChannel() != null) { + silentExecution("Errored while closing replication endpoint channel", () -> replicationEndpoint.getChannel().close()); + replicationEndpoint.setChannel(null); + } + } + + private boolean asyncRestartServer(final ActiveMQServer server, boolean restart) { + return asyncRestartServer(server, restart, true); + } + + private boolean asyncRestartServer(final ActiveMQServer server, boolean restart, boolean checkStopping) { + if (checkStopping) { + if (!stopping.compareAndSet(false, true)) { + return false; + } + } + new Thread(() -> { + if (server.getState() != ActiveMQServer.SERVER_STATE.STOPPED && server.getState() != ActiveMQServer.SERVER_STATE.STOPPING) { + try { + server.stop(!restart); + if (restart) { + server.start(); + } + } catch (Exception e) { + if (restart) { + ActiveMQServerLogger.LOGGER.errorRestartingBackupServer(e, server); + } else { + ActiveMQServerLogger.LOGGER.errorStoppingServer(e); + } + } + } + }).start(); + return true; + } + + private ClusterControl tryLocateAndConnectToLive(final LiveNodeLocator liveLocator, + final ClusterController clusterController) throws ActiveMQException { + liveLocator.locateNode(); + final Pair possibleLive = liveLocator.getLiveConfiguration(); + final String nodeID = liveLocator.getNodeID(); + if (nodeID == null) { + throw new RuntimeException("Could not establish the connection with any live"); + } + if (!policy.isTryFailback()) { + assert expectedNodeID == null; + activeMQServer.getNodeManager().setNodeID(nodeID); + } else { + assert expectedNodeID.equals(nodeID); + } + if (possibleLive == null) { + return null; + } + final ClusterControl liveControl = tryConnectToNodeInReplicatedCluster(clusterController, possibleLive.getA()); + if (liveControl != null) { + return liveControl; + } + return tryConnectToNodeInReplicatedCluster(clusterController, possibleLive.getB()); + } + + private static ClusterControl tryConnectToNodeInReplicatedCluster(final ClusterController clusterController, + final TransportConfiguration tc) { + try { + if (tc != null) { + return clusterController.connectToNodeInReplicatedCluster(tc); + } + } catch (Exception e) { + LOGGER.debug(e.getMessage(), e); + } + return null; + } + + @Override + public void close(final boolean permanently, final boolean restarting) throws Exception { + synchronized (this) { + closed = true; + final ReplicationObserver replicationObserver = this.replicationObserver; + if (replicationObserver != null) { + replicationObserver.close(); + } + } + //we have to check as the server policy may have changed + try { + if (activeMQServer.getHAPolicy().isBackup()) { + // To avoid a NPE cause by the stop + final NodeManager nodeManager = activeMQServer.getNodeManager(); + + activeMQServer.interruptActivationThread(nodeManager); + + if (nodeManager != null) { + nodeManager.stopBackup(); + } + } + } finally { + // this one need to happen after interrupting the activation thread + // in order to unblock distributedManager::start + distributedManager.stop(); + } + } + + @Override + public void preStorageClose() throws Exception { + // TODO replication endpoint close? + } + + private ReplicationEndpoint tryAuthorizeAndAsyncRegisterAsBackupToLive(final ClusterControl liveControl, + final ReplicationObserver liveObserver) { + ReplicationEndpoint replicationEndpoint = null; + try { + liveControl.getSessionFactory().setReconnectAttempts(1); + liveObserver.listenConnectionFailuresOf(liveControl.getSessionFactory()); + liveControl.authorize(); + replicationEndpoint = new ReplicationEndpoint(activeMQServer, policy.isTryFailback(), liveObserver); + final Consumer onReplicationEndpointCreation = this.onReplicationEndpointCreation; + if (onReplicationEndpointCreation != null) { + onReplicationEndpointCreation.accept(replicationEndpoint); + } + replicationEndpoint.setExecutor(activeMQServer.getExecutorFactory().getExecutor()); + connectToReplicationEndpoint(liveControl, replicationEndpoint); + replicationEndpoint.start(); + liveControl.announceReplicatingBackupToLive(policy.isTryFailback(), policy.getClusterName()); + return replicationEndpoint; + } catch (Exception e) { + ActiveMQServerLogger.LOGGER.replicationStartProblem(e); + ActiveMQServerImpl.stopComponent(replicationEndpoint); + closeChannelOf(replicationEndpoint); + return null; + } + } + + private static boolean connectToReplicationEndpoint(final ClusterControl liveControl, + final ReplicationEndpoint replicationEndpoint) { + final Channel replicationChannel = liveControl.createReplicationChannel(); + replicationChannel.setHandler(replicationEndpoint); + replicationEndpoint.setChannel(replicationChannel); + return true; + } + + @Override + public boolean isReplicaSync() { + // NOTE: this method is just for monitoring purposes, not suitable to perform logic! + // During a failover this backup won't have any active liveObserver and will report `false`!! + final ReplicationObserver liveObserver = this.replicationObserver; + if (liveObserver == null) { + return false; + } + return liveObserver.isBackupUpToDate(); + } + + public ReplicationEndpoint getReplicationEndpoint() { + return replicationEndpoint; + } + + /** + * This must be used just for testing purposes. + */ + public void spyReplicationEndpointCreation(Consumer onReplicationEndpointCreation) { + Objects.requireNonNull(onReplicationEndpointCreation); + this.onReplicationEndpointCreation = onReplicationEndpointCreation; + } +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationObserver.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationObserver.java new file mode 100644 index 0000000000..be2737c26c --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationObserver.java @@ -0,0 +1,332 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.core.server.impl; + +import javax.annotation.concurrent.GuardedBy; +import java.util.Objects; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.TimeUnit; + +import org.apache.activemq.artemis.api.core.ActiveMQException; +import org.apache.activemq.artemis.api.core.client.ClusterTopologyListener; +import org.apache.activemq.artemis.api.core.client.SessionFailureListener; +import org.apache.activemq.artemis.api.core.client.TopologyMember; +import org.apache.activemq.artemis.core.client.impl.ClientSessionFactoryInternal; +import org.apache.activemq.artemis.core.protocol.core.CoreRemotingConnection; +import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.ReplicationLiveIsStoppingMessage; +import org.apache.activemq.artemis.core.replication.ReplicationEndpoint; +import org.apache.activemq.artemis.core.server.LiveNodeLocator.BackupRegistrationListener; +import org.apache.activemq.artemis.core.server.NodeManager; +import org.apache.activemq.artemis.core.server.cluster.BackupManager; +import org.jboss.logging.Logger; + +final class ReplicationObserver implements ClusterTopologyListener, SessionFailureListener, BackupRegistrationListener, ReplicationEndpoint.ReplicationEndpointEventListener, AutoCloseable { + + private static final Logger LOGGER = Logger.getLogger(ReplicationObserver.class); + + public enum ReplicationFailure { + VoluntaryFailOver, BackupNotInSync, NonVoluntaryFailover, RegistrationError, AlreadyReplicating, ClosedObserver, WrongNodeId; + } + + private final NodeManager nodeManager; + private final BackupManager backupManager; + private final ScheduledExecutorService scheduledPool; + private final boolean failback; + private final String expectedNodeID; + private final CompletableFuture replicationFailure; + + @GuardedBy("this") + private ClientSessionFactoryInternal sessionFactory; + @GuardedBy("this") + private CoreRemotingConnection connection; + @GuardedBy("this") + private ScheduledFuture forcedFailover; + + private volatile String liveID; + private volatile boolean backupUpToDate; + private volatile boolean closed; + + /** + * This is a safety net in case the live sends the first {@link ReplicationLiveIsStoppingMessage} + * with code {@link org.apache.activemq.artemis.core.protocol.core.impl.wireformat.ReplicationLiveIsStoppingMessage.LiveStopping#STOP_CALLED} and crashes before sending the second with + * {@link org.apache.activemq.artemis.core.protocol.core.impl.wireformat.ReplicationLiveIsStoppingMessage.LiveStopping#FAIL_OVER}. + *

+ * If the second message does come within this dead line, we fail over anyway. + */ + public static final int WAIT_TIME_AFTER_FIRST_LIVE_STOPPING_MSG = 60; + + private ReplicationObserver(final NodeManager nodeManager, + final BackupManager backupManager, + final ScheduledExecutorService scheduledPool, + final boolean failback, + final String expectedNodeID) { + this.nodeManager = nodeManager; + this.backupManager = backupManager; + this.scheduledPool = scheduledPool; + this.failback = failback; + this.expectedNodeID = expectedNodeID; + this.replicationFailure = new CompletableFuture<>(); + + this.sessionFactory = null; + this.connection = null; + this.forcedFailover = null; + + this.liveID = null; + this.backupUpToDate = false; + this.closed = false; + } + + public static ReplicationObserver failbackObserver(final NodeManager nodeManager, + final BackupManager backupManager, + final ScheduledExecutorService scheduledPool, + final String expectedNodeID) { + Objects.requireNonNull(expectedNodeID); + return new ReplicationObserver(nodeManager, backupManager, scheduledPool, true, expectedNodeID); + } + + public static ReplicationObserver failoverObserver(final NodeManager nodeManager, + final BackupManager backupManager, + final ScheduledExecutorService scheduledPool) { + return new ReplicationObserver(nodeManager, backupManager, scheduledPool, false, null); + } + + private void onLiveDown(boolean voluntaryFailover) { + if (closed || replicationFailure.isDone()) { + return; + } + synchronized (this) { + if (closed || replicationFailure.isDone()) { + return; + } + stopForcedFailoverAfterDelay(); + unlistenConnectionFailures(); + if (!isRemoteBackupUpToDate()) { + replicationFailure.complete(ReplicationFailure.BackupNotInSync); + } else if (voluntaryFailover) { + replicationFailure.complete(ReplicationFailure.VoluntaryFailOver); + } else { + replicationFailure.complete(ReplicationFailure.NonVoluntaryFailover); + } + } + } + + @Override + public void nodeDown(long eventUID, String nodeID) { + // ignore it during a failback: + // a failing slave close all connections but the one used for replication + // triggering a nodeDown before the restarted master receive a STOP_CALLED from it. + // This can make master to fire a useless quorum vote during a normal failback. + if (failback) { + return; + } + if (nodeID.equals(liveID)) { + onLiveDown(false); + } + } + + @Override + public void nodeUP(TopologyMember member, boolean last) { + } + + /** + * if the connection to our replicated live goes down then decide on an action + */ + @Override + public void connectionFailed(ActiveMQException exception, boolean failedOver) { + onLiveDown(false); + } + + @Override + public void connectionFailed(final ActiveMQException me, boolean failedOver, String scaleDownTargetNodeID) { + connectionFailed(me, failedOver); + } + + @Override + public void beforeReconnect(ActiveMQException exception) { + //noop + } + + @Override + public void close() { + if (closed) { + return; + } + synchronized (this) { + if (closed) { + return; + } + unlistenConnectionFailures(); + closed = true; + replicationFailure.complete(ReplicationFailure.ClosedObserver); + } + } + + /** + * @param liveSessionFactory the session factory used to connect to the live server + */ + public synchronized void listenConnectionFailuresOf(final ClientSessionFactoryInternal liveSessionFactory) { + if (closed) { + throw new IllegalStateException("the observer is closed: cannot listen to any failures"); + } + if (sessionFactory != null || connection != null) { + throw new IllegalStateException("this observer is already listening to other session factory failures"); + } + this.sessionFactory = liveSessionFactory; + //belts and braces, there are circumstances where the connection listener doesn't get called but the session does. + this.sessionFactory.addFailureListener(this); + connection = (CoreRemotingConnection) liveSessionFactory.getConnection(); + connection.addFailureListener(this); + } + + public synchronized void unlistenConnectionFailures() { + if (connection != null) { + connection.removeFailureListener(this); + connection = null; + } + if (sessionFactory != null) { + sessionFactory.removeFailureListener(this); + sessionFactory = null; + } + } + + @Override + public void onBackupRegistrationFailed(boolean alreadyReplicating) { + if (closed || replicationFailure.isDone()) { + return; + } + synchronized (this) { + if (closed || replicationFailure.isDone()) { + return; + } + stopForcedFailoverAfterDelay(); + unlistenConnectionFailures(); + replicationFailure.complete(alreadyReplicating ? ReplicationFailure.AlreadyReplicating : ReplicationFailure.RegistrationError); + } + } + + public ReplicationFailure awaitReplicationFailure() { + try { + return replicationFailure.get(); + } catch (Throwable e) { + return ReplicationFailure.ClosedObserver; + } + } + + private synchronized void scheduleForcedFailoverAfterDelay() { + if (forcedFailover != null) { + return; + } + forcedFailover = scheduledPool.schedule(() -> onLiveDown(false), WAIT_TIME_AFTER_FIRST_LIVE_STOPPING_MSG, TimeUnit.SECONDS); + } + + private synchronized void stopForcedFailoverAfterDelay() { + if (forcedFailover == null) { + return; + } + forcedFailover.cancel(false); + forcedFailover = null; + } + + @Override + public void onRemoteBackupUpToDate() { + if (backupUpToDate || closed || replicationFailure.isDone()) { + return; + } + synchronized (this) { + if (backupUpToDate || closed || replicationFailure.isDone()) { + return; + } + assert liveID != null; + backupManager.announceBackup(); + backupUpToDate = true; + } + } + + public boolean isBackupUpToDate() { + return backupUpToDate; + } + + public String getLiveID() { + return liveID; + } + + private boolean validateNodeId(String nodeID) { + if (nodeID == null) { + return false; + } + final String existingNodeId = this.liveID; + if (existingNodeId == null) { + if (!failback) { + return true; + } + return nodeID.equals(expectedNodeID); + } + return existingNodeId.equals(nodeID); + } + + @Override + public void onLiveNodeId(String nodeId) { + if (closed || replicationFailure.isDone()) { + return; + } + final String existingNodeId = this.liveID; + if (existingNodeId != null && existingNodeId.equals(nodeId)) { + return; + } + synchronized (this) { + if (closed || replicationFailure.isDone()) { + return; + } + if (!validateNodeId(nodeId)) { + stopForcedFailoverAfterDelay(); + unlistenConnectionFailures(); + replicationFailure.complete(ReplicationFailure.WrongNodeId); + } else if (liveID == null) { + liveID = nodeId; + nodeManager.setNodeID(nodeId); + } + } + } + + public boolean isRemoteBackupUpToDate() { + return backupUpToDate; + } + + @Override + public void onLiveStopping(ReplicationLiveIsStoppingMessage.LiveStopping finalMessage) { + if (closed || replicationFailure.isDone()) { + return; + } + synchronized (this) { + if (closed || replicationFailure.isDone()) { + return; + } + switch (finalMessage) { + case STOP_CALLED: + scheduleForcedFailoverAfterDelay(); + break; + case FAIL_OVER: + onLiveDown(true); + break; + default: + LOGGER.errorf("unsupported LiveStopping type: %s", finalMessage); + } + } + } +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java new file mode 100644 index 0000000000..285f516220 --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java @@ -0,0 +1,439 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.core.server.impl; + +import javax.annotation.concurrent.GuardedBy; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; + +import org.apache.activemq.artemis.api.core.ActiveMQAlreadyReplicatingException; +import org.apache.activemq.artemis.api.core.ActiveMQException; +import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException; +import org.apache.activemq.artemis.api.core.Pair; +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.core.protocol.core.Channel; +import org.apache.activemq.artemis.core.protocol.core.ChannelHandler; +import org.apache.activemq.artemis.core.protocol.core.CoreRemotingConnection; +import org.apache.activemq.artemis.core.protocol.core.impl.PacketImpl; +import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.BackupRegistrationMessage; +import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.BackupReplicationStartFailedMessage; +import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.ReplicationLiveIsStoppingMessage; +import org.apache.activemq.artemis.core.remoting.CloseListener; +import org.apache.activemq.artemis.core.remoting.FailureListener; +import org.apache.activemq.artemis.core.remoting.server.RemotingService; +import org.apache.activemq.artemis.core.replication.ReplicationManager; +import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; +import org.apache.activemq.artemis.core.server.NodeManager; +import org.apache.activemq.artemis.core.server.cluster.ClusterConnection; +import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationPrimaryPolicy; +import org.apache.activemq.artemis.quorum.DistributedLock; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.apache.activemq.artemis.quorum.UnavailableStateException; +import org.apache.activemq.artemis.spi.core.remoting.Acceptor; +import org.jboss.logging.Logger; + +import static org.apache.activemq.artemis.core.server.impl.ClusterTopologySearch.searchActiveLiveNodeId; + +/** + * This is going to be {@link #run()} just by natural born primary, at the first start. + * Both during a failover or a failback, {@link #run()} isn't going to be used, but only {@link #getActivationChannelHandler(Channel, Acceptor)}. + */ +public class ReplicationPrimaryActivation extends LiveActivation implements DistributedLock.UnavailableLockListener { + + private static final Logger LOGGER = Logger.getLogger(ReplicationPrimaryActivation.class); + private static final long DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS = 20_000; + private static final long BLOCKING_CALLS_TIMEOUT_MILLIS = 5_000; + + private final ReplicationPrimaryPolicy policy; + + private final ActiveMQServerImpl activeMQServer; + + @GuardedBy("replicationLock") + private ReplicationManager replicationManager; + + private final Object replicationLock; + + private final DistributedPrimitiveManager distributedManager; + + private volatile boolean stoppingServer; + + public ReplicationPrimaryActivation(final ActiveMQServerImpl activeMQServer, + final DistributedPrimitiveManager distributedManager, + final ReplicationPrimaryPolicy policy) { + this.activeMQServer = activeMQServer; + this.policy = policy; + this.replicationLock = new Object(); + this.distributedManager = distributedManager; + } + + /** + * used for testing purposes. + */ + public DistributedPrimitiveManager getDistributedManager() { + return distributedManager; + } + + @Override + public void freezeConnections(RemotingService remotingService) { + final ReplicationManager replicationManager = getReplicationManager(); + + if (remotingService != null && replicationManager != null) { + remotingService.freeze(null, replicationManager.getBackupTransportConnection()); + } else if (remotingService != null) { + remotingService.freeze(null, null); + } + } + + @Override + public void run() { + try { + + final NodeManager nodeManager = activeMQServer.getNodeManager(); + + final String nodeId = nodeManager.readNodeId().toString(); + + final long dataVersion = nodeManager.readDataVersion(); + + final DistributedLock liveLock = searchLiveOrAcquireLiveLock(nodeId, BLOCKING_CALLS_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS); + + if (liveLock == null) { + return; + } + + activeMQServer.initialisePart1(false); + + activeMQServer.initialisePart2(false); + + // must be registered before checking the caller + liveLock.addListener(this); + + // This control is placed here because initialisePart2 is going to load the journal that + // could pause the JVM for enough time to lose lock ownership + if (!liveLock.isHeldByCaller()) { + throw new IllegalStateException("This broker isn't live anymore, probably due to application pauses eg GC, OS etc: failing now"); + } + + activeMQServer.completeActivation(true); + + if (activeMQServer.getIdentity() != null) { + ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity()); + } else { + ActiveMQServerLogger.LOGGER.serverIsLive(); + } + } catch (Exception e) { + // async stop it, we don't need to await this to complete + distributedManager.stop(); + ActiveMQServerLogger.LOGGER.initializationError(e); + activeMQServer.callActivationFailureListeners(e); + } + } + + private DistributedLock searchLiveOrAcquireLiveLock(final String nodeId, + final long blockingCallTimeout, + final TimeUnit unit) throws ActiveMQException, InterruptedException { + if (policy.isCheckForLiveServer()) { + LOGGER.infof("Searching a live server with NodeID = %s", nodeId); + if (searchActiveLiveNodeId(policy.getClusterName(), nodeId, blockingCallTimeout, unit, activeMQServer.getConfiguration())) { + LOGGER.infof("Found a live server with NodeID = %s: restarting as backup", nodeId); + activeMQServer.setHAPolicy(policy.getBackupPolicy()); + return null; + } + } + startDistributedPrimitiveManager(); + return acquireDistributeLock(getDistributeLock(nodeId), blockingCallTimeout, unit); + } + + private void startDistributedPrimitiveManager() throws InterruptedException, ActiveMQException { + LOGGER.infof("Trying to reach the majority of quorum nodes in %d ms.", DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS); + try { + if (distributedManager.start(DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) { + return; + } + } catch (InterruptedException ie) { + throw ie; + } catch (Throwable t) { + LOGGER.debug(t); + } + assert !distributedManager.isStarted(); + throw new ActiveMQException("Cannot reach the majority of quorum nodes"); + } + + private DistributedLock getDistributeLock(final String nodeId) throws InterruptedException, ActiveMQException { + try { + return distributedManager.getDistributedLock(nodeId); + } catch (Throwable t) { + try { + distributedManager.stop(); + } catch (Throwable ignore) { + // don't care + } + if (t instanceof InterruptedException) { + throw (InterruptedException) t; + } + throw new ActiveMQException("Cannot obtain a live lock instance"); + } + } + + private DistributedLock acquireDistributeLock(final DistributedLock liveLock, + final long acquireLockTimeout, + final TimeUnit unit) throws InterruptedException, ActiveMQException { + try { + if (liveLock.tryLock(acquireLockTimeout, unit)) { + return liveLock; + } + } catch (UnavailableStateException e) { + LOGGER.debug(e); + } + try { + distributedManager.stop(); + } catch (Throwable ignore) { + // don't care + } + throw new ActiveMQException("Failed to become live"); + } + + @Override + public ChannelHandler getActivationChannelHandler(final Channel channel, final Acceptor acceptorUsed) { + if (stoppingServer) { + return null; + } + return packet -> { + if (packet.getType() == PacketImpl.BACKUP_REGISTRATION) { + onBackupRegistration(channel, acceptorUsed, (BackupRegistrationMessage) packet); + } + }; + } + + private void onBackupRegistration(final Channel channel, + final Acceptor acceptorUsed, + final BackupRegistrationMessage msg) { + try { + startAsyncReplication(channel.getConnection(), acceptorUsed.getClusterConnection(), msg.getConnector(), msg.isFailBackRequest()); + } catch (ActiveMQAlreadyReplicatingException are) { + channel.send(new BackupReplicationStartFailedMessage(BackupReplicationStartFailedMessage.BackupRegistrationProblem.ALREADY_REPLICATING)); + } catch (ActiveMQException e) { + LOGGER.debug("Failed to process backup registration packet", e); + channel.send(new BackupReplicationStartFailedMessage(BackupReplicationStartFailedMessage.BackupRegistrationProblem.EXCEPTION)); + } + } + + private void startAsyncReplication(final CoreRemotingConnection remotingConnection, + final ClusterConnection clusterConnection, + final TransportConfiguration backupTransport, + final boolean isFailBackRequest) throws ActiveMQException { + synchronized (replicationLock) { + if (replicationManager != null) { + throw new ActiveMQAlreadyReplicatingException(); + } + if (!activeMQServer.isStarted()) { + throw new ActiveMQIllegalStateException(); + } + final ReplicationFailureListener listener = new ReplicationFailureListener(); + remotingConnection.addCloseListener(listener); + remotingConnection.addFailureListener(listener); + final ReplicationManager replicationManager = new ReplicationManager(activeMQServer, remotingConnection, clusterConnection.getCallTimeout(), policy.getInitialReplicationSyncTimeout(), activeMQServer.getIOExecutorFactory()); + this.replicationManager = replicationManager; + replicationManager.start(); + final Thread replicatingThread = new Thread(() -> replicate(replicationManager, clusterConnection, isFailBackRequest, backupTransport)); + replicatingThread.setName("async-replication-thread"); + replicatingThread.start(); + } + } + + private void replicate(final ReplicationManager replicationManager, + final ClusterConnection clusterConnection, + final boolean isFailBackRequest, + final TransportConfiguration backupTransport) { + try { + final String nodeID = activeMQServer.getNodeID().toString(); + activeMQServer.getStorageManager().startReplication(replicationManager, activeMQServer.getPagingManager(), nodeID, isFailBackRequest && policy.isAllowAutoFailBack(), policy.getInitialReplicationSyncTimeout()); + + clusterConnection.nodeAnnounced(System.currentTimeMillis(), nodeID, policy.getGroupName(), policy.getScaleDownGroupName(), new Pair<>(null, backupTransport), true); + + if (isFailBackRequest && policy.isAllowAutoFailBack()) { + awaitBackupAnnouncementOnFailbackRequest(clusterConnection); + } + } catch (Exception e) { + if (activeMQServer.getState() == ActiveMQServerImpl.SERVER_STATE.STARTED) { + /* + * The reasoning here is that the exception was either caused by (1) the + * (interaction with) the backup, or (2) by an IO Error at the storage. If (1), we + * can swallow the exception and ignore the replication request. If (2) the live + * will crash shortly. + */ + ActiveMQServerLogger.LOGGER.errorStartingReplication(e); + } + try { + ActiveMQServerImpl.stopComponent(replicationManager); + } catch (Exception amqe) { + ActiveMQServerLogger.LOGGER.errorStoppingReplication(amqe); + } finally { + synchronized (replicationLock) { + this.replicationManager = null; + } + } + } + } + + /** + * This is handling awaiting backup announcement before trying to failover. + * This broker is a backup broker, acting as a live and ready to restart as a backup + */ + private void awaitBackupAnnouncementOnFailbackRequest(ClusterConnection clusterConnection) throws Exception { + final String nodeID = activeMQServer.getNodeID().toString(); + final BackupTopologyListener topologyListener = new BackupTopologyListener(nodeID, clusterConnection.getConnector()); + clusterConnection.addClusterTopologyListener(topologyListener); + try { + if (topologyListener.waitForBackup()) { + restartAsBackupAfterFailback(); + } else { + ActiveMQServerLogger.LOGGER.failbackMissedBackupAnnouncement(); + } + } finally { + clusterConnection.removeClusterTopologyListener(topologyListener); + } + } + + /** + * If {@link #asyncStopServer()} happens before this call, the restart just won't happen. + * If {@link #asyncStopServer()} happens after this call, will make the server to stop right after being restarted. + */ + private void restartAsBackupAfterFailback() throws Exception { + if (stoppingServer) { + return; + } + synchronized (this) { + if (stoppingServer) { + return; + } + distributedManager.stop(); + activeMQServer.fail(true); + ActiveMQServerLogger.LOGGER.restartingReplicatedBackupAfterFailback(); + activeMQServer.setHAPolicy(policy.getBackupPolicy()); + activeMQServer.start(); + } + } + + private void asyncStopServer() { + if (stoppingServer) { + return; + } + synchronized (this) { + if (stoppingServer) { + return; + } + stoppingServer = true; + new Thread(() -> { + try { + activeMQServer.stop(); + } catch (Exception e) { + ActiveMQServerLogger.LOGGER.errorRestartingBackupServer(e, activeMQServer); + } + }).start(); + } + } + + @Override + public void onUnavailableLockEvent() { + LOGGER.error("Quorum UNAVAILABLE: async stopping broker."); + asyncStopServer(); + } + + private final class ReplicationFailureListener implements FailureListener, CloseListener { + + @Override + public void connectionFailed(ActiveMQException exception, boolean failedOver) { + onReplicationConnectionClose(); + } + + @Override + public void connectionFailed(final ActiveMQException me, boolean failedOver, String scaleDownTargetNodeID) { + connectionFailed(me, failedOver); + } + + @Override + public void connectionClosed() { + onReplicationConnectionClose(); + } + } + + private void onReplicationConnectionClose() { + ExecutorService executorService = activeMQServer.getThreadPool(); + if (executorService != null) { + synchronized (replicationLock) { + if (replicationManager == null) { + return; + } + } + executorService.execute(() -> { + synchronized (replicationLock) { + if (replicationManager == null) { + return; + } + // this is going to stop the replication manager + activeMQServer.getStorageManager().stopReplication(); + assert !replicationManager.isStarted(); + replicationManager = null; + } + }); + } + } + + @Override + public void close(boolean permanently, boolean restarting) throws Exception { + synchronized (replicationLock) { + replicationManager = null; + } + distributedManager.stop(); + // To avoid a NPE cause by the stop + final NodeManager nodeManager = activeMQServer.getNodeManager(); + if (nodeManager != null) { + if (permanently) { + nodeManager.crashLiveServer(); + } else { + nodeManager.pauseLiveServer(); + } + } + } + + @Override + public void sendLiveIsStopping() { + final ReplicationManager replicationManager = getReplicationManager(); + if (replicationManager == null) { + return; + } + replicationManager.sendLiveIsStopping(ReplicationLiveIsStoppingMessage.LiveStopping.STOP_CALLED); + // this pool gets a 'hard' shutdown, no need to manage the Future of this Runnable. + activeMQServer.getScheduledPool().schedule(replicationManager::clearReplicationTokens, 30, TimeUnit.SECONDS); + } + + @Override + public ReplicationManager getReplicationManager() { + synchronized (replicationLock) { + return replicationManager; + } + } + + @Override + public boolean isReplicaSync() { + final ReplicationManager replicationManager = getReplicationManager(); + if (replicationManager == null) { + return false; + } + return !replicationManager.isSynchronizing(); + } +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingBackupActivation.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingBackupActivation.java index 0249cdfe53..3876185803 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingBackupActivation.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingBackupActivation.java @@ -32,6 +32,7 @@ import org.apache.activemq.artemis.core.postoffice.PostOffice; import org.apache.activemq.artemis.core.protocol.core.Channel; import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.ReplicationLiveIsStoppingMessage; import org.apache.activemq.artemis.core.replication.ReplicationEndpoint; +import org.apache.activemq.artemis.core.replication.ReplicationEndpoint.ReplicationEndpointEventListener; import org.apache.activemq.artemis.core.server.ActivationParams; import org.apache.activemq.artemis.core.server.ActiveMQMessageBundle; import org.apache.activemq.artemis.core.server.ActiveMQServer; @@ -54,7 +55,7 @@ import static org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothi import static org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothingBackupQuorum.BACKUP_ACTIVATION.FAIL_OVER; import static org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothingBackupQuorum.BACKUP_ACTIVATION.STOP; -public final class SharedNothingBackupActivation extends Activation { +public final class SharedNothingBackupActivation extends Activation implements ReplicationEndpointEventListener { private static final Logger logger = Logger.getLogger(SharedNothingBackupActivation.class); @@ -96,7 +97,7 @@ public final class SharedNothingBackupActivation extends Activation { assert replicationEndpoint == null; activeMQServer.resetNodeManager(); backupUpToDate = false; - replicationEndpoint = new ReplicationEndpoint(activeMQServer, ioCriticalErrorListener, attemptFailBack, this); + replicationEndpoint = new ReplicationEndpoint(activeMQServer, attemptFailBack, this); } @Override @@ -156,9 +157,6 @@ public final class SharedNothingBackupActivation extends Activation { logger.debug("Starting backup manager"); activeMQServer.getBackupManager().start(); - logger.debug("Set backup Quorum"); - replicationEndpoint.setBackupQuorum(backupQuorum); - replicationEndpoint.setExecutor(activeMQServer.getExecutorFactory().getExecutor()); EndpointConnector endpointConnector = new EndpointConnector(); @@ -461,7 +459,13 @@ public final class SharedNothingBackupActivation extends Activation { return backupUpToDate; } - public void setRemoteBackupUpToDate() { + @Override + public void onLiveNodeId(String nodeId) { + backupQuorum.liveIDSet(nodeId); + } + + @Override + public void onRemoteBackupUpToDate() { activeMQServer.getBackupManager().announceBackup(); backupUpToDate = true; backupSyncLatch.countDown(); @@ -470,7 +474,8 @@ public final class SharedNothingBackupActivation extends Activation { /** * @throws ActiveMQException */ - public void remoteFailOver(ReplicationLiveIsStoppingMessage.LiveStopping finalMessage) throws ActiveMQException { + @Override + public void onLiveStopping(ReplicationLiveIsStoppingMessage.LiveStopping finalMessage) throws ActiveMQException { if (logger.isTraceEnabled()) { logger.trace("Remote fail-over, got message=" + finalMessage + ", backupUpToDate=" + backupUpToDate); @@ -526,4 +531,9 @@ public final class SharedNothingBackupActivation extends Activation { return replicationEndpoint; } } + + @Override + public boolean isReplicaSync() { + return isRemoteBackupUpToDate(); + } } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingLiveActivation.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingLiveActivation.java index 9de4be057b..f876a7604b 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingLiveActivation.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/SharedNothingLiveActivation.java @@ -462,4 +462,13 @@ public class SharedNothingLiveActivation extends LiveActivation { private TransportConfiguration[] connectorNameListToArray(final List connectorNames) { return activeMQServer.getConfiguration().getTransportConfigurations(connectorNames); } + + @Override + public boolean isReplicaSync() { + final ReplicationManager replicationManager = getReplicationManager(); + if (replicationManager == null) { + return false; + } + return !replicationManager.isSynchronizing(); + } } diff --git a/artemis-server/src/main/resources/schema/artemis-configuration.xsd b/artemis-server/src/main/resources/schema/artemis-configuration.xsd index 9dc5b5c0fc..9e79a54b7f 100644 --- a/artemis-server/src/main/resources/schema/artemis-configuration.xsd +++ b/artemis-server/src/main/resources/schema/artemis-configuration.xsd @@ -2605,7 +2605,7 @@ - + A key-value pair option for the DataSource @@ -2682,7 +2682,7 @@ - + @@ -2726,6 +2726,36 @@ + + + + + + The distributed-primitive-manager class name + + + + + + + A list of options for the distributed-primitive-manager + + + + + + + + A key-value pair option for the distributed-primitive-manager + + + + + + + + + @@ -2749,6 +2779,20 @@ + + + + A primary server configured to replicate. + + + + + + + A backup server configured to replicate. + + + @@ -3119,6 +3163,155 @@ + + + + + + It's the manager used to manager distributed locks used for this type of replication. + + + + + + + used for replication, if set, (remote) backup servers will only pair with live servers with matching + group-name + + + + + + + Name of the cluster configuration to use for replication. This setting is only necessary in case you + configure multiple cluster connections. It is used by a replicating backups and by live servers that + may attempt fail-back. + + + + + + + Whether to check the cluster for a (live) server using our own server ID when starting + up. This option is only necessary for performing 'fail-back' on replicating + servers. Strictly speaking this setting only applies to live servers and not to + backups. + + + + + + + The amount of time to wait for the replica to acknowledge it has received all the necessary data from + the replicating server at the final step of the initial replication synchronization process. + + + + + + + If we start as a replica and lose connection to the master, how many times should we attempt to vote + for quorum before restarting + + + + + + + How long to wait (in milliseconds) between each vote + + + + + + + If we start as a replica how long to wait (in milliseconds) before trying to replicate again after failing to find a replica + + + + + + + + + + + + It's the manager used to manager distributed locks used for this type of replication. + + + + + + + used for replication, if set, (remote) backup servers will only pair with live servers with matching + group-name + + + + + + + Name of the cluster configuration to use for replication. This setting is only necessary in case you + configure multiple cluster connections. It is used by a replicating backups and by live servers that + may attempt fail-back. + + + + + + + This specifies how many times a replicated backup server can restart after moving its files on start. + Once there are this number of backup journal files the server will stop permanently after if fails + back. + + + + + + + Whether a server will automatically stop when a another places a request to take over + its place. The use case is when a regular server stops and its backup takes over its + duties, later the main server restarts and requests the server (the former backup) to + stop operating. + + + + + + + If we have to start as a replicated server this is the amount of time to wait for the replica to + acknowledge it has received all the necessary data from the replicating server at the final step + of the initial replication synchronization process. + + + + + + + If we lose connection to the master, how many times should we attempt to vote for quorum before restarting + + + + + + + How long to wait (in milliseconds) between each vote + + + + + + + How long to wait (in milliseconds) before trying to replicate again after failing to find a replica + + + + + + diff --git a/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java b/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java index 425ff7b244..4b37a76125 100644 --- a/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java +++ b/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java @@ -17,7 +17,12 @@ package org.apache.activemq.artemis.core.config.impl; import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import org.apache.activemq.artemis.api.config.ActiveMQDefaultConfiguration; import org.apache.activemq.artemis.core.config.Configuration; import org.apache.activemq.artemis.core.config.FileDeploymentManager; import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; @@ -27,6 +32,8 @@ import org.apache.activemq.artemis.core.server.cluster.ha.HAPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.LiveOnlyPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.ReplicaPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.ReplicatedPolicy; +import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationBackupPolicy; +import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationPrimaryPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.ScaleDownPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.SharedStoreMasterPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.SharedStoreSlavePolicy; @@ -35,11 +42,19 @@ import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl; import org.apache.activemq.artemis.core.server.impl.ColocatedActivation; import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager; import org.apache.activemq.artemis.core.server.impl.LiveOnlyActivation; +import org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation; +import org.apache.activemq.artemis.core.server.impl.ReplicationPrimaryActivation; import org.apache.activemq.artemis.core.server.impl.SharedNothingBackupActivation; import org.apache.activemq.artemis.core.server.impl.SharedNothingLiveActivation; import org.apache.activemq.artemis.core.server.impl.SharedStoreBackupActivation; import org.apache.activemq.artemis.core.server.impl.SharedStoreLiveActivation; +import org.apache.activemq.artemis.quorum.DistributedLock; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.apache.activemq.artemis.quorum.MutableLong; +import org.apache.activemq.artemis.quorum.UnavailableStateException; import org.apache.activemq.artemis.tests.util.ActiveMQTestBase; +import org.hamcrest.MatcherAssert; +import org.hamcrest.core.IsInstanceOf; import org.junit.Test; import static org.hamcrest.CoreMatchers.instanceOf; @@ -124,6 +139,248 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase { liveOnlyTest("live-only-hapolicy-config5.xml"); } + public static class FakeDistributedPrimitiveManager implements DistributedPrimitiveManager { + + private final Map config; + private boolean started; + private DistributedLock lock; + + public FakeDistributedPrimitiveManager(Map config) { + this.config = config; + this.started = false; + } + + public Map getConfig() { + return config; + } + + @Override + public void addUnavailableManagerListener(UnavailableManagerListener listener) { + // no op + } + + @Override + public void removeUnavailableManagerListener(UnavailableManagerListener listener) { + // no op + } + + @Override + public boolean start(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException { + started = true; + return true; + } + + @Override + public void start() throws InterruptedException, ExecutionException { + started = true; + } + + @Override + public boolean isStarted() { + return started; + } + + @Override + public void stop() { + started = false; + if (lock != null) { + lock.close(); + } + lock = null; + } + + @Override + public DistributedLock getDistributedLock(String lockId) { + if (!started) { + throw new IllegalStateException("need to start first"); + } + if (lock == null) { + lock = new DistributedLock() { + + private boolean held; + + @Override + public String getLockId() { + return lockId; + } + + @Override + public boolean isHeldByCaller() throws UnavailableStateException { + return held; + } + + @Override + public boolean tryLock() throws UnavailableStateException, InterruptedException { + if (held) { + return false; + } + held = true; + return true; + } + + @Override + public void unlock() throws UnavailableStateException { + held = false; + } + + @Override + public void addListener(UnavailableLockListener listener) { + + } + + @Override + public void removeListener(UnavailableLockListener listener) { + + } + + @Override + public void close() { + held = false; + } + }; + } else if (!lock.getLockId().equals(lockId)) { + throw new IllegalStateException("This shouldn't happen"); + } + return lock; + } + + @Override + public MutableLong getMutableLong(String mutableLongId) throws InterruptedException, ExecutionException, TimeoutException { + // TODO + return null; + } + + @Override + public void close() { + stop(); + } + } + + private static void validateManagerConfig(Map config) { + assertEquals("127.0.0.1:6666", config.get("connect-string")); + assertEquals("16000", config.get("session-ms")); + assertEquals("2000", config.get("connection-ms")); + assertEquals("2", config.get("retries")); + assertEquals("2000", config.get("retries-ms")); + assertEquals("test", config.get("namespace")); + assertEquals("10", config.get("session-percent")); + assertEquals(7, config.size()); + } + + @Test + public void PrimaryReplicationTest() throws Exception { + Configuration configuration = createConfiguration("primary-hapolicy-config.xml"); + ActiveMQServerImpl server = new ActiveMQServerImpl(configuration); + try { + server.start(); + Activation activation = server.getActivation(); + assertTrue(activation instanceof ReplicationPrimaryActivation); + HAPolicy haPolicy = server.getHAPolicy(); + assertTrue(haPolicy instanceof ReplicationPrimaryPolicy); + ReplicationPrimaryPolicy policy = (ReplicationPrimaryPolicy) haPolicy; + assertFalse(policy.isAllowAutoFailBack()); + assertEquals(9876, policy.getInitialReplicationSyncTimeout()); + assertFalse(policy.canScaleDown()); + assertFalse(policy.isBackup()); + assertFalse(policy.isSharedStore()); + assertTrue(policy.isCheckForLiveServer()); + assertTrue(policy.isWaitForActivation()); + assertEquals("purple", policy.getGroupName()); + assertEquals("purple", policy.getBackupGroupName()); + assertEquals("abcdefg", policy.getClusterName()); + assertFalse(policy.useQuorumManager()); + // check failback companion backup policy + ReplicationBackupPolicy failbackPolicy = policy.getBackupPolicy(); + assertNotNull(failbackPolicy); + assertSame(policy, failbackPolicy.getLivePolicy()); + assertEquals(policy.getGroupName(), failbackPolicy.getGroupName()); + assertEquals(policy.getBackupGroupName(), failbackPolicy.getBackupGroupName()); + assertEquals(policy.getClusterName(), failbackPolicy.getClusterName()); + assertEquals(failbackPolicy.getMaxSavedReplicatedJournalsSize(), ActiveMQDefaultConfiguration.getDefaultMaxSavedReplicatedJournalsSize()); + assertEquals(1, failbackPolicy.getVoteRetries()); + assertEquals(1000, failbackPolicy.getVoteRetryWait()); + assertTrue(failbackPolicy.isTryFailback()); + assertTrue(failbackPolicy.isBackup()); + assertFalse(failbackPolicy.isSharedStore()); + assertTrue(failbackPolicy.isWaitForActivation()); + assertFalse(failbackPolicy.useQuorumManager()); + assertEquals(12345, failbackPolicy.getRetryReplicationWait()); + // check scale-down properties + assertFalse(failbackPolicy.canScaleDown()); + assertNull(failbackPolicy.getScaleDownClustername()); + assertNull(failbackPolicy.getScaleDownGroupName()); + // validate manager + DistributedPrimitiveManager manager = ((ReplicationPrimaryActivation) activation).getDistributedManager(); + assertNotNull(manager); + assertEquals(FakeDistributedPrimitiveManager.class.getName(), manager.getClass().getName()); + MatcherAssert.assertThat(manager, IsInstanceOf.instanceOf(FakeDistributedPrimitiveManager.class)); + FakeDistributedPrimitiveManager forwardingManager = (FakeDistributedPrimitiveManager) manager; + // validate manager config + validateManagerConfig(forwardingManager.getConfig()); + } finally { + server.stop(); + } + } + + @Test + public void BackupReplicationTest() throws Exception { + Configuration configuration = createConfiguration("backup-hapolicy-config.xml"); + ActiveMQServerImpl server = new ActiveMQServerImpl(configuration); + try { + server.start(); + Activation activation = server.getActivation(); + assertTrue(activation instanceof ReplicationBackupActivation); + HAPolicy haPolicy = server.getHAPolicy(); + assertTrue(haPolicy instanceof ReplicationBackupPolicy); + ReplicationBackupPolicy policy = (ReplicationBackupPolicy) haPolicy; + assertEquals("tiddles", policy.getGroupName()); + assertEquals("tiddles", policy.getBackupGroupName()); + assertEquals("33rrrrr", policy.getClusterName()); + assertEquals(22, policy.getMaxSavedReplicatedJournalsSize()); + assertEquals(1, policy.getVoteRetries()); + assertEquals(1000, policy.getVoteRetryWait()); + assertFalse(policy.isTryFailback()); + assertTrue(policy.isBackup()); + assertFalse(policy.isSharedStore()); + assertTrue(policy.isWaitForActivation()); + assertFalse(policy.useQuorumManager()); + assertEquals(12345, policy.getRetryReplicationWait()); + // check scale-down properties + assertFalse(policy.canScaleDown()); + assertNull(policy.getScaleDownClustername()); + assertNull(policy.getScaleDownGroupName()); + // check failover companion live policy + ReplicationPrimaryPolicy failoverLivePolicy = policy.getLivePolicy(); + assertNotNull(failoverLivePolicy); + assertSame(policy, failoverLivePolicy.getBackupPolicy()); + assertFalse(failoverLivePolicy.isAllowAutoFailBack()); + assertEquals(9876, failoverLivePolicy.getInitialReplicationSyncTimeout()); + assertFalse(failoverLivePolicy.canScaleDown()); + assertFalse(failoverLivePolicy.isBackup()); + assertFalse(failoverLivePolicy.isSharedStore()); + assertFalse(failoverLivePolicy.isCheckForLiveServer()); + assertTrue(failoverLivePolicy.isWaitForActivation()); + assertEquals(policy.getGroupName(), failoverLivePolicy.getGroupName()); + assertEquals(policy.getClusterName(), failoverLivePolicy.getClusterName()); + assertEquals(policy.getBackupGroupName(), failoverLivePolicy.getBackupGroupName()); + assertFalse(failoverLivePolicy.useQuorumManager()); + // check scale-down properties + assertFalse(failoverLivePolicy.canScaleDown()); + assertNull(failoverLivePolicy.getScaleDownClustername()); + assertNull(failoverLivePolicy.getScaleDownGroupName()); + // validate manager + DistributedPrimitiveManager manager = ((ReplicationBackupActivation) activation).getDistributedManager(); + assertNotNull(manager); + assertEquals(FakeDistributedPrimitiveManager.class.getName(), manager.getClass().getName()); + MatcherAssert.assertThat(manager, IsInstanceOf.instanceOf(FakeDistributedPrimitiveManager.class)); + FakeDistributedPrimitiveManager forwardingManager = (FakeDistributedPrimitiveManager) manager; + // validate manager config + validateManagerConfig(forwardingManager.getConfig()); + } finally { + server.stop(); + } + } + @Test public void ReplicatedTest() throws Exception { Configuration configuration = createConfiguration("replicated-hapolicy-config.xml"); diff --git a/artemis-server/src/test/java/org/apache/activemq/artemis/tests/util/ActiveMQTestBase.java b/artemis-server/src/test/java/org/apache/activemq/artemis/tests/util/ActiveMQTestBase.java index adfceeed84..5a16f3e5f6 100644 --- a/artemis-server/src/test/java/org/apache/activemq/artemis/tests/util/ActiveMQTestBase.java +++ b/artemis-server/src/test/java/org/apache/activemq/artemis/tests/util/ActiveMQTestBase.java @@ -113,6 +113,7 @@ import org.apache.activemq.artemis.core.remoting.impl.invm.TransportConstants; import org.apache.activemq.artemis.core.remoting.impl.netty.NettyAcceptorFactory; import org.apache.activemq.artemis.core.remoting.impl.netty.NettyConnector; import org.apache.activemq.artemis.core.remoting.impl.netty.NettyConnectorFactory; +import org.apache.activemq.artemis.core.replication.ReplicationEndpoint; import org.apache.activemq.artemis.core.server.ActiveMQComponent; import org.apache.activemq.artemis.core.server.ActiveMQServer; import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; @@ -129,6 +130,7 @@ import org.apache.activemq.artemis.core.server.impl.Activation; import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl; import org.apache.activemq.artemis.core.server.impl.AddressInfo; import org.apache.activemq.artemis.core.server.impl.LiveOnlyActivation; +import org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation; import org.apache.activemq.artemis.core.server.impl.SharedNothingBackupActivation; import org.apache.activemq.artemis.core.settings.impl.AddressFullMessagePolicy; import org.apache.activemq.artemis.core.settings.impl.AddressSettings; @@ -1384,6 +1386,8 @@ public abstract class ActiveMQTestBase extends Assert { if (isReplicated) { if (activation instanceof SharedNothingBackupActivation) { isRemoteUpToDate = backup.isReplicaSync(); + } else if (activation instanceof ReplicationBackupActivation) { + isRemoteUpToDate = backup.isReplicaSync(); } else { //we may have already failed over and changed the Activation if (actualServer.isStarted()) { @@ -2517,6 +2521,17 @@ public abstract class ActiveMQTestBase extends Assert { return !hadToInterrupt; } + protected static ReplicationEndpoint getReplicationEndpoint(ActiveMQServer server) { + final Activation activation = server.getActivation(); + if (activation instanceof SharedNothingBackupActivation) { + return ((SharedNothingBackupActivation) activation).getReplicationEndpoint(); + } + if (activation instanceof ReplicationBackupActivation) { + return ((ReplicationBackupActivation) activation).getReplicationEndpoint(); + } + return null; + } + // Private ------------------------------------------------------- // Inner classes ------------------------------------------------- diff --git a/artemis-server/src/test/resources/backup-hapolicy-config.xml b/artemis-server/src/test/resources/backup-hapolicy-config.xml new file mode 100644 index 0000000000..be55236676 --- /dev/null +++ b/artemis-server/src/test/resources/backup-hapolicy-config.xml @@ -0,0 +1,54 @@ + + + + + + + + + + tiddles + 22 + 33rrrrr + 9876 + 12345 + 1 + 1000 + false + + + org.apache.activemq.artemis.core.config.impl.HAPolicyConfigurationTest$FakeDistributedPrimitiveManager + + + + + + + + + + + + + + + + diff --git a/artemis-server/src/test/resources/primary-hapolicy-config.xml b/artemis-server/src/test/resources/primary-hapolicy-config.xml new file mode 100644 index 0000000000..5b88bcd80f --- /dev/null +++ b/artemis-server/src/test/resources/primary-hapolicy-config.xml @@ -0,0 +1,52 @@ + + + + + + + + purple + abcdefg + 9876 + 12345 + true + 1 + 1000 + + + org.apache.activemq.artemis.core.config.impl.HAPolicyConfigurationTest$FakeDistributedPrimitiveManager + + + + + + + + + + + + + + + + + diff --git a/docs/user-manual/en/ha.md b/docs/user-manual/en/ha.md index b2f61d5d8b..96045e95e4 100644 --- a/docs/user-manual/en/ha.md +++ b/docs/user-manual/en/ha.md @@ -98,6 +98,36 @@ or ``` +*Replication* allows too to configure 2 new roles to enable *pluggable quorum* provider configuration, by using: +```xml + + + + + +``` +to configure the classic *master* role, and +```xml + + + + + +``` +for the classic *slave* one. + +If *replication* is configured using such new roles some additional element is required to complete configuration, detailed later. + +### IMPORTANT NOTE ON PLUGGABLE QUORUM VOTE FEATURE + +This feature is still **EXPERIMENTAL** and not meant to be run in production yet. + +It means: +- its configuration can change until declared as **officially stable** +- it has to solve yet an inherent data misalignment issue with replication (it can happen with `classic` replication as well) + +More info about this issue are on [ARTEMIS-3340](https://issues.apache.org/jira/browse/ARTEMIS-3340). + ### Data Replication When using replication, the live and the backup servers do not share the @@ -199,16 +229,26 @@ Much like in the shared-store case, when the live server stops or crashes, its replicating backup will become active and take over its duties. Specifically, the backup will become active when it loses connection to its live server. This can be problematic because this can -also happen because of a temporary network problem. In order to address -this issue, the backup will try to determine whether it still can +also happen because of a temporary network problem. + +This issue is solved in 2 different ways depending on which replication roles are configured: +- **classic replication** (`master`/`slave` roles): backup will try to determine whether it still can connect to the other servers in the cluster. If it can connect to more than half the servers, it will become active, if more than half the servers also disappeared with the live, the backup will wait and try reconnecting with the live. This avoids a split brain situation. +- **pluggable quorum vote replication** (`primary`/`backup` roles): backup relies on a pluggable quorum provider + (configurable via `manager` xml element) to detect if there's any active live. + +> ***NOTE*** +> +> A backup in the **pluggable quorum vote replication** still need to carefully configure +> [connection-ttl](connection-ttl.md) in order to promptly issue a request to become live to the quorum service +> before failing-over. #### Configuration -To configure the live and backup servers to be a replicating pair, +To configure a classic replication's live and backup servers to be a replicating pair, configure the live server in ' `broker.xml` to have: ```xml @@ -235,6 +275,30 @@ The backup server must be similarly configured but as a `slave` ``` +To configure a pluggable quorum replication's primary and backup instead: + +```xml + + + + + +... + + + ... + + +``` +and +```xml + + + + + +``` + #### All Replication Configuration The following table lists all the `ha-policy` configuration elements for @@ -308,6 +372,142 @@ replica to acknowledge it has received all the necessary data. The default is 30,000 milliseconds. **Note:** during this interval any journal related operations will be blocked. +#### Pluggable Quorum Vote Replication configurations +Pluggable Quorum Vote replication configuration options are a bit different +from classic replication, mostly because of its customizable nature. + +[Apache curator](https://curator.apache.org/) is used by the default quorum provider. + +Below some example configurations to show how it works. + +For `primary`: +```xml + + + + + org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager + + + + + true + + + +``` +And `backup`: +```xml + + + + + org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager + + + + + true + + + +``` +The configuration of `class-name` as follows +```xml +org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager +``` +isn't really needed, because Apache Curator is the default provider, but has been shown for completeness. + +The `properties` element, instead +```xml + + + +``` +Can specify a list of `property` elements in the form of key-value pairs, depending the ones +accepted by the specified `class-name` provider. + +Apache Curator's provider allow to configure these properties: + +- [`connect-string`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#connectString(java.lang.String)): (no default) +- [`session-ms`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#sessionTimeoutMs(int)): (default is 18000 ms) +- [`session-percent`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#simulatedSessionExpirationPercent(int)): (default is 33); should be <= default, + see https://cwiki.apache.org/confluence/display/CURATOR/TN14 for more info +- [`connection-ms`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#connectionTimeoutMs(int)): (default is 8000 ms) +- [`retries`](https://curator.apache.org/apidocs/org/apache/curator/retry/RetryNTimes.html#%3Cinit%3E(int,int)): (default is 1) +- [`retries-ms`](https://curator.apache.org/apidocs/org/apache/curator/retry/RetryNTimes.html#%3Cinit%3E(int,int)): (default is 1000 ms) +- [`namespace`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#namespace(java.lang.String)): (no default) + +Configuration of the [Apache Zookeeper](https://zookeeper.apache.org/) nodes is left to the user, but there are few +**suggestions to improve the reliability of the quorum service**: +- broker `session_ms` must be `>= 2 * server tick time` and `<= 20 * server tick time` as by + [Zookeeper 3.6.3 admin guide](https://zookeeper.apache.org/doc/r3.6.3/zookeeperAdmin.html): it directly impacts how fast a backup + can failover to an isolated/killed/unresponsive live; the higher, the slower. +- GC on broker machine should allow keeping GC pauses within 1/3 of `session_ms` in order to let the Zookeeper heartbeat protocol +to work reliably: if it's not possible, better increase `session_ms` accepting a slower failover +- Zookeeper must have enough resources to keep GC (and OS) pauses much smaller than server tick time: please consider carefully if + broker and Zookeeper node should share the same physical machine, depending on the expected load of the broker +- network isolation protection requires configuring >=3 Zookeeper nodes + +#### *Important*: Notes on pluggable quorum replication configuration + +The first `classic` replication configuration that won't apply to the pluggable quorum replication +is `vote-on-replication-failure` and configure it produces a startup error: pluggable quorum replication +always behave like `vote-on-replication-failure` `true` ie shutting down a live broker (and its JVM) in case of quorum loss. + +The second deprecated `classic` replication configuration is `quorum-vote-wait`: given that the pluggable quorum vote replication +requires backup to have an always-on reliable quorum service, there's no need to specify the timeout to reach +the majority of quorum nodes. A backup remains inactive (ie JVM still up, console too, unable to sync with live, to failover etc etc) +until the majority of quorum nodes is reachable again, re-activating if happens. + +The only exception is with primary failing-back to an existing live backup using `true`: +if the quorum service isn't immediately available the primary (and its JVM) just stop, allowing fail-fast failing-back. + +There are few *semantic differences* of other existing properties: +- `vote-retry-wait`: in `classic` replication means how long to wait between each quorum vote try, while with pluggable quorum replication + means how long request to failover for each attempt +- `vote-retries`: differently from `classic`, the amount of vote attempt is `1 + vote-retries` (with classic is just `vote-retries`). + Setting `0` means no retries, leaving backup to still perform an initial attempt. + +**Notes on replication configuration with [Apache curator](https://curator.apache.org/) quorum provider** + +As said some paragraphs above, `session-ms` affect the failover duration: a backup can +failover after `session-ms` expires or if the live broker voluntary give up its role +eg during a fail-back/manual broker stop, it happens immediately. + +For the former case (session expiration with live no longer present), the backup broker can detect an unresponsive live by using: +1. cluster connection PINGs (affected by [connection-ttl](connection-ttl.md) tuning) +2. closed TCP connection notification (depends by TCP configuration and networking stack/topology) + +These 2 cases have 2 different failover duration depending on different factors: +1. `connection-ttl` affect how much time of the expiring `session-ms` is used to just detect a missing live broker: the higher `connection-tt`, + the slower it reacts; backup can attempt to failover for the remaining `session-ms - connection-ttl` +2. `session-ms` expiration is immediately detected: backup must try to failover for >=`session-ms` to be sure to catch + the session expiration and complete failover + +The previous comments are meant to suggest to the careful reader that the minimum time to attempt to failover +cannot be below the full `session-ms` expires. +In short, it means +``` + total failover attempt time > session-ms +``` +with +``` + total failover attempt time = vote-retry-wait * (vote-retries + 1) +``` +and by consequence: +``` + vote-retry-wait * (vote-retries + 1) > session-ms +``` +For example with `session-ms = 18000 ms`, safe values for failover timeout are: +```xml + 11 + 2000 +``` +Because `11 * 2000 = 22000 ms` that's bigger then `18000 ms`. + +There's no risk that a backup broker will early stop attempting to failover, losing its chance to become live. + ### Shared Store When using a shared store, both live and backup servers share the *same* @@ -406,8 +606,32 @@ stop. This configuration would look like: ``` -In replication HA mode you need to set an extra property -`check-for-live-server` to `true` in the `master` configuration. If set +The same configuration option can be set for both replications, classic: +```xml + + + + true + + + +``` +and with pluggable quorum provider: +```xml + + + + + + + true + + + +``` + +In both replication HA mode you need to set an extra property +`check-for-live-server` to `true` in the `master`/`primary` configuration. If set to true, during start-up a live server will first search the cluster for another server using its nodeID. If it finds one, it will contact this server and try to "fail-back". Since this is a remote replication @@ -418,7 +642,7 @@ to shutdown for it to take over. This is necessary because otherwise the live server has no means to know whether there was a fail-over or not, and if there was if the server that took its duties is still running or not. To configure this option at your `broker.xml` -configuration file as follows: +configuration file as follows, for classic replication: ```xml @@ -430,6 +654,29 @@ configuration file as follows: ``` +And pluggable quorum replication: + +```xml + + + + + + + true + + + +``` + +The key difference from classic replication is that if `master` cannot reach any +live server with its same nodeID, it's going straight to become live, while `primary` +request it to the quorum provider, searching again for any existing live if +the quorum provider is not available (eg connectivity loss, consensus absence) or +if there's another live broker with the same nodeID alive, in an endless loop. + +In short: a started `primary` cannot become live without consensus. + > **Warning** > > Be aware that if you restart a live server while after failover has diff --git a/pom.xml b/pom.xml index 8d76bbbc22..5e59e4999c 100644 --- a/pom.xml +++ b/pom.xml @@ -64,6 +64,8 @@ artemis-distribution tests artemis-features + artemis-quorum-api + artemis-quorum-ri ActiveMQ Artemis Parent @@ -105,6 +107,9 @@ 3.11.2 2.1.2 4.1.66.Final + 5.1.0 + + 3.6.3 2.0.40.Final @@ -851,6 +856,32 @@ jakarta.security.auth.message-api ${jakarta.security.auth.message-api.version} + + + org.apache.curator + curator-recipes + ${curator.version} + + + org.apache.curator + curator-framework + ${curator.version} + + + org.apache.curator + curator-client + ${curator.version} + + + org.apache.zookeeper + zookeeper + ${zookeeper.version} + + + org.apache.zookeeper + zookeeper-jute + ${zookeeper.version} + diff --git a/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/ScaleDownFailoverTest.java b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/ScaleDownFailoverTest.java index 4ccb6f9310..e4e705bf3c 100644 --- a/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/ScaleDownFailoverTest.java +++ b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/ScaleDownFailoverTest.java @@ -44,9 +44,9 @@ public class ScaleDownFailoverTest extends ClusterTestBase { public void setUp() throws Exception { super.setUp(); stopCount = 0; - setupLiveServer(0, isFileStorage(), false, isNetty(), true); - setupLiveServer(1, isFileStorage(), false, isNetty(), true); - setupLiveServer(2, isFileStorage(), false, isNetty(), true); + setupLiveServer(0, isFileStorage(), HAType.SharedNothingReplication, isNetty(), true); + setupLiveServer(1, isFileStorage(), HAType.SharedNothingReplication, isNetty(), true); + setupLiveServer(2, isFileStorage(), HAType.SharedNothingReplication, isNetty(), true); ScaleDownConfiguration scaleDownConfiguration = new ScaleDownConfiguration(); ScaleDownConfiguration scaleDownConfiguration2 = new ScaleDownConfiguration(); scaleDownConfiguration2.setEnabled(false); diff --git a/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/ScaleDownFailureTest.java b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/ScaleDownFailureTest.java index e592b16371..1551c501be 100644 --- a/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/ScaleDownFailureTest.java +++ b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/ScaleDownFailureTest.java @@ -35,8 +35,8 @@ public class ScaleDownFailureTest extends ClusterTestBase { @Before public void setUp() throws Exception { super.setUp(); - setupLiveServer(0, isFileStorage(), false, isNetty(), true); - setupLiveServer(1, isFileStorage(), false, isNetty(), true); + setupLiveServer(0, isFileStorage(), HAType.SharedNothingReplication, isNetty(), true); + setupLiveServer(1, isFileStorage(), HAType.SharedNothingReplication, isNetty(), true); if (isGrouped()) { ScaleDownConfiguration scaleDownConfiguration = new ScaleDownConfiguration(); scaleDownConfiguration.setGroupName("bill"); diff --git a/tests/integration-tests/pom.xml b/tests/integration-tests/pom.xml index 0d9a4f6154..4a74049f69 100644 --- a/tests/integration-tests/pom.xml +++ b/tests/integration-tests/pom.xml @@ -51,6 +51,12 @@ test test-jar + + org.apache.activemq + artemis-quorum-ri + ${project.version} + test + org.apache.activemq.tests unit-tests diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java index efbc8efb0e..a69c44ec18 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java @@ -93,12 +93,11 @@ public class InfiniteRedeliveryTest extends ActiveMQTestBase { backupConfig = createDefaultConfig(0, true); liveConfig = createDefaultConfig(0, true); - ReplicatedBackupUtils.configureReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, null); + configureReplicationPair(backupConnector, backupAcceptor, liveConnector); backupConfig.setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)).setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setSecurityEnabled(false); - ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(-1).setAllowFailBack(true); - ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setRestartBackup(false); + nodeManager = new InVMNodeManager(true, backupConfig.getJournalLocation()); @@ -109,6 +108,14 @@ public class InfiniteRedeliveryTest extends ActiveMQTestBase { liveServer = createTestableServer(liveConfig, nodeManager); } + protected void configureReplicationPair(TransportConfiguration backupConnector, + TransportConfiguration backupAcceptor, + TransportConfiguration liveConnector) { + ReplicatedBackupUtils.configureReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, null); + ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(-1).setAllowFailBack(true); + ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setRestartBackup(false); + } + @Before @Override diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/PluggableQuorumInfiniteRedeliveryTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/PluggableQuorumInfiniteRedeliveryTest.java new file mode 100644 index 0000000000..2fbacac267 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/PluggableQuorumInfiniteRedeliveryTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.client; + +import java.util.Collections; + +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; +import org.apache.activemq.artemis.tests.util.ReplicatedBackupUtils; +import org.junit.Before; +import org.junit.Rule; +import org.junit.rules.TemporaryFolder; + +public class PluggableQuorumInfiniteRedeliveryTest extends InfiniteRedeliveryTest { + + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); + + private DistributedPrimitiveManagerConfiguration managerConfiguration; + + public PluggableQuorumInfiniteRedeliveryTest(String protocol, boolean useCLI) { + super(protocol, useCLI); + } + + @Before + @Override + public void setUp() throws Exception { + super.setUp(); + this.managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), + Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + } + + @Override + protected void configureReplicationPair(TransportConfiguration backupConnector, + TransportConfiguration backupAcceptor, + TransportConfiguration liveConnector) { + + ReplicatedBackupUtils.configurePluggableQuorumReplicationPair(backupConfig, backupConnector, backupAcceptor, + liveConfig, liveConnector, null, + managerConfiguration, managerConfiguration); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) + .setMaxSavedReplicatedJournalsSize(-1).setAllowFailBack(true); + } +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterTestBase.java index a6aeeaa564..b364ad09eb 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterTestBase.java @@ -17,6 +17,7 @@ package org.apache.activemq.artemis.tests.integration.cluster.distribution; import java.io.File; +import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.net.URI; @@ -56,9 +57,12 @@ import org.apache.activemq.artemis.core.client.impl.TopologyMemberImpl; import org.apache.activemq.artemis.core.config.ClusterConnectionConfiguration; import org.apache.activemq.artemis.core.config.Configuration; import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.LiveOnlyPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicaPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicatedPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.SharedStoreMasterPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.SharedStoreSlavePolicyConfiguration; import org.apache.activemq.artemis.core.postoffice.Binding; @@ -85,6 +89,7 @@ import org.apache.activemq.artemis.core.server.group.GroupingHandler; import org.apache.activemq.artemis.core.server.group.impl.GroupingHandlerConfiguration; import org.apache.activemq.artemis.core.server.impl.AddressInfo; import org.apache.activemq.artemis.core.server.impl.InVMNodeManager; +import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; import org.apache.activemq.artemis.tests.util.ActiveMQTestBase; import org.apache.activemq.artemis.utils.PortCheckRule; import org.jboss.logging.Logger; @@ -92,9 +97,14 @@ import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.rules.TemporaryFolder; public abstract class ClusterTestBase extends ActiveMQTestBase { + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); + private static final Logger log = Logger.getLogger(ClusterTestBase.class); private static final int[] PORTS = {TransportConstants.DEFAULT_PORT, TransportConstants.DEFAULT_PORT + 1, TransportConstants.DEFAULT_PORT + 2, TransportConstants.DEFAULT_PORT + 3, TransportConstants.DEFAULT_PORT + 4, TransportConstants.DEFAULT_PORT + 5, TransportConstants.DEFAULT_PORT + 6, TransportConstants.DEFAULT_PORT + 7, TransportConstants.DEFAULT_PORT + 8, TransportConstants.DEFAULT_PORT + 9,}; @@ -134,6 +144,21 @@ public abstract class ClusterTestBase extends ActiveMQTestBase { return true; } + private DistributedPrimitiveManagerConfiguration pluggableQuorumConfiguration = null; + + private DistributedPrimitiveManagerConfiguration getOrCreatePluggableQuorumConfiguration() { + if (pluggableQuorumConfiguration != null) { + return pluggableQuorumConfiguration; + } + try { + pluggableQuorumConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + } catch (IOException ioException) { + log.error(ioException); + return null; + } + return pluggableQuorumConfiguration; + } + @Override @Before public void setUp() throws Exception { @@ -159,11 +184,19 @@ public abstract class ClusterTestBase extends ActiveMQTestBase { } + public enum HAType { + SharedStore, SharedNothingReplication, PluggableQuorumReplication + } + + protected HAType haType() { + return HAType.SharedNothingReplication; + } + /** * Whether the servers share the storage or not. */ - protected boolean isSharedStore() { - return false; + protected final boolean isSharedStore() { + return HAType.SharedStore.equals(haType()); } @Override @@ -1481,14 +1514,14 @@ public abstract class ClusterTestBase extends ActiveMQTestBase { } protected void setupServer(final int node, final boolean fileStorage, final boolean netty) throws Exception { - setupLiveServer(node, fileStorage, false, netty, false); + setupLiveServer(node, fileStorage, HAType.SharedNothingReplication, netty, false); } protected void setupLiveServer(final int node, final boolean fileStorage, final boolean netty, boolean isLive) throws Exception { - setupLiveServer(node, fileStorage, false, netty, isLive); + setupLiveServer(node, fileStorage, HAType.SharedNothingReplication, netty, isLive); } protected boolean isResolveProtocols() { @@ -1497,27 +1530,26 @@ public abstract class ClusterTestBase extends ActiveMQTestBase { protected void setupLiveServer(final int node, final boolean fileStorage, - final boolean sharedStorage, + final HAType haType, final boolean netty, boolean liveOnly) throws Exception { if (servers[node] != null) { throw new IllegalArgumentException("Already a server at node " + node); } - HAPolicyConfiguration haPolicyConfiguration = null; + final HAPolicyConfiguration haPolicyConfiguration; if (liveOnly) { haPolicyConfiguration = new LiveOnlyPolicyConfiguration(); } else { - if (sharedStorage) - haPolicyConfiguration = new SharedStoreMasterPolicyConfiguration(); - else - haPolicyConfiguration = new ReplicatedPolicyConfiguration(); + haPolicyConfiguration = haPolicyLiveConfiguration(haType); } Configuration configuration = createBasicConfig(node).setJournalMaxIO_AIO(1000).setThreadPoolMaxSize(10).clearAcceptorConfigurations().addAcceptorConfiguration(createTransportConfiguration(netty, true, generateParams(node, netty))).setHAPolicyConfiguration(haPolicyConfiguration).setResolveProtocols(isResolveProtocols()); ActiveMQServer server; + final boolean sharedStorage = HAType.SharedStore.equals(haType); + if (fileStorage) { if (sharedStorage) { server = createInVMFailoverServer(true, configuration, nodeManagers[node], node); @@ -1538,6 +1570,20 @@ public abstract class ClusterTestBase extends ActiveMQTestBase { servers[node] = addServer(server); } + private HAPolicyConfiguration haPolicyLiveConfiguration(HAType haType) { + switch (haType) { + case SharedStore: + return new SharedStoreMasterPolicyConfiguration(); + case SharedNothingReplication: + return new ReplicatedPolicyConfiguration(); + case PluggableQuorumReplication: + return ReplicationPrimaryPolicyConfiguration.withDefault() + .setDistributedManagerConfiguration(getOrCreatePluggableQuorumConfiguration()); + default: + throw new AssertionError("Unsupported haType = " + haType); + } + } + /** * Server lacks a {@link ClusterConnectionConfiguration} necessary for the remote (replicating) * backup case. @@ -1549,14 +1595,14 @@ public abstract class ClusterTestBase extends ActiveMQTestBase { * @param node * @param liveNode * @param fileStorage - * @param sharedStorage + * @param haType * @param netty * @throws Exception */ protected void setupBackupServer(final int node, final int liveNode, final boolean fileStorage, - final boolean sharedStorage, + final HAType haType, final boolean netty) throws Exception { if (servers[node] != null) { throw new IllegalArgumentException("Already a server at node " + node); @@ -1566,7 +1612,9 @@ public abstract class ClusterTestBase extends ActiveMQTestBase { TransportConfiguration backupConfig = createTransportConfiguration(netty, false, generateParams(node, netty)); TransportConfiguration acceptorConfig = createTransportConfiguration(netty, true, generateParams(node, netty)); - Configuration configuration = createBasicConfig(sharedStorage ? liveNode : node).clearAcceptorConfigurations().addAcceptorConfiguration(acceptorConfig).addConnectorConfiguration(liveConfig.getName(), liveConfig).addConnectorConfiguration(backupConfig.getName(), backupConfig).setHAPolicyConfiguration(sharedStorage ? new SharedStoreSlavePolicyConfiguration() : new ReplicaPolicyConfiguration()); + final boolean sharedStorage = HAType.SharedStore.equals(haType); + + Configuration configuration = createBasicConfig(sharedStorage ? liveNode : node).clearAcceptorConfigurations().addAcceptorConfiguration(acceptorConfig).addConnectorConfiguration(liveConfig.getName(), liveConfig).addConnectorConfiguration(backupConfig.getName(), backupConfig).setHAPolicyConfiguration(haPolicyBackupConfiguration(haType)); ActiveMQServer server; @@ -1580,6 +1628,21 @@ public abstract class ClusterTestBase extends ActiveMQTestBase { servers[node] = addServer(server); } + private HAPolicyConfiguration haPolicyBackupConfiguration(HAType haType) { + switch (haType) { + + case SharedStore: + return new SharedStoreSlavePolicyConfiguration(); + case SharedNothingReplication: + return new ReplicaPolicyConfiguration(); + case PluggableQuorumReplication: + return ReplicationBackupPolicyConfiguration.withDefault() + .setDistributedManagerConfiguration(getOrCreatePluggableQuorumConfiguration()); + default: + throw new AssertionError("Unsupported ha type = " + haType); + } + } + protected void setupLiveServerWithDiscovery(final int node, final String groupAddress, final int port, diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterWithBackupTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterWithBackupTest.java index ed0c637579..5b0df53b77 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterWithBackupTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterWithBackupTest.java @@ -87,14 +87,14 @@ public class ClusterWithBackupTest extends ClusterTestBase { protected void setupServers() throws Exception { // The backups - setupBackupServer(0, 3, isFileStorage(), true, isNetty()); - setupBackupServer(1, 4, isFileStorage(), true, isNetty()); - setupBackupServer(2, 5, isFileStorage(), true, isNetty()); + setupBackupServer(0, 3, isFileStorage(), HAType.SharedStore, isNetty()); + setupBackupServer(1, 4, isFileStorage(), HAType.SharedStore, isNetty()); + setupBackupServer(2, 5, isFileStorage(), HAType.SharedStore, isNetty()); // The lives - setupLiveServer(3, isFileStorage(), true, isNetty(), false); - setupLiveServer(4, isFileStorage(), true, isNetty(), false); - setupLiveServer(5, isFileStorage(), true, isNetty(), false); + setupLiveServer(3, isFileStorage(), HAType.SharedStore, isNetty(), false); + setupLiveServer(4, isFileStorage(), HAType.SharedStore, isNetty(), false); + setupLiveServer(5, isFileStorage(), HAType.SharedStore, isNetty(), false); } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/SimpleSymmetricClusterTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/SimpleSymmetricClusterTest.java index c9f2c6f73d..4884ed1696 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/SimpleSymmetricClusterTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/SimpleSymmetricClusterTest.java @@ -46,14 +46,14 @@ public class SimpleSymmetricClusterTest extends ClusterTestBase { @Test public void testSimpleWithBackup() throws Exception { // The backups - setupBackupServer(0, 3, isFileStorage(), true, isNetty()); - setupBackupServer(1, 4, isFileStorage(), true, isNetty()); - setupBackupServer(2, 5, isFileStorage(), true, isNetty()); + setupBackupServer(0, 3, isFileStorage(), HAType.SharedStore, isNetty()); + setupBackupServer(1, 4, isFileStorage(), HAType.SharedStore, isNetty()); + setupBackupServer(2, 5, isFileStorage(), HAType.SharedStore, isNetty()); // The lives - setupLiveServer(3, isFileStorage(), true, isNetty(), false); - setupLiveServer(4, isFileStorage(), true, isNetty(), false); - setupLiveServer(5, isFileStorage(), true, isNetty(), false); + setupLiveServer(3, isFileStorage(), HAType.SharedStore, isNetty(), false); + setupLiveServer(4, isFileStorage(), HAType.SharedStore, isNetty(), false); + setupLiveServer(5, isFileStorage(), HAType.SharedStore, isNetty(), false); setupClusterConnection("cluster0", "queues", MessageLoadBalancingType.ON_DEMAND, 1, isNetty(), 3, 4, 5); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/SymmetricClusterWithBackupTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/SymmetricClusterWithBackupTest.java index 4fcadcdc32..9ac836bd1a 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/SymmetricClusterWithBackupTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/SymmetricClusterWithBackupTest.java @@ -453,18 +453,18 @@ public class SymmetricClusterWithBackupTest extends SymmetricClusterTest { @Override protected void setupServers() throws Exception { // The backups - setupBackupServer(5, 0, isFileStorage(), true, isNetty()); - setupBackupServer(6, 1, isFileStorage(), true, isNetty()); - setupBackupServer(7, 2, isFileStorage(), true, isNetty()); - setupBackupServer(8, 3, isFileStorage(), true, isNetty()); - setupBackupServer(9, 4, isFileStorage(), true, isNetty()); + setupBackupServer(5, 0, isFileStorage(), HAType.SharedStore, isNetty()); + setupBackupServer(6, 1, isFileStorage(), HAType.SharedStore, isNetty()); + setupBackupServer(7, 2, isFileStorage(), HAType.SharedStore, isNetty()); + setupBackupServer(8, 3, isFileStorage(), HAType.SharedStore, isNetty()); + setupBackupServer(9, 4, isFileStorage(), HAType.SharedStore, isNetty()); // The lives - setupLiveServer(0, isFileStorage(), true, isNetty(), false); - setupLiveServer(1, isFileStorage(), true, isNetty(), false); - setupLiveServer(2, isFileStorage(), true, isNetty(), false); - setupLiveServer(3, isFileStorage(), true, isNetty(), false); - setupLiveServer(4, isFileStorage(), true, isNetty(), false); + setupLiveServer(0, isFileStorage(), HAType.SharedStore, isNetty(), false); + setupLiveServer(1, isFileStorage(), HAType.SharedStore, isNetty(), false); + setupLiveServer(2, isFileStorage(), HAType.SharedStore, isNetty(), false); + setupLiveServer(3, isFileStorage(), HAType.SharedStore, isNetty(), false); + setupLiveServer(4, isFileStorage(), HAType.SharedStore, isNetty(), false); } @Override diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java index b79c418329..a13ef3dbbb 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java @@ -60,6 +60,8 @@ import org.apache.activemq.artemis.core.server.cluster.ha.BackupPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.HAPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.ReplicaPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.ReplicatedPolicy; +import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationBackupPolicy; +import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationPrimaryPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.SharedStoreMasterPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.SharedStoreSlavePolicy; import org.apache.activemq.artemis.core.server.files.FileMoveManager; @@ -786,7 +788,7 @@ public class FailoverTest extends FailoverTestBase { ((ReplicaPolicy) haPolicy).setMaxSavedReplicatedJournalsSize(1); } - simpleFailover(haPolicy instanceof ReplicaPolicy, doFailBack); + simpleFailover(haPolicy instanceof ReplicaPolicy || haPolicy instanceof ReplicationBackupPolicy, doFailBack); } @Test(timeout = 120000) @@ -816,7 +818,9 @@ public class FailoverTest extends FailoverTestBase { Thread.sleep(100); Assert.assertFalse("backup is not running", backupServer.isStarted()); - Assert.assertFalse("must NOT be a backup", liveServer.getServer().getHAPolicy() instanceof BackupPolicy); + final boolean isBackup = liveServer.getServer().getHAPolicy() instanceof BackupPolicy || + liveServer.getServer().getHAPolicy() instanceof ReplicationBackupPolicy; + Assert.assertFalse("must NOT be a backup", isBackup); adaptLiveConfigForReplicatedFailBack(liveServer); beforeRestart(liveServer); liveServer.start(); @@ -827,7 +831,8 @@ public class FailoverTest extends FailoverTestBase { ClientSession session2 = createSession(sf, false, false); session2.start(); ClientConsumer consumer2 = session2.createConsumer(FailoverTestBase.ADDRESS); - boolean replication = liveServer.getServer().getHAPolicy() instanceof ReplicatedPolicy; + final boolean replication = liveServer.getServer().getHAPolicy() instanceof ReplicatedPolicy || + liveServer.getServer().getHAPolicy() instanceof ReplicationPrimaryPolicy; if (replication) receiveMessages(consumer2, 0, NUM_MESSAGES, true); assertNoMoreMessages(consumer2); @@ -838,7 +843,7 @@ public class FailoverTest extends FailoverTestBase { public void testSimpleFailover() throws Exception { HAPolicy haPolicy = backupServer.getServer().getHAPolicy(); - simpleFailover(haPolicy instanceof ReplicaPolicy, false); + simpleFailover(haPolicy instanceof ReplicaPolicy || haPolicy instanceof ReplicationBackupPolicy, false); } @Test(timeout = 120000) diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java index a3e3dfc0b0..1bcd820a1b 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java @@ -19,6 +19,7 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover; import java.io.IOException; import java.net.ServerSocket; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; @@ -36,15 +37,19 @@ import org.apache.activemq.artemis.core.client.impl.ClientSessionFactoryInternal import org.apache.activemq.artemis.core.client.impl.ServerLocatorInternal; import org.apache.activemq.artemis.core.config.ClusterConnectionConfiguration; import org.apache.activemq.artemis.core.config.Configuration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicaPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.SharedStoreMasterPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.SharedStoreSlavePolicyConfiguration; import org.apache.activemq.artemis.core.remoting.impl.invm.InVMConnector; import org.apache.activemq.artemis.core.remoting.impl.invm.InVMRegistry; import org.apache.activemq.artemis.core.server.NodeManager; +import org.apache.activemq.artemis.core.server.cluster.ha.HAPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.ReplicatedPolicy; +import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationPrimaryPolicy; import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl; import org.apache.activemq.artemis.core.server.impl.InVMNodeManager; +import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; import org.apache.activemq.artemis.tests.integration.cluster.util.SameProcessActiveMQServer; import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer; import org.apache.activemq.artemis.tests.util.ActiveMQTestBase; @@ -52,9 +57,13 @@ import org.apache.activemq.artemis.tests.util.ReplicatedBackupUtils; import org.junit.After; import org.junit.Assert; import org.junit.Before; +import org.junit.Rule; +import org.junit.rules.TemporaryFolder; public abstract class FailoverTestBase extends ActiveMQTestBase { // Constants ----------------------------------------------------- + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); protected static final SimpleString ADDRESS = new SimpleString("FailoverTestAddress"); @@ -216,7 +225,34 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { } } + protected void createPluggableReplicatedConfigs() throws Exception { + final TransportConfiguration liveConnector = getConnectorTransportConfiguration(true); + final TransportConfiguration backupConnector = getConnectorTransportConfiguration(false); + final TransportConfiguration backupAcceptor = getAcceptorTransportConfiguration(false); + + backupConfig = createDefaultInVMConfig(); + liveConfig = createDefaultInVMConfig(); + + DistributedPrimitiveManagerConfiguration managerConfiguration = + new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), + Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + + ReplicatedBackupUtils.configurePluggableQuorumReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, null, managerConfiguration, managerConfiguration); + + backupConfig.setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)).setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setSecurityEnabled(false); + + setupHAPolicyConfiguration(); + nodeManager = createReplicatedBackupNodeManager(backupConfig); + + backupServer = createTestableServer(backupConfig); + + liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true)); + + liveServer = createTestableServer(liveConfig); + } + protected void setupHAPolicyConfiguration() { + Assert.assertTrue(backupConfig.getHAPolicyConfiguration() instanceof ReplicaPolicyConfiguration); ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(-1).setAllowFailBack(true); ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setRestartBackup(false); } @@ -233,8 +269,13 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { configuration.getConnectorConfigurations().put(backupConnector.getName(), backupConnector); return; } - ReplicatedPolicy haPolicy = (ReplicatedPolicy) server.getServer().getHAPolicy(); - haPolicy.setCheckForLiveServer(true); + HAPolicy policy = server.getServer().getHAPolicy(); + if (policy instanceof ReplicatedPolicy) { + ((ReplicatedPolicy) policy).setCheckForLiveServer(true); + } else if (policy instanceof ReplicationPrimaryPolicy) { + Assert.assertTrue("Adapting won't work for the current configuration", ((ReplicationPrimaryPolicy) policy).isCheckForLiveServer()); + } + } @Override diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverReplicationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverReplicationTest.java index 1b1388970d..cc187a2d43 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverReplicationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverReplicationTest.java @@ -19,8 +19,8 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover; public class GroupingFailoverReplicationTest extends GroupingFailoverTestBase { @Override - protected boolean isSharedStore() { - return false; + protected HAType haType() { + return HAType.SharedNothingReplication; } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverSharedServerTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverSharedServerTest.java index a2849447b6..9d576683b3 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverSharedServerTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverSharedServerTest.java @@ -19,7 +19,7 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover; public class GroupingFailoverSharedServerTest extends GroupingFailoverTestBase { @Override - protected boolean isSharedStore() { - return true; + protected HAType haType() { + return HAType.SharedStore; } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverTestBase.java index de8c02c3e5..2face9db41 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/GroupingFailoverTestBase.java @@ -26,22 +26,26 @@ import org.apache.activemq.artemis.api.core.client.ServerLocator; import org.apache.activemq.artemis.core.client.impl.TopologyMemberImpl; import org.apache.activemq.artemis.core.config.ha.ReplicaPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicatedPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.core.server.cluster.impl.MessageLoadBalancingType; import org.apache.activemq.artemis.core.server.group.impl.GroupingHandlerConfiguration; +import org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation; import org.apache.activemq.artemis.core.server.impl.SharedNothingBackupActivation; import org.apache.activemq.artemis.tests.integration.cluster.distribution.ClusterTestBase; import org.apache.activemq.artemis.tests.util.ActiveMQTestBase; +import org.apache.activemq.artemis.utils.Wait; import org.junit.Test; public abstract class GroupingFailoverTestBase extends ClusterTestBase { @Test public void testGroupingLocalHandlerFails() throws Exception { - setupBackupServer(2, 0, isFileStorage(), isSharedStore(), isNetty()); + setupBackupServer(2, 0, isFileStorage(), haType(), isNetty()); - setupLiveServer(0, isFileStorage(), isSharedStore(), isNetty(), false); + setupLiveServer(0, isFileStorage(), haType(), isNetty(), false); - setupLiveServer(1, isFileStorage(), isSharedStore(), isNetty(), false); + setupLiveServer(1, isFileStorage(), haType(), isNetty(), false); setupClusterConnection("cluster0", "queues", MessageLoadBalancingType.ON_DEMAND, 1, isNetty(), 0, 1); @@ -54,10 +58,18 @@ public abstract class GroupingFailoverTestBase extends ClusterTestBase { setUpGroupHandler(GroupingHandlerConfiguration.TYPE.REMOTE, 1); setUpGroupHandler(GroupingHandlerConfiguration.TYPE.LOCAL, 2); - if (!isSharedStore()) { - ((ReplicatedPolicyConfiguration) servers[0].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); - ((ReplicatedPolicyConfiguration) servers[1].getConfiguration().getHAPolicyConfiguration()).setGroupName("group2"); - ((ReplicaPolicyConfiguration) servers[2].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); + switch (haType()) { + + case SharedNothingReplication: + ((ReplicatedPolicyConfiguration) servers[0].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); + ((ReplicatedPolicyConfiguration) servers[1].getConfiguration().getHAPolicyConfiguration()).setGroupName("group2"); + ((ReplicaPolicyConfiguration) servers[2].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); + break; + case PluggableQuorumReplication: + ((ReplicationPrimaryPolicyConfiguration) servers[0].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); + ((ReplicationPrimaryPolicyConfiguration) servers[1].getConfiguration().getHAPolicyConfiguration()).setGroupName("group2"); + ((ReplicationBackupPolicyConfiguration) servers[2].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); + break; } startServers(0, 1, 2); @@ -129,11 +141,11 @@ public abstract class GroupingFailoverTestBase extends ClusterTestBase { @Test public void testGroupingLocalHandlerFailsMultipleGroups() throws Exception { - setupBackupServer(2, 0, isFileStorage(), isSharedStore(), isNetty()); + setupBackupServer(2, 0, isFileStorage(), haType(), isNetty()); - setupLiveServer(0, isFileStorage(), isSharedStore(), isNetty(), false); + setupLiveServer(0, isFileStorage(), haType(), isNetty(), false); - setupLiveServer(1, isFileStorage(), isSharedStore(), isNetty(), false); + setupLiveServer(1, isFileStorage(), haType(), isNetty(), false); setupClusterConnection("cluster0", "queues", MessageLoadBalancingType.ON_DEMAND, 1, isNetty(), 0, 1); @@ -147,10 +159,18 @@ public abstract class GroupingFailoverTestBase extends ClusterTestBase { setUpGroupHandler(GroupingHandlerConfiguration.TYPE.LOCAL, 2); - if (!isSharedStore()) { - ((ReplicatedPolicyConfiguration) servers[0].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); - ((ReplicatedPolicyConfiguration) servers[1].getConfiguration().getHAPolicyConfiguration()).setGroupName("group2"); - ((ReplicaPolicyConfiguration) servers[2].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); + switch (haType()) { + + case SharedNothingReplication: + ((ReplicatedPolicyConfiguration) servers[0].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); + ((ReplicatedPolicyConfiguration) servers[1].getConfiguration().getHAPolicyConfiguration()).setGroupName("group2"); + ((ReplicaPolicyConfiguration) servers[2].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); + break; + case PluggableQuorumReplication: + ((ReplicationPrimaryPolicyConfiguration) servers[0].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); + ((ReplicationPrimaryPolicyConfiguration) servers[1].getConfiguration().getHAPolicyConfiguration()).setGroupName("group2"); + ((ReplicationBackupPolicyConfiguration) servers[2].getConfiguration().getHAPolicyConfiguration()).setGroupName("group1"); + break; } startServers(0, 1, 2); @@ -187,9 +207,17 @@ public abstract class GroupingFailoverTestBase extends ClusterTestBase { verifyReceiveAllWithGroupIDRoundRobin(0, 30, 0, 1); - if (!isSharedStore()) { - SharedNothingBackupActivation backupActivation = (SharedNothingBackupActivation) servers[2].getActivation(); - assertTrue(backupActivation.waitForBackupSync(10, TimeUnit.SECONDS)); + switch (haType()) { + case SharedNothingReplication: { + SharedNothingBackupActivation backupActivation = (SharedNothingBackupActivation) servers[2].getActivation(); + assertTrue(backupActivation.waitForBackupSync(10, TimeUnit.SECONDS)); + } + break; + case PluggableQuorumReplication: { + ReplicationBackupActivation backupActivation = (ReplicationBackupActivation) servers[2].getActivation(); + Wait.assertTrue(backupActivation::isReplicaSync, TimeUnit.SECONDS.toMillis(10)); + } + break; } closeSessionFactory(0); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/LiveVoteOnBackupFailureClusterTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/LiveVoteOnBackupFailureClusterTest.java index 71281e6a40..e060e076a2 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/LiveVoteOnBackupFailureClusterTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/LiveVoteOnBackupFailureClusterTest.java @@ -49,14 +49,14 @@ public class LiveVoteOnBackupFailureClusterTest extends ClusterWithBackupFailove @Override protected void setupServers() throws Exception { // The backups - setupBackupServer(3, 0, isFileStorage(), isSharedStorage(), isNetty()); - setupBackupServer(4, 1, isFileStorage(), isSharedStorage(), isNetty()); - setupBackupServer(5, 2, isFileStorage(), isSharedStorage(), isNetty()); + setupBackupServer(3, 0, isFileStorage(), haType(), isNetty()); + setupBackupServer(4, 1, isFileStorage(), haType(), isNetty()); + setupBackupServer(5, 2, isFileStorage(), haType(), isNetty()); // The lives - setupLiveServer(0, isFileStorage(), isSharedStorage(), isNetty(), false); - setupLiveServer(1, isFileStorage(), isSharedStorage(), isNetty(), false); - setupLiveServer(2, isFileStorage(), isSharedStorage(), isNetty(), false); + setupLiveServer(0, isFileStorage(), haType(), isNetty(), false); + setupLiveServer(1, isFileStorage(), haType(), isNetty(), false); + setupLiveServer(2, isFileStorage(), haType(), isNetty(), false); //we need to know who is connected to who ((ReplicatedPolicyConfiguration) servers[0].getConfiguration().getHAPolicyConfiguration()).setGroupName("group0"); @@ -71,9 +71,9 @@ public class LiveVoteOnBackupFailureClusterTest extends ClusterWithBackupFailove ((ReplicatedPolicyConfiguration) servers[1].getConfiguration().getHAPolicyConfiguration()).setVoteOnReplicationFailure(true); ((ReplicatedPolicyConfiguration) servers[2].getConfiguration().getHAPolicyConfiguration()).setVoteOnReplicationFailure(true); } - - protected boolean isSharedStorage() { - return false; + @Override + protected HAType haType() { + return HAType.SharedNothingReplication; } @Test diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java index c13e2a7349..a3f19cf412 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java @@ -16,7 +16,9 @@ */ package org.apache.activemq.artemis.tests.integration.cluster.failover; +import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import org.apache.activemq.artemis.api.core.SimpleString; @@ -27,22 +29,47 @@ import org.apache.activemq.artemis.api.core.client.ClientSessionFactory; import org.apache.activemq.artemis.core.client.impl.ServerLocatorInternal; import org.apache.activemq.artemis.core.config.Configuration; import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicaPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicatedPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.SharedStoreMasterPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.SharedStoreSlavePolicyConfiguration; import org.apache.activemq.artemis.core.server.ActiveMQServer; import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.core.server.Queue; +import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; +import org.apache.activemq.artemis.tests.integration.cluster.distribution.ClusterTestBase; import org.apache.activemq.artemis.tests.util.Wait; import org.apache.activemq.artemis.tests.integration.cluster.util.SameProcessActiveMQServer; import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer; import org.apache.activemq.artemis.tests.util.ActiveMQTestBase; import org.apache.activemq.artemis.tests.util.TransportConfigurationUtils; import org.junit.Before; +import org.junit.Rule; +import org.junit.rules.TemporaryFolder; public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase { + + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); + + private DistributedPrimitiveManagerConfiguration pluggableQuorumConfiguration = null; + + private DistributedPrimitiveManagerConfiguration getOrCreatePluggableQuorumConfiguration() { + if (pluggableQuorumConfiguration != null) { + return pluggableQuorumConfiguration; + } + try { + pluggableQuorumConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + } catch (IOException ioException) { + return null; + } + return pluggableQuorumConfiguration; + } + // Constants ----------------------------------------------------- // TODO: find a better solution for this @@ -67,7 +94,15 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase { public abstract boolean isNetty(); - public abstract boolean isSharedStore(); + public enum HAType { + SharedStore, SharedNothingReplication, PluggableQuorumReplication + } + + public abstract HAType haType(); + + protected final boolean isSharedStore() { + return ClusterTestBase.HAType.SharedStore.equals(haType()); + } public abstract String getNodeGroupName(); @@ -82,14 +117,22 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase { for (int i = 0; i < getLiveServerCount(); i++) { HAPolicyConfiguration haPolicyConfiguration = null; + switch (haType()) { - if (isSharedStore()) { - haPolicyConfiguration = new SharedStoreMasterPolicyConfiguration(); - } else { - haPolicyConfiguration = new ReplicatedPolicyConfiguration(); - if (getNodeGroupName() != null) { - ((ReplicatedPolicyConfiguration) haPolicyConfiguration).setGroupName(getNodeGroupName() + "-" + i); - } + case SharedStore: + haPolicyConfiguration = new SharedStoreMasterPolicyConfiguration(); + break; + case SharedNothingReplication: + haPolicyConfiguration = new ReplicatedPolicyConfiguration(); + if (getNodeGroupName() != null) { + ((ReplicatedPolicyConfiguration) haPolicyConfiguration).setGroupName(getNodeGroupName() + "-" + i); + } + break; + case PluggableQuorumReplication: + haPolicyConfiguration = ReplicationPrimaryPolicyConfiguration.withDefault() + .setDistributedManagerConfiguration(getOrCreatePluggableQuorumConfiguration()) + .setGroupName(getNodeGroupName() != null ? (getNodeGroupName() + "-" + i) : null); + break; } Configuration configuration = createDefaultConfig(isNetty()).clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true, i)).setHAPolicyConfiguration(haPolicyConfiguration); @@ -126,13 +169,24 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase { for (int i = 0; i < getBackupServerCount(); i++) { HAPolicyConfiguration haPolicyConfiguration = null; - if (isSharedStore()) { - haPolicyConfiguration = new SharedStoreSlavePolicyConfiguration(); - } else { - haPolicyConfiguration = new ReplicaPolicyConfiguration(); - if (getNodeGroupName() != null) { - ((ReplicaPolicyConfiguration) haPolicyConfiguration).setGroupName(getNodeGroupName() + "-" + i); - } + switch (haType()) { + + case SharedStore: + haPolicyConfiguration = new SharedStoreSlavePolicyConfiguration(); + break; + case SharedNothingReplication: + haPolicyConfiguration = new ReplicaPolicyConfiguration(); + if (getNodeGroupName() != null) { + ((ReplicaPolicyConfiguration) haPolicyConfiguration).setGroupName(getNodeGroupName() + "-" + i); + } + break; + case PluggableQuorumReplication: + haPolicyConfiguration = ReplicationBackupPolicyConfiguration.withDefault() + .setVoteRetries(1) + .setVoteRetryWait(1000) + .setDistributedManagerConfiguration(getOrCreatePluggableQuorumConfiguration()) + .setGroupName(getNodeGroupName() != null ? (getNodeGroupName() + "-" + i) : null); + break; } Configuration configuration = createDefaultConfig(isNetty()).clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(false, i)).setHAPolicyConfiguration(haPolicyConfiguration); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/NettyReplicationStopTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/NettyReplicationStopTest.java index 9f4eb9971b..c8afba5b4f 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/NettyReplicationStopTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/NettyReplicationStopTest.java @@ -103,7 +103,7 @@ public class NettyReplicationStopTest extends FailoverTestBase { final int numMessages = 10; - ReplicationEndpoint endpoint = backupServer.getServer().getReplicationEndpoint(); + ReplicationEndpoint endpoint = getReplicationEndpoint(backupServer.getServer()); endpoint.pause(); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/NetworkIsolationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/NetworkIsolationTest.java index aeea179e88..3f8aabf840 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/NetworkIsolationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/NetworkIsolationTest.java @@ -124,14 +124,14 @@ public class NetworkIsolationTest extends FailoverTestBase { liveServer.start(); - for (int i = 0; i < 1000 && backupServer.getServer().getReplicationEndpoint() != null && !backupServer.getServer().getReplicationEndpoint().isStarted(); i++) { + for (int i = 0; i < 1000 && getReplicationEndpoint(backupServer.getServer()) != null && !getReplicationEndpoint(backupServer.getServer()).isStarted(); i++) { Thread.sleep(10); } backupServer.getServer().getNetworkHealthCheck().clearAddresses(); // This will make sure the backup got synchronized after the network was activated again - Wait.assertTrue(() -> backupServer.getServer().getReplicationEndpoint().isStarted()); + Assert.assertTrue(getReplicationEndpoint(backupServer.getServer()).isStarted()); } finally { AssertionLoggerHandler.stopCapture(); } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java index 91bbd20cfd..d4a9c3d1e1 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java @@ -17,12 +17,10 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover; +import java.io.IOException; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.activemq.artemis.api.core.ActiveMQException; -import org.apache.activemq.artemis.api.core.Interceptor; import org.apache.activemq.artemis.api.core.QueueConfiguration; import org.apache.activemq.artemis.api.core.SimpleString; import org.apache.activemq.artemis.api.core.TransportConfiguration; @@ -34,16 +32,18 @@ import org.apache.activemq.artemis.core.client.impl.ClientSessionFactoryInternal import org.apache.activemq.artemis.core.config.Configuration; import org.apache.activemq.artemis.core.config.ha.ReplicaPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicatedPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.SharedStoreSlavePolicyConfiguration; -import org.apache.activemq.artemis.core.protocol.core.Packet; import org.apache.activemq.artemis.core.protocol.core.impl.PacketImpl; +import org.apache.activemq.artemis.core.replication.ReplicationEndpoint; import org.apache.activemq.artemis.core.server.NodeManager; +import org.apache.activemq.artemis.core.server.impl.Activation; import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl; import org.apache.activemq.artemis.core.server.impl.InVMNodeManager; +import org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation; import org.apache.activemq.artemis.core.server.impl.SharedNothingBackupActivation; import org.apache.activemq.artemis.tests.util.Wait; import org.apache.activemq.artemis.logs.AssertionLoggerHandler; -import org.apache.activemq.artemis.spi.core.protocol.RemotingConnection; import org.apache.activemq.artemis.tests.integration.cluster.util.SameProcessActiveMQServer; import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer; import org.apache.activemq.artemis.tests.util.ActiveMQTestBase; @@ -77,7 +77,9 @@ public class ReplicaTimeoutTest extends ActiveMQTestBase { } protected TestableServer createTestableServer(Configuration config, NodeManager nodeManager) throws Exception { - boolean isBackup = config.getHAPolicyConfiguration() instanceof ReplicaPolicyConfiguration || config.getHAPolicyConfiguration() instanceof SharedStoreSlavePolicyConfiguration; + boolean isBackup = config.getHAPolicyConfiguration() instanceof ReplicationBackupPolicyConfiguration || + config.getHAPolicyConfiguration() instanceof ReplicaPolicyConfiguration || + config.getHAPolicyConfiguration() instanceof SharedStoreSlavePolicyConfiguration; return new SameProcessActiveMQServer(createInVMFailoverServer(true, config, nodeManager, isBackup ? 2 : 1)); } @@ -119,6 +121,19 @@ public class ReplicaTimeoutTest extends ActiveMQTestBase { liveServer.crash(true, true, sessions); } + protected void configureReplicationPair(Configuration backupConfig, + Configuration liveConfig, + TransportConfiguration backupConnector, + TransportConfiguration backupAcceptor, + TransportConfiguration liveConnector) throws IOException { + ReplicatedBackupUtils.configureReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, null); + ((ReplicatedPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setInitialReplicationSyncTimeout(1000); + ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setInitialReplicationSyncTimeout(1000); + ((ReplicatedPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); + ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); + ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setRestartBackup(false); + } + @Test//(timeout = 120000) public void testFailbackTimeout() throws Exception { AssertionLoggerHandler.startCapture(); @@ -134,19 +149,13 @@ public class ReplicaTimeoutTest extends ActiveMQTestBase { Configuration backupConfig = createDefaultInVMConfig(); Configuration liveConfig = createDefaultInVMConfig(); - ReplicatedBackupUtils.configureReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, null); - ((ReplicatedPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setInitialReplicationSyncTimeout(1000); - ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setInitialReplicationSyncTimeout(1000); + configureReplicationPair(backupConfig, liveConfig, backupConnector, backupAcceptor, liveConnector); backupConfig.setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)). setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setSecurityEnabled(false); liveConfig.setBindingsDirectory(getBindingsDir(0, false)).setJournalDirectory(getJournalDir(0, false)). setPagingDirectory(getPageDir(0, false)).setLargeMessagesDirectory(getLargeMessagesDir(0, false)).setSecurityEnabled(false); - ((ReplicatedPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); - ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); - ((ReplicaPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setRestartBackup(false); - NodeManager nodeManager = createReplicatedBackupNodeManager(backupConfig); backupServer = createTestableServer(backupConfig, nodeManager); @@ -155,8 +164,6 @@ public class ReplicaTimeoutTest extends ActiveMQTestBase { liveServer = createTestableServer(liveConfig, nodeManager); - AtomicBoolean ignoreIntercept = new AtomicBoolean(false); - final TestableServer theBackup = backupServer; liveServer.start(); @@ -174,23 +181,30 @@ public class ReplicaTimeoutTest extends ActiveMQTestBase { Wait.assertTrue(backupServer.getServer()::isActive); - ignoreIntercept.set(true); - ((ActiveMQServerImpl) backupServer.getServer()).setAfterActivationCreated(new Runnable() { @Override public void run() { - //theBackup.getServer().getActivation() - - SharedNothingBackupActivation activation = (SharedNothingBackupActivation) theBackup.getServer().getActivation(); - activation.getReplicationEndpoint().addOutgoingInterceptorForReplication(new Interceptor() { - @Override - public boolean intercept(Packet packet, RemotingConnection connection) throws ActiveMQException { - if (ignoreIntercept.get() && packet.getType() == PacketImpl.REPLICATION_RESPONSE_V2) { + final Activation backupActivation = theBackup.getServer().getActivation(); + if (backupActivation instanceof SharedNothingBackupActivation) { + SharedNothingBackupActivation activation = (SharedNothingBackupActivation) backupActivation; + ReplicationEndpoint repEnd = activation.getReplicationEndpoint(); + repEnd.addOutgoingInterceptorForReplication((packet, connection) -> { + if (packet.getType() == PacketImpl.REPLICATION_RESPONSE_V2) { return false; } return true; - } - }); + }); + } else if (backupActivation instanceof ReplicationBackupActivation) { + ReplicationBackupActivation activation = (ReplicationBackupActivation) backupActivation; + activation.spyReplicationEndpointCreation(replicationEndpoint -> { + replicationEndpoint.addOutgoingInterceptorForReplication((packet, connection) -> { + if (packet.getType() == PacketImpl.REPLICATION_RESPONSE_V2) { + return false; + } + return true; + }); + }); + } } }); @@ -198,7 +212,9 @@ public class ReplicaTimeoutTest extends ActiveMQTestBase { Assert.assertTrue(Wait.waitFor(() -> AssertionLoggerHandler.findText("AMQ229114"))); - Wait.assertFalse(liveServer.getServer()::isStarted); + if (expectLiveSuicide()) { + Wait.assertFalse(liveServer.getServer()::isStarted); + } } finally { if (sf != null) { @@ -218,4 +234,8 @@ public class ReplicaTimeoutTest extends ActiveMQTestBase { } } + protected boolean expectLiveSuicide() { + return true; + } + } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedDistributionTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedDistributionTest.java index a7f058c121..35f396672e 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedDistributionTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedDistributionTest.java @@ -178,9 +178,9 @@ public class ReplicatedDistributionTest extends ClusterTestBase { public void setUp() throws Exception { super.setUp(); - setupLiveServer(1, true, isSharedStore(), true, false); - setupLiveServer(3, true, isSharedStore(), true, false); - setupBackupServer(2, 3, true, isSharedStore(), true); + setupLiveServer(1, true, haType(), true, false); + setupLiveServer(3, true, haType(), true, false); + setupBackupServer(2, 3, true, haType(), true); final String address = ReplicatedDistributionTest.ADDRESS.toString(); // notice the abuse of the method call, '3' is not a backup for '1' @@ -210,7 +210,7 @@ public class ReplicatedDistributionTest extends ClusterTestBase { } @Override - protected boolean isSharedStore() { - return false; + protected HAType haType() { + return HAType.SharedNothingReplication; } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java index 9e8d2902cd..6712c74692 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java @@ -29,6 +29,7 @@ import org.apache.activemq.artemis.api.core.client.ClientSessionFactory; import org.apache.activemq.artemis.api.core.client.FailoverEventType; import org.apache.activemq.artemis.api.core.client.ServerLocator; import org.apache.activemq.artemis.core.config.ha.ReplicaPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; import org.apache.activemq.artemis.core.server.ActiveMQServer; import org.apache.activemq.artemis.tests.util.Wait; import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer; @@ -51,8 +52,16 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated @Override @Test public void testStartLiveFirst() throws Exception { - ((ReplicaPolicyConfiguration) backupServers.get(2).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-0"); - ((ReplicaPolicyConfiguration) backupServers.get(3).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-1"); + switch (haType()) { + case SharedNothingReplication: + ((ReplicaPolicyConfiguration) backupServers.get(2).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-0"); + ((ReplicaPolicyConfiguration) backupServers.get(3).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-1"); + break; + case PluggableQuorumReplication: + ((ReplicationBackupPolicyConfiguration) backupServers.get(2).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-0"); + ((ReplicationBackupPolicyConfiguration) backupServers.get(3).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-1"); + break; + } startServers(liveServers); backupServers.get(0).start(); @@ -85,8 +94,17 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated @Override @Test public void testStartBackupFirst() throws Exception { - ((ReplicaPolicyConfiguration) backupServers.get(2).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-0"); - ((ReplicaPolicyConfiguration) backupServers.get(3).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-1"); + switch (haType()) { + case SharedNothingReplication: + ((ReplicaPolicyConfiguration) backupServers.get(2).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-0"); + ((ReplicaPolicyConfiguration) backupServers.get(3).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-1"); + break; + case PluggableQuorumReplication: + ((ReplicationBackupPolicyConfiguration) backupServers.get(2).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-0"); + ((ReplicationBackupPolicyConfiguration) backupServers.get(3).getServer().getConfiguration().getHAPolicyConfiguration()).setGroupName(getNodeGroupName() + "-1"); + break; + } + startServers(backupServers); startServers(liveServers); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverTest.java index 82777a71db..383f97b8d3 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverTest.java @@ -16,6 +16,9 @@ */ package org.apache.activemq.artemis.tests.integration.cluster.failover; +import java.util.Arrays; +import java.util.Collection; + import org.apache.activemq.artemis.api.core.QueueConfiguration; import org.apache.activemq.artemis.api.core.client.ClientConsumer; import org.apache.activemq.artemis.api.core.client.ClientMessage; @@ -25,9 +28,20 @@ import org.apache.activemq.artemis.api.core.client.ClientSessionFactory; import org.apache.activemq.artemis.api.core.client.ServerLocator; import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +@RunWith(Parameterized.class) public class ReplicatedMultipleServerFailoverTest extends MultipleServerFailoverTestBase { + @Parameterized.Parameter + public HAType haType; + + @Parameterized.Parameters(name = "ha={0}") + public static Collection getParams() { + return Arrays.asList(new Object[][]{{HAType.SharedNothingReplication}, {HAType.PluggableQuorumReplication}}); + } + @Test public void testStartLiveFirst() throws Exception { for (TestableServer liveServer : liveServers) { @@ -140,8 +154,8 @@ public class ReplicatedMultipleServerFailoverTest extends MultipleServerFailover } @Override - public boolean isSharedStore() { - return false; + public HAType haType() { + return haType; } @Override diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreDistributionTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreDistributionTest.java index ee972e9279..ea3accd1f4 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreDistributionTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreDistributionTest.java @@ -19,7 +19,7 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover; public class SharedStoreDistributionTest extends ReplicatedDistributionTest { @Override - protected boolean isSharedStore() { - return true; + protected HAType haType() { + return HAType.SharedStore; } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreDontWaitForActivationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreDontWaitForActivationTest.java index a4424e43f3..f23e59ee48 100755 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreDontWaitForActivationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreDontWaitForActivationTest.java @@ -41,8 +41,8 @@ public class SharedStoreDontWaitForActivationTest extends ClusterTestBase { // 1. configure 0 as backup of one to share the same node manager and file // storage locations - setupBackupServer(0, 1, isFileStorage(), true, isNetty()); - setupLiveServer(1, isFileStorage(), true, isNetty(), false); + setupBackupServer(0, 1, isFileStorage(), HAType.SharedStore, isNetty()); + setupLiveServer(1, isFileStorage(), HAType.SharedStore, isNetty(), false); // now reconfigure the HA policy for both servers to master with automatic // failover and wait-for-activation disabled. diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreMetricsLeakTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreMetricsLeakTest.java index 43c904bda1..c78fba5411 100755 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreMetricsLeakTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreMetricsLeakTest.java @@ -40,8 +40,8 @@ public class SharedStoreMetricsLeakTest extends ClusterTestBase { } private void setupServers() throws Exception { - setupLiveServer(0, isFileStorage(), true, isNetty(), false); - setupBackupServer(1, 0, isFileStorage(), true, isNetty()); + setupLiveServer(0, isFileStorage(), HAType.SharedStore, isNetty(), false); + setupBackupServer(1, 0, isFileStorage(), HAType.SharedStore, isNetty()); getServer(0).getConfiguration().setHAPolicyConfiguration(new SharedStoreMasterPolicyConfiguration().setFailoverOnServerShutdown(true)); getServer(0).getConfiguration().setMetricsConfiguration(new MetricsConfiguration().setJvmThread(false).setJvmGc(false).setJvmMemory(false).setPlugin(new SimpleMetricsPlugin().init(null))); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreScaleDownBackupTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreScaleDownBackupTest.java index fb4c5002d8..9bd16e8b48 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreScaleDownBackupTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/SharedStoreScaleDownBackupTest.java @@ -41,9 +41,9 @@ public class SharedStoreScaleDownBackupTest extends ClusterTestBase { public void setUp() throws Exception { super.setUp(); - setupLiveServer(0, isFileStorage(), true, isNetty(), false); - setupLiveServer(1, isFileStorage(), true, isNetty(), false); - setupBackupServer(2, 0, isFileStorage(), true, isNetty()); + setupLiveServer(0, isFileStorage(), HAType.SharedStore, isNetty(), false); + setupLiveServer(1, isFileStorage(), HAType.SharedStore, isNetty(), false); + setupBackupServer(2, 0, isFileStorage(), HAType.SharedStore, isNetty()); setupClusterConnection("cluster0", "testAddress", MessageLoadBalancingType.ON_DEMAND, 1, isNetty(), 0, 1); setupClusterConnection("cluster1", "testAddress", MessageLoadBalancingType.ON_DEMAND, 1, isNetty(), 1, 0); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/StaticClusterWithBackupFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/StaticClusterWithBackupFailoverTest.java index fa39efd2ac..be5d841035 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/StaticClusterWithBackupFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/StaticClusterWithBackupFailoverTest.java @@ -42,13 +42,13 @@ public class StaticClusterWithBackupFailoverTest extends ClusterWithBackupFailov @Override protected void setupServers() throws Exception { // The backups - setupBackupServer(3, 0, isFileStorage(), isSharedStorage(), isNetty()); - setupBackupServer(4, 1, isFileStorage(), isSharedStorage(), isNetty()); - setupBackupServer(5, 2, isFileStorage(), isSharedStorage(), isNetty()); + setupBackupServer(3, 0, isFileStorage(), haType(), isNetty()); + setupBackupServer(4, 1, isFileStorage(), haType(), isNetty()); + setupBackupServer(5, 2, isFileStorage(), haType(), isNetty()); // The lives - setupLiveServer(0, isFileStorage(), isSharedStorage(), isNetty(), false); - setupLiveServer(1, isFileStorage(), isSharedStorage(), isNetty(), false); - setupLiveServer(2, isFileStorage(), isSharedStorage(), isNetty(), false); + setupLiveServer(0, isFileStorage(), haType(), isNetty(), false); + setupLiveServer(1, isFileStorage(), haType(), isNetty(), false); + setupLiveServer(2, isFileStorage(), haType(), isNetty(), false); } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java new file mode 100644 index 0000000000..b1e6a94dc2 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; + +import java.util.Arrays; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import org.apache.activemq.artemis.api.core.ActiveMQException; +import org.apache.activemq.artemis.api.core.Interceptor; +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.core.protocol.core.Packet; +import org.apache.activemq.artemis.core.protocol.core.impl.PacketImpl; +import org.apache.activemq.artemis.spi.core.protocol.RemotingConnection; +import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTestBase; +import org.apache.activemq.artemis.tests.integration.cluster.failover.FakeServiceComponent; +import org.apache.activemq.artemis.tests.util.TransportConfigurationUtils; +import org.apache.activemq.artemis.tests.util.Wait; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import static java.util.Arrays.asList; + +@RunWith(Parameterized.class) +public class PluggableQuorumBackupAuthenticationTest extends FailoverTestBase { + + private static CountDownLatch registrationStarted; + + @Parameterized.Parameter + public boolean useNetty; + + @Parameterized.Parameters(name = "useNetty={1}") + public static Iterable getParams() { + return asList(new Object[][]{{false}, {true}}); + } + + @Override + @Before + public void setUp() throws Exception { + startBackupServer = false; + registrationStarted = new CountDownLatch(1); + super.setUp(); + } + + @Test + public void testWrongPasswordSetting() throws Exception { + FakeServiceComponent fakeServiceComponent = new FakeServiceComponent("fake web server"); + Wait.assertTrue(liveServer.getServer()::isActive); + waitForServerToStart(liveServer.getServer()); + backupServer.start(); + backupServer.getServer().addExternalComponent(fakeServiceComponent, true); + assertTrue(registrationStarted .await(5, TimeUnit.SECONDS)); + /* + * can't intercept the message at the backup, so we intercept the registration message at the + * live. + */ + Wait.waitFor(() -> !backupServer.isStarted()); + assertFalse("backup should have stopped", backupServer.isStarted()); + Wait.assertFalse(fakeServiceComponent::isStarted); + backupServer.stop(); + liveServer.stop(); + } + + @Override + protected void createConfigs() throws Exception { + createPluggableReplicatedConfigs(); + backupConfig.setClusterPassword("crocodile"); + liveConfig.setIncomingInterceptorClassNames(Arrays.asList(NotifyingInterceptor.class.getName())); + backupConfig.setSecurityEnabled(true); + liveConfig.setSecurityEnabled(true); + } + + @Override + protected void setupHAPolicyConfiguration() { + ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); + } + + @Override + protected TransportConfiguration getAcceptorTransportConfiguration(final boolean live) { + return useNetty ? getNettyAcceptorTransportConfiguration(live) : + TransportConfigurationUtils.getInVMAcceptor(live); + } + + @Override + protected TransportConfiguration getConnectorTransportConfiguration(final boolean live) { + return useNetty ? getNettyConnectorTransportConfiguration(live) : + TransportConfigurationUtils.getInVMConnector(live); + } + + public static final class NotifyingInterceptor implements Interceptor { + + @Override + public boolean intercept(Packet packet, RemotingConnection connection) throws ActiveMQException { + if (packet.getType() == PacketImpl.BACKUP_REGISTRATION) { + registrationStarted.countDown(); + } else if (packet.getType() == PacketImpl.CLUSTER_CONNECT) { + registrationStarted.countDown(); + } + return true; + } + } +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java new file mode 100644 index 0000000000..6f620a64f2 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; + +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.tests.integration.cluster.failover.BackupSyncJournalTest; + +public class PluggableQuorumBackupSyncJournalTest extends BackupSyncJournalTest { + + @Override + protected void createConfigs() throws Exception { + createPluggableReplicatedConfigs(); + } + + @Override + protected void setupHAPolicyConfiguration() { + ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()) + .setCheckForLiveServer(true); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) + .setMaxSavedReplicatedJournalsSize(2) + .setAllowFailBack(true); + } +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumExtraBackupReplicatedFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumExtraBackupReplicatedFailoverTest.java new file mode 100644 index 0000000000..e9f7576dc0 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumExtraBackupReplicatedFailoverTest.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; + +import java.util.Arrays; + +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.core.config.Configuration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.core.server.ActiveMQServer; +import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTestBase; +import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer; +import org.apache.activemq.artemis.tests.util.TransportConfigurationUtils; +import org.apache.activemq.artemis.tests.util.Wait; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public class PluggableQuorumExtraBackupReplicatedFailoverTest extends FailoverTestBase { + + private static final String GROUP_NAME = "foo"; + + @Parameterized.Parameter + public boolean useGroupName; + + @Parameterized.Parameters(name = "useGroupName={0}") + public static Iterable getParams() { + return Arrays.asList(new Object[][]{{false}, {true}}); + } + + @Override + protected void createConfigs() throws Exception { + createPluggableReplicatedConfigs(); + } + + @Override + protected void setupHAPolicyConfiguration() { + if (useGroupName) { + ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setGroupName(GROUP_NAME); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setGroupName(GROUP_NAME); + } + } + + @Override + protected TransportConfiguration getAcceptorTransportConfiguration(final boolean live) { + return TransportConfigurationUtils.getInVMAcceptor(live); + } + + @Override + protected TransportConfiguration getConnectorTransportConfiguration(final boolean live) { + return TransportConfigurationUtils.getInVMConnector(live); + } + + @Test + public void testExtraBackupReplicates() throws Exception { + Configuration secondBackupConfig = backupConfig.copy(); + String secondBackupGroupName = ((ReplicationBackupPolicyConfiguration) secondBackupConfig.getHAPolicyConfiguration()).getGroupName(); + Assert.assertEquals(((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).getGroupName(), + secondBackupGroupName); + if (useGroupName) { + Assert.assertEquals(GROUP_NAME, secondBackupGroupName); + } else { + Assert.assertNull(secondBackupGroupName); + } + TestableServer secondBackupServer = createTestableServer(secondBackupConfig); + secondBackupConfig.setBindingsDirectory(getBindingsDir(1, true)) + .setJournalDirectory(getJournalDir(1, true)) + .setPagingDirectory(getPageDir(1, true)) + .setLargeMessagesDirectory(getLargeMessagesDir(1, true)) + .setSecurityEnabled(false); + + waitForRemoteBackupSynchronization(backupServer.getServer()); + + secondBackupServer.start(); + Thread.sleep(5000); + backupServer.stop(); + waitForSync(secondBackupServer.getServer()); + waitForRemoteBackupSynchronization(secondBackupServer.getServer()); + + } + + private void waitForSync(ActiveMQServer server) throws Exception { + Wait.waitFor(server::isReplicaSync); + } + +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumGroupingFailoverReplicationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumGroupingFailoverReplicationTest.java new file mode 100644 index 0000000000..80ea376470 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumGroupingFailoverReplicationTest.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; + +import org.apache.activemq.artemis.tests.integration.cluster.distribution.ClusterTestBase; +import org.apache.activemq.artemis.tests.integration.cluster.failover.GroupingFailoverTestBase; + +public class PluggableQuorumGroupingFailoverReplicationTest extends GroupingFailoverTestBase { + + @Override + protected ClusterTestBase.HAType haType() { + return HAType.PluggableQuorumReplication; + } + +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java new file mode 100644 index 0000000000..eb2f1dc2a0 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; + +import java.io.IOException; +import java.io.OutputStream; +import java.net.InetSocketAddress; +import java.util.ArrayList; +import java.util.concurrent.TimeUnit; + +import com.sun.net.httpserver.HttpExchange; +import com.sun.net.httpserver.HttpHandler; +import com.sun.net.httpserver.HttpServer; +import org.apache.activemq.artemis.api.core.QueueConfiguration; +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.api.core.client.ClientSession; +import org.apache.activemq.artemis.component.WebServerComponent; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.core.server.ActiveMQServer; +import org.apache.activemq.artemis.core.server.ServiceComponent; +import org.apache.activemq.artemis.dto.AppDTO; +import org.apache.activemq.artemis.dto.WebServerDTO; +import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTest; +import org.apache.activemq.artemis.tests.util.Wait; +import org.junit.Assert; +import org.junit.Test; + +public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends FailoverTest { + + protected void beforeWaitForRemoteBackupSynchronization() { + } + + private void waitForSync(ActiveMQServer server) throws Exception { + Wait.waitFor(server::isReplicaSync); + } + + /** + * Default maxSavedReplicatedJournalsSize is 2, this means the backup will fall back to replicated only twice, after this + * it is stopped permanently. + */ + @Test(timeout = 120000) + public void testReplicatedFailback() throws Exception { + try { + beforeWaitForRemoteBackupSynchronization(); + + waitForSync(backupServer.getServer()); + + createSessionFactory(); + + ClientSession session = createSession(sf, true, true); + + session.createQueue(new QueueConfiguration(ADDRESS)); + + crash(session); + + liveServer.start(); + + waitForSync(liveServer.getServer()); + + waitForSync(backupServer.getServer()); + + waitForServerToStart(liveServer.getServer()); + + session = createSession(sf, true, true); + + crash(session); + + liveServer.start(); + + waitForSync(liveServer.getServer()); + + waitForSync(backupServer.getServer()); + + waitForServerToStart(liveServer.getServer()); + + session = createSession(sf, true, true); + + crash(session); + + liveServer.start(); + + waitForSync(liveServer.getServer()); + + liveServer.getServer().waitForActivation(5, TimeUnit.SECONDS); + + waitForSync(liveServer.getServer()); + + waitForServerToStart(backupServer.getServer()); + + assertTrue(backupServer.getServer().isStarted()); + + } finally { + if (sf != null) { + sf.close(); + } + try { + liveServer.getServer().stop(); + } catch (Throwable ignored) { + } + try { + backupServer.getServer().stop(); + } catch (Throwable ignored) { + } + } + } + + @Test + public void testReplicatedFailbackBackupFromLiveBackToBackup() throws Exception { + + InetSocketAddress address = new InetSocketAddress("127.0.0.1", 8787); + HttpServer httpServer = HttpServer.create(address, 100); + httpServer.start(); + + try { + httpServer.createContext("/", new HttpHandler() { + @Override + public void handle(HttpExchange t) throws IOException { + String response = "This is a unit test"; + t.sendResponseHeaders(200, response.length()); + OutputStream os = t.getResponseBody(); + os.write(response.getBytes()); + os.close(); + } + }); + WebServerDTO wdto = new WebServerDTO(); + AppDTO appDTO = new AppDTO(); + appDTO.war = "console.war"; + appDTO.url = "console"; + wdto.apps = new ArrayList(); + wdto.apps.add(appDTO); + wdto.bind = "http://localhost:0"; + wdto.path = "console"; + WebServerComponent webServerComponent = new WebServerComponent(); + webServerComponent.configure(wdto, ".", "."); + webServerComponent.start(); + + backupServer.getServer().getNetworkHealthCheck().parseURIList("http://localhost:8787"); + Assert.assertTrue(backupServer.getServer().getNetworkHealthCheck().isStarted()); + backupServer.getServer().addExternalComponent(webServerComponent, false); + // this is called when backup servers go from live back to backup + backupServer.getServer().fail(true); + Assert.assertTrue(backupServer.getServer().getNetworkHealthCheck().isStarted()); + Assert.assertTrue(backupServer.getServer().getExternalComponents().get(0).isStarted()); + ((ServiceComponent) (backupServer.getServer().getExternalComponents().get(0))).stop(true); + } finally { + httpServer.stop(0); + } + + } + + @Override + protected void createConfigs() throws Exception { + createPluggableReplicatedConfigs(); + } + + @Override + protected void setupHAPolicyConfiguration() { + ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()) + .setCheckForLiveServer(true); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) + .setMaxSavedReplicatedJournalsSize(2) + .setAllowFailBack(true); + } + + @Override + protected TransportConfiguration getAcceptorTransportConfiguration(final boolean live) { + return getNettyAcceptorTransportConfiguration(live); + } + + @Override + protected TransportConfiguration getConnectorTransportConfiguration(final boolean live) { + return getNettyConnectorTransportConfiguration(live); + } + + @Override + protected void crash(boolean waitFailure, ClientSession... sessions) throws Exception { + if (sessions.length > 0) { + for (ClientSession session : sessions) { + waitForRemoteBackup(session.getSessionFactory(), 5, true, backupServer.getServer()); + } + } else { + waitForRemoteBackup(null, 5, true, backupServer.getServer()); + } + super.crash(waitFailure, sessions); + } + + @Override + protected void crash(ClientSession... sessions) throws Exception { + if (sessions.length > 0) { + for (ClientSession session : sessions) { + waitForRemoteBackup(session.getSessionFactory(), 5, true, backupServer.getServer()); + } + } else { + waitForRemoteBackup(null, 5, true, backupServer.getServer()); + } + super.crash(sessions); + } +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java new file mode 100644 index 0000000000..1381cad3c4 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; + +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.tests.integration.cluster.failover.NettyReplicationStopTest; + +public class PluggableQuorumNettyReplicationStopTest extends NettyReplicationStopTest { + + @Override + protected void createConfigs() throws Exception { + createPluggableReplicatedConfigs(); + } + + @Override + protected void setupHAPolicyConfiguration() { + ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); + } + +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java new file mode 100644 index 0000000000..b5b8b3af02 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; + +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.tests.integration.cluster.failover.PageCleanupWhileReplicaCatchupTest; + +public class PluggableQuorumPageCleanupWhileReplicaCatchupTest extends PageCleanupWhileReplicaCatchupTest { + + @Override + protected void createConfigs() throws Exception { + createPluggableReplicatedConfigs(); + } + + @Override + protected void setupHAPolicyConfiguration() { + ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); + } + +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java new file mode 100644 index 0000000000..4c15d7bd3b --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; + +import java.io.IOException; +import java.util.Collections; + +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.core.config.Configuration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; +import org.apache.activemq.artemis.tests.integration.cluster.failover.ReplicaTimeoutTest; +import org.apache.activemq.artemis.tests.util.ReplicatedBackupUtils; +import org.junit.Rule; +import org.junit.rules.TemporaryFolder; + +public class PluggableQuorumReplicaTimeoutTest extends ReplicaTimeoutTest { + + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); + + @Override + protected void configureReplicationPair(Configuration backupConfig, + Configuration liveConfig, + TransportConfiguration backupConnector, + TransportConfiguration backupAcceptor, + TransportConfiguration liveConnector) throws IOException { + DistributedPrimitiveManagerConfiguration managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + + ReplicatedBackupUtils.configurePluggableQuorumReplicationPair(backupConfig, backupConnector, backupAcceptor, + liveConfig, liveConnector, null, + managerConfiguration, managerConfiguration); + ReplicationPrimaryPolicyConfiguration primaryConfiguration = ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()); + primaryConfiguration.setInitialReplicationSyncTimeout(1000); + primaryConfiguration.setCheckForLiveServer(true); + ReplicationBackupPolicyConfiguration backupConfiguration = ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()); + backupConfiguration.setInitialReplicationSyncTimeout(1000); + backupConfiguration.setMaxSavedReplicatedJournalsSize(2) + .setAllowFailBack(true); + } + + @Override + protected boolean expectLiveSuicide() { + return false; + } +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedDistributionTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedDistributionTest.java new file mode 100644 index 0000000000..afbd01e9e6 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedDistributionTest.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; + +import org.apache.activemq.artemis.tests.integration.cluster.failover.ReplicatedDistributionTest; + +public class PluggableQuorumReplicatedDistributionTest extends ReplicatedDistributionTest { + + @Override + protected HAType haType() { + return HAType.PluggableQuorumReplication; + } +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedLargeMessageFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java similarity index 63% rename from tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedLargeMessageFailoverTest.java rename to tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java index 7479257bd2..9d7cb4d3b2 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedLargeMessageFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java @@ -14,23 +14,31 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.activemq.artemis.tests.integration.cluster.failover; +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.api.core.client.ClientSession; -import org.apache.activemq.artemis.core.client.impl.ClientSessionInternal; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.tests.integration.cluster.failover.LargeMessageFailoverTest; -public class ReplicatedLargeMessageFailoverTest extends LargeMessageFailoverTest { +public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMessageFailoverTest { @Override protected void createConfigs() throws Exception { - createReplicatedConfigs(); + createPluggableReplicatedConfigs(); + } + + @Override + protected void setupHAPolicyConfiguration() { + ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } @Override protected void crash(boolean waitFailure, ClientSession... sessions) throws Exception { if (sessions.length > 0) { for (ClientSession session : sessions) { - waitForRemoteBackup(((ClientSessionInternal) session).getSessionFactory(), 5, true, backupServer.getServer()); + waitForRemoteBackup(session.getSessionFactory(), 5, true, backupServer.getServer()); } } else { waitForRemoteBackup(null, 5, true, backupServer.getServer()); @@ -42,11 +50,12 @@ public class ReplicatedLargeMessageFailoverTest extends LargeMessageFailoverTest protected void crash(ClientSession... sessions) throws Exception { if (sessions.length > 0) { for (ClientSession session : sessions) { - waitForRemoteBackup(((ClientSessionInternal) session).getSessionFactory(), 5, true, backupServer.getServer()); + waitForRemoteBackup(session.getSessionFactory(), 5, true, backupServer.getServer()); } } else { waitForRemoteBackup(null, 5, true, backupServer.getServer()); } super.crash(sessions); } + } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedLargeMessageWithDelayFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java similarity index 73% rename from tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedLargeMessageWithDelayFailoverTest.java rename to tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java index cb6ae62fd1..5954d8992d 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedLargeMessageWithDelayFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java @@ -14,17 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.activemq.artemis.tests.integration.cluster.failover; +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.api.core.client.ClientSession; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.tests.integration.cluster.util.BackupSyncDelay; import org.junit.After; import org.junit.Before; -/** - * See {@link BackupSyncDelay} for the rationale about these 'WithDelay' tests. - */ -public class ReplicatedLargeMessageWithDelayFailoverTest extends ReplicatedLargeMessageFailoverTest { +public class PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest extends PluggableQuorumReplicatedLargeMessageFailoverTest { private BackupSyncDelay syncDelay; @@ -60,10 +59,23 @@ public class ReplicatedLargeMessageWithDelayFailoverTest extends ReplicatedLarge super.crash(waitFailure, sessions); } + @Override + protected void createConfigs() throws Exception { + createPluggableReplicatedConfigs(); + } + + @Override + protected void setupHAPolicyConfiguration() { + ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) + .setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); + } + @Override @After public void tearDown() throws Exception { syncDelay.deliverUpToDateMsg(); super.tearDown(); } + } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java new file mode 100644 index 0000000000..d80460855b --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; + +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.tests.integration.cluster.failover.PagingFailoverTest; + +public class PluggableQuorumReplicatedPagingFailoverTest extends PagingFailoverTest { + + @Override + protected void createConfigs() throws Exception { + createPluggableReplicatedConfigs(); + } + + @Override + protected void setupHAPolicyConfiguration() { + ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); + } +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/util/BackupSyncDelay.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/util/BackupSyncDelay.java index 9ddffd389a..b0af71b367 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/util/BackupSyncDelay.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/util/BackupSyncDelay.java @@ -32,6 +32,8 @@ import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.Replicatio import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.ReplicationStartSyncMessage; import org.apache.activemq.artemis.core.replication.ReplicationEndpoint; import org.apache.activemq.artemis.core.server.ActiveMQServer; +import org.apache.activemq.artemis.core.server.impl.Activation; +import org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation; import org.apache.activemq.artemis.core.server.impl.SharedNothingBackupActivation; import org.apache.activemq.artemis.spi.core.protocol.RemotingConnection; @@ -94,8 +96,18 @@ public class BackupSyncDelay implements Interceptor { public boolean intercept(Packet packet, RemotingConnection connection) throws ActiveMQException { if (packet.getType() == PacketImpl.BACKUP_REGISTRATION) { try { - SharedNothingBackupActivation activation = (SharedNothingBackupActivation) backup.getActivation(); - ReplicationEndpoint repEnd = activation.getReplicationEndpoint(); + Activation backupActivation = backup.getActivation(); + ReplicationEndpoint repEnd = null; + if (backupActivation instanceof SharedNothingBackupActivation) { + SharedNothingBackupActivation activation = (SharedNothingBackupActivation) backupActivation; + repEnd = activation.getReplicationEndpoint(); + } else if (backupActivation instanceof ReplicationBackupActivation) { + ReplicationBackupActivation activation = (ReplicationBackupActivation) backupActivation; + repEnd = activation.getReplicationEndpoint(); + } + if (repEnd == null) { + throw new NullPointerException("replication endpoint isn't supposed to be null"); + } handler.addSubHandler(repEnd); Channel repChannel = repEnd.getChannel(); repChannel.setHandler(handler); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/openwire/OpenWireProtocolManagerTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/openwire/OpenWireProtocolManagerTest.java index 4ee2a6c76c..e1d6ce8a07 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/openwire/OpenWireProtocolManagerTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/openwire/OpenWireProtocolManagerTest.java @@ -78,7 +78,7 @@ public class OpenWireProtocolManagerTest extends ActiveMQTestBase { @Override public ClusterManager getClusterManager() { - return new ClusterManager(getExecutorFactory(), this, null, null, null, null, null, false); + return new ClusterManager(getExecutorFactory(), this, null, null, null, null, null, true); } @Override diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java new file mode 100644 index 0000000000..11e4380246 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.activemq.artemis.tests.integration.replication; + +import java.io.IOException; +import java.util.Collections; + +import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; +import org.junit.Before; +import org.junit.Rule; +import org.junit.rules.TemporaryFolder; + +public class PluggableQuorumReplicationFlowControlTest extends SharedNothingReplicationFlowControlTest { + + private DistributedPrimitiveManagerConfiguration managerConfiguration; + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); + + @Before + public void init() throws IOException { + managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + } + + @Override + protected HAPolicyConfiguration createReplicationBackupConfiguration() { + ReplicationBackupPolicyConfiguration haPolicy = ReplicationBackupPolicyConfiguration.withDefault(); + haPolicy.setDistributedManagerConfiguration(managerConfiguration); + haPolicy.setClusterName("cluster"); + // fail-fast in order to let the backup to quickly retry syncing with primary + haPolicy.setVoteRetries(0); + return haPolicy; + } + + @Override + protected HAPolicyConfiguration createReplicationLiveConfiguration() { + ReplicationPrimaryPolicyConfiguration haPolicy = ReplicationPrimaryPolicyConfiguration.withDefault(); + haPolicy.setDistributedManagerConfiguration(managerConfiguration); + haPolicy.setCheckForLiveServer(false); + return haPolicy; + } +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java new file mode 100644 index 0000000000..f5ed720792 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.replication; + +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import static java.util.Arrays.asList; + +@RunWith(Parameterized.class) +public class PluggableQuorumReplicationOrderTest extends ReplicationOrderTest { + + @Parameterized.Parameter + public boolean useNetty; + + @Parameterized.Parameters(name = "useNetty={1}") + public static Iterable getParams() { + return asList(new Object[][]{{false}, {true}}); + } + + @Override + protected void createConfigs() throws Exception { + createPluggableReplicatedConfigs(); + } + + @Override + protected void setupHAPolicyConfiguration() { + ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()) + .setCheckForLiveServer(true); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) + .setMaxSavedReplicatedJournalsSize(2) + .setAllowFailBack(true); + } + + @Override + protected TransportConfiguration getAcceptorTransportConfiguration(final boolean live) { + return useNetty ? getNettyAcceptorTransportConfiguration(live) : + super.getAcceptorTransportConfiguration(live); + } + + @Override + protected TransportConfiguration getConnectorTransportConfiguration(final boolean live) { + return useNetty ? getNettyConnectorTransportConfiguration(live) : + super.getConnectorTransportConfiguration(live); + } +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java new file mode 100644 index 0000000000..1d702b5b88 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.replication; + +import java.io.IOException; +import java.util.Collections; + +import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; +import org.junit.Before; +import org.junit.Rule; +import org.junit.rules.TemporaryFolder; + +public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest { + + private DistributedPrimitiveManagerConfiguration managerConfiguration; + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); + + @Before + public void init() throws IOException { + managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + } + + @Override + protected HAPolicyConfiguration createReplicationLiveConfiguration() { + ReplicationPrimaryPolicyConfiguration haPolicy = ReplicationPrimaryPolicyConfiguration.withDefault(); + haPolicy.setDistributedManagerConfiguration(managerConfiguration); + haPolicy.setCheckForLiveServer(false); + return haPolicy; + } + + @Override + protected HAPolicyConfiguration createReplicationBackupConfiguration() { + ReplicationBackupPolicyConfiguration haPolicy = ReplicationBackupPolicyConfiguration.withDefault(); + haPolicy.setDistributedManagerConfiguration(managerConfiguration); + haPolicy.setClusterName("cluster"); + return haPolicy; + } + +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/ReplicationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/ReplicationTest.java index 539e845927..88da15e93d 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/ReplicationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/ReplicationTest.java @@ -18,6 +18,7 @@ package org.apache.activemq.artemis.tests.integration.replication; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.concurrent.CountDownLatch; @@ -48,6 +49,7 @@ import org.apache.activemq.artemis.api.core.client.ClientSessionFactory; import org.apache.activemq.artemis.api.core.client.ServerLocator; import org.apache.activemq.artemis.core.config.ClusterConnectionConfiguration; import org.apache.activemq.artemis.core.config.Configuration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.SharedStoreSlavePolicyConfiguration; import org.apache.activemq.artemis.core.io.IOCallback; import org.apache.activemq.artemis.core.io.SequentialFileFactory; @@ -85,6 +87,7 @@ import org.apache.activemq.artemis.core.server.cluster.ClusterController; import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl; import org.apache.activemq.artemis.core.settings.HierarchicalRepository; import org.apache.activemq.artemis.core.settings.impl.AddressSettings; +import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; import org.apache.activemq.artemis.spi.core.protocol.RemotingConnection; import org.apache.activemq.artemis.tests.util.ActiveMQTestBase; import org.apache.activemq.artemis.tests.util.ReplicatedBackupUtils; @@ -97,10 +100,26 @@ import org.apache.activemq.artemis.utils.critical.EmptyCriticalAnalyzer; import org.junit.After; import org.junit.Assert; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +@RunWith(Parameterized.class) public final class ReplicationTest extends ActiveMQTestBase { + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); + + @Parameterized.Parameter + public boolean pluggableQuorum; + + @Parameterized.Parameters(name = "PluggableQuorum={0}") + public static Iterable data() { + return Arrays.asList(new Object[][]{{true}, {false}}); + } + private ThreadFactory tFactory; private ExecutorService executor; private ExecutorFactory factory; @@ -144,7 +163,15 @@ public final class ReplicationTest extends ActiveMQTestBase { Configuration backupConfig = createDefaultInVMConfig().setHAPolicyConfiguration(new SharedStoreSlavePolicyConfiguration()).setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)).setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setIncomingInterceptorClassNames(incomingInterceptors.length > 0 ? Arrays.asList(incomingInterceptors) : new ArrayList()); - ReplicatedBackupUtils.configureReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, liveAcceptor); + if (!pluggableQuorum) { + ReplicatedBackupUtils.configureReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, liveAcceptor); + } else { + DistributedPrimitiveManagerConfiguration managerConfiguration = + new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), + Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + + ReplicatedBackupUtils.configurePluggableQuorumReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, liveAcceptor, managerConfiguration, managerConfiguration); + } if (extraConfig != null) { extraConfig.config(liveConfig, backupConfig); @@ -558,7 +585,7 @@ public final class ReplicationTest extends ActiveMQTestBase { blockOnReplication(storage, manager); - LargeServerMessageImpl message1 = (LargeServerMessageImpl) backupServer.getReplicationEndpoint().getLargeMessages().get(Long.valueOf(500)); + LargeServerMessageImpl message1 = (LargeServerMessageImpl) getReplicationEndpoint(backupServer).getLargeMessages().get(Long.valueOf(500)); Assert.assertNotNull(message1); Assert.assertFalse(largeMsg.getAppendFile().isOpen()); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationFlowControlTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationFlowControlTest.java index f2a8a28309..1d923731d6 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationFlowControlTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationFlowControlTest.java @@ -47,6 +47,7 @@ import org.apache.activemq.artemis.api.core.client.ServerLocator; import org.apache.activemq.artemis.core.client.impl.ServerLocatorImpl; import org.apache.activemq.artemis.core.config.ClusterConnectionConfiguration; import org.apache.activemq.artemis.core.config.Configuration; +import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicaPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicatedPolicyConfiguration; import org.apache.activemq.artemis.core.config.impl.ConfigurationImpl; @@ -173,7 +174,6 @@ public class SharedNothingReplicationFlowControlTest extends ActiveMQTestBase { SequentialFileFactory fileFactory; - File liveJournalDir = brokersFolder.getRoot().toPath().resolve("live").resolve("data").resolve("journal").toFile(); fileFactory = new MappedSequentialFileFactory(liveConfiguration.getJournalLocation(), liveConfiguration.getJournalFileSize(), false, liveConfiguration.getJournalBufferSize_NIO(), liveConfiguration.getJournalBufferTimeout_NIO(), null); JournalImpl liveMessageJournal = new JournalImpl(liveConfiguration.getJournalFileSize(), liveConfiguration.getJournalMinFiles(), liveConfiguration.getJournalPoolFiles(), liveConfiguration.getJournalCompactMinFiles(), liveConfiguration.getJournalCompactPercentage(), fileFactory, "activemq-data", "amq", fileFactory.getMaxIO()); @@ -355,6 +355,12 @@ public class SharedNothingReplicationFlowControlTest extends ActiveMQTestBase { } } + protected HAPolicyConfiguration createReplicationLiveConfiguration() { + return new ReplicatedPolicyConfiguration() + .setVoteOnReplicationFailure(false) + .setCheckForLiveServer(false); + } + // Set a small call timeout and write buffer high water mark value to trigger replication flow control private Configuration createLiveConfiguration() throws Exception { Configuration conf = new ConfigurationImpl(); @@ -370,10 +376,7 @@ public class SharedNothingReplicationFlowControlTest extends ActiveMQTestBase { conf.setClusterUser("mycluster"); conf.setClusterPassword("mypassword"); - ReplicatedPolicyConfiguration haPolicy = new ReplicatedPolicyConfiguration(); - haPolicy.setVoteOnReplicationFailure(false); - haPolicy.setCheckForLiveServer(false); - conf.setHAPolicyConfiguration(haPolicy); + conf.setHAPolicyConfiguration(createReplicationLiveConfiguration()); ClusterConnectionConfiguration ccconf = new ClusterConnectionConfiguration(); ccconf.setStaticConnectors(new ArrayList<>()).getStaticConnectors().add("backup"); @@ -387,6 +390,10 @@ public class SharedNothingReplicationFlowControlTest extends ActiveMQTestBase { return conf; } + protected HAPolicyConfiguration createReplicationBackupConfiguration() { + return new ReplicaPolicyConfiguration().setClusterName("cluster"); + } + private Configuration createBackupConfiguration() throws Exception { Configuration conf = new ConfigurationImpl(); conf.setName("localhost::backup"); @@ -394,9 +401,7 @@ public class SharedNothingReplicationFlowControlTest extends ActiveMQTestBase { File backupDir = brokersFolder.newFolder("backup"); conf.setBrokerInstance(backupDir); - ReplicaPolicyConfiguration haPolicy = new ReplicaPolicyConfiguration(); - haPolicy.setClusterName("cluster"); - conf.setHAPolicyConfiguration(haPolicy); + conf.setHAPolicyConfiguration(createReplicationBackupConfiguration()); conf.addAcceptorConfiguration("backup", "tcp://localhost:61617"); conf.addConnectorConfiguration("live", "tcp://localhost:61616"); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java index 7cc8bb9397..1d3317c40e 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java @@ -31,6 +31,7 @@ import org.apache.activemq.artemis.api.core.client.TopologyMember; import org.apache.activemq.artemis.core.client.impl.ServerLocatorImpl; import org.apache.activemq.artemis.core.config.ClusterConnectionConfiguration; import org.apache.activemq.artemis.core.config.Configuration; +import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicaPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicatedPolicyConfiguration; import org.apache.activemq.artemis.core.config.impl.ConfigurationImpl; @@ -237,6 +238,12 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase { Assert.assertTrue("The test is not valid, slow persister stopped being used", SlowMessagePersister._getInstance().used); } + protected HAPolicyConfiguration createReplicationLiveConfiguration() { + return new ReplicatedPolicyConfiguration() + .setVoteOnReplicationFailure(false) + .setCheckForLiveServer(false); + } + private Configuration createLiveConfiguration() throws Exception { Configuration conf = new ConfigurationImpl(); conf.setName("localhost::live"); @@ -251,10 +258,7 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase { conf.setClusterUser("mycluster"); conf.setClusterPassword("mypassword"); - ReplicatedPolicyConfiguration haPolicy = new ReplicatedPolicyConfiguration(); - haPolicy.setVoteOnReplicationFailure(false); - haPolicy.setCheckForLiveServer(false); - conf.setHAPolicyConfiguration(haPolicy); + conf.setHAPolicyConfiguration(createReplicationLiveConfiguration()); ClusterConnectionConfiguration ccconf = new ClusterConnectionConfiguration(); ccconf.setStaticConnectors(new ArrayList<>()).getStaticConnectors().add("backup"); @@ -267,6 +271,10 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase { return conf; } + protected HAPolicyConfiguration createReplicationBackupConfiguration() { + return new ReplicaPolicyConfiguration().setClusterName("cluster"); + } + private Configuration createBackupConfiguration() throws Exception { Configuration conf = new ConfigurationImpl(); conf.setName("localhost::backup"); @@ -274,9 +282,7 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase { File backupDir = brokersFolder.newFolder("backup"); conf.setBrokerInstance(backupDir); - ReplicaPolicyConfiguration haPolicy = new ReplicaPolicyConfiguration(); - haPolicy.setClusterName("cluster"); - conf.setHAPolicyConfiguration(haPolicy); + conf.setHAPolicyConfiguration(createReplicationBackupConfiguration()); conf.addAcceptorConfiguration("backup", "tcp://localhost:61617"); conf.addConnectorConfiguration("live", "tcp://localhost:61616"); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java new file mode 100644 index 0000000000..60f27a13c2 --- /dev/null +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.integration.security; + +import java.util.Collections; + +import org.apache.activemq.artemis.api.core.ActiveMQException; +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.api.core.client.ClientSession; +import org.apache.activemq.artemis.api.core.client.ClientSessionFactory; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; +import org.apache.activemq.artemis.spi.core.security.ActiveMQBasicSecurityManager; +import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTestBase; +import org.apache.activemq.artemis.tests.util.ReplicatedBackupUtils; +import org.apache.activemq.artemis.tests.util.TransportConfigurationUtils; +import org.junit.Assert; +import org.junit.Test; + +public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTestBase { + + @Override + protected void createConfigs() throws Exception { + createPluggableReplicatedConfigs(); + } + + @Override + protected void createPluggableReplicatedConfigs() throws Exception { + final TransportConfiguration liveConnector = getConnectorTransportConfiguration(true); + final TransportConfiguration backupConnector = getConnectorTransportConfiguration(false); + final TransportConfiguration backupAcceptor = getAcceptorTransportConfiguration(false); + + backupConfig = createDefaultInVMConfig(); + liveConfig = createDefaultInVMConfig(); + + DistributedPrimitiveManagerConfiguration managerConfiguration = + new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), + Collections.singletonMap("locks-folder", + tmpFolder.newFolder("manager").toString())); + + ReplicatedBackupUtils.configurePluggableQuorumReplicationPair(backupConfig, backupConnector, backupAcceptor, + liveConfig, liveConnector, null, + managerConfiguration, managerConfiguration); + + backupConfig + .setSecurityEnabled(true) + .setBindingsDirectory(getBindingsDir(0, true)) + .setJournalDirectory(getJournalDir(0, true)) + .setPagingDirectory(getPageDir(0, true)) + .setLargeMessagesDirectory(getLargeMessagesDir(0, true)); + + setupHAPolicyConfiguration(); + nodeManager = createReplicatedBackupNodeManager(backupConfig); + + backupServer = createTestableServer(backupConfig); + + backupServer.getServer().setSecurityManager(new ActiveMQBasicSecurityManager()); + + liveConfig + .setSecurityEnabled(true) + .clearAcceptorConfigurations() + .addAcceptorConfiguration(getAcceptorTransportConfiguration(true)); + + liveServer = createTestableServer(liveConfig); + + liveServer.getServer().setSecurityManager(new ActiveMQBasicSecurityManager()); + } + + @Override + protected TransportConfiguration getAcceptorTransportConfiguration(final boolean live) { + return TransportConfigurationUtils.getInVMAcceptor(live); + } + + @Override + protected TransportConfiguration getConnectorTransportConfiguration(final boolean live) { + return TransportConfigurationUtils.getInVMConnector(live); + } + + @Override + protected void setupHAPolicyConfiguration() { + ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); + ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); + } + + @Test + public void testFailover() throws Exception { + + liveServer.getServer().getActiveMQServerControl().addUser("foo", "bar", "baz", false); + + ClientSessionFactory cf = createSessionFactory(getServerLocator()); + ClientSession session = null; + + try { + session = cf.createSession("foo", "bar", false, true, true, false, 0); + } catch (ActiveMQException e) { + e.printStackTrace(); + Assert.fail("should not throw exception"); + } + + crash(session); + waitForServerToStart(backupServer.getServer()); + + try { + cf = createSessionFactory(getServerLocator()); + session = cf.createSession("foo", "bar", false, true, true, false, 0); + } catch (ActiveMQException e) { + e.printStackTrace(); + Assert.fail("should not throw exception"); + } + } +} + diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/server/ScaleDown3NodeTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/server/ScaleDown3NodeTest.java index 726b079e79..d7e3061f84 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/server/ScaleDown3NodeTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/server/ScaleDown3NodeTest.java @@ -48,11 +48,11 @@ public class ScaleDown3NodeTest extends ClusterTestBase { @Before public void setUp() throws Exception { super.setUp(); - setupLiveServer(0, isFileStorage(), false, isNetty(), true); + setupLiveServer(0, isFileStorage(), HAType.SharedNothingReplication, isNetty(), true); servers[0].getConfiguration().setSecurityEnabled(true); - setupLiveServer(1, isFileStorage(), false, isNetty(), true); + setupLiveServer(1, isFileStorage(), HAType.SharedNothingReplication, isNetty(), true); servers[1].getConfiguration().setSecurityEnabled(true); - setupLiveServer(2, isFileStorage(), false, isNetty(), true); + setupLiveServer(2, isFileStorage(), HAType.SharedNothingReplication, isNetty(), true); servers[2].getConfiguration().setSecurityEnabled(true); LiveOnlyPolicyConfiguration haPolicyConfiguration0 = (LiveOnlyPolicyConfiguration) servers[0].getConfiguration().getHAPolicyConfiguration(); ScaleDownConfiguration scaleDownConfiguration0 = new ScaleDownConfiguration(); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/util/ReplicatedBackupUtils.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/util/ReplicatedBackupUtils.java index 1a38a6ac70..e2a4fcf8f4 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/util/ReplicatedBackupUtils.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/util/ReplicatedBackupUtils.java @@ -18,8 +18,11 @@ package org.apache.activemq.artemis.tests.util; import org.apache.activemq.artemis.api.core.TransportConfiguration; import org.apache.activemq.artemis.core.config.Configuration; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicaPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicatedPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; public final class ReplicatedBackupUtils { @@ -48,4 +51,30 @@ public final class ReplicatedBackupUtils { liveConfig.setName(LIVE_NODE_NAME).addConnectorConfiguration(LIVE_NODE_NAME, liveConnector).addConnectorConfiguration(BACKUP_NODE_NAME, backupConnector).setSecurityEnabled(false).addClusterConfiguration(ActiveMQTestBase.basicClusterConnectionConfig(LIVE_NODE_NAME, BACKUP_NODE_NAME)).setHAPolicyConfiguration(new ReplicatedPolicyConfiguration()); } + + + public static void configurePluggableQuorumReplicationPair(Configuration backupConfig, + TransportConfiguration backupConnector, + TransportConfiguration backupAcceptor, + Configuration liveConfig, + TransportConfiguration liveConnector, + TransportConfiguration liveAcceptor, + DistributedPrimitiveManagerConfiguration primaryManagerConfiguration, + DistributedPrimitiveManagerConfiguration backupManagerConfiguration) { + if (backupAcceptor != null) { + backupConfig.clearAcceptorConfigurations().addAcceptorConfiguration(backupAcceptor); + } + + if (liveAcceptor != null) { + liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(liveAcceptor); + } + + backupConfig.addConnectorConfiguration(BACKUP_NODE_NAME, backupConnector).addConnectorConfiguration(LIVE_NODE_NAME, liveConnector).addClusterConfiguration(ActiveMQTestBase.basicClusterConnectionConfig(BACKUP_NODE_NAME, LIVE_NODE_NAME)) + .setHAPolicyConfiguration(ReplicationBackupPolicyConfiguration.withDefault() + .setDistributedManagerConfiguration(backupManagerConfiguration)); + + liveConfig.setName(LIVE_NODE_NAME).addConnectorConfiguration(LIVE_NODE_NAME, liveConnector).addConnectorConfiguration(BACKUP_NODE_NAME, backupConnector).setSecurityEnabled(false).addClusterConfiguration(ActiveMQTestBase.basicClusterConnectionConfig(LIVE_NODE_NAME, BACKUP_NODE_NAME)) + .setHAPolicyConfiguration(ReplicationPrimaryPolicyConfiguration.withDefault() + .setDistributedManagerConfiguration(primaryManagerConfiguration)); + } } diff --git a/tests/smoke-tests/pom.xml b/tests/smoke-tests/pom.xml index cc5a141216..764fa0a396 100644 --- a/tests/smoke-tests/pom.xml +++ b/tests/smoke-tests/pom.xml @@ -167,6 +167,38 @@ test test-jar + + + org.apache.curator + curator-recipes + test + + + org.apache.curator + curator-client + test + + + org.apache.curator + curator-framework + test + + + org.apache.curator + curator-test + ${curator.version} + test + + + org.apache.zookeeper + zookeeper + test + + + org.apache.zookeeper + zookeeper-jute + test + @@ -624,6 +656,48 @@ + + + test-compile + create-zk-replication-primary + + create + + + + ${basedir}/target/classes/servers/zkReplicationPrimary + true + admin + admin + ${basedir}/target/zkReplicationPrimary + + + --java-options + -Djava.rmi.server.hostname=localhost + + + + + test-compile + create-zk-replication-backup + + create + + + + ${basedir}/target/classes/servers/zkReplicationBackup + true + admin + admin + ${basedir}/target/zkReplicationBackup + + + --java-options + -Djava.rmi.server.hostname=localhost + + + + test-compile create-replicated-failback-master2 diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationBackup/broker.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationBackup/broker.xml new file mode 100644 index 0000000000..8b97bf9831 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationBackup/broker.xml @@ -0,0 +1,138 @@ + + + + + + backup + + ./data/bindings + + ./data/journal + + ./data/largemessages + + ./data/paging + + + + + + org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager + + + + + true + + + + + + + tcp://localhost:61916 + tcp://localhost:61616 + + + + + + tcp://localhost:61916 + + + admin + + password + + + + artemis + OFF + 1 + + primary + + + + + + + + + + + + + + + + + + + + + + + + + + + DLQ + ExpiryQueue + 0 + + -1 + 10 + PAGE + true + true + true + true + + + + DLQ + ExpiryQueue + 0 + + 10MB + 1MB + + 10 + PAGE + true + true + true + true + + + + +

+ + +
+
+ + + +
+ + + diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationBackup/management.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationBackup/management.xml new file mode 100644 index 0000000000..14bbaf2218 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationBackup/management.xml @@ -0,0 +1,20 @@ + + + + + \ No newline at end of file diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml new file mode 100644 index 0000000000..2095d20830 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml @@ -0,0 +1,137 @@ + + + + + + primary + + ./data/bindings + + ./data/journal + + ./data/largemessages + + ./data/paging + + + + + + org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager + + + + + true + + + + + + + tcp://localhost:61616 + tcp://localhost:61916 + + + + + tcp://localhost:61616 + + + admin + + password + + + + artemis + OFF + 1 + + backup + + + + + + + + + + + + + + + + + + + + + + + + + + + DLQ + ExpiryQueue + 0 + + -1 + 10 + PAGE + true + true + true + true + + + + DLQ + ExpiryQueue + 0 + + 10MB + 1MB + + 10 + PAGE + true + true + true + true + + + + +
+ + +
+
+ + + +
+
+
+
diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/management.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/management.xml new file mode 100644 index 0000000000..576f1e5995 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/management.xml @@ -0,0 +1,20 @@ + + + + + \ No newline at end of file diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java new file mode 100644 index 0000000000..d5948f57a1 --- /dev/null +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java @@ -0,0 +1,276 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.activemq.artemis.tests.smoke.quorum; + +import javax.management.remote.JMXServiceURL; +import java.net.MalformedURLException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Objects; +import java.util.Optional; +import java.util.concurrent.TimeUnit; + +import org.apache.activemq.artemis.api.config.ActiveMQDefaultConfiguration; +import org.apache.activemq.artemis.api.core.management.ObjectNameBuilder; +import org.apache.activemq.artemis.tests.smoke.common.SmokeTestBase; +import org.apache.activemq.artemis.tests.smoke.utils.Jmx; +import org.apache.activemq.artemis.util.ServerUtil; +import org.apache.activemq.artemis.utils.Wait; +import org.jboss.logging.Logger; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.backupOf; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.containsExactNodeIds; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.decodeNetworkTopologyJson; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.liveOf; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.validateNetworkTopology; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withBackup; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withLive; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withMembers; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withNodes; + +@RunWith(Parameterized.class) +public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { + + private static final Logger LOGGER = Logger.getLogger(PluggableQuorumSinglePairTest.class); + + private static final String JMX_SERVER_HOSTNAME = "localhost"; + private static final int JMX_PORT_PRIMARY = 10099; + private static final int JMX_PORT_BACKUP = 10199; + + private static final String PRIMARY_DATA_FOLDER = "ReplicationPrimary";; + private static final String BACKUP_DATA_FOLDER = "ReplicationBackup"; + + private static final int PRIMARY_PORT_OFFSET = 0; + private static final int BACKUP_PORT_OFFSET = PRIMARY_PORT_OFFSET + 100; + + public static class BrokerControl { + + final String name; + final ObjectNameBuilder objectNameBuilder; + final String dataFolder; + final JMXServiceURL jmxServiceURL; + final int portID; + + private BrokerControl(final String name, int jmxPort, String dataFolder, int portID) { + this.portID = portID; + this.dataFolder = dataFolder; + try { + jmxServiceURL = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://" + JMX_SERVER_HOSTNAME + ":" + jmxPort + "/jmxrmi"); + } catch (MalformedURLException e) { + throw new RuntimeException(e); + } + this.objectNameBuilder = ObjectNameBuilder.create(ActiveMQDefaultConfiguration.getDefaultJmxDomain(), name, true); + this.name = name; + } + + public Process startServer(SmokeTestBase env, int millisTimeout) throws Exception { + return env.startServer(dataFolder, portID, millisTimeout); + } + + public void cleanupData() { + SmokeTestBase.cleanupData(dataFolder); + } + + public Optional isReplicaSync() throws Exception { + return Jmx.isReplicaSync(jmxServiceURL, objectNameBuilder); + } + + public Optional isBackup() throws Exception { + return Jmx.isBackup(jmxServiceURL, objectNameBuilder); + } + + public Optional getNodeID() throws Exception { + return Jmx.getNodeID(jmxServiceURL, objectNameBuilder); + } + + public Optional listNetworkTopology() throws Exception { + return Jmx.listNetworkTopology(jmxServiceURL, objectNameBuilder); + } + } + + @Parameterized.Parameter + public boolean forceKill; + + @Parameterized.Parameters(name = "forceKill={0}") + public static Iterable getParams() { + return Arrays.asList(new Object[][]{{false}, {true}}); + } + + private final BrokerControl primary; + private final BrokerControl backup; + private final Collection brokers; + + public PluggableQuorumSinglePairTest(String brokerFolderPrefix) { + primary = new BrokerControl("primary", JMX_PORT_PRIMARY, brokerFolderPrefix + PRIMARY_DATA_FOLDER, PRIMARY_PORT_OFFSET); + backup = new BrokerControl("backup", JMX_PORT_BACKUP, brokerFolderPrefix + BACKUP_DATA_FOLDER, BACKUP_PORT_OFFSET); + brokers = Collections.unmodifiableList(Arrays.asList(primary, backup)); + } + + protected abstract boolean awaitAsyncSetupCompleted(long timeout, TimeUnit unit) throws InterruptedException; + + protected abstract void stopMajority() throws Exception; + + @Before + public void setup() throws Exception { + brokers.forEach(BrokerControl::cleanupData); + } + + @Override + @After + public void after() throws Exception { + super.after(); + } + + @Test + public void testBackupFailoverAndPrimaryFailback() throws Exception { + final int timeout = (int) TimeUnit.SECONDS.toMillis(30); + LOGGER.info("starting primary"); + Process primaryInstance = primary.startServer(this, timeout); + Assert.assertTrue(awaitAsyncSetupCompleted(timeout, TimeUnit.MILLISECONDS)); + Wait.assertTrue(() -> !primary.isBackup().orElse(true), timeout); + LOGGER.info("started primary"); + LOGGER.info("starting backup"); + Process backupInstance = backup.startServer(this, 0); + Wait.assertTrue(() -> backup.isBackup().orElse(false), timeout); + final String nodeID = primary.getNodeID().get(); + Assert.assertNotNull(nodeID); + LOGGER.infof("NodeID: %s", nodeID); + for (BrokerControl broker : brokers) { + Wait.assertTrue(() -> validateNetworkTopology(broker.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, Objects::nonNull)) + .and(withBackup(nodeID, Objects::nonNull)) + .and(withMembers(1)) + .and(withNodes(2))), timeout); + } + LOGGER.infof("primary topology is: %s", primary.listNetworkTopology().get()); + LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get()); + Assert.assertTrue(backup.isReplicaSync().get()); + LOGGER.infof("backup is synchronized with live"); + final String urlBackup = backupOf(nodeID, decodeNetworkTopologyJson(backup.listNetworkTopology().get())); + Assert.assertNotNull(urlBackup); + LOGGER.infof("backup: %s", urlBackup); + final String urlPrimary = liveOf(nodeID, decodeNetworkTopologyJson(primary.listNetworkTopology().get())); + Assert.assertNotNull(urlPrimary); + LOGGER.infof("primary: %s", urlPrimary); + Assert.assertNotEquals(urlPrimary, urlBackup); + LOGGER.info("killing primary"); + ServerUtil.killServer(primaryInstance, forceKill); + LOGGER.info("killed primary"); + Wait.assertTrue(() -> !backup.isBackup().orElse(true), timeout); + Wait.assertTrue(() -> validateNetworkTopology(backup.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, urlBackup::equals)) + .and(withBackup(nodeID, Objects::isNull)) + .and(withMembers(1)) + .and(withNodes(1))), timeout); + LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get()); + Assert.assertEquals(nodeID, backup.getNodeID().get()); + // wait a bit before restarting primary + LOGGER.info("waiting before starting primary"); + TimeUnit.SECONDS.sleep(4); + LOGGER.info("starting primary"); + primary.startServer(this, 0); + LOGGER.info("started primary"); + Wait.assertTrue(() -> backup.isBackup().orElse(false), timeout); + Assert.assertTrue(!primary.isBackup().get()); + for (BrokerControl broker : brokers) { + Wait.assertTrue(() -> validateNetworkTopology(broker.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, urlPrimary::equals)) + .and(withBackup(nodeID, urlBackup::equals)) + .and(withMembers(1)) + .and(withNodes(2))), timeout); + } + LOGGER.infof("primary topology is: %s", primary.listNetworkTopology().get()); + LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get()); + Assert.assertTrue(backup.isReplicaSync().get()); + LOGGER.infof("backup is synchronized with live"); + Assert.assertEquals(nodeID, primary.getNodeID().get()); + } + + @Test + public void testLivePrimarySuicideOnLostQuorum() throws Exception { + final int timeout = (int) TimeUnit.SECONDS.toMillis(30); + Process primaryInstance = primary.startServer(this, timeout); + Assert.assertTrue(awaitAsyncSetupCompleted(timeout, TimeUnit.MILLISECONDS)); + Wait.assertTrue(() -> !primary.isBackup().orElse(true), timeout); + final String nodeID = primary.getNodeID().get(); + Wait.assertTrue(() -> validateNetworkTopology(primary.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, Objects::nonNull)) + .and(withBackup(nodeID, Objects::isNull)) + .and(withMembers(1)) + .and(withNodes(1))), timeout); + final String urlLive = liveOf(nodeID, decodeNetworkTopologyJson(primary.listNetworkTopology().get())); + Assert.assertTrue(validateNetworkTopology(primary.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, urlLive::equals)) + .and(withBackup(nodeID, Objects::isNull)) + .and(withMembers(1)) + .and(withNodes(1)))); + stopMajority(); + Wait.waitFor(()-> !primaryInstance.isAlive(), timeout); + } + + @Test + public void testLiveBackupSuicideOnLostQuorum() throws Exception { + final int timeout = (int) TimeUnit.SECONDS.toMillis(30); + Process primaryInstance = primary.startServer(this, timeout); + Assert.assertTrue(awaitAsyncSetupCompleted(timeout, TimeUnit.MILLISECONDS)); + Wait.assertTrue(() -> !primary.isBackup().orElse(true), timeout); + Process backupInstance = backup.startServer(this, 0); + Wait.assertTrue(() -> backup.isBackup().orElse(false), timeout); + final String nodeID = primary.getNodeID().get(); + Assert.assertNotNull(nodeID); + for (BrokerControl broker : brokers) { + Wait.assertTrue(() -> validateNetworkTopology(broker.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, Objects::nonNull)) + .and(withBackup(nodeID, Objects::nonNull)) + .and(withMembers(1)) + .and(withNodes(2))), timeout); + } + Assert.assertTrue(backup.isReplicaSync().get()); + final String urlBackup = backupOf(nodeID, decodeNetworkTopologyJson(backup.listNetworkTopology().get())); + Assert.assertNotNull(urlBackup); + final String urlPrimary = liveOf(nodeID, decodeNetworkTopologyJson(primary.listNetworkTopology().get())); + Assert.assertNotNull(urlPrimary); + Assert.assertNotEquals(urlPrimary, urlBackup); + ServerUtil.killServer(primaryInstance, forceKill); + Wait.assertTrue(() -> !backup.isBackup().orElse(true), timeout); + Wait.assertTrue(() -> validateNetworkTopology(backup.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, urlBackup::equals)) + .and(withBackup(nodeID, Objects::isNull)) + .and(withMembers(1)) + .and(withNodes(1))), timeout); + Assert.assertEquals(nodeID, backup.getNodeID().get()); + stopMajority(); + Wait.waitFor(()-> !backupInstance.isAlive(), timeout); + } + +} + diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java new file mode 100644 index 0000000000..df2123f7d6 --- /dev/null +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.activemq.artemis.tests.smoke.quorum; + +import java.util.List; +import java.util.concurrent.TimeUnit; +import org.apache.curator.test.InstanceSpec; +import org.apache.curator.test.TestingCluster; +import org.apache.curator.test.TestingZooKeeperServer; +import org.jboss.logging.Logger; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.rules.TemporaryFolder; + +public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSinglePairTest { + + private static final Logger LOGGER = Logger.getLogger(ZookeeperPluggableQuorumSinglePairTest.class); + private static final int BASE_SERVER_PORT = 6666; + // Beware: the server tick must be small enough that to let the session to be correctly expired + private static final int SERVER_TICK_MS = 100; + + @Rule + public TemporaryFolder tmpFolder = new TemporaryFolder(); + private TestingCluster testingServer; + private InstanceSpec[] clusterSpecs; + private int nodes; + + @Before + @Override + public void setup() throws Exception { + super.setup(); + nodes = 3; + clusterSpecs = new InstanceSpec[nodes]; + for (int i = 0; i < nodes; i++) { + clusterSpecs[i] = new InstanceSpec(tmpFolder.newFolder(), BASE_SERVER_PORT + i, -1, -1, true, -1, SERVER_TICK_MS, -1); + } + testingServer = new TestingCluster(clusterSpecs); + testingServer.start(); + Assert.assertEquals("127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668", testingServer.getConnectString()); + LOGGER.infof("Cluster of %d nodes on: %s", 3, testingServer.getConnectString()); + } + + @Override + @After + public void after() throws Exception { + try { + super.after(); + } finally { + testingServer.close(); + } + } + + public ZookeeperPluggableQuorumSinglePairTest() { + super("zk"); + } + + @Override + protected boolean awaitAsyncSetupCompleted(long timeout, TimeUnit unit) { + return true; + } + + @Override + protected void stopMajority() throws Exception { + List followers = testingServer.getServers(); + final int quorum = (nodes / 2) + 1; + for (int i = 0; i < quorum; i++) { + followers.get(i).stop(); + } + } +} diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java new file mode 100644 index 0000000000..079e2f75b0 --- /dev/null +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java @@ -0,0 +1,157 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.smoke.utils; + +import javax.json.Json; +import javax.json.JsonArray; +import javax.json.JsonObject; +import javax.json.JsonReader; +import javax.management.MBeanServerInvocationHandler; +import javax.management.ObjectName; +import javax.management.remote.JMXConnector; +import javax.management.remote.JMXConnectorFactory; +import javax.management.remote.JMXServiceURL; +import java.io.StringReader; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Stream; + +import org.apache.activemq.artemis.api.core.Pair; +import org.apache.activemq.artemis.api.core.management.ActiveMQServerControl; +import org.apache.activemq.artemis.api.core.management.ObjectNameBuilder; +import org.jboss.logging.Logger; + +public class Jmx { + + private static final Logger LOGGER = Logger.getLogger(Jmx.class); + + @FunctionalInterface + public interface ThrowableFunction { + + R apply(T t) throws Throwable; + } + + private static Optional queryControl(JMXServiceURL serviceURI, + ObjectName objectName, + ThrowableFunction queryControl, + Class controlClass, + Function onThrowable) { + try { + try (JMXConnector jmx = JMXConnectorFactory.connect(serviceURI)) { + final C control = MBeanServerInvocationHandler.newProxyInstance(jmx.getMBeanServerConnection(), objectName, controlClass, false); + return Optional.ofNullable(queryControl.apply(control)); + } + } catch (Throwable t) { + return Optional.ofNullable(onThrowable.apply(t)); + } + } + + public static Optional isReplicaSync(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception { + return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::isReplicaSync, ActiveMQServerControl.class, throwable -> null); + } + + public static Optional isBackup(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception { + return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::isBackup, ActiveMQServerControl.class, throwable -> null); + } + + public static Optional getNodeID(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception { + return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::getNodeID, ActiveMQServerControl.class, throwable -> null); + } + + public static Optional listNetworkTopology(JMXServiceURL serviceURI, + ObjectNameBuilder builder) throws Exception { + return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::listNetworkTopology, ActiveMQServerControl.class, throwable -> null); + } + + public static Map> decodeNetworkTopologyJson(String networkTopologyJson) { + if (networkTopologyJson == null || networkTopologyJson.isEmpty()) { + return Collections.emptyMap(); + } + try (JsonReader jsonReader = Json.createReader(new StringReader(networkTopologyJson))) { + final JsonArray nodeIDs = jsonReader.readArray(); + final int nodeCount = nodeIDs.size(); + Map> networkTopology = new HashMap<>(nodeCount); + for (int i = 0; i < nodeCount; i++) { + final JsonObject nodePair = nodeIDs.getJsonObject(i); + try { + final String nodeID = nodePair.getString("nodeID"); + final String live = nodePair.getString("live"); + final String backup = nodePair.getString("backup", null); + networkTopology.put(nodeID, new Pair<>(live, backup)); + } catch (Exception e) { + LOGGER.warnf(e, "Error on %s", nodePair); + } + } + return networkTopology; + } + } + + private static long countMembers(Map> networkTopology) { + final long count = networkTopology.values().stream() + .map(Pair::getA).filter(live -> live != null && !live.isEmpty()) + .count(); + return count; + } + + private static long countNodes(Map> networkTopology) { + final long count = networkTopology.values().stream() + .flatMap(pair -> Stream.of(pair.getA(), pair.getB())) + .filter(liveOrBackup -> liveOrBackup != null && !liveOrBackup.isEmpty()) + .count(); + return count; + } + + public static boolean validateNetworkTopology(String networkTopologyJson, + Predicate>> checkTopology) { + final Map> networkTopology = decodeNetworkTopologyJson(networkTopologyJson); + return checkTopology.test(networkTopology); + } + + public static String backupOf(String nodeID, Map> networkTopology) { + return networkTopology.get(nodeID).getB(); + } + + public static String liveOf(String nodeID, Map> networkTopology) { + return networkTopology.get(nodeID).getA(); + } + + public static Predicate>> containsExactNodeIds(String... nodeID) { + Objects.requireNonNull(nodeID); + return topology -> topology.size() == nodeID.length && Stream.of(nodeID).allMatch(topology::containsKey); + } + + public static Predicate>> withMembers(int count) { + return topology -> countMembers(topology) == count; + } + + public static Predicate>> withNodes(int count) { + return topology -> countNodes(topology) == count; + } + + public static Predicate>> withBackup(String nodeId, Predicate compare) { + return topology -> compare.test(backupOf(nodeId, topology)); + } + + public static Predicate>> withLive(String nodeId, Predicate compare) { + return topology -> compare.test(liveOf(nodeId, topology)); + } +} From ca7a100de03ea665ef61701a0caccc00f9ac6ae2 Mon Sep 17 00:00:00 2001 From: gtully Date: Wed, 7 Jul 2021 20:54:49 +0100 Subject: [PATCH 2/4] ARTEMIS-3340 Sequential activation tracking for pluggable quorum replication policies + peer Co-authored-by: franz1981 --- .../activemq/artemis/cli/commands/Run.java | 18 +- .../activemq/artemis/logs/AuditLogger.java | 8 + .../management/ActiveMQServerControl.java | 9 + artemis-quorum-ri/pom.xml | 10 + .../file/FileBasedPrimitiveManager.java | 57 +- .../quorum/file/FileDistributedLock.java | 4 + .../artemis/quorum/DistributedLockTest.java | 1 + .../ReplicationBackupPolicyConfiguration.java | 25 - ...ReplicationPrimaryPolicyConfiguration.java | 61 +- .../impl/FileConfigurationParser.java | 16 +- .../impl/ActiveMQServerControlImpl.java | 11 + .../ReplicationStartSyncMessage.java | 23 +- .../core/replication/ReplicationEndpoint.java | 7 +- .../core/replication/ReplicationManager.java | 2 +- .../core/server/ActiveMQServerLogger.java | 4 +- .../artemis/core/server/NodeManager.java | 17 +- .../cluster/ha/ReplicationBackupPolicy.java | 31 +- .../cluster/ha/ReplicationPrimaryPolicy.java | 18 +- .../core/server/impl/ActiveMQServerImpl.java | 4 +- .../server/impl/ClusterTopologySearch.java | 160 ----- .../server/impl/FileBasedNodeManager.java | 40 +- .../impl/ReplicationBackupActivation.java | 166 ++--- .../impl/ReplicationPrimaryActivation.java | 197 +++--- .../ActivationSequenceStateMachine.java | 312 +++++++++ .../impl/ManagementServiceImpl.java | 4 +- .../schema/artemis-configuration.xsd | 38 +- .../impl/HAPolicyConfigurationTest.java | 6 - .../test/resources/backup-hapolicy-config.xml | 2 - .../resources/primary-hapolicy-config.xml | 3 - docs/user-manual/en/ha.md | 133 ++-- examples/features/ha/pom.xml | 2 + .../ha/zookeeper-single-pair-failback/pom.xml | 110 +++ .../zookeeper-single-pair-failback/readme.md | 94 +++ .../example/ZookeeperSinglePairFailback.java | 157 +++++ .../resources/activemq/server0/broker.xml | 90 +++ .../resources/activemq/server1/broker.xml | 91 +++ .../src/main/resources/jndi.properties | 20 + .../client/InfiniteRedeliveryTest.java | 7 +- .../cluster/failover/FailoverTest.java | 12 +- .../cluster/failover/FailoverTestBase.java | 28 +- .../MultipleServerFailoverTestBase.java | 8 +- .../cluster/failover/ReplicaTimeoutTest.java | 5 +- ...ultipleServerFailoverExtraBackupsTest.java | 15 +- ...uggableQuorumBackupAuthenticationTest.java | 2 - .../PluggableQuorumBackupSyncJournalTest.java | 5 +- ...ettyNoGroupNameReplicatedFailoverTest.java | 36 +- ...uggableQuorumNettyReplicationStopTest.java | 2 - ...rumPageCleanupWhileReplicaCatchupTest.java | 2 - .../PluggableQuorumReplicaTimeoutTest.java | 1 - ...rumReplicatedLargeMessageFailoverTest.java | 11 +- ...atedLargeMessageWithDelayFailoverTest.java | 2 - ...bleQuorumReplicatedPagingFailoverTest.java | 2 - .../ActiveMQServerControlUsingCoreTest.java | 5 + ...gableQuorumReplicationFlowControlTest.java | 3 - .../PluggableQuorumReplicationOrderTest.java | 5 +- .../PluggableQuorumReplicationTest.java | 636 +++++++++++++++++- .../SharedNothingReplicationTest.java | 4 +- ...uorumBasicSecurityManagerFailoverTest.java | 9 +- tests/smoke-tests/pom.xml | 38 ++ .../servers/zkReplicationPrimary/broker.xml | 1 - .../zkReplicationPrimaryPeerA/broker.xml | 138 ++++ .../zkReplicationPrimaryPeerA/management.xml | 20 + .../zkReplicationPrimaryPeerB/broker.xml | 138 ++++ .../zkReplicationPrimaryPeerB/management.xml | 20 + .../quorum/PluggableQuorumSinglePairTest.java | 140 +++- .../ZookeeperPluggableQuorumPeerTest.java | 109 +++ ...ookeeperPluggableQuorumSinglePairTest.java | 24 +- .../artemis/tests/smoke/utils/Jmx.java | 10 + .../unit/core/server/impl/FileLockTest.java | 20 + 69 files changed, 2691 insertions(+), 718 deletions(-) delete mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ClusterTopologySearch.java create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/quorum/ActivationSequenceStateMachine.java create mode 100644 examples/features/ha/zookeeper-single-pair-failback/pom.xml create mode 100644 examples/features/ha/zookeeper-single-pair-failback/readme.md create mode 100644 examples/features/ha/zookeeper-single-pair-failback/src/main/java/org/apache/activemq/artemis/jms/example/ZookeeperSinglePairFailback.java create mode 100644 examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server0/broker.xml create mode 100644 examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server1/broker.xml create mode 100644 examples/features/ha/zookeeper-single-pair-failback/src/main/resources/jndi.properties create mode 100644 tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/broker.xml create mode 100644 tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/management.xml create mode 100644 tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/broker.xml create mode 100644 tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/management.xml create mode 100644 tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumPeerTest.java diff --git a/artemis-cli/src/main/java/org/apache/activemq/artemis/cli/commands/Run.java b/artemis-cli/src/main/java/org/apache/activemq/artemis/cli/commands/Run.java index 8dfae7871b..b594e37785 100644 --- a/artemis-cli/src/main/java/org/apache/activemq/artemis/cli/commands/Run.java +++ b/artemis-cli/src/main/java/org/apache/activemq/artemis/cli/commands/Run.java @@ -19,7 +19,7 @@ package org.apache.activemq.artemis.cli.commands; import java.io.File; import java.util.Timer; import java.util.TimerTask; -import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; import io.airlift.airline.Command; import io.airlift.airline.Option; @@ -71,6 +71,7 @@ public class Run extends LockAbstract { public Object execute(ActionContext context) throws Exception { super.execute(context); + AtomicReference serverActivationFailed = new AtomicReference<>(); try { BrokerDTO broker = getBrokerDTO(); ActiveMQSecurityManager securityManager = SecurityManagerFactory.create(broker.security); @@ -110,8 +111,7 @@ public class Run extends LockAbstract { server = BrokerFactory.createServer(broker.server, securityManager, activateCallback); server.createComponents(); - AtomicBoolean serverActivationFailed = new AtomicBoolean(false); - server.getServer().registerActivationFailureListener(exception -> serverActivationFailed.set(true)); + server.getServer().registerActivationFailureListener(exception -> serverActivationFailed.set(exception)); server.start(); server.getServer().addExternalComponent(managementContext, false); @@ -126,14 +126,16 @@ public class Run extends LockAbstract { server.getServer().addExternalComponent(component, true); assert component.isStarted(); } - - if (serverActivationFailed.get()) { - stop(); - } } catch (Throwable t) { t.printStackTrace(); - stop(); + serverActivationFailed.set(t); } + + if (serverActivationFailed.get() != null) { + stop(); + return serverActivationFailed.get(); + } + return new Pair<>(managementContext, server.getServer()); } diff --git a/artemis-commons/src/main/java/org/apache/activemq/artemis/logs/AuditLogger.java b/artemis-commons/src/main/java/org/apache/activemq/artemis/logs/AuditLogger.java index ff9c7804dd..e9a63421c9 100644 --- a/artemis-commons/src/main/java/org/apache/activemq/artemis/logs/AuditLogger.java +++ b/artemis-commons/src/main/java/org/apache/activemq/artemis/logs/AuditLogger.java @@ -2851,4 +2851,12 @@ public interface AuditLogger extends BasicLogger { @LogMessage(level = Logger.Level.INFO) @Message(id = 601748, value = "User {0} is getting max retry interval on target resource: {1} {2}", format = Message.Format.MESSAGE_FORMAT) void getMaxRetryInterval(String user, Object source, Object... args); + + static void getActivationSequence(Object source) { + BASE_LOGGER.getActivationSequence(getCaller(), source); + } + + @LogMessage(level = Logger.Level.INFO) + @Message(id = 601749, value = "User {0} is getting activation sequence on target resource: {1} {2}", format = Message.Format.MESSAGE_FORMAT) + void getActivationSequence(String user, Object source, Object... args); } diff --git a/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/core/management/ActiveMQServerControl.java b/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/core/management/ActiveMQServerControl.java index e4db945cc2..f040fcaa7b 100644 --- a/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/core/management/ActiveMQServerControl.java +++ b/artemis-core-client/src/main/java/org/apache/activemq/artemis/api/core/management/ActiveMQServerControl.java @@ -321,6 +321,15 @@ public interface ActiveMQServerControl { @Attribute(desc = "Node ID of this server") String getNodeID(); + + /** + * Returns the current activation sequence number of this server. + *
+ * When replicated, peers may coordinate activation with this monotonic sequence + */ + @Attribute(desc = "Activation sequence of this server instance") + long getActivationSequence(); + /** * Returns the management notification address of this server. *
diff --git a/artemis-quorum-ri/pom.xml b/artemis-quorum-ri/pom.xml index 83c81678af..455c4d8b0c 100644 --- a/artemis-quorum-ri/pom.xml +++ b/artemis-quorum-ri/pom.xml @@ -44,6 +44,12 @@ org.apache.zookeeper zookeeper + + + org.slf4j + slf4j-log4j12 + + org.apache.curator @@ -59,6 +65,10 @@ org.jboss.logging jboss-logging + + org.jboss.slf4j + slf4j-jboss-logmanager + org.apache.activemq artemis-commons diff --git a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java index 4764b7c614..2195d5a585 100644 --- a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java +++ b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileBasedPrimitiveManager.java @@ -18,16 +18,18 @@ package org.apache.activemq.artemis.quorum.file; import java.io.File; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.HashMap; import java.util.Map; import java.util.Objects; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; import org.apache.activemq.artemis.quorum.DistributedLock; import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; import org.apache.activemq.artemis.quorum.MutableLong; +import org.apache.activemq.artemis.quorum.UnavailableStateException; /** * This is an implementation suitable to be used just on unit tests and it won't attempt @@ -127,8 +129,55 @@ public class FileBasedPrimitiveManager implements DistributedPrimitiveManager { } @Override - public MutableLong getMutableLong(String mutableLongId) throws InterruptedException, ExecutionException, TimeoutException { - // TODO - return null; + public MutableLong getMutableLong(final String mutableLongId) throws ExecutionException { + // use a lock file - but with a prefix + final FileDistributedLock fileDistributedLock = (FileDistributedLock) getDistributedLock("ML:" + mutableLongId); + return new MutableLong() { + @Override + public String getMutableLongId() { + return mutableLongId; + } + + @Override + public long get() throws UnavailableStateException { + try { + return readLong(fileDistributedLock); + } catch (IOException e) { + throw new UnavailableStateException(e); + } + } + + @Override + public void set(long value) throws UnavailableStateException { + try { + writeLong(fileDistributedLock, value); + } catch (IOException e) { + throw new UnavailableStateException(e); + } + } + + @Override + public void close() { + fileDistributedLock.close(); + } + }; + } + + private void writeLong(FileDistributedLock fileDistributedLock, long value) throws IOException { + ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN); + buffer.putLong(value); + buffer.flip(); + if (fileDistributedLock.getChannel().position(0).write(buffer) == Long.BYTES) { + fileDistributedLock.getChannel().force(false); + } + } + + private long readLong(FileDistributedLock fileDistributedLock) throws IOException { + ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN); + if (fileDistributedLock.getChannel().position(0).read(buffer, 0) != Long.BYTES) { + return 0; + } + buffer.flip(); + return buffer.getLong(); } } diff --git a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java index 5e749698d6..13a9e78a9d 100644 --- a/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java +++ b/artemis-quorum-ri/src/main/java/org/apache/activemq/artemis/quorum/file/FileDistributedLock.java @@ -138,4 +138,8 @@ final class FileDistributedLock implements DistributedLock { public void close() { close(true); } + + public FileChannel getChannel() { + return channel; + } } diff --git a/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java index e6c8cab1d7..50e4d0c513 100644 --- a/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java +++ b/artemis-quorum-ri/src/test/java/org/apache/activemq/artemis/quorum/DistributedLockTest.java @@ -178,6 +178,7 @@ public abstract class DistributedLockTest { ownerManager.getDistributedLock("a").unlock(); Assert.assertFalse(observerManager.getDistributedLock("a").isHeldByCaller()); Assert.assertFalse(ownerManager.getDistributedLock("a").isHeldByCaller()); + Assert.assertTrue(observerManager.getDistributedLock("a").tryLock()); } @Test diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java index f1c97eb7ea..2328d91ee9 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationBackupPolicyConfiguration.java @@ -34,13 +34,6 @@ public class ReplicationBackupPolicyConfiguration implements HAPolicyConfigurati private long initialReplicationSyncTimeout = ActiveMQDefaultConfiguration.getDefaultInitialReplicationSyncTimeout(); - private int voteRetries = ActiveMQDefaultConfiguration.getDefaultVoteRetries(); - - /** - * TODO: move if into {@link ActiveMQDefaultConfiguration} when the configuration is stable. - */ - private long voteRetryWait = 2000; - private long retryReplicationWait = ActiveMQDefaultConfiguration.getDefaultRetryReplicationWait(); private DistributedPrimitiveManagerConfiguration distributedManagerConfiguration = null; @@ -102,24 +95,6 @@ public class ReplicationBackupPolicyConfiguration implements HAPolicyConfigurati return this; } - public int getVoteRetries() { - return voteRetries; - } - - public ReplicationBackupPolicyConfiguration setVoteRetries(int voteRetries) { - this.voteRetries = voteRetries; - return this; - } - - public ReplicationBackupPolicyConfiguration setVoteRetryWait(long voteRetryWait) { - this.voteRetryWait = voteRetryWait; - return this; - } - - public long getVoteRetryWait() { - return voteRetryWait; - } - public long getRetryReplicationWait() { return retryReplicationWait; } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java index 7bb53184dd..a2e071fa1e 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/config/ha/ReplicationPrimaryPolicyConfiguration.java @@ -21,25 +21,18 @@ import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfiguration { - private boolean checkForLiveServer = ActiveMQDefaultConfiguration.isDefaultCheckForLiveServer(); - private String groupName = null; private String clusterName = null; private long initialReplicationSyncTimeout = ActiveMQDefaultConfiguration.getDefaultInitialReplicationSyncTimeout(); - private int voteRetries = ActiveMQDefaultConfiguration.getDefaultVoteRetries(); - - /** - * TODO: move if into {@link ActiveMQDefaultConfiguration} when the configuration is stable. - */ - private long voteRetryWait = 2000; - private Long retryReplicationWait = ActiveMQDefaultConfiguration.getDefaultRetryReplicationWait(); private DistributedPrimitiveManagerConfiguration distributedManagerConfiguration = null; + private String coordinationId = null; + public static ReplicationPrimaryPolicyConfiguration withDefault() { return new ReplicationPrimaryPolicyConfiguration(); } @@ -52,15 +45,6 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat return TYPE.PRIMARY; } - public boolean isCheckForLiveServer() { - return checkForLiveServer; - } - - public ReplicationPrimaryPolicyConfiguration setCheckForLiveServer(boolean checkForLiveServer) { - this.checkForLiveServer = checkForLiveServer; - return this; - } - public String getGroupName() { return groupName; } @@ -88,24 +72,6 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat return this; } - public int getVoteRetries() { - return voteRetries; - } - - public ReplicationPrimaryPolicyConfiguration setVoteRetries(int voteRetries) { - this.voteRetries = voteRetries; - return this; - } - - public ReplicationPrimaryPolicyConfiguration setVoteRetryWait(long voteRetryWait) { - this.voteRetryWait = voteRetryWait; - return this; - } - - public long getVoteRetryWait() { - return voteRetryWait; - } - public void setRetryReplicationWait(Long retryReplicationWait) { this.retryReplicationWait = retryReplicationWait; } @@ -122,4 +88,27 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat public DistributedPrimitiveManagerConfiguration getDistributedManagerConfiguration() { return distributedManagerConfiguration; } + + public String getCoordinationId() { + return coordinationId; + } + + public void setCoordinationId(String newCoordinationId) { + if (newCoordinationId == null) { + return; + } + final int len = newCoordinationId.length(); + if (len >= 16) { + this.coordinationId = newCoordinationId.substring(0, 16); + } else if (len % 2 != 0) { + // must be even for conversion to uuid, extend to next even + this.coordinationId = newCoordinationId + "+"; + } else if (len > 0 ) { + // run with it + this.coordinationId = newCoordinationId; + } + if (this.coordinationId != null) { + this.coordinationId = this.coordinationId.replace('-', '.'); + } + } } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java index 4932e01439..93eb2dbb0e 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java @@ -1715,22 +1715,18 @@ public final class FileConfigurationParser extends XMLConfigurationUtil { private ReplicationPrimaryPolicyConfiguration createReplicationPrimaryHaPolicy(Element policyNode, Configuration config) { ReplicationPrimaryPolicyConfiguration configuration = ReplicationPrimaryPolicyConfiguration.withDefault(); - configuration.setCheckForLiveServer(getBoolean(policyNode, "check-for-live-server", configuration.isCheckForLiveServer())); - configuration.setGroupName(getString(policyNode, "group-name", configuration.getGroupName(), Validators.NO_CHECK)); configuration.setClusterName(getString(policyNode, "cluster-name", configuration.getClusterName(), Validators.NO_CHECK)); configuration.setInitialReplicationSyncTimeout(getLong(policyNode, "initial-replication-sync-timeout", configuration.getInitialReplicationSyncTimeout(), Validators.GT_ZERO)); - configuration.setVoteRetries(getInteger(policyNode, "vote-retries", configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO)); - - configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); - - configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); + configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getRetryReplicationWait(), Validators.GT_ZERO)); configuration.setDistributedManagerConfiguration(createDistributedPrimitiveManagerConfiguration(policyNode, config)); + configuration.setCoordinationId(getString(policyNode, "coordination-id", configuration.getCoordinationId(), Validators.NOT_NULL_OR_EMPTY)); + return configuration; } @@ -1748,11 +1744,7 @@ public final class FileConfigurationParser extends XMLConfigurationUtil { configuration.setMaxSavedReplicatedJournalsSize(getInteger(policyNode, "max-saved-replicated-journals-size", configuration.getMaxSavedReplicatedJournalsSize(), Validators.MINUS_ONE_OR_GE_ZERO)); - configuration.setVoteRetries(getInteger(policyNode, "vote-retries", configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO)); - - configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); - - configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO)); + configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getRetryReplicationWait(), Validators.GT_ZERO)); configuration.setDistributedManagerConfiguration(createDistributedPrimitiveManagerConfiguration(policyNode, config)); diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/management/impl/ActiveMQServerControlImpl.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/management/impl/ActiveMQServerControlImpl.java index 28ea231994..145a23ad75 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/management/impl/ActiveMQServerControlImpl.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/management/impl/ActiveMQServerControlImpl.java @@ -4204,6 +4204,17 @@ public class ActiveMQServerControlImpl extends AbstractControl implements Active return server.getNodeID() == null ? null : server.getNodeID().toString(); } + @Override + public long getActivationSequence() { + if (AuditLogger.isBaseLoggingEnabled()) { + AuditLogger.getActivationSequence(this.server); + } + if (server.getNodeManager() != null) { + return server.getNodeManager().getNodeActivationSequence(); + } + return 0; + } + @Override public String getManagementNotificationAddress() { if (AuditLogger.isBaseLoggingEnabled()) { diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/protocol/core/impl/wireformat/ReplicationStartSyncMessage.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/protocol/core/impl/wireformat/ReplicationStartSyncMessage.java index 018535f426..ebd5d3603a 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/protocol/core/impl/wireformat/ReplicationStartSyncMessage.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/protocol/core/impl/wireformat/ReplicationStartSyncMessage.java @@ -43,7 +43,8 @@ public class ReplicationStartSyncMessage extends PacketImpl { public enum SyncDataType { JournalBindings(AbstractJournalStorageManager.JournalContent.BINDINGS.typeByte), JournalMessages(AbstractJournalStorageManager.JournalContent.MESSAGES.typeByte), - LargeMessages((byte) 2); + LargeMessages((byte) 2), + ActivationSequence((byte) 3); private byte code; @@ -62,6 +63,9 @@ public class ReplicationStartSyncMessage extends PacketImpl { return JournalMessages; if (code == LargeMessages.code) return LargeMessages; + if (code == ActivationSequence.code) + return ActivationSequence; + throw new InvalidParameterException("invalid byte: " + code); } } @@ -80,6 +84,14 @@ public class ReplicationStartSyncMessage extends PacketImpl { nodeID = ""; // this value will be ignored } + + public ReplicationStartSyncMessage(String nodeID, long nodeDataVersion) { + this(nodeID); + ids = new long[1]; + ids[0] = nodeDataVersion; + dataType = SyncDataType.ActivationSequence; + } + public ReplicationStartSyncMessage(String nodeID) { this(); synchronizationIsFinished = true; @@ -118,10 +130,6 @@ public class ReplicationStartSyncMessage extends PacketImpl { DataConstants.SIZE_BOOLEAN + // buffer.writeBoolean(allowsAutoFailBack); nodeID.length() * 3; // buffer.writeString(nodeID); -- an estimate - - if (synchronizationIsFinished) { - return size; - } size += DataConstants.SIZE_BYTE + // buffer.writeByte(dataType.code); DataConstants.SIZE_INT + // buffer.writeInt(ids.length); DataConstants.SIZE_LONG * ids.length; // the write loop @@ -135,8 +143,6 @@ public class ReplicationStartSyncMessage extends PacketImpl { buffer.writeBoolean(synchronizationIsFinished); buffer.writeBoolean(allowsAutoFailBack); buffer.writeString(nodeID); - if (synchronizationIsFinished) - return; buffer.writeByte(dataType.code); buffer.writeInt(ids.length); for (long id : ids) { @@ -149,9 +155,6 @@ public class ReplicationStartSyncMessage extends PacketImpl { synchronizationIsFinished = buffer.readBoolean(); allowsAutoFailBack = buffer.readBoolean(); nodeID = buffer.readString(); - if (synchronizationIsFinished) { - return; - } dataType = SyncDataType.getDataType(buffer.readByte()); int length = buffer.readInt(); ids = new long[length]; diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java index 495b35d281..ad2093ad8b 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationEndpoint.java @@ -483,7 +483,7 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon logger.trace("Backup is synchronized / BACKUP-SYNC-DONE"); } - ActiveMQServerLogger.LOGGER.backupServerSynched(server); + ActiveMQServerLogger.LOGGER.backupServerSynchronized(server, liveID); return; } @@ -560,6 +560,11 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon return replicationResponseMessage; if (packet.isSynchronizationFinished()) { + if (packet.getFileIds() != null && packet.getFileIds().length == 1) { + // this is the version sequence of the data we are replicating + // verified if we activate with this data + server.getNodeManager().writeNodeActivationSequence(packet.getFileIds()[0]); + } finishSynchronization(packet.getNodeID()); replicationResponseMessage.setSynchronizationIsFinishedAcknowledgement(true); return replicationResponseMessage; diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationManager.java index ccebb6a3a0..7be1e17554 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationManager.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/replication/ReplicationManager.java @@ -821,7 +821,7 @@ public final class ReplicationManager implements ActiveMQComponent { } synchronizationIsFinishedAcknowledgement.countUp(); - sendReplicatePacket(new ReplicationStartSyncMessage(nodeID)); + sendReplicatePacket(new ReplicationStartSyncMessage(nodeID, server.getNodeManager().getNodeActivationSequence())); try { if (!synchronizationIsFinishedAcknowledgement.await(initialReplicationSyncTimeout)) { ActiveMQReplicationTimeooutException exception = ActiveMQMessageBundle.BUNDLE.replicationSynchronizationTimeout(initialReplicationSyncTimeout); diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServerLogger.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServerLogger.java index 1baf9ca49b..7d46a0d6a6 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServerLogger.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQServerLogger.java @@ -190,8 +190,8 @@ public interface ActiveMQServerLogger extends BasicLogger { void errorStoppingConnectorService(@Cause Throwable e, String name); @LogMessage(level = Logger.Level.INFO) - @Message(id = 221024, value = "Backup server {0} is synchronized with live-server.", format = Message.Format.MESSAGE_FORMAT) - void backupServerSynched(ActiveMQServerImpl server); + @Message(id = 221024, value = "Backup server {0} is synchronized with live server, nodeID={1}.", format = Message.Format.MESSAGE_FORMAT) + void backupServerSynchronized(ActiveMQServerImpl server, String liveID); @LogMessage(level = Logger.Level.INFO) @Message(id = 221025, value = "Replication: sending {0} (size={1}) to replica.", format = Message.Format.MESSAGE_FORMAT) diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java index d136402f69..b6dd08f19c 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/NodeManager.java @@ -39,6 +39,7 @@ public abstract class NodeManager implements ActiveMQComponent { private UUID uuid; private boolean isStarted = false; private final Set lockListeners; + protected long nodeActivationSequence; // local version of a coordinated sequence, tracking state transitions of ownership public NodeManager(final boolean replicatedBackup) { this.replicatedBackup = replicatedBackup; @@ -79,18 +80,30 @@ public abstract class NodeManager implements ActiveMQComponent { } } - public long readDataVersion() throws NodeManagerException { + public long readNodeActivationSequence() throws NodeManagerException { // TODO make it abstract throw new UnsupportedOperationException("TODO"); } - public void writeDataVersion(long version) throws NodeManagerException { + public void writeNodeActivationSequence(long version) throws NodeManagerException { // TODO make it abstract throw new UnsupportedOperationException("TODO"); } public abstract SimpleString readNodeId() throws NodeManagerException; + public long getNodeActivationSequence() { + synchronized (nodeIDGuard) { + return nodeActivationSequence; + } + } + + public void setNodeActivationSequence(long activationSequence) { + synchronized (nodeIDGuard) { + nodeActivationSequence = activationSequence; + } + } + public UUID getUUID() { synchronized (nodeIDGuard) { return uuid; diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java index 5427360fcf..c3d0c87d59 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/cluster/ha/ReplicationBackupPolicy.java @@ -32,10 +32,8 @@ public class ReplicationBackupPolicy implements HAPolicy activationParams, IOCriticalErrorListener shutdownOnCriticalIO) throws Exception { - return new ReplicationBackupActivation(server, wasLive, DistributedPrimitiveManager.newInstanceOf( - distributedManagerConfiguration.getClassName(), - distributedManagerConfiguration.getProperties()), this); + return new ReplicationBackupActivation(server, DistributedPrimitiveManager.newInstanceOf( + managerConfiguration.getClassName(), managerConfiguration.getProperties()), this); } @Override @@ -157,14 +146,6 @@ public class ReplicationBackupPolicy implements HAPolicy - * This search isn't filtering the caller broker transport and is meant to be used - * when the broker acceptors aren't running yet. - */ - public static boolean searchActiveLiveNodeId(String clusterName, - String nodeId, - long timeout, - TimeUnit unit, - Configuration serverConfiguration) throws ActiveMQException { - if (serverConfiguration.getClusterConfigurations().isEmpty()) - return false; - final ClusterConnectionConfiguration clusterConnectionConfiguration = ConfigurationUtils.getReplicationClusterConfiguration(serverConfiguration, clusterName); - - final LiveNodeIdListener liveNodeIdListener = new LiveNodeIdListener(nodeId, serverConfiguration.getClusterUser(), serverConfiguration.getClusterPassword()); - - try (ServerLocatorInternal locator = createLocator(serverConfiguration, clusterConnectionConfiguration)) { - // if would like to filter out a transport configuration: - // locator.setClusterTransportConfiguration(callerBrokerTransportConfiguration) - locator.addClusterTopologyListener(liveNodeIdListener); - locator.setReconnectAttempts(0); - try (ClientSessionFactoryInternal ignored = locator.connectNoWarnings()) { - return liveNodeIdListener.awaitNodePresent(timeout, unit); - } catch (Exception notConnected) { - if (!(notConnected instanceof ActiveMQException) || ActiveMQExceptionType.INTERNAL_ERROR.equals(((ActiveMQException) notConnected).getType())) { - // report all exceptions that aren't ActiveMQException and all INTERNAL_ERRORs - ActiveMQServerLogger.LOGGER.failedConnectingToCluster(notConnected); - } - return false; - } - } - } - - private static final class LiveNodeIdListener implements ClusterTopologyListener { - - private static final Logger logger = Logger.getLogger(LiveNodeIdListener.class); - private final String nodeId; - private final String user; - private final String password; - private final CountDownLatch searchCompleted; - private boolean isNodePresent = false; - - LiveNodeIdListener(String nodeId, String user, String password) { - this.nodeId = nodeId; - this.user = user; - this.password = password; - this.searchCompleted = new CountDownLatch(1); - } - - @Override - public void nodeUP(TopologyMember topologyMember, boolean last) { - boolean isOurNodeId = nodeId != null && nodeId.equals(topologyMember.getNodeId()); - if (isOurNodeId && isActive(topologyMember.getLive())) { - isNodePresent = true; - } - if (isOurNodeId || last) { - searchCompleted.countDown(); - } - } - - public boolean awaitNodePresent(long timeout, TimeUnit unit) throws InterruptedException { - searchCompleted.await(timeout, unit); - return isNodePresent; - } - - /** - * In a cluster of replicated live/backup pairs if a backup crashes and then its live crashes the cluster will - * retain the topology information of the live such that when the live server restarts it will check the - * cluster to see if its nodeID is present (which it will be) and then it will activate as a backup rather than - * a live. To prevent this situation an additional check is necessary to see if the server with the matching - * nodeID is actually active or not which is done by attempting to make a connection to it. - * - * @param transportConfiguration - * @return - */ - private boolean isActive(TransportConfiguration transportConfiguration) { - try (ServerLocator serverLocator = ActiveMQClient.createServerLocator(false, transportConfiguration); - ClientSessionFactory clientSessionFactory = serverLocator.createSessionFactory(); - ClientSession clientSession = clientSessionFactory.createSession(user, password, false, false, false, false, 0)) { - return true; - } catch (Exception e) { - logger.debug("isActive check failed", e); - return false; - } - } - - @Override - public void nodeDown(long eventUID, String nodeID) { - // no-op - } - } - - private static ServerLocatorInternal createLocator(Configuration configuration, - ClusterConnectionConfiguration config) throws ActiveMQException { - final ServerLocatorInternal locator; - if (config.getDiscoveryGroupName() != null) { - DiscoveryGroupConfiguration dg = configuration.getDiscoveryGroupConfigurations().get(config.getDiscoveryGroupName()); - - if (dg == null) { - throw ActiveMQMessageBundle.BUNDLE.noDiscoveryGroupFound(null); - } - locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(dg); - } else { - TransportConfiguration[] tcConfigs = config.getStaticConnectors() != null ? configuration.getTransportConfigurations(config.getStaticConnectors()) : null; - - locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(tcConfigs); - } - return locator; - } - -} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java index 4a8b59f27d..56b59252a8 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileBasedNodeManager.java @@ -28,7 +28,7 @@ import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.utils.UUID; import org.apache.activemq.artemis.utils.UUIDGenerator; -import static java.nio.file.StandardOpenOption.CREATE_NEW; +import static java.nio.file.StandardOpenOption.CREATE; import static java.nio.file.StandardOpenOption.READ; import static java.nio.file.StandardOpenOption.WRITE; @@ -36,33 +36,36 @@ public abstract class FileBasedNodeManager extends NodeManager { protected static final byte FIRST_TIME_START = '0'; public static final String SERVER_LOCK_NAME = "server.lock"; - public static final String DATA_VERSION_NAME = "server.data.version"; + public static final String SERVER_ACTIVATION_SEQUENCE_NAME = "server.activation.sequence"; private static final String ACCESS_MODE = "rw"; private final File directory; protected FileChannel channel; - protected FileChannel dataVersionChannel; + protected FileChannel activationSequenceChannel; public FileBasedNodeManager(boolean replicatedBackup, File directory) { super(replicatedBackup); this.directory = directory; + if (directory != null) { + directory.mkdirs(); + } } - protected void useDataVersionChannel() throws IOException { - if (dataVersionChannel != null) { + protected void useActivationSequenceChannel() throws IOException { + if (activationSequenceChannel != null) { return; } - dataVersionChannel = FileChannel.open(newFile(DATA_VERSION_NAME).toPath(), READ, WRITE, CREATE_NEW); + activationSequenceChannel = FileChannel.open(newFile(SERVER_ACTIVATION_SEQUENCE_NAME).toPath(), READ, WRITE, CREATE); } @Override - public long readDataVersion() throws NodeManagerException { + public long readNodeActivationSequence() throws NodeManagerException { if (!isStarted()) { throw new NodeManagerException(new IllegalStateException("node manager must be started first")); } try { - useDataVersionChannel(); + useActivationSequenceChannel(); ByteBuffer tmpBuffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN); - if (dataVersionChannel.read(tmpBuffer, 0) != Long.BYTES) { + if (activationSequenceChannel.read(tmpBuffer, 0) != Long.BYTES) { return 0; } tmpBuffer.flip(); @@ -73,16 +76,17 @@ public abstract class FileBasedNodeManager extends NodeManager { } @Override - public void writeDataVersion(long version) throws NodeManagerException { + public void writeNodeActivationSequence(long version) throws NodeManagerException { if (!isStarted()) { throw new NodeManagerException(new IllegalStateException("node manager must be started first")); } try { - useDataVersionChannel(); + useActivationSequenceChannel(); ByteBuffer tmpBuffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN); tmpBuffer.putLong(0, version); - dataVersionChannel.write(tmpBuffer, 0); - dataVersionChannel.force(false); + activationSequenceChannel.write(tmpBuffer, 0); + activationSequenceChannel.force(false); + setNodeActivationSequence(version); } catch (IOException ie) { throw new NodeManagerException(ie); } @@ -149,12 +153,8 @@ public abstract class FileBasedNodeManager extends NodeManager { createNodeId(); } - /** - * @return - */ protected final File newFile(final String fileName) { - File file = new File(directory, fileName); - return file; + return new File(directory, fileName); } protected final synchronized void createNodeId() throws IOException { @@ -190,8 +190,8 @@ public abstract class FileBasedNodeManager extends NodeManager { channelCopy.close(); } finally { try { - FileChannel dataVersionChannel = this.dataVersionChannel; - this.dataVersionChannel = null; + FileChannel dataVersionChannel = this.activationSequenceChannel; + this.activationSequenceChannel = null; if (dataVersionChannel != null) { dataVersionChannel.close(); } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java index 4441bf0018..d925113b38 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationBackupActivation.java @@ -19,9 +19,6 @@ package org.apache.activemq.artemis.core.server.impl; import javax.annotation.concurrent.GuardedBy; import java.util.Objects; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Consumer; @@ -35,7 +32,7 @@ import org.apache.activemq.artemis.core.replication.ReplicationEndpoint; import org.apache.activemq.artemis.core.server.ActiveMQServer; import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; import org.apache.activemq.artemis.core.server.LiveNodeLocator; -import org.apache.activemq.artemis.core.server.NodeManager;; +import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.core.server.cluster.ClusterControl; import org.apache.activemq.artemis.core.server.cluster.ClusterController; import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationBackupPolicy; @@ -45,6 +42,8 @@ import org.apache.activemq.artemis.quorum.UnavailableStateException; import org.jboss.logging.Logger; import static org.apache.activemq.artemis.core.server.impl.ReplicationObserver.ReplicationFailure; +import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.ensureSequentialAccessToNodeData; +import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.tryActivate; /** * This activation can be used by a primary while trying to fail-back ie {@code failback == true} or @@ -54,7 +53,6 @@ public final class ReplicationBackupActivation extends Activation implements Dis private static final Logger LOGGER = Logger.getLogger(ReplicationBackupActivation.class); - private final boolean wasLive; private final ReplicationBackupPolicy policy; private final ActiveMQServerImpl activeMQServer; // This field is != null iff this node is a primary during a fail-back ie acting as a backup in order to become live again. @@ -72,10 +70,8 @@ public final class ReplicationBackupActivation extends Activation implements Dis private final AtomicBoolean stopping; public ReplicationBackupActivation(final ActiveMQServerImpl activeMQServer, - final boolean wasLive, final DistributedPrimitiveManager distributedManager, final ReplicationBackupPolicy policy) { - this.wasLive = wasLive; this.activeMQServer = activeMQServer; if (policy.isTryFailback()) { final SimpleString serverNodeID = activeMQServer.getNodeID(); @@ -146,14 +142,35 @@ public final class ReplicationBackupActivation extends Activation implements Dis } } try { - LOGGER.info("Trying to reach majority of quorum service nodes"); distributedManager.start(); - LOGGER.info("Quorum service available: starting broker"); + final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence(); + // only a backup with positive local activation sequence could contain valuable data + if (nodeActivationSequence > 0) { + final String nodeId = activeMQServer.getNodeManager().getNodeId().toString(); + DistributedLock liveLockWithInSyncReplica; + while (true) { + distributedManager.start(); + try { + liveLockWithInSyncReplica = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER); + break; + } catch (UnavailableStateException canRecoverEx) { + distributedManager.stop(); + } + } + if (liveLockWithInSyncReplica != null) { + // retain state and start as live + if (!activeMQServer.initialisePart1(false)) { + return; + } + activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED); + startAsLive(liveLockWithInSyncReplica); + return; + } + } distributedManager.addUnavailableManagerListener(this); // Stop the previous node manager and create a new one with NodeManager::replicatedBackup == true: // NodeManager::start skip setup lock file with NodeID, until NodeManager::stopBackup is called. activeMQServer.resetNodeManager(); - activeMQServer.getNodeManager().stop(); // A primary need to preserve NodeID across runs activeMQServer.moveServerData(policy.getMaxSavedReplicatedJournalsSize(), policy.isTryFailback()); activeMQServer.getNodeManager().start(); @@ -164,11 +181,15 @@ public final class ReplicationBackupActivation extends Activation implements Dis if (closed) return; } + + final ClusterController clusterController = activeMQServer.getClusterManager().getClusterController(); + + LOGGER.infof("Apache ActiveMQ Artemis Backup Server version %s [%s] started, awaiting connection to a live cluster member to start replication", activeMQServer.getVersion().getFullVersion(), + activeMQServer.toString()); + clusterController.awaitConnectionToReplicationCluster(); activeMQServer.getBackupManager().start(); - ActiveMQServerLogger.LOGGER.backupServerStarted(activeMQServer.getVersion().getFullVersion(), - activeMQServer.getNodeManager().getNodeId()); activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED); final DistributedLock liveLock = replicateAndFailover(clusterController); if (liveLock == null) { @@ -192,6 +213,14 @@ public final class ReplicationBackupActivation extends Activation implements Dis liveLock.close(); return; } + try { + ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER); + } catch (Throwable fatal) { + LOGGER.warn(fatal); + // policy is already live one, but there's no activation yet: we can just stop + asyncRestartServer(activeMQServer, false, false); + throw new ActiveMQIllegalStateException("This server cannot ensure sequential access to broker data: activation is failed"); + } ActiveMQServerLogger.LOGGER.becomingLive(activeMQServer); // stopBackup is going to write the NodeID previously set on the NodeManager, // because activeMQServer.resetNodeManager() has created a NodeManager with replicatedBackup == true. @@ -260,28 +289,38 @@ public final class ReplicationBackupActivation extends Activation implements Dis return null; } LOGGER.debugf("ReplicationFailure = %s", failure); - boolean voluntaryFailOver = false; switch (failure) { case VoluntaryFailOver: - voluntaryFailOver = true; case NonVoluntaryFailover: - final DistributedLock liveLock = tryAcquireLiveLock(); // from now on we're meant to stop: // - due to failover // - due to restart/stop - assert stopping.get(); - if (liveLock != null) { - return liveLock; + if (!stopping.compareAndSet(false, true)) { + return null; } - boolean restart = true; - if (voluntaryFailOver && isFirstFailbackAttempt()) { - restart = false; - LOGGER.error("Failed to fail-back: stopping broker based on quorum results"); + // no more interested into these events: handling it manually from here + distributedManager.removeUnavailableManagerListener(this); + final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence(); + final String nodeId = activeMQServer.getNodeManager().getNodeId().toString(); + DistributedLock liveLockWithInSyncReplica = null; + if (nodeActivationSequence > 0) { + try { + liveLockWithInSyncReplica = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER); + } catch (Throwable error) { + // no need to retry here, can just restart as backup that will handle a more resilient tryActivate + LOGGER.warn("Errored while attempting failover", error); + liveLockWithInSyncReplica = null; + } } else { - ActiveMQServerLogger.LOGGER.restartingAsBackupBasedOnQuorumVoteResults(); + LOGGER.warnf("We expect local activation sequence for NodeID = %s to be > 0 on a fail-over, while is %d", nodeId, nodeActivationSequence); } - // let's ignore the stopping flag here, we're in control of it - asyncRestartServer(activeMQServer, restart, false); + assert stopping.get(); + if (liveLockWithInSyncReplica != null) { + return liveLockWithInSyncReplica; + } + ActiveMQServerLogger.LOGGER.restartingAsBackupBasedOnQuorumVoteResults(); + // let's ignore the stopping flag here, we're already in control of it + asyncRestartServer(activeMQServer, true, false); return null; case RegistrationError: LOGGER.error("Stopping broker because of critical registration error"); @@ -307,73 +346,7 @@ public final class ReplicationBackupActivation extends Activation implements Dis } } } finally { - silentExecution("Errored on cluster topology listener for replication cleanup", () -> clusterController.removeClusterTopologyListenerForReplication(nodeLocator)); - } - } - - /** - * {@code wasLive} is {code true} only while transitioning from primary to backup.
- * If a natural born backup become live and allows failback, while transitioning to back again - * {@code wasLive} is still {@code false}.
- * The check on {@link ReplicationBackupPolicy#isTryFailback()} is redundant but still useful for correctness. - *

- * In case of fail-back, any event that's going to restart this broker as backup (eg quorum service unavailable - * or some replication failures) will cause {@code wasLive} to be {@code false}, because the HA policy set isn't - * a primary anymore. - */ - private boolean isFirstFailbackAttempt() { - return wasLive && policy.isTryFailback(); - } - - private DistributedLock tryAcquireLiveLock() throws InterruptedException { - // disable quorum service unavailability handling and just treat this imperatively - if (!stopping.compareAndSet(false, true)) { - // already unavailable quorum service: fail fast - return null; - } - distributedManager.removeUnavailableManagerListener(this); - assert activeMQServer.getNodeManager().getNodeId() != null; - final String liveID = activeMQServer.getNodeManager().getNodeId().toString(); - final int voteRetries = policy.getVoteRetries(); - final long maxAttempts = voteRetries >= 0 ? (voteRetries + 1) : -1; - if (maxAttempts == -1) { - LOGGER.error("It's not safe to retry an infinite amount of time to acquire a live lock: please consider setting a vote-retries value"); - } - final long voteRetryWait = policy.getVoteRetryWait(); - final DistributedLock liveLock = getLock(distributedManager, liveID); - if (liveLock == null) { - return null; - } - for (long attempt = 0; maxAttempts >= 0 ? (attempt < maxAttempts) : true; attempt++) { - try { - if (liveLock.tryLock(voteRetryWait, TimeUnit.MILLISECONDS)) { - LOGGER.debugf("%s live lock acquired after %d attempts.", liveID, (attempt + 1)); - return liveLock; - } - } catch (UnavailableStateException e) { - LOGGER.warnf(e, "Failed to acquire live lock %s because of unavailable quorum service: stop trying", liveID); - distributedManager.stop(); - return null; - } - } - LOGGER.warnf("Failed to acquire live lock %s after %d tries", liveID, maxAttempts); - distributedManager.stop(); - return null; - } - - private DistributedLock getLock(final DistributedPrimitiveManager manager, - final String lockId) throws InterruptedException { - if (!manager.isStarted()) { - return null; - } - try { - return manager.getDistributedLock(lockId); - } catch (ExecutionException e) { - LOGGER.warnf(e, "Errored while getting lock %s", lockId); - return null; - } catch (TimeoutException te) { - LOGGER.warnf(te, "Timeout while getting lock %s", lockId); - return null; + silentExecution("Error on cluster topology listener for replication cleanup", () -> clusterController.removeClusterTopologyListenerForReplication(nodeLocator)); } } @@ -405,7 +378,6 @@ public final class ReplicationBackupActivation extends Activation implements Dis return ReplicationFailure.RegistrationError; } this.replicationEndpoint = replicationEndpoint; - assert replicationEndpoint != null; try { return replicationObserver.awaitReplicationFailure(); } finally { @@ -414,11 +386,11 @@ public final class ReplicationBackupActivation extends Activation implements Dis closeChannelOf(replicationEndpoint); } } finally { - silentExecution("Errored on live control close", liveControl::close); + silentExecution("Error on live control close", liveControl::close); } } finally { - silentExecution("Errored on cluster topology listener cleanup", () -> clusterController.removeClusterTopologyListener(replicationObserver)); - silentExecution("Errored while removing incoming interceptor for replication", () -> clusterController.removeIncomingInterceptorForReplication(replicationError)); + silentExecution("Error on cluster topology listener cleanup", () -> clusterController.removeClusterTopologyListener(replicationObserver)); + silentExecution("Error while removing incoming interceptor for replication", () -> clusterController.removeIncomingInterceptorForReplication(replicationError)); } } finally { this.replicationObserver = null; @@ -438,7 +410,7 @@ public final class ReplicationBackupActivation extends Activation implements Dis return; } if (replicationEndpoint.getChannel() != null) { - silentExecution("Errored while closing replication endpoint channel", () -> replicationEndpoint.getChannel().close()); + silentExecution("Error while closing replication endpoint channel", () -> replicationEndpoint.getChannel().close()); replicationEndpoint.setChannel(null); } } diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java index 285f516220..abf92226d7 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ReplicationPrimaryActivation.java @@ -19,12 +19,14 @@ package org.apache.activemq.artemis.core.server.impl; import javax.annotation.concurrent.GuardedBy; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.activemq.artemis.api.core.ActiveMQAlreadyReplicatingException; import org.apache.activemq.artemis.api.core.ActiveMQException; import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException; import org.apache.activemq.artemis.api.core.Pair; import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.core.persistence.StorageManager; import org.apache.activemq.artemis.core.protocol.core.Channel; import org.apache.activemq.artemis.core.protocol.core.ChannelHandler; import org.apache.activemq.artemis.core.protocol.core.CoreRemotingConnection; @@ -46,7 +48,10 @@ import org.apache.activemq.artemis.quorum.UnavailableStateException; import org.apache.activemq.artemis.spi.core.remoting.Acceptor; import org.jboss.logging.Logger; -import static org.apache.activemq.artemis.core.server.impl.ClusterTopologySearch.searchActiveLiveNodeId; +import static org.apache.activemq.artemis.core.server.ActiveMQServer.SERVER_STATE.STARTED; +import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.awaitNextCommittedActivationSequence; +import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.ensureSequentialAccessToNodeData; +import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.tryActivate; /** * This is going to be {@link #run()} just by natural born primary, at the first start. @@ -55,8 +60,9 @@ import static org.apache.activemq.artemis.core.server.impl.ClusterTopologySearch public class ReplicationPrimaryActivation extends LiveActivation implements DistributedLock.UnavailableLockListener { private static final Logger LOGGER = Logger.getLogger(ReplicationPrimaryActivation.class); - private static final long DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS = 20_000; - private static final long BLOCKING_CALLS_TIMEOUT_MILLIS = 5_000; + // This is the time we expect a replica to become a live from the quorum pov + // ie time to execute tryActivate and ensureSequentialAccessToNodeData + private static final long FAILBACK_TIMEOUT_MILLIS = 4_000; private final ReplicationPrimaryPolicy policy; @@ -69,7 +75,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist private final DistributedPrimitiveManager distributedManager; - private volatile boolean stoppingServer; + private final AtomicBoolean stoppingServer; public ReplicationPrimaryActivation(final ActiveMQServerImpl activeMQServer, final DistributedPrimitiveManager distributedManager, @@ -78,6 +84,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist this.policy = policy; this.replicationLock = new Object(); this.distributedManager = distributedManager; + this.stoppingServer = new AtomicBoolean(); } /** @@ -101,19 +108,36 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist @Override public void run() { try { - - final NodeManager nodeManager = activeMQServer.getNodeManager(); - - final String nodeId = nodeManager.readNodeId().toString(); - - final long dataVersion = nodeManager.readDataVersion(); - - final DistributedLock liveLock = searchLiveOrAcquireLiveLock(nodeId, BLOCKING_CALLS_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS); - + // we have a common nodeId that we can share and coordinate with between peers + if (policy.getCoordinationId() != null) { + LOGGER.infof("Applying shared peer NodeID=%s to enable coordinated live activation", policy.getCoordinationId()); + // REVISIT: this is quite clunky, also in backup activation, we just need new nodeID persisted! + activeMQServer.resetNodeManager(); + activeMQServer.getNodeManager().start(); + activeMQServer.getNodeManager().setNodeID(policy.getCoordinationId()); + activeMQServer.getNodeManager().stopBackup(); + } + final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence(); + final String nodeId = activeMQServer.getNodeManager().readNodeId().toString(); + DistributedLock liveLock; + while (true) { + distributedManager.start(); + try { + liveLock = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER); + break; + } catch (UnavailableStateException canRecoverEx) { + distributedManager.stop(); + } + } if (liveLock == null) { + distributedManager.stop(); + LOGGER.infof("This broker cannot become a live server with NodeID = %s: restarting as backup", nodeId); + activeMQServer.setHAPolicy(policy.getBackupPolicy()); return; } + ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER); + activeMQServer.initialisePart1(false); activeMQServer.initialisePart2(false); @@ -142,73 +166,9 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist } } - private DistributedLock searchLiveOrAcquireLiveLock(final String nodeId, - final long blockingCallTimeout, - final TimeUnit unit) throws ActiveMQException, InterruptedException { - if (policy.isCheckForLiveServer()) { - LOGGER.infof("Searching a live server with NodeID = %s", nodeId); - if (searchActiveLiveNodeId(policy.getClusterName(), nodeId, blockingCallTimeout, unit, activeMQServer.getConfiguration())) { - LOGGER.infof("Found a live server with NodeID = %s: restarting as backup", nodeId); - activeMQServer.setHAPolicy(policy.getBackupPolicy()); - return null; - } - } - startDistributedPrimitiveManager(); - return acquireDistributeLock(getDistributeLock(nodeId), blockingCallTimeout, unit); - } - - private void startDistributedPrimitiveManager() throws InterruptedException, ActiveMQException { - LOGGER.infof("Trying to reach the majority of quorum nodes in %d ms.", DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS); - try { - if (distributedManager.start(DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) { - return; - } - } catch (InterruptedException ie) { - throw ie; - } catch (Throwable t) { - LOGGER.debug(t); - } - assert !distributedManager.isStarted(); - throw new ActiveMQException("Cannot reach the majority of quorum nodes"); - } - - private DistributedLock getDistributeLock(final String nodeId) throws InterruptedException, ActiveMQException { - try { - return distributedManager.getDistributedLock(nodeId); - } catch (Throwable t) { - try { - distributedManager.stop(); - } catch (Throwable ignore) { - // don't care - } - if (t instanceof InterruptedException) { - throw (InterruptedException) t; - } - throw new ActiveMQException("Cannot obtain a live lock instance"); - } - } - - private DistributedLock acquireDistributeLock(final DistributedLock liveLock, - final long acquireLockTimeout, - final TimeUnit unit) throws InterruptedException, ActiveMQException { - try { - if (liveLock.tryLock(acquireLockTimeout, unit)) { - return liveLock; - } - } catch (UnavailableStateException e) { - LOGGER.debug(e); - } - try { - distributedManager.stop(); - } catch (Throwable ignore) { - // don't care - } - throw new ActiveMQException("Failed to become live"); - } - @Override public ChannelHandler getActivationChannelHandler(final Channel channel, final Acceptor acceptorUsed) { - if (stoppingServer) { + if (stoppingServer.get()) { return null; } return packet -> { @@ -268,7 +228,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist awaitBackupAnnouncementOnFailbackRequest(clusterConnection); } } catch (Exception e) { - if (activeMQServer.getState() == ActiveMQServerImpl.SERVER_STATE.STARTED) { + if (activeMQServer.getState() == STARTED) { /* * The reasoning here is that the exception was either caused by (1) the * (interaction with) the backup, or (2) by an IO Error at the storage. If (1), we @@ -283,7 +243,9 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist ActiveMQServerLogger.LOGGER.errorStoppingReplication(amqe); } finally { synchronized (replicationLock) { - this.replicationManager = null; + if (this.replicationManager == replicationManager) { + this.replicationManager = null; + } } } } @@ -308,35 +270,55 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist } } - /** - * If {@link #asyncStopServer()} happens before this call, the restart just won't happen. - * If {@link #asyncStopServer()} happens after this call, will make the server to stop right after being restarted. - */ private void restartAsBackupAfterFailback() throws Exception { - if (stoppingServer) { + if (stoppingServer.get()) { return; } - synchronized (this) { - if (stoppingServer) { + final String coordinatedLockAndNodeId; + final long inSyncReplicaActivation; + synchronized (replicationLock) { + if (stoppingServer.get()) { return; } - distributedManager.stop(); + final ReplicationManager replicationManager = this.replicationManager; + if (replicationManager == null) { + LOGGER.warnf("Failback interrupted"); + // we got a disconnection from the replica *before* stopping acceptors: better not failback! + return; + } + // IMPORTANT: this is going to save server::fail to issue a replica connection failure (with failed == false) + // because onReplicationConnectionClose fail-fast on stopping == true. + if (!stoppingServer.compareAndSet(false, true)) { + LOGGER.infof("Failback interrupted: server is already stopping"); + return; + } + coordinatedLockAndNodeId = activeMQServer.getNodeManager().getNodeId().toString(); + inSyncReplicaActivation = activeMQServer.getNodeManager().getNodeActivationSequence(); + // none can notice a concurrent drop of replica connection here: awaitNextCommittedActivationSequence defensively + // wait FAILBACK_TIMEOUT_MILLIS, proceed as backup and compete to become live again activeMQServer.fail(true); - ActiveMQServerLogger.LOGGER.restartingReplicatedBackupAfterFailback(); - activeMQServer.setHAPolicy(policy.getBackupPolicy()); - activeMQServer.start(); } + try { + distributedManager.start(); + if (!awaitNextCommittedActivationSequence(distributedManager, coordinatedLockAndNodeId, inSyncReplicaActivation, FAILBACK_TIMEOUT_MILLIS, LOGGER)) { + LOGGER.warnf("Timed out waiting for failback server activation with NodeID = %s: and sequence > %d: after %dms", + coordinatedLockAndNodeId, inSyncReplicaActivation, FAILBACK_TIMEOUT_MILLIS); + } + } catch (UnavailableStateException ignored) { + LOGGER.debug("Unavailable distributed manager while awaiting failback activation sequence: ignored", ignored); + } finally { + distributedManager.stop(); + } + ActiveMQServerLogger.LOGGER.restartingReplicatedBackupAfterFailback(); + activeMQServer.setHAPolicy(policy.getBackupPolicy()); + activeMQServer.start(); } private void asyncStopServer() { - if (stoppingServer) { + if (stoppingServer.get()) { return; } - synchronized (this) { - if (stoppingServer) { - return; - } - stoppingServer = true; + if (stoppingServer.compareAndSet(false, true)) { new Thread(() -> { try { activeMQServer.stop(); @@ -374,19 +356,28 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist private void onReplicationConnectionClose() { ExecutorService executorService = activeMQServer.getThreadPool(); if (executorService != null) { - synchronized (replicationLock) { - if (replicationManager == null) { - return; - } + if (stoppingServer.get()) { + return; } executorService.execute(() -> { synchronized (replicationLock) { if (replicationManager == null) { return; } + // we increment only if we are staying alive + if (!stoppingServer.get() && STARTED.equals(activeMQServer.getState())) { + try { + ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER); + } catch (Throwable fatal) { + LOGGER.errorf(fatal, "Unexpected exception: %s on attempted activation sequence increment; stopping server async", fatal.getLocalizedMessage()); + asyncStopServer(); + } + } // this is going to stop the replication manager - activeMQServer.getStorageManager().stopReplication(); - assert !replicationManager.isStarted(); + final StorageManager storageManager = activeMQServer.getStorageManager(); + if (storageManager != null) { + storageManager.stopReplication(); + } replicationManager = null; } }); diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/quorum/ActivationSequenceStateMachine.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/quorum/ActivationSequenceStateMachine.java new file mode 100644 index 0000000000..96083634df --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/quorum/ActivationSequenceStateMachine.java @@ -0,0 +1,312 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.activemq.artemis.core.server.impl.quorum; + +import java.util.Objects; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import org.apache.activemq.artemis.api.core.ActiveMQException; +import org.apache.activemq.artemis.core.server.ActiveMQServer; +import org.apache.activemq.artemis.core.server.NodeManager; +import org.apache.activemq.artemis.quorum.DistributedLock; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.apache.activemq.artemis.quorum.MutableLong; +import org.apache.activemq.artemis.quorum.UnavailableStateException; +import org.jboss.logging.Logger; + +/** + * This class contains the activation sequence logic of the pluggable quorum vote: + * it should be used by {@link org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation} + * and {@link org.apache.activemq.artemis.core.server.impl.ReplicationPrimaryActivation} to coordinate + * for replication. + */ +public final class ActivationSequenceStateMachine { + + private static final long CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS = 200; + private static final long CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS = 2000; + private static final long LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS = 2000; + + private ActivationSequenceStateMachine() { + + } + + /** + * It loops if the data of the broker is still valuable, but cannot become live. + * It loops (temporarly) if data is in sync or can self-heal, but cannot yet acquire the live lock. + *

+ * It stops loop and return: + *

    + *
  • {@code null}: if data is stale (and there are no rights to become live) + *
  • {@code !=null}: if data is in sync and the {@link DistributedLock} is correctly acquired + *

+ *

+ * After successfully returning from this method ie not null return value, a broker should use + * {@link #ensureSequentialAccessToNodeData(ActiveMQServer, DistributedPrimitiveManager, Logger)} to complete + * the activation and guarantee the initial not-replicated ownership of data. + */ + public static DistributedLock tryActivate(final String nodeId, + final long nodeActivationSequence, + final DistributedPrimitiveManager distributedManager, + final Logger logger) throws InterruptedException, ExecutionException, TimeoutException, UnavailableStateException { + final DistributedLock activationLock = distributedManager.getDistributedLock(nodeId); + try (MutableLong coordinatedNodeSequence = distributedManager.getMutableLong(nodeId)) { + while (true) { + // dirty read is sufficient to know if we are *not* an in sync replica + // typically the lock owner will increment to signal our data is stale and we are happy without any + // further coordination at this point + switch (validateActivationSequence(coordinatedNodeSequence, activationLock, nodeId, nodeActivationSequence, logger)) { + + case Stale: + activationLock.close(); + return null; + case SelfRepair: + case InSync: + break; + case MaybeInSync: + if (activationLock.tryLock()) { + // BAD: where's the broker that should commit it? + activationLock.unlock(); + logger.warnf("Cannot assume live role for NodeID = %s: claimed activation sequence need to be repaired", + nodeId); + TimeUnit.MILLISECONDS.sleep(CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS); + continue; + } + // quick path while data is still valuable: wait until something change (commit/repair) + TimeUnit.MILLISECONDS.sleep(CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS); + continue; + } + // SelfRepair, InSync + if (!activationLock.tryLock(LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) { + logger.debugf("Candidate for Node ID = %s, with local activation sequence: %d, cannot acquire live lock within %dms; retrying", + nodeId, nodeActivationSequence, LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS); + continue; + } + switch (validateActivationSequence(coordinatedNodeSequence, activationLock, nodeId, nodeActivationSequence, logger)) { + + case Stale: + activationLock.close(); + return null; + case SelfRepair: + // Self-repair sequence ie we were the only one with the most up to date data. + // NOTE: We cannot move the sequence now, let's delay it on ensureSequentialAccessToNodeData + logger.infof("Assuming live role for NodeID = %s: local activation sequence %d matches claimed coordinated activation sequence %d. Repairing sequence", nodeId, nodeActivationSequence, nodeActivationSequence); + return activationLock; + case InSync: + // we are an in_sync_replica, good to go live as UNREPLICATED + logger.infof("Assuming live role for NodeID = %s, local activation sequence %d matches current coordinated activation sequence %d", nodeId, nodeActivationSequence, nodeActivationSequence); + return activationLock; + case MaybeInSync: + activationLock.unlock(); + logger.warnf("Cannot assume live role for NodeID = %s: claimed activation sequence need to be repaired", nodeId); + TimeUnit.MILLISECONDS.sleep(CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS); + continue; + } + } + } + } + + private enum ValidationResult { + /** + * coordinated activation sequence (claimed/committed) is far beyond the local one: data is not valuable anymore + **/ + Stale, + /** + * coordinated activation sequence is the same as local one: data is in sync + **/ + InSync, + /** + * next coordinated activation sequence is not committed yet: maybe data is in sync + **/ + MaybeInSync, + /** + * next coordinated activation sequence is not committed yet, but this broker can self-repair: data is in sync + **/ + SelfRepair + } + + private static ValidationResult validateActivationSequence(final MutableLong coordinatedNodeSequence, + final DistributedLock activationLock, + final String lockAndLongId, + final long nodeActivationSequence, + final Logger logger) throws UnavailableStateException { + assert coordinatedNodeSequence.getMutableLongId().equals(lockAndLongId); + assert activationLock.getLockId().equals(lockAndLongId); + final long currentCoordinatedNodeSequence = coordinatedNodeSequence.get(); + if (nodeActivationSequence == currentCoordinatedNodeSequence) { + return ValidationResult.InSync; + } + if (currentCoordinatedNodeSequence > 0) { + logger.infof("Not a candidate for NodeID = %s activation, local activation sequence %d does not match coordinated activation sequence %d", + lockAndLongId, nodeActivationSequence, currentCoordinatedNodeSequence); + return ValidationResult.Stale; + } + // claimed activation sequence + final long claimedCoordinatedNodeSequence = -currentCoordinatedNodeSequence; + final long sequenceGap = claimedCoordinatedNodeSequence - nodeActivationSequence; + if (sequenceGap == 0) { + return ValidationResult.SelfRepair; + } + if (sequenceGap == 1) { + // maybe data is still valuable + return ValidationResult.MaybeInSync; + } + assert sequenceGap > 1; + // sequence is moved so much that data is no longer valuable + logger.infof("Not a candidate for NodeID = %s activation, local activation sequence %d does not match coordinated activation sequence %d", + lockAndLongId, nodeActivationSequence, claimedCoordinatedNodeSequence); + return ValidationResult.Stale; + } + + /** + * It wait until {@code timeoutMillis ms} has passed or the coordinated activation sequence has progressed enough + */ + public static boolean awaitNextCommittedActivationSequence(final DistributedPrimitiveManager distributedManager, + final String coordinatedLockAndNodeId, + final long activationSequence, + final long timeoutMills, + final Logger logger) + throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { + Objects.requireNonNull(distributedManager); + Objects.requireNonNull(logger); + Objects.requireNonNull(coordinatedLockAndNodeId); + if (activationSequence < 0) { + throw new IllegalArgumentException("activationSequence must be >= 0, while is " + activationSequence); + } + if (!distributedManager.isStarted()) { + throw new IllegalStateException("manager must be started"); + } + final MutableLong coordinatedActivationSequence = distributedManager.getMutableLong(coordinatedLockAndNodeId); + // wait for the live to activate and run un replicated with a sequence > inSyncReplicaActivation + // this read can be dirty b/c we are just looking for an increment. + boolean anyNext = false; + final long timeoutNs = TimeUnit.MILLISECONDS.toNanos(timeoutMills); + final long started = System.nanoTime(); + long elapsedNs; + do { + final long coordinatedValue = coordinatedActivationSequence.get(); + if (coordinatedValue > activationSequence) { + // all good, some activation has gone ahead + logger.infof("Detected a new activation sequence with NodeID = %s: and sequence: %d", coordinatedLockAndNodeId, coordinatedValue); + anyNext = true; + break; + } + if (coordinatedValue < 0) { + // commit claim + final long claimedSequence = -coordinatedValue; + final long activationsGap = claimedSequence - activationSequence; + if (activationsGap > 1) { + // all good, some activation has gone ahead + logger.infof("Detected furthers sequential server activations from sequence %d, with NodeID = %s: and claimed sequence: %d", activationSequence, coordinatedLockAndNodeId, claimedSequence); + anyNext = true; + break; + } + // activation is still in progress + logger.debugf("Detected claiming of activation sequence = %d for NodeID = %s", claimedSequence, coordinatedLockAndNodeId); + } + try { + TimeUnit.MILLISECONDS.sleep(CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS); + } catch (InterruptedException ignored) { + } + elapsedNs = System.nanoTime() - started; + } + while (elapsedNs < timeoutNs); + return anyNext; + } + + /** + * This is going to increment the coordinated activation sequence while holding the live lock, failing with some exception otherwise.
+ *

+ * The acceptable states are {@link ValidationResult#InSync} and {@link ValidationResult#SelfRepair}, throwing some exception otherwise. + *

+ * This must be used while holding a live lock to ensure not-exclusive ownership of data ie can be both used + * while loosing connectivity with a replica or after successfully {@link #tryActivate(String, long, DistributedPrimitiveManager, Logger)}. + */ + public static void ensureSequentialAccessToNodeData(ActiveMQServer activeMQServer, + DistributedPrimitiveManager distributedPrimitiveManager, + final Logger logger) throws ActiveMQException, InterruptedException, UnavailableStateException, ExecutionException, TimeoutException { + + final NodeManager nodeManager = activeMQServer.getNodeManager(); + final String lockAndLongId = nodeManager.getNodeId().toString(); + final DistributedLock liveLock = distributedPrimitiveManager.getDistributedLock(lockAndLongId); + if (!liveLock.isHeldByCaller()) { + final String message = String.format("Server [%s], live lock for NodeID = %s, not held, activation sequence cannot be safely changed", + activeMQServer, lockAndLongId); + logger.info(message); + throw new UnavailableStateException(message); + } + final long nodeActivationSequence = nodeManager.readNodeActivationSequence(); + final MutableLong coordinatedNodeActivationSequence = distributedPrimitiveManager.getMutableLong(lockAndLongId); + final long currentCoordinatedActivationSequence = coordinatedNodeActivationSequence.get(); + final long nextActivationSequence; + if (currentCoordinatedActivationSequence < 0) { + // Check Self-Repair + if (nodeActivationSequence != -currentCoordinatedActivationSequence) { + final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, local activation sequence %d does not match current claimed coordinated sequence %d: need repair", + activeMQServer, lockAndLongId, nodeActivationSequence, -currentCoordinatedActivationSequence); + logger.info(message); + throw new ActiveMQException(message); + } + // auto-repair: this is the same server that failed to commit its claimed sequence + nextActivationSequence = nodeActivationSequence; + } else { + // Check InSync + if (nodeActivationSequence != currentCoordinatedActivationSequence) { + final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, local activation sequence %d does not match current coordinated sequence %d", + activeMQServer, lockAndLongId, nodeActivationSequence, currentCoordinatedActivationSequence); + logger.info(message); + throw new ActiveMQException(message); + } + nextActivationSequence = nodeActivationSequence + 1; + } + // UN_REPLICATED STATE ENTER: auto-repair doesn't need to claim and write locally + if (nodeActivationSequence != nextActivationSequence) { + // claim + if (!coordinatedNodeActivationSequence.compareAndSet(nodeActivationSequence, -nextActivationSequence)) { + final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, activation sequence claim failed, local activation sequence %d no longer matches current coordinated sequence %d", + activeMQServer, lockAndLongId, nodeActivationSequence, coordinatedNodeActivationSequence.get()); + logger.infof(message); + throw new ActiveMQException(message); + } + // claim success: write locally + try { + nodeManager.writeNodeActivationSequence(nextActivationSequence); + } catch (NodeManager.NodeManagerException fatal) { + logger.errorf("Server [%s] failed to set local activation sequence to: %d for NodeId =%s. Cannot continue committing coordinated activation sequence: REQUIRES ADMIN INTERVENTION", + activeMQServer, nextActivationSequence, lockAndLongId); + throw new UnavailableStateException(fatal); + } + logger.infof("Server [%s], incremented local activation sequence to: %d for NodeId = %s", + activeMQServer, nextActivationSequence, lockAndLongId); + } else { + // self-heal need to update the in-memory sequence, because no writes will do it + nodeManager.setNodeActivationSequence(nextActivationSequence); + } + // commit + if (!coordinatedNodeActivationSequence.compareAndSet(-nextActivationSequence, nextActivationSequence)) { + final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, activation sequence commit failed, local activation sequence %d no longer matches current coordinated sequence %d", + activeMQServer, lockAndLongId, nodeActivationSequence, coordinatedNodeActivationSequence.get()); + logger.infof(message); + throw new ActiveMQException(message); + } + logger.infof("Server [%s], incremented coordinated activation sequence to: %d for NodeId = %s", + activeMQServer, nextActivationSequence, lockAndLongId); + } + +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/management/impl/ManagementServiceImpl.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/management/impl/ManagementServiceImpl.java index c5a49fa30d..5d9115b040 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/management/impl/ManagementServiceImpl.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/management/impl/ManagementServiceImpl.java @@ -244,7 +244,9 @@ public class ManagementServiceImpl implements ManagementService { ObjectName objectName = objectNameBuilder.getActiveMQServerObjectName(); unregisterFromJMX(objectName); unregisterFromRegistry(ResourceNames.BROKER); - unregisterMeters(ResourceNames.BROKER + "." + messagingServer.getConfiguration().getName()); + if (messagingServer != null) { + unregisterMeters(ResourceNames.BROKER + "." + messagingServer.getConfiguration().getName()); + } } @Override diff --git a/artemis-server/src/main/resources/schema/artemis-configuration.xsd b/artemis-server/src/main/resources/schema/artemis-configuration.xsd index 9e79a54b7f..69e06cdcc2 100644 --- a/artemis-server/src/main/resources/schema/artemis-configuration.xsd +++ b/artemis-server/src/main/resources/schema/artemis-configuration.xsd @@ -3189,13 +3189,12 @@ - + - Whether to check the cluster for a (live) server using our own server ID when starting - up. This option is only necessary for performing 'fail-back' on replicating - servers. Strictly speaking this setting only applies to live servers and not to - backups. + The common identity to use for coordination that is shared across instances that will replicate. + The value will be used as the internal server nodeId and as the identity of entities in the + distributed-primitive-manager. @@ -3208,21 +3207,6 @@ - - - - If we start as a replica and lose connection to the master, how many times should we attempt to vote - for quorum before restarting - - - - - - - How long to wait (in milliseconds) between each vote - - - @@ -3288,20 +3272,6 @@ - - - - If we lose connection to the master, how many times should we attempt to vote for quorum before restarting - - - - - - - How long to wait (in milliseconds) between each vote - - - diff --git a/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java b/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java index 4b37a76125..4dc63779b0 100644 --- a/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java +++ b/artemis-server/src/test/java/org/apache/activemq/artemis/core/config/impl/HAPolicyConfigurationTest.java @@ -283,7 +283,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase { assertFalse(policy.canScaleDown()); assertFalse(policy.isBackup()); assertFalse(policy.isSharedStore()); - assertTrue(policy.isCheckForLiveServer()); assertTrue(policy.isWaitForActivation()); assertEquals("purple", policy.getGroupName()); assertEquals("purple", policy.getBackupGroupName()); @@ -297,8 +296,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase { assertEquals(policy.getBackupGroupName(), failbackPolicy.getBackupGroupName()); assertEquals(policy.getClusterName(), failbackPolicy.getClusterName()); assertEquals(failbackPolicy.getMaxSavedReplicatedJournalsSize(), ActiveMQDefaultConfiguration.getDefaultMaxSavedReplicatedJournalsSize()); - assertEquals(1, failbackPolicy.getVoteRetries()); - assertEquals(1000, failbackPolicy.getVoteRetryWait()); assertTrue(failbackPolicy.isTryFailback()); assertTrue(failbackPolicy.isBackup()); assertFalse(failbackPolicy.isSharedStore()); @@ -337,8 +334,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase { assertEquals("tiddles", policy.getBackupGroupName()); assertEquals("33rrrrr", policy.getClusterName()); assertEquals(22, policy.getMaxSavedReplicatedJournalsSize()); - assertEquals(1, policy.getVoteRetries()); - assertEquals(1000, policy.getVoteRetryWait()); assertFalse(policy.isTryFailback()); assertTrue(policy.isBackup()); assertFalse(policy.isSharedStore()); @@ -358,7 +353,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase { assertFalse(failoverLivePolicy.canScaleDown()); assertFalse(failoverLivePolicy.isBackup()); assertFalse(failoverLivePolicy.isSharedStore()); - assertFalse(failoverLivePolicy.isCheckForLiveServer()); assertTrue(failoverLivePolicy.isWaitForActivation()); assertEquals(policy.getGroupName(), failoverLivePolicy.getGroupName()); assertEquals(policy.getClusterName(), failoverLivePolicy.getClusterName()); diff --git a/artemis-server/src/test/resources/backup-hapolicy-config.xml b/artemis-server/src/test/resources/backup-hapolicy-config.xml index be55236676..070c5672b4 100644 --- a/artemis-server/src/test/resources/backup-hapolicy-config.xml +++ b/artemis-server/src/test/resources/backup-hapolicy-config.xml @@ -30,8 +30,6 @@ 33rrrrr 9876 12345 - 1 - 1000 false diff --git a/artemis-server/src/test/resources/primary-hapolicy-config.xml b/artemis-server/src/test/resources/primary-hapolicy-config.xml index 5b88bcd80f..8e0ea49406 100644 --- a/artemis-server/src/test/resources/primary-hapolicy-config.xml +++ b/artemis-server/src/test/resources/primary-hapolicy-config.xml @@ -27,9 +27,6 @@ abcdefg 9876 12345 - true - 1 - 1000 org.apache.activemq.artemis.core.config.impl.HAPolicyConfigurationTest$FakeDistributedPrimitiveManager diff --git a/docs/user-manual/en/ha.md b/docs/user-manual/en/ha.md index 96045e95e4..0395fcb63e 100644 --- a/docs/user-manual/en/ha.md +++ b/docs/user-manual/en/ha.md @@ -65,8 +65,8 @@ which we will cover in a later chapter. > message data will not be available after failover. The `ha-policy` type configures which strategy a cluster should use to -provide the backing up of a servers data. Within this configuration -element is configured how a server should behave within the cluster, +provide the backing up of a server's data. Within this configuration +element we configure how a server should behave within the cluster, either as a master (live), slave (backup) or colocated (both live and backup). This would look something like: @@ -98,7 +98,7 @@ or ``` -*Replication* allows too to configure 2 new roles to enable *pluggable quorum* provider configuration, by using: +*Replication* allows the configuration of two new roles to enable *pluggable quorum* provider configuration, by using: ```xml @@ -116,17 +116,14 @@ to configure the classic *master* role, and ``` for the classic *slave* one. -If *replication* is configured using such new roles some additional element is required to complete configuration, detailed later. +If *replication* is configured using such new roles some additional element are required to complete configuration as detailed later. ### IMPORTANT NOTE ON PLUGGABLE QUORUM VOTE FEATURE -This feature is still **EXPERIMENTAL** and not meant to be run in production yet. +This feature is still **EXPERIMENTAL**. Extra testing should be done before running this feature into production. Please raise issues eventually found to the ActiveMQ Artemis Mail Lists. It means: -- its configuration can change until declared as **officially stable** -- it has to solve yet an inherent data misalignment issue with replication (it can happen with `classic` replication as well) - -More info about this issue are on [ARTEMIS-3340](https://issues.apache.org/jira/browse/ARTEMIS-3340). +- it's configuration can change until declared as **officially stable** ### Data Replication @@ -226,12 +223,12 @@ changes and repeats the process. > live server by changing `slave` to `master`. Much like in the shared-store case, when the live server stops or -crashes, its replicating backup will become active and take over its +crashes, it's replicating backup will become active and take over its duties. Specifically, the backup will become active when it loses -connection to its live server. This can be problematic because this can -also happen because of a temporary network problem. +connection to its live server. This can be problematic because it can +also happen as the result of temporary network problem. -This issue is solved in 2 different ways depending on which replication roles are configured: +The issue can be solved in two different ways, depending on which replication roles are configured: - **classic replication** (`master`/`slave` roles): backup will try to determine whether it still can connect to the other servers in the cluster. If it can connect to more than half the servers, it will become active, if more than half the @@ -275,7 +272,7 @@ The backup server must be similarly configured but as a `slave` ``` -To configure a pluggable quorum replication's primary and backup instead: +To configure a pluggable quorum replication's primary and backup use: ```xml @@ -391,7 +388,6 @@ For `primary`: - true @@ -418,16 +414,16 @@ The configuration of `class-name` as follows ``` isn't really needed, because Apache Curator is the default provider, but has been shown for completeness. -The `properties` element, instead +The `properties` element: ```xml ``` -Can specify a list of `property` elements in the form of key-value pairs, depending the ones -accepted by the specified `class-name` provider. +can specify a list of `property` elements in the form of key-value pairs, appropriate to what is +supported by the specified `class-name` provider. -Apache Curator's provider allow to configure these properties: +Apache Curator's provider allows the following properties: - [`connect-string`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#connectString(java.lang.String)): (no default) - [`session-ms`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#sessionTimeoutMs(int)): (default is 18000 ms) @@ -438,37 +434,26 @@ Apache Curator's provider allow to configure these properties: - [`retries-ms`](https://curator.apache.org/apidocs/org/apache/curator/retry/RetryNTimes.html#%3Cinit%3E(int,int)): (default is 1000 ms) - [`namespace`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#namespace(java.lang.String)): (no default) -Configuration of the [Apache Zookeeper](https://zookeeper.apache.org/) nodes is left to the user, but there are few +Configuration of the [Apache Zookeeper](https://zookeeper.apache.org/) ensemble is the responsibility of the user, but there are few **suggestions to improve the reliability of the quorum service**: - broker `session_ms` must be `>= 2 * server tick time` and `<= 20 * server tick time` as by [Zookeeper 3.6.3 admin guide](https://zookeeper.apache.org/doc/r3.6.3/zookeeperAdmin.html): it directly impacts how fast a backup can failover to an isolated/killed/unresponsive live; the higher, the slower. - GC on broker machine should allow keeping GC pauses within 1/3 of `session_ms` in order to let the Zookeeper heartbeat protocol -to work reliably: if it's not possible, better increase `session_ms` accepting a slower failover + work reliably. If that is not possible, it is better to increase `session_ms`, accepting a slower failover. - Zookeeper must have enough resources to keep GC (and OS) pauses much smaller than server tick time: please consider carefully if broker and Zookeeper node should share the same physical machine, depending on the expected load of the broker - network isolation protection requires configuring >=3 Zookeeper nodes #### *Important*: Notes on pluggable quorum replication configuration -The first `classic` replication configuration that won't apply to the pluggable quorum replication -is `vote-on-replication-failure` and configure it produces a startup error: pluggable quorum replication -always behave like `vote-on-replication-failure` `true` ie shutting down a live broker (and its JVM) in case of quorum loss. +There are some no longer needed `classic` replication configurations: + - `vote-on-replication-failure` + - `quorum-vote-wait` + - `vote-retries` + - `vote-retries-wait` + - `check-for-live-server` -The second deprecated `classic` replication configuration is `quorum-vote-wait`: given that the pluggable quorum vote replication -requires backup to have an always-on reliable quorum service, there's no need to specify the timeout to reach -the majority of quorum nodes. A backup remains inactive (ie JVM still up, console too, unable to sync with live, to failover etc etc) -until the majority of quorum nodes is reachable again, re-activating if happens. - -The only exception is with primary failing-back to an existing live backup using `true`: -if the quorum service isn't immediately available the primary (and its JVM) just stop, allowing fail-fast failing-back. - -There are few *semantic differences* of other existing properties: -- `vote-retry-wait`: in `classic` replication means how long to wait between each quorum vote try, while with pluggable quorum replication - means how long request to failover for each attempt -- `vote-retries`: differently from `classic`, the amount of vote attempt is `1 + vote-retries` (with classic is just `vote-retries`). - Setting `0` means no retries, leaving backup to still perform an initial attempt. - **Notes on replication configuration with [Apache curator](https://curator.apache.org/) quorum provider** As said some paragraphs above, `session-ms` affect the failover duration: a backup can @@ -479,34 +464,34 @@ For the former case (session expiration with live no longer present), the backup 1. cluster connection PINGs (affected by [connection-ttl](connection-ttl.md) tuning) 2. closed TCP connection notification (depends by TCP configuration and networking stack/topology) -These 2 cases have 2 different failover duration depending on different factors: -1. `connection-ttl` affect how much time of the expiring `session-ms` is used to just detect a missing live broker: the higher `connection-tt`, - the slower it reacts; backup can attempt to failover for the remaining `session-ms - connection-ttl` -2. `session-ms` expiration is immediately detected: backup must try to failover for >=`session-ms` to be sure to catch - the session expiration and complete failover - -The previous comments are meant to suggest to the careful reader that the minimum time to attempt to failover -cannot be below the full `session-ms` expires. -In short, it means -``` - total failover attempt time > session-ms -``` -with -``` - total failover attempt time = vote-retry-wait * (vote-retries + 1) -``` -and by consequence: -``` - vote-retry-wait * (vote-retries + 1) > session-ms -``` -For example with `session-ms = 18000 ms`, safe values for failover timeout are: -```xml - 11 - 2000 -``` -Because `11 * 2000 = 22000 ms` that's bigger then `18000 ms`. +The suggestion is to tune `connection-ttl` low enough to attempt failover as soon as possible, while taking in consideration that +the whole fail-over duration cannot last less than the configured `session-ms`. -There's no risk that a backup broker will early stop attempting to failover, losing its chance to become live. +##### Peer or Multi Primary +With coordination delegated to the quorum service, roles are less important. It is possible to have two peer servers compete +for activation; the winner activating as live, the looser taking up a backup role. On restart, 'any' peer server +with the most up to date journal can activate. +The instances need to know in advance, what identity they will coordinate on. +In the replication 'primary' ha policy we can explicitly set the 'coordination-id' to a common value for all peers in a cluster. + +For `multi primary`: +```xml + + + + + org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager + + + + + peer-journal-001 + + + +``` +Note: the string value provided will be converted internally into a 16 byte UUID, so it may not be immediately recognisable or human-readable, +however it will ensure that all 'peers' coordinate. ### Shared Store @@ -637,10 +622,10 @@ another server using its nodeID. If it finds one, it will contact this server and try to "fail-back". Since this is a remote replication scenario, the "starting live" will have to synchronize its data with the server running with its ID, once they are in sync, it will request the -other server (which it assumes it is a back that has assumed its duties) -to shutdown for it to take over. This is necessary because otherwise the +other server (which it assumes it is a backup that has assumed its duties) +to shutdown, for it to take over. This is necessary because otherwise the live server has no means to know whether there was a fail-over or not, -and if there was if the server that took its duties is still running or +and if there was, if the server that took its duties is still running or not. To configure this option at your `broker.xml` configuration file as follows, for classic replication: @@ -663,21 +648,21 @@ And pluggable quorum replication: - true + ``` The key difference from classic replication is that if `master` cannot reach any -live server with its same nodeID, it's going straight to become live, while `primary` -request it to the quorum provider, searching again for any existing live if -the quorum provider is not available (eg connectivity loss, consensus absence) or -if there's another live broker with the same nodeID alive, in an endless loop. +live server with its nodeID, it activates unilaterally. +With `primary`, the responsibilities of coordination are delegated to the quorum provider, +there are no unilateral decisions. The `primary` will only activate when +it knows that it has the most up to date version of the journal identified by its nodeID. In short: a started `primary` cannot become live without consensus. -> **Warning** +> **Warning for classic replication** > > Be aware that if you restart a live server while after failover has > occurred then `check-for-live-server` must be set to `true`. If not the live server diff --git a/examples/features/ha/pom.xml b/examples/features/ha/pom.xml index 36eb4403ab..92e2198842 100644 --- a/examples/features/ha/pom.xml +++ b/examples/features/ha/pom.xml @@ -62,6 +62,7 @@ under the License. scale-down stop-server-failover transaction-failover + zookeeper-single-pair-failback @@ -81,6 +82,7 @@ under the License. replicated-transaction-failover scale-down transaction-failover + zookeeper-single-pair-failback diff --git a/examples/features/ha/zookeeper-single-pair-failback/pom.xml b/examples/features/ha/zookeeper-single-pair-failback/pom.xml new file mode 100644 index 0000000000..51044d7145 --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/pom.xml @@ -0,0 +1,110 @@ + + + + + 4.0.0 + + + org.apache.activemq.examples.failover + broker-failover + 2.18.0-SNAPSHOT + + + zookeeper-single-pair-ordered-failback + jar + ActiveMQ Artemis Zookeeper Single Pair Ordered Failback Example + + + ${project.basedir}/../../../.. + + + + + org.apache.activemq + artemis-cli + ${project.version} + + + jakarta.jms + jakarta.jms-api + + + + + + + org.apache.activemq + artemis-maven-plugin + + + create0 + + create + + + + -Djava.net.preferIPv4Stack=true + ${basedir}/target/server0 + ${basedir}/target/classes/activemq/server0 + -Dudp-address=${udp-address} + + + + create1 + + create + + + + -Djava.net.preferIPv4Stack=true + ${basedir}/target/server1 + ${basedir}/target/classes/activemq/server1 + -Dudp-address=${udp-address} + + + + runClient + + runClient + + + org.apache.activemq.artemis.jms.example.ZookeeperSinglePairFailback + + ${basedir}/target/server0 + ${basedir}/target/server1 + + + + + + + org.apache.activemq.examples.failover + zookeeper-single-pair-ordered-failback + 2.18.0-SNAPSHOT + + + + + org.apache.maven.plugins + maven-clean-plugin + + + + diff --git a/examples/features/ha/zookeeper-single-pair-failback/readme.md b/examples/features/ha/zookeeper-single-pair-failback/readme.md new file mode 100644 index 0000000000..f9fc00f5b1 --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/readme.md @@ -0,0 +1,94 @@ +# Zookeeper Single Pair Failback Example + +This example demonstrates two servers coupled as a primary-backup pair for high availability (HA) using +pluggable quorum vote replication Reference Implementation based on [Apache Curator](https://curator.apache.org/) to use +[Apache Zookeeper](https://zookeeper.apache.org/) as external quorum service. + +The example shows a client connection failing over from live to backup when the live broker is crashed and +then back to the original live when it is restarted (i.e. "failback"). + +To run the example, simply type **mvn verify** from this directory after running a Zookeeper node at `localhost:2181`. + +If no Zookeeper node is configured, can use the commands below (see [Official Zookeeper Docker Image Site](https://hub.docker.com/_/zookeeper) +for more details on how configure it). + +Run Zookeeper `3.6.3` with: +``` +$ docker run --name artemis-zk --network host --restart always -d zookeeper:3.6.3 +``` +By default, the official docker image exposes `2181 2888 3888 8080` as client, follower, election and AdminServer ports. + +Verify Zookeeper server is correctly started by running: +``` +$ docker logs --follow artemis-zk +``` +It should print the Zookeeper welcome ASCII logs: +``` +ZooKeeper JMX enabled by default +Using config: /conf/zoo.cfg +2021-08-05 14:29:29,431 [myid:] - INFO [main:QuorumPeerConfig@174] - Reading configuration from: /conf/zoo.cfg +2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@451] - clientPort is not set +2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@464] - secureClientPort is not set +2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@480] - observerMasterPort is not set +2021-08-05 14:29:29,435 [myid:] - INFO [main:QuorumPeerConfig@497] - metricsProvider.className is org.apache.zookeeper.metrics.impl.DefaultMetricsProvider +2021-08-05 14:29:29,438 [myid:] - ERROR [main:QuorumPeerConfig@722] - Invalid configuration, only one server specified (ignoring) +2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@78] - autopurge.snapRetainCount set to 3 +2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@79] - autopurge.purgeInterval set to 0 +2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@101] - Purge task is not scheduled. +2021-08-05 14:29:29,441 [myid:1] - WARN [main:QuorumPeerMain@138] - Either no config or no quorum defined in config, running in standalone mode +2021-08-05 14:29:29,444 [myid:1] - INFO [main:ManagedUtil@44] - Log4j 1.2 jmx support found and enabled. +2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@174] - Reading configuration from: /conf/zoo.cfg +2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@451] - clientPort is not set +2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@464] - secureClientPort is not set +2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@480] - observerMasterPort is not set +2021-08-05 14:29:29,450 [myid:1] - INFO [main:QuorumPeerConfig@497] - metricsProvider.className is org.apache.zookeeper.metrics.impl.DefaultMetricsProvider +2021-08-05 14:29:29,450 [myid:1] - ERROR [main:QuorumPeerConfig@722] - Invalid configuration, only one server specified (ignoring) +2021-08-05 14:29:29,451 [myid:1] - INFO [main:ZooKeeperServerMain@122] - Starting server +2021-08-05 14:29:29,459 [myid:1] - INFO [main:ServerMetrics@62] - ServerMetrics initialized with provider org.apache.zookeeper.metrics.impl.DefaultMetricsProvider@525f1e4e +2021-08-05 14:29:29,461 [myid:1] - INFO [main:FileTxnSnapLog@124] - zookeeper.snapshot.trust.empty : false +2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - +2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - ______ _ +2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - |___ / | | +2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - / / ___ ___ | | __ ___ ___ _ __ ___ _ __ +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - / / / _ \ / _ \ | |/ / / _ \ / _ \ | '_ \ / _ \ | '__| +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - / /__ | (_) | | (_) | | < | __/ | __/ | |_) | | __/ | | +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - /_____| \___/ \___/ |_|\_\ \___| \___| | .__/ \___| |_| +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - | | +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - |_| +2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - +``` +Alternatively, can run +``` +$ docker run -it --rm --network host zookeeper:3.6.3 zkCli.sh -server localhost:2181 +``` +Zookeeper server can be reached using localhost:2181 if it output something like: +``` +2021-08-05 14:56:03,739 [myid:localhost:2181] - INFO [main-SendThread(localhost:2181):ClientCnxn$SendThread@1448] - Session establishment complete on server localhost/0:0:0:0:0:0:0:1:2181, session id = 0x100078b8cfc0002, negotiated timeout = 30000 + +``` +Type +``` +[zk: localhost:2181(CONNECTED) 0] quit +``` +to quit the client instance. + +The 2 brokers of this example are already configured to connect to a single Zookeeper node at the mentioned address, thanks to the XML configuration of their `manager`: +```xml + + + + + + + +``` +**NOTE** the `namespace` parameter is used to separate the pair information from others if the Zookeeper node is shared with other applications. + +**WARNING** As already recommended on the [High Availability section](https://activemq.apache.org/components/artemis/documentation/latest/ha.html), a production environment needs >= 3 nodes to protect against network partitions. + +This example can be run with +``` +$ mvn verify +``` + +For more information on ActiveMQ Artemis failover and HA, and clustering in general, please see the clustering section of the user manual. \ No newline at end of file diff --git a/examples/features/ha/zookeeper-single-pair-failback/src/main/java/org/apache/activemq/artemis/jms/example/ZookeeperSinglePairFailback.java b/examples/features/ha/zookeeper-single-pair-failback/src/main/java/org/apache/activemq/artemis/jms/example/ZookeeperSinglePairFailback.java new file mode 100644 index 0000000000..fa1172c572 --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/src/main/java/org/apache/activemq/artemis/jms/example/ZookeeperSinglePairFailback.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.jms.example; + +import javax.jms.Connection; +import javax.jms.ConnectionFactory; +import javax.jms.JMSException; +import javax.jms.MessageConsumer; +import javax.jms.MessageProducer; +import javax.jms.Queue; +import javax.jms.Session; +import javax.jms.TextMessage; +import javax.naming.InitialContext; + +import org.apache.activemq.artemis.util.ServerUtil; + +/** + * Example of live and replicating backup pair using Zookeeper as the quorum provider. + *

+ * After both servers are started, the live server is killed and the backup becomes active ("fails-over"). + *

+ * Later the live server is restarted and takes back its position by asking the backup to stop ("fail-back"). + */ +public class ZookeeperSinglePairFailback { + + private static Process server0; + + private static Process server1; + + public static void main(final String[] args) throws Exception { + // Step 0. Prepare Zookeeper Evironment as shown on readme.md + + final int numMessages = 30; + + Connection connection = null; + + InitialContext initialContext = null; + + try { + server0 = ServerUtil.startServer(args[0], ZookeeperSinglePairFailback.class.getSimpleName() + "-primary", 0, 30000); + server1 = ServerUtil.startServer(args[1], ZookeeperSinglePairFailback.class.getSimpleName() + "-backup", 1, 10000); + + // Step 2. Get an initial context for looking up JNDI from the server #1 + initialContext = new InitialContext(); + + // Step 3. Look up the JMS resources from JNDI + Queue queue = (Queue) initialContext.lookup("queue/exampleQueue"); + ConnectionFactory connectionFactory = (ConnectionFactory) initialContext.lookup("ConnectionFactory"); + + // Step 4. Create a JMS Connection + connection = connectionFactory.createConnection(); + + // Step 5. Create a *non-transacted* JMS Session with client acknowledgement + Session session = connection.createSession(false, Session.CLIENT_ACKNOWLEDGE); + + // Step 6. Start the connection to ensure delivery occurs + connection.start(); + + // Step 7. Create a JMS MessageProducer and a MessageConsumer + MessageProducer producer = session.createProducer(queue); + MessageConsumer consumer = session.createConsumer(queue); + + // Step 8. Send some messages to server #1, the live server + for (int i = 0; i < numMessages; i++) { + TextMessage message = session.createTextMessage("This is text message " + i); + producer.send(message); + System.out.println("Sent message: " + message.getText()); + } + + // Step 9. Receive and acknowledge a third of the sent messages + TextMessage message0 = null; + for (int i = 0; i < numMessages / 3; i++) { + message0 = (TextMessage) consumer.receive(5000); + System.out.println("Got message: " + message0.getText()); + } + message0.acknowledge(); + System.out.println("Received and acknowledged a third of the sent messages"); + + // Step 10. Receive the rest third of the sent messages but *do not* acknowledge them yet + for (int i = numMessages / 3; i < numMessages; i++) { + message0 = (TextMessage) consumer.receive(5000); + System.out.println("Got message: " + message0.getText()); + } + System.out.println("Received without acknowledged the rest of the sent messages"); + + Thread.sleep(2000); + // Step 11. Crash server #0, the live server, and wait a little while to make sure + // it has really crashed + ServerUtil.killServer(server0); + System.out.println("Killed primary"); + + Thread.sleep(2000); + + // Step 12. Acknowledging the received messages will fail as failover to the backup server has occurred + try { + message0.acknowledge(); + } catch (JMSException e) { + System.out.println("Got (the expected) exception while acknowledging message: " + e.getMessage()); + } + + // Step 13. Consume again the 2nd third of the messages again. Note that they are not considered as redelivered. + for (int i = numMessages / 3; i < (numMessages / 3) * 2; i++) { + message0 = (TextMessage) consumer.receive(5000); + System.out.printf("Got message: %s (redelivered?: %s)\n", message0.getText(), message0.getJMSRedelivered()); + } + + // Step 14. Acknowledging them on the failed-over broker works fine + message0.acknowledge(); + System.out.println("Acknowledged 2n third of messages"); + + // Step 15. Restarting primary + server0 = ServerUtil.startServer(args[0], ZookeeperSinglePairFailback.class.getSimpleName() + "-primary", 0, 10000); + System.out.println("Started primary"); + + // await fail-back to complete + Thread.sleep(4000); + + // Step 16. Consuming the 3rd third of the messages. Note that they are not considered as redelivered. + for (int i = (numMessages / 3) * 2; i < numMessages; i++) { + message0 = (TextMessage) consumer.receive(5000); + System.out.printf("Got message: %s (redelivered?: %s)\n", message0.getText(), message0.getJMSRedelivered()); + } + message0.acknowledge(); + System.out.println("Acknowledged 3d third of messages"); + + } finally { + // Step 17. Be sure to close our resources! + + if (connection != null) { + connection.close(); + } + + if (initialContext != null) { + initialContext.close(); + } + + ServerUtil.killServer(server0); + ServerUtil.killServer(server1); + + // Step 18. stop the ZK server + } + } +} diff --git a/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server0/broker.xml b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server0/broker.xml new file mode 100644 index 0000000000..1223fca0e3 --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server0/broker.xml @@ -0,0 +1,90 @@ + + + + + + ./data/bindings + + ./data/journal + + ./data/largemessages + + ./data/paging + + exampleUser + + secret + + + + + + + + + + + + + + + + + tcp://localhost:61616 + tcp://localhost:61617 + + + + + tcp://localhost:61616 + + + + + netty-connector + + netty-backup-connector + + + + + + + + + + + + + + + + + + + +

+ + + +
+ + + diff --git a/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server1/broker.xml b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server1/broker.xml new file mode 100644 index 0000000000..bfe63dacda --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/activemq/server1/broker.xml @@ -0,0 +1,91 @@ + + + + + + ./data/bindings + + ./data/journal + + ./data/largemessages + + ./data/paging + + exampleUser + + secret + + + + + + + + + + + + true + + + + + + + tcp://localhost:61616 + tcp://localhost:61617 + + + + + tcp://localhost:61617 + + + + + netty-connector + + netty-live-connector + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
diff --git a/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/jndi.properties b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/jndi.properties new file mode 100644 index 0000000000..7f7a19f280 --- /dev/null +++ b/examples/features/ha/zookeeper-single-pair-failback/src/main/resources/jndi.properties @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +java.naming.factory.initial=org.apache.activemq.artemis.jndi.ActiveMQInitialContextFactory +connectionFactory.ConnectionFactory=tcp://localhost:61616?ha=true&retryInterval=1000&retryIntervalMultiplier=1.0&reconnectAttempts=-1 +queue.queue/exampleQueue=exampleQueue diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java index a69c44ec18..50c33a9ec7 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/InfiniteRedeliveryTest.java @@ -77,7 +77,6 @@ public class InfiniteRedeliveryTest extends ActiveMQTestBase { Configuration backupConfig; Configuration liveConfig; - NodeManager nodeManager; protected TestableServer createTestableServer(Configuration config, NodeManager nodeManager) throws Exception { boolean isBackup = config.getHAPolicyConfiguration() instanceof ReplicaPolicyConfiguration || config.getHAPolicyConfiguration() instanceof SharedStoreSlavePolicyConfiguration; @@ -99,13 +98,11 @@ public class InfiniteRedeliveryTest extends ActiveMQTestBase { - nodeManager = new InVMNodeManager(true, backupConfig.getJournalLocation()); - - backupServer = createTestableServer(backupConfig, nodeManager); + backupServer = createTestableServer(backupConfig, new InVMNodeManager(true, backupConfig.getJournalLocation())); liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(TransportConfigurationUtils.getNettyAcceptor(true, 0)); - liveServer = createTestableServer(liveConfig, nodeManager); + liveServer = createTestableServer(liveConfig, new InVMNodeManager(false, liveConfig.getJournalLocation())); } protected void configureReplicationPair(TransportConfiguration backupConnector, diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java index a13ef3dbbb..3dcf9a9d22 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTest.java @@ -659,6 +659,8 @@ public class FailoverTest extends FailoverTestBase { backupServer.getServer().fail(true); + decrementActivationSequenceForForceRestartOf(liveServer); + liveServer.start(); consumer.close(); @@ -823,6 +825,7 @@ public class FailoverTest extends FailoverTestBase { Assert.assertFalse("must NOT be a backup", isBackup); adaptLiveConfigForReplicatedFailBack(liveServer); beforeRestart(liveServer); + decrementActivationSequenceForForceRestartOf(liveServer); liveServer.start(); Assert.assertTrue("live initialized...", liveServer.getServer().waitForActivation(15, TimeUnit.SECONDS)); @@ -931,12 +934,13 @@ public class FailoverTest extends FailoverTestBase { while (!backupServer.isStarted() && i++ < 100) { Thread.sleep(100); } - liveServer.getServer().waitForActivation(5, TimeUnit.SECONDS); + backupServer.getServer().waitForActivation(5, TimeUnit.SECONDS); Assert.assertTrue(backupServer.isStarted()); if (isReplicated) { FileMoveManager moveManager = new FileMoveManager(backupServer.getServer().getConfiguration().getJournalLocation(), 0); - Assert.assertEquals(1, moveManager.getNumberOfFolders()); + // backup has not had a chance to restart as a backup and cleanup + Wait.assertTrue(() -> moveManager.getNumberOfFolders() <= 2); } } else { backupServer.stop(); @@ -2427,6 +2431,10 @@ public class FailoverTest extends FailoverTestBase { // no-op } + protected void decrementActivationSequenceForForceRestartOf(TestableServer liveServer) throws Exception { + // no-op + } + protected ClientSession sendAndConsume(final ClientSessionFactory sf1, final boolean createQueue) throws Exception { ClientSession session = createSession(sf1, false, true, true); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java index 1bcd820a1b..d863cf59fb 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java @@ -46,7 +46,6 @@ import org.apache.activemq.artemis.core.remoting.impl.invm.InVMRegistry; import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.core.server.cluster.ha.HAPolicy; import org.apache.activemq.artemis.core.server.cluster.ha.ReplicatedPolicy; -import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationPrimaryPolicy; import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl; import org.apache.activemq.artemis.core.server.impl.InVMNodeManager; import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; @@ -88,6 +87,10 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { protected NodeManager nodeManager; + protected NodeManager backupNodeManager; + + protected DistributedPrimitiveManagerConfiguration managerConfiguration; + protected boolean startBackupServer = true; @Override @@ -173,6 +176,10 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { return new InVMNodeManager(false); } + protected NodeManager createNodeManager(Configuration configuration) throws Exception { + return new InVMNodeManager(false, configuration.getNodeManagerLockLocation()); + } + protected void createConfigs() throws Exception { nodeManager = createNodeManager(); TransportConfiguration liveConnector = getConnectorTransportConfiguration(true); @@ -211,13 +218,14 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { backupConfig.setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)).setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setSecurityEnabled(false); setupHAPolicyConfiguration(); - nodeManager = createReplicatedBackupNodeManager(backupConfig); + backupNodeManager = createReplicatedBackupNodeManager(backupConfig); - backupServer = createTestableServer(backupConfig); + backupServer = createTestableServer(backupConfig, backupNodeManager); liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true)); - liveServer = createTestableServer(liveConfig); + nodeManager = createNodeManager(liveConfig); + liveServer = createTestableServer(liveConfig, nodeManager); if (supportsRetention()) { liveServer.getServer().getConfiguration().setJournalRetentionDirectory(getJournalDir(0, false) + "_retention"); @@ -233,7 +241,7 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { backupConfig = createDefaultInVMConfig(); liveConfig = createDefaultInVMConfig(); - DistributedPrimitiveManagerConfiguration managerConfiguration = + managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); @@ -242,13 +250,14 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { backupConfig.setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)).setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setSecurityEnabled(false); setupHAPolicyConfiguration(); - nodeManager = createReplicatedBackupNodeManager(backupConfig); + backupNodeManager = createReplicatedBackupNodeManager(backupConfig); - backupServer = createTestableServer(backupConfig); + backupServer = createTestableServer(backupConfig, backupNodeManager); liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true)); - liveServer = createTestableServer(liveConfig); + nodeManager = createNodeManager(liveConfig); + liveServer = createTestableServer(liveConfig, nodeManager); } protected void setupHAPolicyConfiguration() { @@ -272,8 +281,6 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { HAPolicy policy = server.getServer().getHAPolicy(); if (policy instanceof ReplicatedPolicy) { ((ReplicatedPolicy) policy).setCheckForLiveServer(true); - } else if (policy instanceof ReplicationPrimaryPolicy) { - Assert.assertTrue("Adapting won't work for the current configuration", ((ReplicationPrimaryPolicy) policy).isCheckForLiveServer()); } } @@ -294,6 +301,7 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { nodeManager = null; + backupNodeManager = null; try { ServerSocket serverSocket = new ServerSocket(61616); serverSocket.close(); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java index a3f19cf412..49bd396b05 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java @@ -182,8 +182,6 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase { break; case PluggableQuorumReplication: haPolicyConfiguration = ReplicationBackupPolicyConfiguration.withDefault() - .setVoteRetries(1) - .setVoteRetryWait(1000) .setDistributedManagerConfiguration(getOrCreatePluggableQuorumConfiguration()) .setGroupName(getNodeGroupName() != null ? (getNodeGroupName() + "-" + i) : null); break; @@ -278,12 +276,14 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase { return addClientSession(sf.createSession(xa, autoCommitSends, autoCommitAcks)); } - protected void waitForDistribution(SimpleString address, ActiveMQServer server, int messageCount) throws Exception { + protected boolean waitForDistribution(SimpleString address, ActiveMQServer server, int messageCount) throws Exception { ActiveMQServerLogger.LOGGER.debug("waiting for distribution of messages on server " + server); Queue q = (Queue) server.getPostOffice().getBinding(address).getBindable(); - Wait.waitFor(() -> getMessageCount(q) >= messageCount); + return Wait.waitFor(() -> { + return getMessageCount(q) >= messageCount; + }); } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java index d4a9c3d1e1..ce775368ee 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicaTimeoutTest.java @@ -156,12 +156,13 @@ public class ReplicaTimeoutTest extends ActiveMQTestBase { liveConfig.setBindingsDirectory(getBindingsDir(0, false)).setJournalDirectory(getJournalDir(0, false)). setPagingDirectory(getPageDir(0, false)).setLargeMessagesDirectory(getLargeMessagesDir(0, false)).setSecurityEnabled(false); - NodeManager nodeManager = createReplicatedBackupNodeManager(backupConfig); + NodeManager replicatedBackupNodeManager = createReplicatedBackupNodeManager(backupConfig); - backupServer = createTestableServer(backupConfig, nodeManager); + backupServer = createTestableServer(backupConfig, replicatedBackupNodeManager); liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true)); + NodeManager nodeManager = createReplicatedBackupNodeManager(liveConfig); liveServer = createTestableServer(liveConfig, nodeManager); final TestableServer theBackup = backupServer; diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java index 6712c74692..15cdd621f1 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/ReplicatedMultipleServerFailoverExtraBackupsTest.java @@ -75,7 +75,10 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated sendCrashReceive(); Wait.assertTrue(backupServers.get(0)::isActive, 5000, 10); + Wait.assertTrue(backupServers.get(1)::isActive, 5000, 10); waitForTopology(backupServers.get(0).getServer(), liveServers.size(), 2); + waitForTopology(backupServers.get(1).getServer(), liveServers.size(), 2); + sendCrashBackupReceive(); } @@ -115,6 +118,14 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated } protected void sendCrashBackupReceive() throws Exception { + + //make sure bindings are ready before sending messages b/c we verify strict load balancing in waitForDistribution + this.waitForBindings( backupServers.get(0).getServer(), ADDRESS.toString(), false, 1, 0, 2000); + this.waitForBindings( backupServers.get(0).getServer(), ADDRESS.toString(), false, 1, 0, 2000); + + this.waitForBindings( backupServers.get(1).getServer(), ADDRESS.toString(), false, 1, 0, 2000); + this.waitForBindings( backupServers.get(1).getServer(), ADDRESS.toString(), false, 1, 0, 2000); + ServerLocator locator0 = getBackupServerLocator(0); ServerLocator locator1 = getBackupServerLocator(1); @@ -138,8 +149,8 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated producer.close(); - waitForDistribution(ADDRESS, backupServers.get(0).getServer(), 100); - waitForDistribution(ADDRESS, backupServers.get(1).getServer(), 100); + assertTrue(waitForDistribution(ADDRESS, backupServers.get(0).getServer(), 100)); + assertTrue(waitForDistribution(ADDRESS, backupServers.get(1).getServer(), 100)); List toCrash = new ArrayList<>(); for (TestableServer backupServer : backupServers) { diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java index b1e6a94dc2..95a69262ed 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupAuthenticationTest.java @@ -24,7 +24,6 @@ import org.apache.activemq.artemis.api.core.ActiveMQException; import org.apache.activemq.artemis.api.core.Interceptor; import org.apache.activemq.artemis.api.core.TransportConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.core.protocol.core.Packet; import org.apache.activemq.artemis.core.protocol.core.impl.PacketImpl; import org.apache.activemq.artemis.spi.core.protocol.RemotingConnection; @@ -90,7 +89,6 @@ public class PluggableQuorumBackupAuthenticationTest extends FailoverTestBase { @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java index 6f620a64f2..0f10c44934 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumBackupSyncJournalTest.java @@ -17,8 +17,7 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; -import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;; import org.apache.activemq.artemis.tests.integration.cluster.failover.BackupSyncJournalTest; public class PluggableQuorumBackupSyncJournalTest extends BackupSyncJournalTest { @@ -30,8 +29,6 @@ public class PluggableQuorumBackupSyncJournalTest extends BackupSyncJournalTest @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()) - .setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) .setMaxSavedReplicatedJournalsSize(2) .setAllowFailBack(true); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java index eb2f1dc2a0..274fb5acb8 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.java @@ -29,18 +29,24 @@ import org.apache.activemq.artemis.api.core.QueueConfiguration; import org.apache.activemq.artemis.api.core.TransportConfiguration; import org.apache.activemq.artemis.api.core.client.ClientSession; import org.apache.activemq.artemis.component.WebServerComponent; +import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.core.server.ActiveMQServer; +import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.core.server.ServiceComponent; import org.apache.activemq.artemis.dto.AppDTO; import org.apache.activemq.artemis.dto.WebServerDTO; +import org.apache.activemq.artemis.quorum.MutableLong; +import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTest; +import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer; import org.apache.activemq.artemis.tests.util.Wait; +import org.jboss.logging.Logger; import org.junit.Assert; import org.junit.Test; public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends FailoverTest { + private static final Logger log = Logger.getLogger(PluggableQuorumReplicatedLargeMessageFailoverTest.class); protected void beforeWaitForRemoteBackupSynchronization() { } @@ -170,8 +176,6 @@ public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends Failo @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()) - .setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) .setMaxSavedReplicatedJournalsSize(2) .setAllowFailBack(true); @@ -210,4 +214,30 @@ public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends Failo } super.crash(sessions); } + + @Override + protected void decrementActivationSequenceForForceRestartOf(TestableServer testableServer) throws Exception { + doDecrementActivationSequenceForForceRestartOf(log, nodeManager, managerConfiguration); + } + + public static void doDecrementActivationSequenceForForceRestartOf(Logger log, NodeManager nodeManager, DistributedPrimitiveManagerConfiguration distributedPrimitiveManagerConfiguration) throws Exception { + nodeManager.start(); + long localActivation = nodeManager.readNodeActivationSequence(); + // file based + FileBasedPrimitiveManager fileBasedPrimitiveManager = new FileBasedPrimitiveManager(distributedPrimitiveManagerConfiguration.getProperties()); + fileBasedPrimitiveManager.start(); + try { + MutableLong mutableLong = fileBasedPrimitiveManager.getMutableLong(nodeManager.getNodeId().toString()); + + if (!mutableLong.compareAndSet(localActivation + 1, localActivation)) { + throw new Exception("Failed to decrement coordinated activation sequence to:" + localActivation + ", not +1 : " + mutableLong.get()); + } + log.warn("Intentionally decrementing coordinated activation sequence for test, may result is lost data"); + + } finally { + fileBasedPrimitiveManager.stop(); + nodeManager.stop(); + } + } + } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java index 1381cad3c4..a42547c6e7 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumNettyReplicationStopTest.java @@ -17,7 +17,6 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.tests.integration.cluster.failover.NettyReplicationStopTest; public class PluggableQuorumNettyReplicationStopTest extends NettyReplicationStopTest { @@ -29,7 +28,6 @@ public class PluggableQuorumNettyReplicationStopTest extends NettyReplicationSto @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java index b5b8b3af02..b351706e43 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumPageCleanupWhileReplicaCatchupTest.java @@ -17,7 +17,6 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.tests.integration.cluster.failover.PageCleanupWhileReplicaCatchupTest; public class PluggableQuorumPageCleanupWhileReplicaCatchupTest extends PageCleanupWhileReplicaCatchupTest { @@ -29,7 +28,6 @@ public class PluggableQuorumPageCleanupWhileReplicaCatchupTest extends PageClean @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java index 4c15d7bd3b..0718962e10 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java @@ -48,7 +48,6 @@ public class PluggableQuorumReplicaTimeoutTest extends ReplicaTimeoutTest { managerConfiguration, managerConfiguration); ReplicationPrimaryPolicyConfiguration primaryConfiguration = ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()); primaryConfiguration.setInitialReplicationSyncTimeout(1000); - primaryConfiguration.setCheckForLiveServer(true); ReplicationBackupPolicyConfiguration backupConfiguration = ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()); backupConfiguration.setInitialReplicationSyncTimeout(1000); backupConfiguration.setMaxSavedReplicatedJournalsSize(2) diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java index 9d7cb4d3b2..a675200508 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageFailoverTest.java @@ -18,11 +18,15 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.api.core.client.ClientSession; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.tests.integration.cluster.failover.LargeMessageFailoverTest; +import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer; +import org.jboss.logging.Logger; + +import static org.apache.activemq.artemis.tests.integration.cluster.failover.quorum.PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.doDecrementActivationSequenceForForceRestartOf; public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMessageFailoverTest { + private static final Logger log = Logger.getLogger(PluggableQuorumReplicatedLargeMessageFailoverTest.class); @Override protected void createConfigs() throws Exception { createPluggableReplicatedConfigs(); @@ -30,7 +34,6 @@ public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMess @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } @@ -58,4 +61,8 @@ public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMess super.crash(sessions); } + @Override + protected void decrementActivationSequenceForForceRestartOf(TestableServer liveServer) throws Exception { + doDecrementActivationSequenceForForceRestartOf(log, nodeManager, managerConfiguration); + } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java index 5954d8992d..a1e152e813 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest.java @@ -18,7 +18,6 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.api.core.client.ClientSession; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.tests.integration.cluster.util.BackupSyncDelay; import org.junit.After; import org.junit.Before; @@ -66,7 +65,6 @@ public class PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest extends @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) .setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java index d80460855b..9c4d60004f 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicatedPagingFailoverTest.java @@ -17,7 +17,6 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.tests.integration.cluster.failover.PagingFailoverTest; public class PluggableQuorumReplicatedPagingFailoverTest extends PagingFailoverTest { @@ -29,7 +28,6 @@ public class PluggableQuorumReplicatedPagingFailoverTest extends PagingFailoverT @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/management/ActiveMQServerControlUsingCoreTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/management/ActiveMQServerControlUsingCoreTest.java index 7eb69530c5..ab060fac52 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/management/ActiveMQServerControlUsingCoreTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/management/ActiveMQServerControlUsingCoreTest.java @@ -520,6 +520,11 @@ public class ActiveMQServerControlUsingCoreTest extends ActiveMQServerControlTes return (String) proxy.retrieveAttributeValue("nodeID"); } + @Override + public long getActivationSequence() { + return (Long) proxy.retrieveAttributeValue("activationSequence"); + } + @Override public String getManagementAddress() { return (String) proxy.retrieveAttributeValue("managementAddress"); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java index 11e4380246..1a84de67f5 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java @@ -45,8 +45,6 @@ public class PluggableQuorumReplicationFlowControlTest extends SharedNothingRepl ReplicationBackupPolicyConfiguration haPolicy = ReplicationBackupPolicyConfiguration.withDefault(); haPolicy.setDistributedManagerConfiguration(managerConfiguration); haPolicy.setClusterName("cluster"); - // fail-fast in order to let the backup to quickly retry syncing with primary - haPolicy.setVoteRetries(0); return haPolicy; } @@ -54,7 +52,6 @@ public class PluggableQuorumReplicationFlowControlTest extends SharedNothingRepl protected HAPolicyConfiguration createReplicationLiveConfiguration() { ReplicationPrimaryPolicyConfiguration haPolicy = ReplicationPrimaryPolicyConfiguration.withDefault(); haPolicy.setDistributedManagerConfiguration(managerConfiguration); - haPolicy.setCheckForLiveServer(false); return haPolicy; } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java index f5ed720792..8d71a57e4e 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationOrderTest.java @@ -18,7 +18,6 @@ package org.apache.activemq.artemis.tests.integration.replication; import org.apache.activemq.artemis.api.core.TransportConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; @@ -30,7 +29,7 @@ public class PluggableQuorumReplicationOrderTest extends ReplicationOrderTest { @Parameterized.Parameter public boolean useNetty; - @Parameterized.Parameters(name = "useNetty={1}") + @Parameterized.Parameters(name = "useNetty={0}") public static Iterable getParams() { return asList(new Object[][]{{false}, {true}}); } @@ -42,8 +41,6 @@ public class PluggableQuorumReplicationOrderTest extends ReplicationOrderTest { @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()) - .setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()) .setMaxSavedReplicatedJournalsSize(2) .setAllowFailBack(true); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java index 1d702b5b88..714b2982db 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java @@ -18,14 +18,37 @@ package org.apache.activemq.artemis.tests.integration.replication; import java.io.IOException; import java.util.Collections; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import org.apache.activemq.artemis.api.core.Message; +import org.apache.activemq.artemis.api.core.QueueConfiguration; +import org.apache.activemq.artemis.api.core.RoutingType; +import org.apache.activemq.artemis.api.core.client.ClientConsumer; +import org.apache.activemq.artemis.api.core.client.ClientMessage; +import org.apache.activemq.artemis.api.core.client.ClientProducer; +import org.apache.activemq.artemis.api.core.client.ClientSession; +import org.apache.activemq.artemis.api.core.client.ClientSessionFactory; +import org.apache.activemq.artemis.api.core.client.ServerLocator; +import org.apache.activemq.artemis.core.client.impl.ServerLocatorImpl; +import org.apache.activemq.artemis.core.config.Configuration; import org.apache.activemq.artemis.core.config.HAPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; +import org.apache.activemq.artemis.core.server.ActivateCallback; +import org.apache.activemq.artemis.core.server.ActiveMQServer; +import org.apache.activemq.artemis.core.server.ActiveMQServers; +import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager; +import org.apache.activemq.artemis.quorum.DistributedLock; +import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager; +import org.apache.activemq.artemis.quorum.MutableLong; import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; +import org.apache.activemq.artemis.tests.util.Wait; +import org.junit.Assert; import org.junit.Before; import org.junit.Rule; +import org.junit.Test; import org.junit.rules.TemporaryFolder; public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest { @@ -43,7 +66,6 @@ public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest protected HAPolicyConfiguration createReplicationLiveConfiguration() { ReplicationPrimaryPolicyConfiguration haPolicy = ReplicationPrimaryPolicyConfiguration.withDefault(); haPolicy.setDistributedManagerConfiguration(managerConfiguration); - haPolicy.setCheckForLiveServer(false); return haPolicy; } @@ -55,4 +77,616 @@ public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest return haPolicy; } + @Test + public void testUnReplicatedOrderedTransition() throws Exception { + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + ServerLocator locator = ServerLocatorImpl.newLocator("(tcp://localhost:61616,tcp://localhost:61617)?ha=true"); + locator.setCallTimeout(60_000L); + locator.setConnectionTTL(60_000L); + + ClientSessionFactory csf = locator.createSessionFactory(); + ClientSession clientSession = csf.createSession(); + clientSession.createQueue(new QueueConfiguration("slow").setRoutingType(RoutingType.ANYCAST)); + clientSession.close(); + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + Wait.waitFor(backupServer::isStarted); + + waitForTopology(liveServer, 1, 1, 30000); + waitForTopology(backupServer, 1, 1, 30000); + + liveServer.stop(); + + // backup will take over and run un replicated + + csf = locator.createSessionFactory(); + clientSession = csf.createSession(); + clientSession.createQueue(new QueueConfiguration("slow_un_replicated").setRoutingType(RoutingType.ANYCAST)); + clientSession.close(); + + waitForTopology(backupServer, 1, 0, 30000); + assertTrue(Wait.waitFor(() -> 2L == backupServer.getNodeManager().getNodeActivationSequence())); + + backupServer.stop(false); + + // now only backup should be able to start as it has run un_replicated + liveServer.start(); + Wait.assertFalse(liveServer::isActive); + liveServer.stop(); + + // restart backup + backupServer.start(); + + Wait.waitFor(backupServer::isStarted); + assertEquals(3L, backupServer.getNodeManager().getNodeActivationSequence()); + + csf = locator.createSessionFactory(); + clientSession = csf.createSession(); + clientSession.createQueue(new QueueConfiguration("backup_as_un_replicated").setRoutingType(RoutingType.ANYCAST)); + clientSession.close(); + + // verify the live restart as a backup to the restarted backupServer that has taken on the live role, no failback + liveServer.start(); + + csf = locator.createSessionFactory(); + clientSession = csf.createSession(); + clientSession.createQueue(new QueueConfiguration("backup_as_replicated").setRoutingType(RoutingType.ANYCAST)); + clientSession.close(); + + assertTrue(Wait.waitFor(liveServer::isReplicaSync)); + assertTrue(Wait.waitFor(() -> 3L == liveServer.getNodeManager().getNodeActivationSequence())); + + backupServer.stop(true); + + waitForTopology(liveServer, 1, 0, 30000); + assertTrue(Wait.waitFor(() -> 4L == liveServer.getNodeManager().getNodeActivationSequence())); + + liveServer.stop(true); + clientSession.close(); + locator.close(); + } + + @Test + public void testBackupFailoverAndPrimaryFailback() throws Exception { + final int timeout = (int) TimeUnit.SECONDS.toMillis(30); + + // start live + Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer primaryInstance = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + primaryInstance.setIdentity("PRIMARY"); + primaryInstance.start(); + + // primary initially UN REPLICATED + Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence()); + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ((ReplicationBackupPolicyConfiguration)backupConfiguration.getHAPolicyConfiguration()).setAllowFailBack(true); + + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + Wait.waitFor(backupServer::isStarted); + + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout); + + // primary REPLICATED, backup matches (has replicated) activation sequence + Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence()); + Assert.assertEquals(1L, backupServer.getNodeManager().getNodeActivationSequence()); + + primaryInstance.stop(); + + // backup UN REPLICATED (new version) + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 2L == backupServer.getNodeManager().getNodeActivationSequence(), timeout); + + // just to let the console logging breath! + TimeUnit.MILLISECONDS.sleep(100); + + // restart primary that will request failback + ActiveMQServer restartedPrimaryForFailBack = primaryInstance; //addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + restartedPrimaryForFailBack.start(); + + // first step is backup getting replicated + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout); + + // restarted primary will run un replicated (increment sequence) while backup restarts to revert to backup role. + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> { + try { + return 3L == restartedPrimaryForFailBack.getNodeManager().getNodeActivationSequence(); + } catch (NullPointerException ok) { + return false; + } + }, timeout); + + // the backup should then resume with an insync replica view of that version + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> restartedPrimaryForFailBack.isReplicaSync(), timeout); + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout); + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 3L == backupServer.getNodeManager().getNodeActivationSequence(), timeout); + + // just to let the console logging breath! + TimeUnit.MILLISECONDS.sleep(100); + + // stop backup to verify primary goes on with new sequence as un replicated + backupServer.stop(); + + // just to let the console logging breath! + TimeUnit.MILLISECONDS.sleep(100); + + // live goes un replicated + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> { + try { + return 4L == restartedPrimaryForFailBack.getNodeManager().getNodeActivationSequence(); + } catch (NullPointerException ok) { + return false; + } + }, timeout); + + restartedPrimaryForFailBack.stop(); + } + + + @Test + public void testPrimaryIncrementActivationSequenceOnUnReplicated() throws Exception { + final int timeout = (int) TimeUnit.SECONDS.toMillis(30); + + // start live + Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer primaryInstance = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + primaryInstance.setIdentity("PRIMARY"); + primaryInstance.start(); + + // primary UN REPLICATED + Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence()); + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + Wait.waitFor(backupServer::isStarted); + + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout); + + // primary REPLICATED, backup matches (has replicated) activation sequence + Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence()); + Assert.assertEquals(1L, backupServer.getNodeManager().getNodeActivationSequence()); + + // transition to un replicated once backup goes away + backupServer.stop(); + + org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 2L == primaryInstance.getNodeManager().getNodeActivationSequence(), timeout); + + // done + primaryInstance.stop(); + } + + + @Test + public void testBackupStartsFirst() throws Exception { + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + } + + @Test + public void testBackupOutOfSequenceReleasesLock() throws Exception { + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + + backupServer.stop(); + + TimeUnit.SECONDS.sleep(1); + + liveServer.stop(); + // backup can get lock but does not have the sequence to start, will try and be a backup + + backupServer.start(); + + // live server should be active + liveServer.start(); + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + } + + + @Test + public void testBackupOutOfSequenceCheckActivationSequence() throws Exception { + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + + backupServer.stop(); + + TimeUnit.SECONDS.sleep(1); + + final String coordinatedId = liveServer.getNodeID().toString(); + liveServer.stop(); + + // backup can get lock but does not have the sequence to start, will try and be a backup + // to verify it can short circuit with a dirty read we grab the lock for a little while + DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf( + managerConfiguration.getClassName(), + managerConfiguration.getProperties()); + distributedPrimitiveManager.start(); + final DistributedLock lock = distributedPrimitiveManager.getDistributedLock(coordinatedId); + assertTrue(lock.tryLock()); + CountDownLatch preActivate = new CountDownLatch(1); + backupServer.registerActivateCallback(new ActivateCallback() { + @Override + public void preActivate() { + ActivateCallback.super.preActivate(); + preActivate.countDown(); + } + }); + backupServer.start(); + + // it should be able to do a dirty read of the sequence id and not have to wait to get a lock + assertTrue(preActivate.await(1, TimeUnit.SECONDS)); + + // release the lock + distributedPrimitiveManager.stop(); + + // live server should be active + liveServer.start(); + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + } + + @Test + public void testSelfRepairPrimary() throws Exception { + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + + liveServer.start(); + final String coordinatedId = liveServer.getNodeID().toString(); + Wait.waitFor(liveServer::isStarted); + liveServer.stop(); + + liveServer.start(); + Wait.waitFor(liveServer::isStarted); + Assert.assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence()); + liveServer.stop(); + + // backup can get lock but does not have the sequence to start, will try and be a backup + // to verify it can short circuit with a dirty read we grab the lock for a little while + DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager + .newInstanceOf(managerConfiguration.getClassName(), managerConfiguration.getProperties()); + distributedPrimitiveManager.start(); + try (DistributedLock lock = distributedPrimitiveManager.getDistributedLock(coordinatedId)) { + assertTrue(lock.tryLock()); + distributedPrimitiveManager.getMutableLong(coordinatedId).compareAndSet(2, -2); + } + liveServer.start(); + Wait.waitFor(liveServer::isStarted); + Assert.assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence()); + Assert.assertEquals(2, distributedPrimitiveManager.getMutableLong(coordinatedId).get()); + + distributedPrimitiveManager.stop(); + + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + Wait.waitFor(backupServer::isReplicaSync); + Assert.assertEquals(2, backupServer.getNodeManager().getNodeActivationSequence()); + backupServer.stop(); + } + + @Test + public void testPrimaryPeers() throws Exception { + final String PEER_NODE_ID = "some-shared-id-001"; + + final Configuration liveConfiguration = createLiveConfiguration(); + ((ReplicationPrimaryPolicyConfiguration)liveConfiguration.getHAPolicyConfiguration()).setCoordinationId(PEER_NODE_ID); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + ServerLocator locator = ServerLocatorImpl.newLocator("(tcp://localhost:61616,tcp://localhost:61617)?ha=true"); + locator.setCallTimeout(60_000L); + locator.setConnectionTTL(60_000L); + final ClientSessionFactory keepLocatorAliveSLF = locator.createSessionFactory(); + + ClientSessionFactory csf = locator.createSessionFactory(); + sendTo(csf, "live_un_replicated"); + csf.close(); + + // start peer, will backup + Configuration peerLiveConfiguration = createBackupConfiguration(); // to get acceptors and locators ports that won't clash + peerLiveConfiguration.setHAPolicyConfiguration(createReplicationLiveConfiguration()); + ((ReplicationPrimaryPolicyConfiguration)peerLiveConfiguration.getHAPolicyConfiguration()).setCoordinationId(PEER_NODE_ID); + peerLiveConfiguration.setName("localhost::live-peer"); + + ActiveMQServer livePeerServer = addServer(ActiveMQServers.newActiveMQServer(peerLiveConfiguration)); + livePeerServer.setIdentity("LIVE-PEER"); + livePeerServer.start(); + + Wait.waitFor(livePeerServer::isStarted); + + waitForTopology(liveServer, 1, 1, 30000); + waitForTopology(livePeerServer, 1, 1, 30000); + + liveServer.stop(); + + // livePeerServer will take over and run un replicated + + csf = locator.createSessionFactory(); + receiveFrom(csf, "live_un_replicated"); + sendTo(csf, "peer_un_replicated"); + csf.close(); + + waitForTopology(livePeerServer, 1, 0, 30000); + + assertTrue(Wait.waitFor(() -> 2L == livePeerServer.getNodeManager().getNodeActivationSequence())); + + livePeerServer.stop(false); + + liveServer.start(); + + Wait.assertTrue(() -> !liveServer.isActive()); + + // restart backup + livePeerServer.start(); + + Wait.waitFor(livePeerServer::isStarted); + + assertEquals(3L, livePeerServer.getNodeManager().getNodeActivationSequence()); + + csf = locator.createSessionFactory(); + receiveFrom(csf, "peer_un_replicated"); + sendTo(csf, "backup_as_un_replicated"); + csf.close(); + + // verify the live restart as a backup to the restarted PeerLiveServer that has taken on the live role + liveServer.start(); + + csf = locator.createSessionFactory(); + receiveFrom(csf, "backup_as_un_replicated"); + sendTo(csf, "backup_as_replicated"); + csf.close(); + + assertTrue(Wait.waitFor(liveServer::isReplicaSync)); + assertTrue(Wait.waitFor(() -> 3L == liveServer.getNodeManager().getNodeActivationSequence())); + + waitForTopology(liveServer, 1, 1, 30000); + waitForTopology(livePeerServer, 1, 1, 30000); + + livePeerServer.stop(true); + + assertTrue(Wait.waitFor(() -> 4L == liveServer.getNodeManager().getNodeActivationSequence())); + + csf = locator.createSessionFactory(); + receiveFrom(csf, "backup_as_replicated"); + csf.close(); + + waitForTopology(liveServer, 1, 0, 30000); + + liveServer.stop(true); + keepLocatorAliveSLF.close(); + locator.close(); + } + + @Test + public void testUnavailableSelfHeal() throws Exception { + + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + + final String coordinatedId = liveServer.getNodeID().toString(); + + backupServer.stop(); + TimeUnit.MILLISECONDS.sleep(500); + liveServer.stop(); + + // some manual intervention to force an unavailable + // simulate live failing in activation local sequence update on un replicated run when backup stops. + + DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf(managerConfiguration.getClassName(), managerConfiguration.getProperties()); + distributedPrimitiveManager.start(); + final MutableLong activationSequence = distributedPrimitiveManager.getMutableLong(coordinatedId); + Assert.assertTrue(activationSequence.compareAndSet(2, -2)); + + // case: 1, the fail to write locally 2 but the write actually succeeding + // should delay pending resolution of the uncommitted claim + backupServer.start(); + + // live server should activate after self healing its outstanding claim + liveServer.start(); + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + } + + @Test + public void testUnavailableAdminIntervention() throws Exception { + // start backup + Configuration backupConfiguration = createBackupConfiguration(); + ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration)); + backupServer.setIdentity("BACKUP"); + backupServer.start(); + + // start live + final Configuration liveConfiguration = createLiveConfiguration(); + + ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration)); + liveServer.setIdentity("LIVE"); + liveServer.start(); + + Wait.waitFor(liveServer::isStarted); + + assertTrue(Wait.waitFor(backupServer::isStarted)); + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + + final String coordinatedId = liveServer.getNodeID().toString(); + + System.err.println("coodr id: " + coordinatedId); + backupServer.stop(); + TimeUnit.MILLISECONDS.sleep(500); + liveServer.stop(); + + // some manual intervention to force an unavailable + // simulate live failing in activation local sequence update on un replicated run when backup stops. + + DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf( + managerConfiguration.getClassName(), + managerConfiguration.getProperties()); + distributedPrimitiveManager.start(); + final MutableLong coordinatedActivationSequence = distributedPrimitiveManager.getMutableLong(coordinatedId); + Assert.assertTrue(coordinatedActivationSequence.compareAndSet(2, -2)); + + // case: 2, the fail to write locally 2 but the write actually failing + // need to put 1 in the local activation sequence of the live + FileLockNodeManager fileLockNodeManager = new FileLockNodeManager(liveConfiguration.getNodeManagerLockLocation().getAbsoluteFile(), true); + fileLockNodeManager.start(); + assertEquals(2, fileLockNodeManager.readNodeActivationSequence()); + fileLockNodeManager.writeNodeActivationSequence(1); + fileLockNodeManager.stop(); + + // should delay pending resolution of the uncommitted claim + backupServer.start(); + CountDownLatch liveStarting = new CountDownLatch(1); + // should delay pending resolution of the uncommitted claim + // IMPORTANT: primary activation run on the start caller thread!! We need another thread here + final Thread liveServerStarterThread = new Thread(() -> { + liveStarting.countDown(); + try { + liveServer.start(); + } catch (Throwable e) { + e.printStackTrace(); + } + }); + liveServerStarterThread.start(); + liveStarting.await(); + TimeUnit.MILLISECONDS.sleep(500); + // both are candidates and one of them failed to commit the claim + // let them compete on retry + Assert.assertTrue(coordinatedActivationSequence.compareAndSet(-2, 1)); + // one of the two can activate + Wait.waitFor(() -> liveServer.isStarted() || backupServer.isStarted()); + + assertTrue(Wait.waitFor(backupServer::isReplicaSync)); + assertTrue(liveServer.isReplicaSync()); + + assertEquals(2, backupServer.getNodeManager().getNodeActivationSequence()); + assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence()); + + } + + private void sendTo(ClientSessionFactory clientSessionFactory, String addr) throws Exception { + ClientSession clientSession = clientSessionFactory.createSession(true, true); + clientSession.createQueue(new QueueConfiguration(addr).setRoutingType(RoutingType.ANYCAST).setDurable(true)); + ClientProducer producer = clientSession.createProducer(addr); + ClientMessage message = clientSession.createMessage(true); + message.putStringProperty("K", addr); + message.putLongProperty("delay", 0L); // so slow interceptor does not get us + producer.send(message); + producer.close(); + clientSession.close(); + } + + private void receiveFrom(ClientSessionFactory clientSessionFactory, String addr) throws Exception { + ClientSession clientSession = clientSessionFactory.createSession(true, true); + clientSession.start(); + ClientConsumer consumer = clientSession.createConsumer(addr); + Message message = consumer.receive(4000); + assertNotNull(message); + assertTrue(message.getStringProperty("K").equals(addr)); + consumer.close(); + clientSession.close(); + } } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java index 1d3317c40e..fde20f4a87 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/SharedNothingReplicationTest.java @@ -244,7 +244,7 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase { .setCheckForLiveServer(false); } - private Configuration createLiveConfiguration() throws Exception { + protected Configuration createLiveConfiguration() throws Exception { Configuration conf = new ConfigurationImpl(); conf.setName("localhost::live"); @@ -275,7 +275,7 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase { return new ReplicaPolicyConfiguration().setClusterName("cluster"); } - private Configuration createBackupConfiguration() throws Exception { + protected Configuration createBackupConfiguration() throws Exception { Configuration conf = new ConfigurationImpl(); conf.setName("localhost::backup"); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java index 60f27a13c2..267f553211 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java @@ -24,7 +24,6 @@ import org.apache.activemq.artemis.api.core.client.ClientSession; import org.apache.activemq.artemis.api.core.client.ClientSessionFactory; import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration; import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration; -import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; import org.apache.activemq.artemis.spi.core.security.ActiveMQBasicSecurityManager; import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTestBase; @@ -66,9 +65,9 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes .setLargeMessagesDirectory(getLargeMessagesDir(0, true)); setupHAPolicyConfiguration(); - nodeManager = createReplicatedBackupNodeManager(backupConfig); + backupNodeManager = createReplicatedBackupNodeManager(backupConfig); - backupServer = createTestableServer(backupConfig); + backupServer = createTestableServer(backupConfig, backupNodeManager); backupServer.getServer().setSecurityManager(new ActiveMQBasicSecurityManager()); @@ -77,7 +76,8 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes .clearAcceptorConfigurations() .addAcceptorConfiguration(getAcceptorTransportConfiguration(true)); - liveServer = createTestableServer(liveConfig); + nodeManager = createNodeManager(liveConfig); + liveServer = createTestableServer(liveConfig, nodeManager); liveServer.getServer().setSecurityManager(new ActiveMQBasicSecurityManager()); } @@ -94,7 +94,6 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes @Override protected void setupHAPolicyConfiguration() { - ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true); ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true); } diff --git a/tests/smoke-tests/pom.xml b/tests/smoke-tests/pom.xml index 764fa0a396..9684932e8c 100644 --- a/tests/smoke-tests/pom.xml +++ b/tests/smoke-tests/pom.xml @@ -677,6 +677,44 @@ + + test-compile + create-zk-replication-primary-peer-a + + create + + + ${basedir}/target/classes/servers/zkReplicationPrimaryPeerA + true + admin + admin + ${basedir}/target/zkReplicationPrimaryPeerA + + + --java-options + -Djava.rmi.server.hostname=localhost + + + + + test-compile + create-zk-replication-primary-peer-b + + create + + + ${basedir}/target/classes/servers/zkReplicationPrimaryPeerB + true + admin + admin + ${basedir}/target/zkReplicationPrimaryPeerB + + + --java-options + -Djava.rmi.server.hostname=localhost + + + test-compile create-zk-replication-backup diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml index 2095d20830..75b9e7405f 100644 --- a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimary/broker.xml @@ -39,7 +39,6 @@ under the License. - true diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/broker.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/broker.xml new file mode 100644 index 0000000000..9c2567d509 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/broker.xml @@ -0,0 +1,138 @@ + + + + + + primary-peer-a + + ./data/bindings + + ./data/journal + + ./data/largemessages + + ./data/paging + + + + + + org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager + + + + + peer-journal-001 + + + + + + + tcp://localhost:61616 + tcp://localhost:61716 + + + + + + tcp://localhost:61616 + + + admin + + password + + + + artemis + OFF + 1 + + peer + + + + + + + + + + + + + + + + + + + + + + + + + + + DLQ + ExpiryQueue + 0 + + -1 + 10 + PAGE + true + true + true + true + + + + DLQ + ExpiryQueue + 0 + + 10MB + 1MB + + 10 + PAGE + true + true + true + true + + + + +
+ + +
+
+ + + +
+
+
+
diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/management.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/management.xml new file mode 100644 index 0000000000..576f1e5995 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerA/management.xml @@ -0,0 +1,20 @@ + + + + + \ No newline at end of file diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/broker.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/broker.xml new file mode 100644 index 0000000000..3db6ead7a1 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/broker.xml @@ -0,0 +1,138 @@ + + + + + + primary-peer-b + + ./data/bindings + + ./data/journal + + ./data/largemessages + + ./data/paging + + + + + + org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager + + + + + peer-journal-001 + + + + + + + tcp://localhost:61716 + tcp://localhost:61616 + + + + + + tcp://localhost:61716 + + + admin + + password + + + + artemis + OFF + 1 + + peer + + + + + + + + + + + + + + + + + + + + + + + + + + + DLQ + ExpiryQueue + 0 + + -1 + 10 + PAGE + true + true + true + true + + + + DLQ + ExpiryQueue + 0 + + 10MB + 1MB + + 10 + PAGE + true + true + true + true + + + + +
+ + +
+
+ + + +
+
+
+
diff --git a/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/management.xml b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/management.xml new file mode 100644 index 0000000000..14bbaf2218 --- /dev/null +++ b/tests/smoke-tests/src/main/resources/servers/zkReplicationPrimaryPeerB/management.xml @@ -0,0 +1,20 @@ + + + + + \ No newline at end of file diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java index d5948f57a1..ab6ee9237e 100644 --- a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/PluggableQuorumSinglePairTest.java @@ -20,8 +20,7 @@ package org.apache.activemq.artemis.tests.smoke.quorum; import javax.management.remote.JMXServiceURL; import java.net.MalformedURLException; import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; +import java.util.LinkedList; import java.util.Objects; import java.util.Optional; import java.util.concurrent.TimeUnit; @@ -55,15 +54,15 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { private static final Logger LOGGER = Logger.getLogger(PluggableQuorumSinglePairTest.class); - private static final String JMX_SERVER_HOSTNAME = "localhost"; - private static final int JMX_PORT_PRIMARY = 10099; - private static final int JMX_PORT_BACKUP = 10199; + static final String JMX_SERVER_HOSTNAME = "localhost"; + static final int JMX_PORT_PRIMARY = 10099; + static final int JMX_PORT_BACKUP = 10199; - private static final String PRIMARY_DATA_FOLDER = "ReplicationPrimary";; - private static final String BACKUP_DATA_FOLDER = "ReplicationBackup"; + static final String PRIMARY_DATA_FOLDER = "ReplicationPrimary"; + static final String BACKUP_DATA_FOLDER = "ReplicationBackup"; - private static final int PRIMARY_PORT_OFFSET = 0; - private static final int BACKUP_PORT_OFFSET = PRIMARY_PORT_OFFSET + 100; + static final int PRIMARY_PORT_OFFSET = 0; + static final int BACKUP_PORT_OFFSET = PRIMARY_PORT_OFFSET + 100; public static class BrokerControl { @@ -73,7 +72,7 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { final JMXServiceURL jmxServiceURL; final int portID; - private BrokerControl(final String name, int jmxPort, String dataFolder, int portID) { + BrokerControl(final String name, int jmxPort, String dataFolder, int portID) { this.portID = portID; this.dataFolder = dataFolder; try { @@ -108,6 +107,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { public Optional listNetworkTopology() throws Exception { return Jmx.listNetworkTopology(jmxServiceURL, objectNameBuilder); } + + public Optional getActivationSequence() throws Exception { + return Jmx.getActivationSequence(jmxServiceURL, objectNameBuilder); + } + + public Optional isActive() throws Exception { + return Jmx.isActive(jmxServiceURL, objectNameBuilder); + } } @Parameterized.Parameter @@ -118,14 +125,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { return Arrays.asList(new Object[][]{{false}, {true}}); } - private final BrokerControl primary; - private final BrokerControl backup; - private final Collection brokers; + protected BrokerControl primary; + protected BrokerControl backup; + protected LinkedList brokers; public PluggableQuorumSinglePairTest(String brokerFolderPrefix) { primary = new BrokerControl("primary", JMX_PORT_PRIMARY, brokerFolderPrefix + PRIMARY_DATA_FOLDER, PRIMARY_PORT_OFFSET); backup = new BrokerControl("backup", JMX_PORT_BACKUP, brokerFolderPrefix + BACKUP_DATA_FOLDER, BACKUP_PORT_OFFSET); - brokers = Collections.unmodifiableList(Arrays.asList(primary, backup)); + brokers = new LinkedList(Arrays.asList(primary, backup)); } protected abstract boolean awaitAsyncSetupCompleted(long timeout, TimeUnit unit) throws InterruptedException; @@ -150,6 +157,10 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { Process primaryInstance = primary.startServer(this, timeout); Assert.assertTrue(awaitAsyncSetupCompleted(timeout, TimeUnit.MILLISECONDS)); Wait.assertTrue(() -> !primary.isBackup().orElse(true), timeout); + + // primary UN REPLICATED + Assert.assertEquals(1L, primary.getActivationSequence().get().longValue()); + LOGGER.info("started primary"); LOGGER.info("starting backup"); Process backupInstance = backup.startServer(this, 0); @@ -176,6 +187,11 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { Assert.assertNotNull(urlPrimary); LOGGER.infof("primary: %s", urlPrimary); Assert.assertNotEquals(urlPrimary, urlBackup); + + // primary REPLICATED, backup matches (has replicated) activation sequence + Assert.assertEquals(1L, primary.getActivationSequence().get().longValue()); + Assert.assertEquals(1L, backup.getActivationSequence().get().longValue()); + LOGGER.info("killing primary"); ServerUtil.killServer(primaryInstance, forceKill); LOGGER.info("killed primary"); @@ -188,11 +204,15 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { .and(withNodes(1))), timeout); LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get()); Assert.assertEquals(nodeID, backup.getNodeID().get()); + + // backup UN REPLICATED (new version) + Assert.assertEquals(2L, backup.getActivationSequence().get().longValue()); + // wait a bit before restarting primary LOGGER.info("waiting before starting primary"); TimeUnit.SECONDS.sleep(4); LOGGER.info("starting primary"); - primary.startServer(this, 0); + primaryInstance = primary.startServer(this, 0); LOGGER.info("started primary"); Wait.assertTrue(() -> backup.isBackup().orElse(false), timeout); Assert.assertTrue(!primary.isBackup().get()); @@ -209,6 +229,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { Assert.assertTrue(backup.isReplicaSync().get()); LOGGER.infof("backup is synchronized with live"); Assert.assertEquals(nodeID, primary.getNodeID().get()); + + // primary ran un replicated for a short while after failback, before backup was in sync + Assert.assertEquals(3L, primary.getActivationSequence().get().longValue()); + Assert.assertEquals(3L, backup.getActivationSequence().get().longValue()); + + LOGGER.infof("Done, killing both"); + ServerUtil.killServer(primaryInstance); + ServerUtil.killServer(backupInstance); } @Test @@ -272,5 +300,87 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase { Wait.waitFor(()-> !backupInstance.isAlive(), timeout); } + + @Test + public void testOnlyLastUnreplicatedCanStart() throws Exception { + final int timeout = (int) TimeUnit.SECONDS.toMillis(30); + LOGGER.info("starting primary"); + Process primaryInstance = primary.startServer(this, timeout); + Assert.assertTrue(awaitAsyncSetupCompleted(timeout, TimeUnit.MILLISECONDS)); + Wait.assertTrue(() -> !primary.isBackup().orElse(true), timeout); + LOGGER.info("started primary"); + LOGGER.info("starting backup"); + Process backupInstance = backup.startServer(this, 0); + Wait.assertTrue(() -> backup.isBackup().orElse(false), timeout); + final String nodeID = primary.getNodeID().get(); + Assert.assertNotNull(nodeID); + LOGGER.infof("NodeID: %s", nodeID); + for (BrokerControl broker : brokers) { + Wait.assertTrue(() -> validateNetworkTopology(broker.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, Objects::nonNull)) + .and(withBackup(nodeID, Objects::nonNull)) + .and(withMembers(1)) + .and(withNodes(2))), timeout); + } + LOGGER.infof("primary topology is: %s", primary.listNetworkTopology().get()); + LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get()); + Assert.assertTrue(backup.isReplicaSync().get()); + LOGGER.infof("backup is synchronized with live"); + final String urlBackup = backupOf(nodeID, decodeNetworkTopologyJson(backup.listNetworkTopology().get())); + Assert.assertNotNull(urlBackup); + LOGGER.infof("backup: %s", urlBackup); + final String urlPrimary = liveOf(nodeID, decodeNetworkTopologyJson(primary.listNetworkTopology().get())); + Assert.assertNotNull(urlPrimary); + LOGGER.infof("primary: %s", urlPrimary); + Assert.assertNotEquals(urlPrimary, urlBackup); + + + // verify sequence id's in sync + Assert.assertEquals(1L, primary.getActivationSequence().get().longValue()); + Assert.assertEquals(1L, backup.getActivationSequence().get().longValue()); + + LOGGER.info("killing primary"); + ServerUtil.killServer(primaryInstance, forceKill); + LOGGER.info("killed primary"); + Wait.assertTrue(() -> !backup.isBackup().orElse(true), timeout); + Wait.assertTrue(() -> validateNetworkTopology(backup.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, urlBackup::equals)) + .and(withBackup(nodeID, Objects::isNull)) + .and(withMembers(1)) + .and(withNodes(1))), timeout); + LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get()); + Assert.assertEquals(nodeID, backup.getNodeID().get()); + + + // backup now UNREPLICATED, it is the only node that can continue + Assert.assertEquals(2L, backup.getActivationSequence().get().longValue()); + + LOGGER.info("killing backup"); + ServerUtil.killServer(backupInstance, forceKill); + + // wait a bit before restarting primary + LOGGER.info("waiting before starting primary"); + TimeUnit.SECONDS.sleep(4); + LOGGER.info("restarting primary"); + + Process restartedPrimary = primary.startServer(this, 0); + LOGGER.info("restarted primary, " + restartedPrimary); + + Wait.assertFalse("Primary shouldn't activate", () -> primary.isActive().orElse(false), 5000); + + ServerUtil.killServer(restartedPrimary); + + LOGGER.info("restarting backup"); + + // backup can resume with data seq 3 + final Process restartedBackupInstance = backup.startServer(this, 5000); + Wait.waitFor(() -> backup.isActive().orElse(false), 5000); + assertTrue(Wait.waitFor(() -> nodeID.equals(backup.getNodeID().orElse("not set yet")))); + LOGGER.info("restarted backup"); + + Assert.assertEquals(3L, backup.getActivationSequence().get().longValue()); + } } diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumPeerTest.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumPeerTest.java new file mode 100644 index 0000000000..14fabd8292 --- /dev/null +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumPeerTest.java @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.activemq.artemis.tests.smoke.quorum; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.Objects; +import java.util.concurrent.TimeUnit; + +import org.apache.activemq.artemis.util.ServerUtil; +import org.apache.activemq.artemis.utils.Wait; +import org.jboss.logging.Logger; +import org.junit.Assert; +import org.junit.Test; + +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.containsExactNodeIds; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.validateNetworkTopology; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withBackup; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withLive; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withMembers; +import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withNodes; + +public class ZookeeperPluggableQuorumPeerTest extends ZookeeperPluggableQuorumSinglePairTest { + + private static final Logger LOGGER = Logger.getLogger(ZookeeperPluggableQuorumPeerTest.class); + + public ZookeeperPluggableQuorumPeerTest() { + super(); + // accepting the primary/backup vars to reuse the test, for peers, these are interchangeable as either can take + // both roles as both wish to be primary but will revert to backup + primary = new BrokerControl("primary-peer-a", JMX_PORT_PRIMARY, "zkReplicationPrimaryPeerA", PRIMARY_PORT_OFFSET); + backup = new BrokerControl("primary-peer-b", JMX_PORT_BACKUP, "zkReplicationPrimaryPeerB", BACKUP_PORT_OFFSET); + brokers = new LinkedList(Arrays.asList(primary, backup)); + } + + @Test + @Override + public void testBackupFailoverAndPrimaryFailback() throws Exception { + // peers don't request fail back by default + // just wait for setup to avoid partial stop of zk via fast tear down with async setup + Wait.waitFor(this::ensembleHasLeader); + } + + @Test + public void testMultiPrimary_Peer() throws Exception { + + final int timeout = (int) TimeUnit.SECONDS.toMillis(30); + LOGGER.info("starting peer b primary"); + + Process backupInstance = backup.startServer(this, timeout); + + // alive as unreplicated, it has configured node id + assertTrue(Wait.waitFor(() -> 1L == backup.getActivationSequence().orElse(Long.MAX_VALUE).longValue())); + + final String nodeID = backup.getNodeID().get(); + Assert.assertNotNull(nodeID); + LOGGER.infof("NodeID: %s", nodeID); + + LOGGER.info("starting peer a primary"); + primary.startServer(this, 0); + Wait.assertTrue(() -> primary.isBackup().orElse(false), timeout); + + Wait.assertTrue(() -> !backup.isBackup().orElse(true), timeout); + + for (BrokerControl broker : brokers) { + Wait.assertTrue(() -> validateNetworkTopology(broker.listNetworkTopology().orElse(""), + containsExactNodeIds(nodeID) + .and(withLive(nodeID, Objects::nonNull)) + .and(withBackup(nodeID, Objects::nonNull)) + .and(withMembers(1)) + .and(withNodes(2))), timeout); + } + + LOGGER.infof("primary topology is: %s", primary.listNetworkTopology().get()); + LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get()); + Assert.assertTrue(backup.isReplicaSync().get()); + Assert.assertTrue(primary.isReplicaSync().get()); + + + LOGGER.info("killing peer-b"); + ServerUtil.killServer(backupInstance, forceKill); + + // peer-a now UNREPLICATED + Wait.assertTrue(() -> 2L == primary.getActivationSequence().get().longValue()); + + LOGGER.info("restarting peer-b"); + backup.startServer(this, 0); + + assertTrue(Wait.waitFor(() -> nodeID.equals(backup.getNodeID().orElse("not set yet")))); + // peer-b now a REPLICA + Wait.waitFor(() -> backup.isReplicaSync().get()); + Wait.assertTrue(() -> 2L == backup.getActivationSequence().get().longValue()); + } +} diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java index df2123f7d6..57f1cce866 100644 --- a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java @@ -1,13 +1,13 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

+ * + * http://www.apache.org/licenses/LICENSE-2.0 + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,6 +19,8 @@ package org.apache.activemq.artemis.tests.smoke.quorum; import java.util.List; import java.util.concurrent.TimeUnit; + +import org.apache.activemq.artemis.utils.ThreadLeakCheckRule; import org.apache.curator.test.InstanceSpec; import org.apache.curator.test.TestingCluster; import org.apache.curator.test.TestingZooKeeperServer; @@ -38,7 +40,7 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); - private TestingCluster testingServer; + protected TestingCluster testingServer; private InstanceSpec[] clusterSpecs; private int nodes; @@ -60,6 +62,8 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl @Override @After public void after() throws Exception { + // zk bits that leak from servers + ThreadLeakCheckRule.addKownThread("ListenerHandler-"); try { super.after(); } finally { @@ -76,6 +80,16 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl return true; } + protected boolean ensembleHasLeader() { + return testingServer.getServers().stream().filter(ZookeeperPluggableQuorumSinglePairTest::isLeader).count() != 0; + } + + private static boolean isLeader(TestingZooKeeperServer server) { + long leaderId = server.getQuorumPeer().getLeaderId(); + long id = server.getQuorumPeer().getId(); + return id == leaderId; + } + @Override protected void stopMajority() throws Exception { List followers = testingServer.getServers(); diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java index 079e2f75b0..5c5f044382 100644 --- a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/utils/Jmx.java @@ -77,6 +77,16 @@ public class Jmx { return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::getNodeID, ActiveMQServerControl.class, throwable -> null); } + public static Optional getActivationSequence(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception { + return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::getActivationSequence, ActiveMQServerControl.class, throwable -> null); + + } + + public static Optional isActive(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception { + return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::isActive, ActiveMQServerControl.class, throwable -> null); + + } + public static Optional listNetworkTopology(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception { return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::listNetworkTopology, ActiveMQServerControl.class, throwable -> null); diff --git a/tests/unit-tests/src/test/java/org/apache/activemq/artemis/tests/unit/core/server/impl/FileLockTest.java b/tests/unit-tests/src/test/java/org/apache/activemq/artemis/tests/unit/core/server/impl/FileLockTest.java index 88127937bc..bfac547824 100644 --- a/tests/unit-tests/src/test/java/org/apache/activemq/artemis/tests/unit/core/server/impl/FileLockTest.java +++ b/tests/unit-tests/src/test/java/org/apache/activemq/artemis/tests/unit/core/server/impl/FileLockTest.java @@ -18,6 +18,7 @@ package org.apache.activemq.artemis.tests.unit.core.server.impl; import java.io.File; +import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager; import org.apache.activemq.artemis.tests.util.ActiveMQTestBase; import org.junit.Before; @@ -33,6 +34,25 @@ public class FileLockTest extends ActiveMQTestBase { file.mkdirs(); } + @Test + public void testSetNodeID() throws Exception { + FileLockNodeManager underTest = new FileLockNodeManager(getTestDirfile(), false); + ReplicationPrimaryPolicyConfiguration replicationPrimaryPolicyConfiguration = ReplicationPrimaryPolicyConfiguration.withDefault(); + String seed = ""; + for (int i = 0; i < 20; i++) { + replicationPrimaryPolicyConfiguration.setCoordinationId(seed); + if (replicationPrimaryPolicyConfiguration.getCoordinationId() != null) { + underTest.setNodeID(replicationPrimaryPolicyConfiguration.getCoordinationId()); + } + seed += String.valueOf(i); + } + + replicationPrimaryPolicyConfiguration.setCoordinationId("somme-dash-and-odd"); + if (replicationPrimaryPolicyConfiguration.getCoordinationId() != null) { + underTest.setNodeID(replicationPrimaryPolicyConfiguration.getCoordinationId()); + } + } + @Test public void testNIOLock() throws Exception { doTestLock(new FileLockNodeManager(getTestDirfile(), false), new FileLockNodeManager(getTestDirfile(), false)); From 56299433c31fa30f8efa37fed32a35651d1be119 Mon Sep 17 00:00:00 2001 From: Clebert Suconic Date: Wed, 4 Aug 2021 15:03:51 -0400 Subject: [PATCH 3/4] ARTEMIS-3340 Removing not needed TemporaryFolder usages on tests --- .../client/PluggableQuorumInfiniteRedeliveryTest.java | 7 +------ .../integration/cluster/distribution/ClusterTestBase.java | 7 +------ .../integration/cluster/failover/FailoverTestBase.java | 6 +----- .../cluster/failover/MultipleServerFailoverTestBase.java | 7 +------ .../quorum/PluggableQuorumReplicaTimeoutTest.java | 7 +------ .../PluggableQuorumReplicationFlowControlTest.java | 6 +----- .../replication/PluggableQuorumReplicationTest.java | 6 +----- .../tests/integration/replication/ReplicationTest.java | 7 +------ .../PluggableQuorumBasicSecurityManagerFailoverTest.java | 2 +- .../quorum/ZookeeperPluggableQuorumSinglePairTest.java | 8 ++------ 10 files changed, 11 insertions(+), 52 deletions(-) diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/PluggableQuorumInfiniteRedeliveryTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/PluggableQuorumInfiniteRedeliveryTest.java index 2fbacac267..9b4d154e9b 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/PluggableQuorumInfiniteRedeliveryTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/client/PluggableQuorumInfiniteRedeliveryTest.java @@ -24,14 +24,9 @@ import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfigu import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; import org.apache.activemq.artemis.tests.util.ReplicatedBackupUtils; import org.junit.Before; -import org.junit.Rule; -import org.junit.rules.TemporaryFolder; public class PluggableQuorumInfiniteRedeliveryTest extends InfiniteRedeliveryTest { - @Rule - public TemporaryFolder tmpFolder = new TemporaryFolder(); - private DistributedPrimitiveManagerConfiguration managerConfiguration; public PluggableQuorumInfiniteRedeliveryTest(String protocol, boolean useCLI) { @@ -43,7 +38,7 @@ public class PluggableQuorumInfiniteRedeliveryTest extends InfiniteRedeliveryTes public void setUp() throws Exception { super.setUp(); this.managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), - Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + Collections.singletonMap("locks-folder", temporaryFolder.newFolder("manager").toString())); } @Override diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterTestBase.java index b364ad09eb..3c380b9655 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/distribution/ClusterTestBase.java @@ -97,14 +97,9 @@ import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.ClassRule; -import org.junit.Rule; -import org.junit.rules.TemporaryFolder; public abstract class ClusterTestBase extends ActiveMQTestBase { - @Rule - public TemporaryFolder tmpFolder = new TemporaryFolder(); - private static final Logger log = Logger.getLogger(ClusterTestBase.class); private static final int[] PORTS = {TransportConstants.DEFAULT_PORT, TransportConstants.DEFAULT_PORT + 1, TransportConstants.DEFAULT_PORT + 2, TransportConstants.DEFAULT_PORT + 3, TransportConstants.DEFAULT_PORT + 4, TransportConstants.DEFAULT_PORT + 5, TransportConstants.DEFAULT_PORT + 6, TransportConstants.DEFAULT_PORT + 7, TransportConstants.DEFAULT_PORT + 8, TransportConstants.DEFAULT_PORT + 9,}; @@ -151,7 +146,7 @@ public abstract class ClusterTestBase extends ActiveMQTestBase { return pluggableQuorumConfiguration; } try { - pluggableQuorumConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + pluggableQuorumConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", temporaryFolder.newFolder("manager").toString())); } catch (IOException ioException) { log.error(ioException); return null; diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java index d863cf59fb..c1b949250d 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java @@ -56,13 +56,9 @@ import org.apache.activemq.artemis.tests.util.ReplicatedBackupUtils; import org.junit.After; import org.junit.Assert; import org.junit.Before; -import org.junit.Rule; -import org.junit.rules.TemporaryFolder; public abstract class FailoverTestBase extends ActiveMQTestBase { // Constants ----------------------------------------------------- - @Rule - public TemporaryFolder tmpFolder = new TemporaryFolder(); protected static final SimpleString ADDRESS = new SimpleString("FailoverTestAddress"); @@ -243,7 +239,7 @@ public abstract class FailoverTestBase extends ActiveMQTestBase { managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), - Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + Collections.singletonMap("locks-folder", temporaryFolder.newFolder("manager").toString())); ReplicatedBackupUtils.configurePluggableQuorumReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, null, managerConfiguration, managerConfiguration); diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java index 49bd396b05..75570c220f 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/MultipleServerFailoverTestBase.java @@ -48,14 +48,9 @@ import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer import org.apache.activemq.artemis.tests.util.ActiveMQTestBase; import org.apache.activemq.artemis.tests.util.TransportConfigurationUtils; import org.junit.Before; -import org.junit.Rule; -import org.junit.rules.TemporaryFolder; public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase { - @Rule - public TemporaryFolder tmpFolder = new TemporaryFolder(); - private DistributedPrimitiveManagerConfiguration pluggableQuorumConfiguration = null; private DistributedPrimitiveManagerConfiguration getOrCreatePluggableQuorumConfiguration() { @@ -63,7 +58,7 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase { return pluggableQuorumConfiguration; } try { - pluggableQuorumConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + pluggableQuorumConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", temporaryFolder.newFolder("manager").toString())); } catch (IOException ioException) { return null; } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java index 0718962e10..d3a150e2a7 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/quorum/PluggableQuorumReplicaTimeoutTest.java @@ -27,21 +27,16 @@ import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfig import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; import org.apache.activemq.artemis.tests.integration.cluster.failover.ReplicaTimeoutTest; import org.apache.activemq.artemis.tests.util.ReplicatedBackupUtils; -import org.junit.Rule; -import org.junit.rules.TemporaryFolder; public class PluggableQuorumReplicaTimeoutTest extends ReplicaTimeoutTest { - @Rule - public TemporaryFolder tmpFolder = new TemporaryFolder(); - @Override protected void configureReplicationPair(Configuration backupConfig, Configuration liveConfig, TransportConfiguration backupConnector, TransportConfiguration backupAcceptor, TransportConfiguration liveConnector) throws IOException { - DistributedPrimitiveManagerConfiguration managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + DistributedPrimitiveManagerConfiguration managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", temporaryFolder.newFolder("manager").toString())); ReplicatedBackupUtils.configurePluggableQuorumReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, null, diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java index 1a84de67f5..c10029cb70 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationFlowControlTest.java @@ -26,18 +26,14 @@ import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfigu import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration; import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; import org.junit.Before; -import org.junit.Rule; -import org.junit.rules.TemporaryFolder; public class PluggableQuorumReplicationFlowControlTest extends SharedNothingReplicationFlowControlTest { private DistributedPrimitiveManagerConfiguration managerConfiguration; - @Rule - public TemporaryFolder tmpFolder = new TemporaryFolder(); @Before public void init() throws IOException { - managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", temporaryFolder.newFolder("manager").toString())); } @Override diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java index 714b2982db..69eadfd539 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/PluggableQuorumReplicationTest.java @@ -47,19 +47,15 @@ import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager; import org.apache.activemq.artemis.tests.util.Wait; import org.junit.Assert; import org.junit.Before; -import org.junit.Rule; import org.junit.Test; -import org.junit.rules.TemporaryFolder; public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest { private DistributedPrimitiveManagerConfiguration managerConfiguration; - @Rule - public TemporaryFolder tmpFolder = new TemporaryFolder(); @Before public void init() throws IOException { - managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", temporaryFolder.newFolder("manager").toString())); } @Override diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/ReplicationTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/ReplicationTest.java index 88da15e93d..ce873c5d8c 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/ReplicationTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/replication/ReplicationTest.java @@ -100,18 +100,13 @@ import org.apache.activemq.artemis.utils.critical.EmptyCriticalAnalyzer; import org.junit.After; import org.junit.Assert; import org.junit.Before; -import org.junit.Rule; import org.junit.Test; -import org.junit.rules.TemporaryFolder; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; @RunWith(Parameterized.class) public final class ReplicationTest extends ActiveMQTestBase { - @Rule - public TemporaryFolder tmpFolder = new TemporaryFolder(); - @Parameterized.Parameter public boolean pluggableQuorum; @@ -168,7 +163,7 @@ public final class ReplicationTest extends ActiveMQTestBase { } else { DistributedPrimitiveManagerConfiguration managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), - Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString())); + Collections.singletonMap("locks-folder", temporaryFolder.newFolder("manager").toString())); ReplicatedBackupUtils.configurePluggableQuorumReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, liveAcceptor, managerConfiguration, managerConfiguration); } diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java index 267f553211..2a6259c60d 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/security/PluggableQuorumBasicSecurityManagerFailoverTest.java @@ -51,7 +51,7 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes DistributedPrimitiveManagerConfiguration managerConfiguration = new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(), Collections.singletonMap("locks-folder", - tmpFolder.newFolder("manager").toString())); + temporaryFolder.newFolder("manager").toString())); ReplicatedBackupUtils.configurePluggableQuorumReplicationPair(backupConfig, backupConnector, backupAcceptor, liveConfig, liveConnector, null, diff --git a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java index 57f1cce866..b284541708 100644 --- a/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java +++ b/tests/smoke-tests/src/test/java/org/apache/activemq/artemis/tests/smoke/quorum/ZookeeperPluggableQuorumSinglePairTest.java @@ -28,8 +28,6 @@ import org.jboss.logging.Logger; import org.junit.After; import org.junit.Assert; import org.junit.Before; -import org.junit.Rule; -import org.junit.rules.TemporaryFolder; public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSinglePairTest { @@ -38,9 +36,7 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl // Beware: the server tick must be small enough that to let the session to be correctly expired private static final int SERVER_TICK_MS = 100; - @Rule - public TemporaryFolder tmpFolder = new TemporaryFolder(); - protected TestingCluster testingServer; + private TestingCluster testingServer; private InstanceSpec[] clusterSpecs; private int nodes; @@ -51,7 +47,7 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl nodes = 3; clusterSpecs = new InstanceSpec[nodes]; for (int i = 0; i < nodes; i++) { - clusterSpecs[i] = new InstanceSpec(tmpFolder.newFolder(), BASE_SERVER_PORT + i, -1, -1, true, -1, SERVER_TICK_MS, -1); + clusterSpecs[i] = new InstanceSpec(temporaryFolder.newFolder(), BASE_SERVER_PORT + i, -1, -1, true, -1, SERVER_TICK_MS, -1); } testingServer = new TestingCluster(clusterSpecs); testingServer.start(); From 3d77fcf3ee2cbfb9d272f99db3202077cfc9b82f Mon Sep 17 00:00:00 2001 From: Clebert Suconic Date: Thu, 5 Aug 2021 14:17:18 -0400 Subject: [PATCH 4/4] ARTEMIS-3340 Small edits to readme.md on zookeeper example --- .../zookeeper-single-pair-failback/readme.md | 93 ++++++++++++++++++- 1 file changed, 89 insertions(+), 4 deletions(-) diff --git a/examples/features/ha/zookeeper-single-pair-failback/readme.md b/examples/features/ha/zookeeper-single-pair-failback/readme.md index f9fc00f5b1..2d5ece4b36 100644 --- a/examples/features/ha/zookeeper-single-pair-failback/readme.md +++ b/examples/features/ha/zookeeper-single-pair-failback/readme.md @@ -1,5 +1,7 @@ # Zookeeper Single Pair Failback Example +## Configuring Zookeeper with Docker + This example demonstrates two servers coupled as a primary-backup pair for high availability (HA) using pluggable quorum vote replication Reference Implementation based on [Apache Curator](https://curator.apache.org/) to use [Apache Zookeeper](https://zookeeper.apache.org/) as external quorum service. @@ -57,7 +59,7 @@ Using config: /conf/zoo.cfg 2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - |_| 2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - ``` -Alternatively, can run +Alternatively, this command could be executed: ``` $ docker run -it --rm --network host zookeeper:3.6.3 zkCli.sh -server localhost:2181 ``` @@ -72,6 +74,58 @@ Type ``` to quit the client instance. + +## Configuring zookeeper bare metal + +It is possible to run zooKeeper in a bare metal instance for this example as well. + +Simply download [Zookeeper](https://zookeeper.apache.org/releases.html), and use the following zoo.cfg under ./apache-zookeeper/conf: + +```shell +# The number of milliseconds of each tick +tickTime=2000 +# The number of ticks that the initial +# synchronization phase can take +initLimit=10 +# The number of ticks that can pass between +# sending a request and getting an acknowledgement +syncLimit=5 +# the directory where the snapshot is stored. +# do not use /tmp for storage, /tmp here is just +# example sakes. +dataDir=/tmp/datazookeeper +# the port at which the clients will connect +clientPort=2181 +``` + +And use one of the shells to start Zookeeper such as: + +```shell +# From the bin folder under the apache-zookeeper distribution folder +$ ./zkServer.sh start-foreground +``` + +And zookeeper would run normally: + +``` +2021-08-05 14:10:16,902 [myid:] - INFO [main:DigestAuthenticationProvider@47] - ACL digest algorithm is: SHA1 +2021-08-05 14:10:16,902 [myid:] - INFO [main:DigestAuthenticationProvider@61] - zookeeper.DigestAuthenticationProvider.enabled = true +2021-08-05 14:10:16,905 [myid:] - INFO [main:FileTxnSnapLog@124] - zookeeper.snapshot.trust.empty : false +2021-08-05 14:10:16,917 [myid:] - INFO [main:ZookeeperBanner@42] - +2021-08-05 14:10:16,917 [myid:] - INFO [main:ZookeeperBanner@42] - ______ _ +2021-08-05 14:10:16,917 [myid:] - INFO [main:ZookeeperBanner@42] - |___ / | | +2021-08-05 14:10:16,917 [myid:] - INFO [main:ZookeeperBanner@42] - / / ___ ___ | | __ ___ ___ _ __ ___ _ __ +2021-08-05 14:10:16,917 [myid:] - INFO [main:ZookeeperBanner@42] - / / / _ \ / _ \ | |/ / / _ \ / _ \ | '_ \ / _ \ | '__| +2021-08-05 14:10:16,917 [myid:] - INFO [main:ZookeeperBanner@42] - / /__ | (_) | | (_) | | < | __/ | __/ | |_) | | __/ | | +2021-08-05 14:10:16,918 [myid:] - INFO [main:ZookeeperBanner@42] - /_____| \___/ \___/ |_|\_\ \___| \___| | .__/ \___| |_| +2021-08-05 14:10:16,918 [myid:] - INFO [main:ZookeeperBanner@42] - | | +2021-08-05 14:10:16,918 [myid:] - INFO [main:ZookeeperBanner@42] - |_| +2021-08-05 14:10:16,918 [myid:] - INFO [main:ZookeeperBanner@42] - +``` + + +## Configured the brokers + The 2 brokers of this example are already configured to connect to a single Zookeeper node at the mentioned address, thanks to the XML configuration of their `manager`: ```xml @@ -86,9 +140,40 @@ The 2 brokers of this example are already configured to connect to a single Zook **WARNING** As already recommended on the [High Availability section](https://activemq.apache.org/components/artemis/documentation/latest/ha.html), a production environment needs >= 3 nodes to protect against network partitions. -This example can be run with -``` + +##Running the example + +After Zookeeper is started accordingly to any of the portrayed steps here, this example can be run with +```shell $ mvn verify ``` -For more information on ActiveMQ Artemis failover and HA, and clustering in general, please see the clustering section of the user manual. \ No newline at end of file +``` +ZookeeperSinglePairFailback-primary-out:2021-08-05 14:15:50,052 INFO [org.apache.activemq.artemis.core.server] AMQ221020: Started KQUEUE Acceptor at localhost:61616 for protocols [CORE,MQTT,AMQP,HORNETQ,STOMP,OPENWIRE] +server tcp://localhost:61616 started +Started primary +Got message: This is text message 20 (redelivered?: false) +Got message: This is text message 21 (redelivered?: false) +Got message: This is text message 22 (redelivered?: false) +Got message: This is text message 23 (redelivered?: false) +Got message: This is text message 24 (redelivered?: false) +Got message: This is text message 25 (redelivered?: false) +Got message: This is text message 26 (redelivered?: false) +Got message: This is text message 27 (redelivered?: false) +Got message: This is text message 28 (redelivered?: false) +Got message: This is text message 29 (redelivered?: false) +Acknowledged 3d third of messages +********************************** +Killing server java.lang.UNIXProcess@dd025d9 +********************************** +********************************** +Killing server java.lang.UNIXProcess@3bea478e +********************************** +[INFO] ------------------------------------------------------------------------ +[INFO] BUILD SUCCESS +[INFO] ------------------------------------------------------------------------ +[INFO] Total time: 36.629 s +[INFO] Finished at: 2021-08-05T14:15:56-04:00 +[INFO] ------------------------------------------------------------------------ +clebertsuconic@MacBook-Pro zookeeper-single-pair-failback % +``` \ No newline at end of file