ARTEMIS-3340 Sequential activation tracking for pluggable quorum replication policies + peer

Co-authored-by: franz1981 <nigro.fra@gmail.com>
This commit is contained in:
gtully 2021-07-07 20:54:49 +01:00 committed by Clebert Suconic
parent 536271485f
commit ca7a100de0
69 changed files with 2691 additions and 718 deletions

View File

@ -19,7 +19,7 @@ package org.apache.activemq.artemis.cli.commands;
import java.io.File;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import io.airlift.airline.Command;
import io.airlift.airline.Option;
@ -71,6 +71,7 @@ public class Run extends LockAbstract {
public Object execute(ActionContext context) throws Exception {
super.execute(context);
AtomicReference<Throwable> serverActivationFailed = new AtomicReference<>();
try {
BrokerDTO broker = getBrokerDTO();
ActiveMQSecurityManager securityManager = SecurityManagerFactory.create(broker.security);
@ -110,8 +111,7 @@ public class Run extends LockAbstract {
server = BrokerFactory.createServer(broker.server, securityManager, activateCallback);
server.createComponents();
AtomicBoolean serverActivationFailed = new AtomicBoolean(false);
server.getServer().registerActivationFailureListener(exception -> serverActivationFailed.set(true));
server.getServer().registerActivationFailureListener(exception -> serverActivationFailed.set(exception));
server.start();
server.getServer().addExternalComponent(managementContext, false);
@ -126,14 +126,16 @@ public class Run extends LockAbstract {
server.getServer().addExternalComponent(component, true);
assert component.isStarted();
}
if (serverActivationFailed.get()) {
stop();
}
} catch (Throwable t) {
t.printStackTrace();
stop();
serverActivationFailed.set(t);
}
if (serverActivationFailed.get() != null) {
stop();
return serverActivationFailed.get();
}
return new Pair<>(managementContext, server.getServer());
}

View File

@ -2851,4 +2851,12 @@ public interface AuditLogger extends BasicLogger {
@LogMessage(level = Logger.Level.INFO)
@Message(id = 601748, value = "User {0} is getting max retry interval on target resource: {1} {2}", format = Message.Format.MESSAGE_FORMAT)
void getMaxRetryInterval(String user, Object source, Object... args);
static void getActivationSequence(Object source) {
BASE_LOGGER.getActivationSequence(getCaller(), source);
}
@LogMessage(level = Logger.Level.INFO)
@Message(id = 601749, value = "User {0} is getting activation sequence on target resource: {1} {2}", format = Message.Format.MESSAGE_FORMAT)
void getActivationSequence(String user, Object source, Object... args);
}

View File

@ -321,6 +321,15 @@ public interface ActiveMQServerControl {
@Attribute(desc = "Node ID of this server")
String getNodeID();
/**
* Returns the current activation sequence number of this server.
* <br>
* When replicated, peers may coordinate activation with this monotonic sequence
*/
@Attribute(desc = "Activation sequence of this server instance")
long getActivationSequence();
/**
* Returns the management notification address of this server.
* <br>

View File

@ -44,6 +44,12 @@
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
@ -59,6 +65,10 @@
<groupId>org.jboss.logging</groupId>
<artifactId>jboss-logging</artifactId>
</dependency>
<dependency>
<groupId>org.jboss.slf4j</groupId>
<artifactId>slf4j-jboss-logmanager</artifactId>
</dependency>
<dependency>
<groupId>org.apache.activemq</groupId>
<artifactId>artemis-commons</artifactId>

View File

@ -18,16 +18,18 @@ package org.apache.activemq.artemis.quorum.file;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.activemq.artemis.quorum.DistributedLock;
import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager;
import org.apache.activemq.artemis.quorum.MutableLong;
import org.apache.activemq.artemis.quorum.UnavailableStateException;
/**
* This is an implementation suitable to be used just on unit tests and it won't attempt
@ -127,8 +129,55 @@ public class FileBasedPrimitiveManager implements DistributedPrimitiveManager {
}
@Override
public MutableLong getMutableLong(String mutableLongId) throws InterruptedException, ExecutionException, TimeoutException {
// TODO
return null;
public MutableLong getMutableLong(final String mutableLongId) throws ExecutionException {
// use a lock file - but with a prefix
final FileDistributedLock fileDistributedLock = (FileDistributedLock) getDistributedLock("ML:" + mutableLongId);
return new MutableLong() {
@Override
public String getMutableLongId() {
return mutableLongId;
}
@Override
public long get() throws UnavailableStateException {
try {
return readLong(fileDistributedLock);
} catch (IOException e) {
throw new UnavailableStateException(e);
}
}
@Override
public void set(long value) throws UnavailableStateException {
try {
writeLong(fileDistributedLock, value);
} catch (IOException e) {
throw new UnavailableStateException(e);
}
}
@Override
public void close() {
fileDistributedLock.close();
}
};
}
private void writeLong(FileDistributedLock fileDistributedLock, long value) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN);
buffer.putLong(value);
buffer.flip();
if (fileDistributedLock.getChannel().position(0).write(buffer) == Long.BYTES) {
fileDistributedLock.getChannel().force(false);
}
}
private long readLong(FileDistributedLock fileDistributedLock) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN);
if (fileDistributedLock.getChannel().position(0).read(buffer, 0) != Long.BYTES) {
return 0;
}
buffer.flip();
return buffer.getLong();
}
}

View File

@ -138,4 +138,8 @@ final class FileDistributedLock implements DistributedLock {
public void close() {
close(true);
}
public FileChannel getChannel() {
return channel;
}
}

View File

@ -178,6 +178,7 @@ public abstract class DistributedLockTest {
ownerManager.getDistributedLock("a").unlock();
Assert.assertFalse(observerManager.getDistributedLock("a").isHeldByCaller());
Assert.assertFalse(ownerManager.getDistributedLock("a").isHeldByCaller());
Assert.assertTrue(observerManager.getDistributedLock("a").tryLock());
}
@Test

View File

@ -34,13 +34,6 @@ public class ReplicationBackupPolicyConfiguration implements HAPolicyConfigurati
private long initialReplicationSyncTimeout = ActiveMQDefaultConfiguration.getDefaultInitialReplicationSyncTimeout();
private int voteRetries = ActiveMQDefaultConfiguration.getDefaultVoteRetries();
/**
* TODO: move if into {@link ActiveMQDefaultConfiguration} when the configuration is stable.
*/
private long voteRetryWait = 2000;
private long retryReplicationWait = ActiveMQDefaultConfiguration.getDefaultRetryReplicationWait();
private DistributedPrimitiveManagerConfiguration distributedManagerConfiguration = null;
@ -102,24 +95,6 @@ public class ReplicationBackupPolicyConfiguration implements HAPolicyConfigurati
return this;
}
public int getVoteRetries() {
return voteRetries;
}
public ReplicationBackupPolicyConfiguration setVoteRetries(int voteRetries) {
this.voteRetries = voteRetries;
return this;
}
public ReplicationBackupPolicyConfiguration setVoteRetryWait(long voteRetryWait) {
this.voteRetryWait = voteRetryWait;
return this;
}
public long getVoteRetryWait() {
return voteRetryWait;
}
public long getRetryReplicationWait() {
return retryReplicationWait;
}

View File

@ -21,25 +21,18 @@ import org.apache.activemq.artemis.core.config.HAPolicyConfiguration;
public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfiguration {
private boolean checkForLiveServer = ActiveMQDefaultConfiguration.isDefaultCheckForLiveServer();
private String groupName = null;
private String clusterName = null;
private long initialReplicationSyncTimeout = ActiveMQDefaultConfiguration.getDefaultInitialReplicationSyncTimeout();
private int voteRetries = ActiveMQDefaultConfiguration.getDefaultVoteRetries();
/**
* TODO: move if into {@link ActiveMQDefaultConfiguration} when the configuration is stable.
*/
private long voteRetryWait = 2000;
private Long retryReplicationWait = ActiveMQDefaultConfiguration.getDefaultRetryReplicationWait();
private DistributedPrimitiveManagerConfiguration distributedManagerConfiguration = null;
private String coordinationId = null;
public static ReplicationPrimaryPolicyConfiguration withDefault() {
return new ReplicationPrimaryPolicyConfiguration();
}
@ -52,15 +45,6 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat
return TYPE.PRIMARY;
}
public boolean isCheckForLiveServer() {
return checkForLiveServer;
}
public ReplicationPrimaryPolicyConfiguration setCheckForLiveServer(boolean checkForLiveServer) {
this.checkForLiveServer = checkForLiveServer;
return this;
}
public String getGroupName() {
return groupName;
}
@ -88,24 +72,6 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat
return this;
}
public int getVoteRetries() {
return voteRetries;
}
public ReplicationPrimaryPolicyConfiguration setVoteRetries(int voteRetries) {
this.voteRetries = voteRetries;
return this;
}
public ReplicationPrimaryPolicyConfiguration setVoteRetryWait(long voteRetryWait) {
this.voteRetryWait = voteRetryWait;
return this;
}
public long getVoteRetryWait() {
return voteRetryWait;
}
public void setRetryReplicationWait(Long retryReplicationWait) {
this.retryReplicationWait = retryReplicationWait;
}
@ -122,4 +88,27 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat
public DistributedPrimitiveManagerConfiguration getDistributedManagerConfiguration() {
return distributedManagerConfiguration;
}
public String getCoordinationId() {
return coordinationId;
}
public void setCoordinationId(String newCoordinationId) {
if (newCoordinationId == null) {
return;
}
final int len = newCoordinationId.length();
if (len >= 16) {
this.coordinationId = newCoordinationId.substring(0, 16);
} else if (len % 2 != 0) {
// must be even for conversion to uuid, extend to next even
this.coordinationId = newCoordinationId + "+";
} else if (len > 0 ) {
// run with it
this.coordinationId = newCoordinationId;
}
if (this.coordinationId != null) {
this.coordinationId = this.coordinationId.replace('-', '.');
}
}
}

View File

@ -1715,22 +1715,18 @@ public final class FileConfigurationParser extends XMLConfigurationUtil {
private ReplicationPrimaryPolicyConfiguration createReplicationPrimaryHaPolicy(Element policyNode, Configuration config) {
ReplicationPrimaryPolicyConfiguration configuration = ReplicationPrimaryPolicyConfiguration.withDefault();
configuration.setCheckForLiveServer(getBoolean(policyNode, "check-for-live-server", configuration.isCheckForLiveServer()));
configuration.setGroupName(getString(policyNode, "group-name", configuration.getGroupName(), Validators.NO_CHECK));
configuration.setClusterName(getString(policyNode, "cluster-name", configuration.getClusterName(), Validators.NO_CHECK));
configuration.setInitialReplicationSyncTimeout(getLong(policyNode, "initial-replication-sync-timeout", configuration.getInitialReplicationSyncTimeout(), Validators.GT_ZERO));
configuration.setVoteRetries(getInteger(policyNode, "vote-retries", configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO));
configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO));
configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO));
configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getRetryReplicationWait(), Validators.GT_ZERO));
configuration.setDistributedManagerConfiguration(createDistributedPrimitiveManagerConfiguration(policyNode, config));
configuration.setCoordinationId(getString(policyNode, "coordination-id", configuration.getCoordinationId(), Validators.NOT_NULL_OR_EMPTY));
return configuration;
}
@ -1748,11 +1744,7 @@ public final class FileConfigurationParser extends XMLConfigurationUtil {
configuration.setMaxSavedReplicatedJournalsSize(getInteger(policyNode, "max-saved-replicated-journals-size", configuration.getMaxSavedReplicatedJournalsSize(), Validators.MINUS_ONE_OR_GE_ZERO));
configuration.setVoteRetries(getInteger(policyNode, "vote-retries", configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO));
configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO));
configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO));
configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getRetryReplicationWait(), Validators.GT_ZERO));
configuration.setDistributedManagerConfiguration(createDistributedPrimitiveManagerConfiguration(policyNode, config));

View File

@ -4204,6 +4204,17 @@ public class ActiveMQServerControlImpl extends AbstractControl implements Active
return server.getNodeID() == null ? null : server.getNodeID().toString();
}
@Override
public long getActivationSequence() {
if (AuditLogger.isBaseLoggingEnabled()) {
AuditLogger.getActivationSequence(this.server);
}
if (server.getNodeManager() != null) {
return server.getNodeManager().getNodeActivationSequence();
}
return 0;
}
@Override
public String getManagementNotificationAddress() {
if (AuditLogger.isBaseLoggingEnabled()) {

View File

@ -43,7 +43,8 @@ public class ReplicationStartSyncMessage extends PacketImpl {
public enum SyncDataType {
JournalBindings(AbstractJournalStorageManager.JournalContent.BINDINGS.typeByte),
JournalMessages(AbstractJournalStorageManager.JournalContent.MESSAGES.typeByte),
LargeMessages((byte) 2);
LargeMessages((byte) 2),
ActivationSequence((byte) 3);
private byte code;
@ -62,6 +63,9 @@ public class ReplicationStartSyncMessage extends PacketImpl {
return JournalMessages;
if (code == LargeMessages.code)
return LargeMessages;
if (code == ActivationSequence.code)
return ActivationSequence;
throw new InvalidParameterException("invalid byte: " + code);
}
}
@ -80,6 +84,14 @@ public class ReplicationStartSyncMessage extends PacketImpl {
nodeID = ""; // this value will be ignored
}
public ReplicationStartSyncMessage(String nodeID, long nodeDataVersion) {
this(nodeID);
ids = new long[1];
ids[0] = nodeDataVersion;
dataType = SyncDataType.ActivationSequence;
}
public ReplicationStartSyncMessage(String nodeID) {
this();
synchronizationIsFinished = true;
@ -118,10 +130,6 @@ public class ReplicationStartSyncMessage extends PacketImpl {
DataConstants.SIZE_BOOLEAN + // buffer.writeBoolean(allowsAutoFailBack);
nodeID.length() * 3; // buffer.writeString(nodeID); -- an estimate
if (synchronizationIsFinished) {
return size;
}
size += DataConstants.SIZE_BYTE + // buffer.writeByte(dataType.code);
DataConstants.SIZE_INT + // buffer.writeInt(ids.length);
DataConstants.SIZE_LONG * ids.length; // the write loop
@ -135,8 +143,6 @@ public class ReplicationStartSyncMessage extends PacketImpl {
buffer.writeBoolean(synchronizationIsFinished);
buffer.writeBoolean(allowsAutoFailBack);
buffer.writeString(nodeID);
if (synchronizationIsFinished)
return;
buffer.writeByte(dataType.code);
buffer.writeInt(ids.length);
for (long id : ids) {
@ -149,9 +155,6 @@ public class ReplicationStartSyncMessage extends PacketImpl {
synchronizationIsFinished = buffer.readBoolean();
allowsAutoFailBack = buffer.readBoolean();
nodeID = buffer.readString();
if (synchronizationIsFinished) {
return;
}
dataType = SyncDataType.getDataType(buffer.readByte());
int length = buffer.readInt();
ids = new long[length];

View File

@ -483,7 +483,7 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon
logger.trace("Backup is synchronized / BACKUP-SYNC-DONE");
}
ActiveMQServerLogger.LOGGER.backupServerSynched(server);
ActiveMQServerLogger.LOGGER.backupServerSynchronized(server, liveID);
return;
}
@ -560,6 +560,11 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon
return replicationResponseMessage;
if (packet.isSynchronizationFinished()) {
if (packet.getFileIds() != null && packet.getFileIds().length == 1) {
// this is the version sequence of the data we are replicating
// verified if we activate with this data
server.getNodeManager().writeNodeActivationSequence(packet.getFileIds()[0]);
}
finishSynchronization(packet.getNodeID());
replicationResponseMessage.setSynchronizationIsFinishedAcknowledgement(true);
return replicationResponseMessage;

View File

@ -821,7 +821,7 @@ public final class ReplicationManager implements ActiveMQComponent {
}
synchronizationIsFinishedAcknowledgement.countUp();
sendReplicatePacket(new ReplicationStartSyncMessage(nodeID));
sendReplicatePacket(new ReplicationStartSyncMessage(nodeID, server.getNodeManager().getNodeActivationSequence()));
try {
if (!synchronizationIsFinishedAcknowledgement.await(initialReplicationSyncTimeout)) {
ActiveMQReplicationTimeooutException exception = ActiveMQMessageBundle.BUNDLE.replicationSynchronizationTimeout(initialReplicationSyncTimeout);

View File

@ -190,8 +190,8 @@ public interface ActiveMQServerLogger extends BasicLogger {
void errorStoppingConnectorService(@Cause Throwable e, String name);
@LogMessage(level = Logger.Level.INFO)
@Message(id = 221024, value = "Backup server {0} is synchronized with live-server.", format = Message.Format.MESSAGE_FORMAT)
void backupServerSynched(ActiveMQServerImpl server);
@Message(id = 221024, value = "Backup server {0} is synchronized with live server, nodeID={1}.", format = Message.Format.MESSAGE_FORMAT)
void backupServerSynchronized(ActiveMQServerImpl server, String liveID);
@LogMessage(level = Logger.Level.INFO)
@Message(id = 221025, value = "Replication: sending {0} (size={1}) to replica.", format = Message.Format.MESSAGE_FORMAT)

View File

@ -39,6 +39,7 @@ public abstract class NodeManager implements ActiveMQComponent {
private UUID uuid;
private boolean isStarted = false;
private final Set<FileLockNodeManager.LockListener> lockListeners;
protected long nodeActivationSequence; // local version of a coordinated sequence, tracking state transitions of ownership
public NodeManager(final boolean replicatedBackup) {
this.replicatedBackup = replicatedBackup;
@ -79,18 +80,30 @@ public abstract class NodeManager implements ActiveMQComponent {
}
}
public long readDataVersion() throws NodeManagerException {
public long readNodeActivationSequence() throws NodeManagerException {
// TODO make it abstract
throw new UnsupportedOperationException("TODO");
}
public void writeDataVersion(long version) throws NodeManagerException {
public void writeNodeActivationSequence(long version) throws NodeManagerException {
// TODO make it abstract
throw new UnsupportedOperationException("TODO");
}
public abstract SimpleString readNodeId() throws NodeManagerException;
public long getNodeActivationSequence() {
synchronized (nodeIDGuard) {
return nodeActivationSequence;
}
}
public void setNodeActivationSequence(long activationSequence) {
synchronized (nodeIDGuard) {
nodeActivationSequence = activationSequence;
}
}
public UUID getUUID() {
synchronized (nodeIDGuard) {
return uuid;

View File

@ -32,10 +32,8 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
private final String groupName;
private final String clusterName;
private final int maxSavedReplicatedJournalsSize;
private final int voteRetries;
private final long voteRetryWait;
private final long retryReplicationWait;
private final DistributedPrimitiveManagerConfiguration distributedManagerConfiguration;
private final DistributedPrimitiveManagerConfiguration managerConfiguration;
private final boolean tryFailback;
private ReplicationBackupPolicy(ReplicationBackupPolicyConfiguration configuration,
@ -44,10 +42,8 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
this.clusterName = configuration.getClusterName();
this.maxSavedReplicatedJournalsSize = configuration.getMaxSavedReplicatedJournalsSize();
this.groupName = configuration.getGroupName();
this.voteRetries = configuration.getVoteRetries();
this.voteRetryWait = configuration.getVoteRetryWait();
this.retryReplicationWait = configuration.getRetryReplicationWait();
this.distributedManagerConfiguration = configuration.getDistributedManagerConfiguration();
this.managerConfiguration = configuration.getDistributedManagerConfiguration();
this.tryFailback = true;
this.livePolicy = livePolicy;
}
@ -56,10 +52,8 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
this.clusterName = configuration.getClusterName();
this.maxSavedReplicatedJournalsSize = configuration.getMaxSavedReplicatedJournalsSize();
this.groupName = configuration.getGroupName();
this.voteRetries = configuration.getVoteRetries();
this.voteRetryWait = configuration.getVoteRetryWait();
this.retryReplicationWait = configuration.getRetryReplicationWait();
this.distributedManagerConfiguration = configuration.getDistributedManagerConfiguration();
this.managerConfiguration = configuration.getDistributedManagerConfiguration();
this.tryFailback = false;
livePolicy = ReplicationPrimaryPolicy.failoverPolicy(
configuration.getInitialReplicationSyncTimeout(),
@ -84,16 +78,12 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
/**
* It creates a companion backup policy for a natural-born primary: it would cause the broker to try failback.
*/
static ReplicationBackupPolicy failback(int voteRetries,
long voteRetryWait,
long retryReplicationWait,
static ReplicationBackupPolicy failback(long retryReplicationWait,
String clusterName,
String groupName,
ReplicationPrimaryPolicy livePolicy,
DistributedPrimitiveManagerConfiguration distributedManagerConfiguration) {
return new ReplicationBackupPolicy(ReplicationBackupPolicyConfiguration.withDefault()
.setVoteRetries(voteRetries)
.setVoteRetryWait(voteRetryWait)
.setRetryReplicationWait(retryReplicationWait)
.setClusterName(clusterName)
.setGroupName(groupName)
@ -106,9 +96,8 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
boolean wasLive,
Map<String, Object> activationParams,
IOCriticalErrorListener shutdownOnCriticalIO) throws Exception {
return new ReplicationBackupActivation(server, wasLive, DistributedPrimitiveManager.newInstanceOf(
distributedManagerConfiguration.getClassName(),
distributedManagerConfiguration.getProperties()), this);
return new ReplicationBackupActivation(server, DistributedPrimitiveManager.newInstanceOf(
managerConfiguration.getClassName(), managerConfiguration.getProperties()), this);
}
@Override
@ -157,14 +146,6 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
return maxSavedReplicatedJournalsSize;
}
public int getVoteRetries() {
return voteRetries;
}
public long getVoteRetryWait() {
return voteRetryWait;
}
public long getRetryReplicationWait() {
return retryReplicationWait;
}

View File

@ -31,10 +31,10 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
private final ReplicationBackupPolicy backupPolicy;
private final String clusterName;
private final String groupName;
private final boolean checkForLiveServer;
private final long initialReplicationSyncTimeout;
private final DistributedPrimitiveManagerConfiguration distributedManagerConfiguration;
private final boolean allowAutoFailBack;
private final String coordinationId;
private ReplicationPrimaryPolicy(ReplicationPrimaryPolicyConfiguration configuration,
ReplicationBackupPolicy backupPolicy,
@ -42,9 +42,9 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
Objects.requireNonNull(backupPolicy);
clusterName = configuration.getClusterName();
groupName = configuration.getGroupName();
checkForLiveServer = configuration.isCheckForLiveServer();
initialReplicationSyncTimeout = configuration.getInitialReplicationSyncTimeout();
distributedManagerConfiguration = configuration.getDistributedManagerConfiguration();
coordinationId = configuration.getCoordinationId();
this.allowAutoFailBack = allowAutoFailBack;
this.backupPolicy = backupPolicy;
}
@ -52,12 +52,11 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
private ReplicationPrimaryPolicy(ReplicationPrimaryPolicyConfiguration config) {
clusterName = config.getClusterName();
groupName = config.getGroupName();
checkForLiveServer = config.isCheckForLiveServer();
coordinationId = config.getCoordinationId();
initialReplicationSyncTimeout = config.getInitialReplicationSyncTimeout();
distributedManagerConfiguration = config.getDistributedManagerConfiguration();
this.allowAutoFailBack = false;
backupPolicy = ReplicationBackupPolicy.failback(config.getVoteRetries(), config.getVoteRetryWait(),
config.getRetryReplicationWait(), config.getClusterName(),
backupPolicy = ReplicationBackupPolicy.failback(config.getRetryReplicationWait(), config.getClusterName(),
config.getGroupName(), this,
config.getDistributedManagerConfiguration());
}
@ -73,7 +72,6 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
boolean allowAutoFailback,
DistributedPrimitiveManagerConfiguration distributedManagerConfiguration) {
return new ReplicationPrimaryPolicy(ReplicationPrimaryPolicyConfiguration.withDefault()
.setCheckForLiveServer(false)
.setInitialReplicationSyncTimeout(initialReplicationSyncTimeout)
.setGroupName(groupName)
.setClusterName(clusterName)
@ -139,10 +137,6 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
return null;
}
public boolean isCheckForLiveServer() {
return checkForLiveServer;
}
public boolean isAllowAutoFailBack() {
return allowAutoFailBack;
}
@ -163,4 +157,8 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
public boolean useQuorumManager() {
return false;
}
public String getCoordinationId() {
return coordinationId;
}
}

View File

@ -659,7 +659,7 @@ public class ActiveMQServerImpl implements ActiveMQServer {
afterActivationCreated.run();
} catch (Throwable e) {
logger.warn(e.getMessage(), e); // just debug, this is not supposed to happend, and if it does
// it will be embedeed code from tests
// it will be embedded code from tests
}
afterActivationCreated = null;
}
@ -2877,6 +2877,8 @@ public class ActiveMQServerImpl implements ActiveMQServer {
public String toString() {
if (identity != null) {
return "ActiveMQServerImpl::" + identity;
} else if (configuration != null && configuration.getName() != null) {
return "ActiveMQServerImpl::" + "name=" + configuration.getName();
}
return "ActiveMQServerImpl::" + (nodeManager != null ? "serverUUID=" + nodeManager.getUUID() : "");
}

View File

@ -1,160 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.activemq.artemis.core.server.impl;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import org.apache.activemq.artemis.api.core.ActiveMQException;
import org.apache.activemq.artemis.api.core.ActiveMQExceptionType;
import org.apache.activemq.artemis.api.core.DiscoveryGroupConfiguration;
import org.apache.activemq.artemis.api.core.TransportConfiguration;
import org.apache.activemq.artemis.api.core.client.ActiveMQClient;
import org.apache.activemq.artemis.api.core.client.ClientSession;
import org.apache.activemq.artemis.api.core.client.ClientSessionFactory;
import org.apache.activemq.artemis.api.core.client.ClusterTopologyListener;
import org.apache.activemq.artemis.api.core.client.ServerLocator;
import org.apache.activemq.artemis.api.core.client.TopologyMember;
import org.apache.activemq.artemis.core.client.impl.ClientSessionFactoryInternal;
import org.apache.activemq.artemis.core.client.impl.ServerLocatorInternal;
import org.apache.activemq.artemis.core.config.ClusterConnectionConfiguration;
import org.apache.activemq.artemis.core.config.Configuration;
import org.apache.activemq.artemis.core.config.ConfigurationUtils;
import org.apache.activemq.artemis.core.server.ActiveMQMessageBundle;
import org.apache.activemq.artemis.core.server.ActiveMQServerLogger;
import org.jboss.logging.Logger;
/**
* This class contains some utils to allow a broker to check presence and role of another broker in the cluster.
*/
final class ClusterTopologySearch {
private ClusterTopologySearch() {
}
/**
* Determines whether there is a live server already running with nodeID.<br>
* This search isn't filtering the caller broker transport and is meant to be used
* when the broker acceptors aren't running yet.
*/
public static boolean searchActiveLiveNodeId(String clusterName,
String nodeId,
long timeout,
TimeUnit unit,
Configuration serverConfiguration) throws ActiveMQException {
if (serverConfiguration.getClusterConfigurations().isEmpty())
return false;
final ClusterConnectionConfiguration clusterConnectionConfiguration = ConfigurationUtils.getReplicationClusterConfiguration(serverConfiguration, clusterName);
final LiveNodeIdListener liveNodeIdListener = new LiveNodeIdListener(nodeId, serverConfiguration.getClusterUser(), serverConfiguration.getClusterPassword());
try (ServerLocatorInternal locator = createLocator(serverConfiguration, clusterConnectionConfiguration)) {
// if would like to filter out a transport configuration:
// locator.setClusterTransportConfiguration(callerBrokerTransportConfiguration)
locator.addClusterTopologyListener(liveNodeIdListener);
locator.setReconnectAttempts(0);
try (ClientSessionFactoryInternal ignored = locator.connectNoWarnings()) {
return liveNodeIdListener.awaitNodePresent(timeout, unit);
} catch (Exception notConnected) {
if (!(notConnected instanceof ActiveMQException) || ActiveMQExceptionType.INTERNAL_ERROR.equals(((ActiveMQException) notConnected).getType())) {
// report all exceptions that aren't ActiveMQException and all INTERNAL_ERRORs
ActiveMQServerLogger.LOGGER.failedConnectingToCluster(notConnected);
}
return false;
}
}
}
private static final class LiveNodeIdListener implements ClusterTopologyListener {
private static final Logger logger = Logger.getLogger(LiveNodeIdListener.class);
private final String nodeId;
private final String user;
private final String password;
private final CountDownLatch searchCompleted;
private boolean isNodePresent = false;
LiveNodeIdListener(String nodeId, String user, String password) {
this.nodeId = nodeId;
this.user = user;
this.password = password;
this.searchCompleted = new CountDownLatch(1);
}
@Override
public void nodeUP(TopologyMember topologyMember, boolean last) {
boolean isOurNodeId = nodeId != null && nodeId.equals(topologyMember.getNodeId());
if (isOurNodeId && isActive(topologyMember.getLive())) {
isNodePresent = true;
}
if (isOurNodeId || last) {
searchCompleted.countDown();
}
}
public boolean awaitNodePresent(long timeout, TimeUnit unit) throws InterruptedException {
searchCompleted.await(timeout, unit);
return isNodePresent;
}
/**
* In a cluster of replicated live/backup pairs if a backup crashes and then its live crashes the cluster will
* retain the topology information of the live such that when the live server restarts it will check the
* cluster to see if its nodeID is present (which it will be) and then it will activate as a backup rather than
* a live. To prevent this situation an additional check is necessary to see if the server with the matching
* nodeID is actually active or not which is done by attempting to make a connection to it.
*
* @param transportConfiguration
* @return
*/
private boolean isActive(TransportConfiguration transportConfiguration) {
try (ServerLocator serverLocator = ActiveMQClient.createServerLocator(false, transportConfiguration);
ClientSessionFactory clientSessionFactory = serverLocator.createSessionFactory();
ClientSession clientSession = clientSessionFactory.createSession(user, password, false, false, false, false, 0)) {
return true;
} catch (Exception e) {
logger.debug("isActive check failed", e);
return false;
}
}
@Override
public void nodeDown(long eventUID, String nodeID) {
// no-op
}
}
private static ServerLocatorInternal createLocator(Configuration configuration,
ClusterConnectionConfiguration config) throws ActiveMQException {
final ServerLocatorInternal locator;
if (config.getDiscoveryGroupName() != null) {
DiscoveryGroupConfiguration dg = configuration.getDiscoveryGroupConfigurations().get(config.getDiscoveryGroupName());
if (dg == null) {
throw ActiveMQMessageBundle.BUNDLE.noDiscoveryGroupFound(null);
}
locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(dg);
} else {
TransportConfiguration[] tcConfigs = config.getStaticConnectors() != null ? configuration.getTransportConfigurations(config.getStaticConnectors()) : null;
locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(tcConfigs);
}
return locator;
}
}

View File

@ -28,7 +28,7 @@ import org.apache.activemq.artemis.core.server.NodeManager;
import org.apache.activemq.artemis.utils.UUID;
import org.apache.activemq.artemis.utils.UUIDGenerator;
import static java.nio.file.StandardOpenOption.CREATE_NEW;
import static java.nio.file.StandardOpenOption.CREATE;
import static java.nio.file.StandardOpenOption.READ;
import static java.nio.file.StandardOpenOption.WRITE;
@ -36,33 +36,36 @@ public abstract class FileBasedNodeManager extends NodeManager {
protected static final byte FIRST_TIME_START = '0';
public static final String SERVER_LOCK_NAME = "server.lock";
public static final String DATA_VERSION_NAME = "server.data.version";
public static final String SERVER_ACTIVATION_SEQUENCE_NAME = "server.activation.sequence";
private static final String ACCESS_MODE = "rw";
private final File directory;
protected FileChannel channel;
protected FileChannel dataVersionChannel;
protected FileChannel activationSequenceChannel;
public FileBasedNodeManager(boolean replicatedBackup, File directory) {
super(replicatedBackup);
this.directory = directory;
if (directory != null) {
directory.mkdirs();
}
}
protected void useDataVersionChannel() throws IOException {
if (dataVersionChannel != null) {
protected void useActivationSequenceChannel() throws IOException {
if (activationSequenceChannel != null) {
return;
}
dataVersionChannel = FileChannel.open(newFile(DATA_VERSION_NAME).toPath(), READ, WRITE, CREATE_NEW);
activationSequenceChannel = FileChannel.open(newFile(SERVER_ACTIVATION_SEQUENCE_NAME).toPath(), READ, WRITE, CREATE);
}
@Override
public long readDataVersion() throws NodeManagerException {
public long readNodeActivationSequence() throws NodeManagerException {
if (!isStarted()) {
throw new NodeManagerException(new IllegalStateException("node manager must be started first"));
}
try {
useDataVersionChannel();
useActivationSequenceChannel();
ByteBuffer tmpBuffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN);
if (dataVersionChannel.read(tmpBuffer, 0) != Long.BYTES) {
if (activationSequenceChannel.read(tmpBuffer, 0) != Long.BYTES) {
return 0;
}
tmpBuffer.flip();
@ -73,16 +76,17 @@ public abstract class FileBasedNodeManager extends NodeManager {
}
@Override
public void writeDataVersion(long version) throws NodeManagerException {
public void writeNodeActivationSequence(long version) throws NodeManagerException {
if (!isStarted()) {
throw new NodeManagerException(new IllegalStateException("node manager must be started first"));
}
try {
useDataVersionChannel();
useActivationSequenceChannel();
ByteBuffer tmpBuffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN);
tmpBuffer.putLong(0, version);
dataVersionChannel.write(tmpBuffer, 0);
dataVersionChannel.force(false);
activationSequenceChannel.write(tmpBuffer, 0);
activationSequenceChannel.force(false);
setNodeActivationSequence(version);
} catch (IOException ie) {
throw new NodeManagerException(ie);
}
@ -149,12 +153,8 @@ public abstract class FileBasedNodeManager extends NodeManager {
createNodeId();
}
/**
* @return
*/
protected final File newFile(final String fileName) {
File file = new File(directory, fileName);
return file;
return new File(directory, fileName);
}
protected final synchronized void createNodeId() throws IOException {
@ -190,8 +190,8 @@ public abstract class FileBasedNodeManager extends NodeManager {
channelCopy.close();
} finally {
try {
FileChannel dataVersionChannel = this.dataVersionChannel;
this.dataVersionChannel = null;
FileChannel dataVersionChannel = this.activationSequenceChannel;
this.activationSequenceChannel = null;
if (dataVersionChannel != null) {
dataVersionChannel.close();
}

View File

@ -19,9 +19,6 @@ package org.apache.activemq.artemis.core.server.impl;
import javax.annotation.concurrent.GuardedBy;
import java.util.Objects;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;
@ -35,7 +32,7 @@ import org.apache.activemq.artemis.core.replication.ReplicationEndpoint;
import org.apache.activemq.artemis.core.server.ActiveMQServer;
import org.apache.activemq.artemis.core.server.ActiveMQServerLogger;
import org.apache.activemq.artemis.core.server.LiveNodeLocator;
import org.apache.activemq.artemis.core.server.NodeManager;;
import org.apache.activemq.artemis.core.server.NodeManager;
import org.apache.activemq.artemis.core.server.cluster.ClusterControl;
import org.apache.activemq.artemis.core.server.cluster.ClusterController;
import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationBackupPolicy;
@ -45,6 +42,8 @@ import org.apache.activemq.artemis.quorum.UnavailableStateException;
import org.jboss.logging.Logger;
import static org.apache.activemq.artemis.core.server.impl.ReplicationObserver.ReplicationFailure;
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.ensureSequentialAccessToNodeData;
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.tryActivate;
/**
* This activation can be used by a primary while trying to fail-back ie {@code failback == true} or
@ -54,7 +53,6 @@ public final class ReplicationBackupActivation extends Activation implements Dis
private static final Logger LOGGER = Logger.getLogger(ReplicationBackupActivation.class);
private final boolean wasLive;
private final ReplicationBackupPolicy policy;
private final ActiveMQServerImpl activeMQServer;
// This field is != null iff this node is a primary during a fail-back ie acting as a backup in order to become live again.
@ -72,10 +70,8 @@ public final class ReplicationBackupActivation extends Activation implements Dis
private final AtomicBoolean stopping;
public ReplicationBackupActivation(final ActiveMQServerImpl activeMQServer,
final boolean wasLive,
final DistributedPrimitiveManager distributedManager,
final ReplicationBackupPolicy policy) {
this.wasLive = wasLive;
this.activeMQServer = activeMQServer;
if (policy.isTryFailback()) {
final SimpleString serverNodeID = activeMQServer.getNodeID();
@ -146,14 +142,35 @@ public final class ReplicationBackupActivation extends Activation implements Dis
}
}
try {
LOGGER.info("Trying to reach majority of quorum service nodes");
distributedManager.start();
LOGGER.info("Quorum service available: starting broker");
final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence();
// only a backup with positive local activation sequence could contain valuable data
if (nodeActivationSequence > 0) {
final String nodeId = activeMQServer.getNodeManager().getNodeId().toString();
DistributedLock liveLockWithInSyncReplica;
while (true) {
distributedManager.start();
try {
liveLockWithInSyncReplica = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER);
break;
} catch (UnavailableStateException canRecoverEx) {
distributedManager.stop();
}
}
if (liveLockWithInSyncReplica != null) {
// retain state and start as live
if (!activeMQServer.initialisePart1(false)) {
return;
}
activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED);
startAsLive(liveLockWithInSyncReplica);
return;
}
}
distributedManager.addUnavailableManagerListener(this);
// Stop the previous node manager and create a new one with NodeManager::replicatedBackup == true:
// NodeManager::start skip setup lock file with NodeID, until NodeManager::stopBackup is called.
activeMQServer.resetNodeManager();
activeMQServer.getNodeManager().stop();
// A primary need to preserve NodeID across runs
activeMQServer.moveServerData(policy.getMaxSavedReplicatedJournalsSize(), policy.isTryFailback());
activeMQServer.getNodeManager().start();
@ -164,11 +181,15 @@ public final class ReplicationBackupActivation extends Activation implements Dis
if (closed)
return;
}
final ClusterController clusterController = activeMQServer.getClusterManager().getClusterController();
LOGGER.infof("Apache ActiveMQ Artemis Backup Server version %s [%s] started, awaiting connection to a live cluster member to start replication", activeMQServer.getVersion().getFullVersion(),
activeMQServer.toString());
clusterController.awaitConnectionToReplicationCluster();
activeMQServer.getBackupManager().start();
ActiveMQServerLogger.LOGGER.backupServerStarted(activeMQServer.getVersion().getFullVersion(),
activeMQServer.getNodeManager().getNodeId());
activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED);
final DistributedLock liveLock = replicateAndFailover(clusterController);
if (liveLock == null) {
@ -192,6 +213,14 @@ public final class ReplicationBackupActivation extends Activation implements Dis
liveLock.close();
return;
}
try {
ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER);
} catch (Throwable fatal) {
LOGGER.warn(fatal);
// policy is already live one, but there's no activation yet: we can just stop
asyncRestartServer(activeMQServer, false, false);
throw new ActiveMQIllegalStateException("This server cannot ensure sequential access to broker data: activation is failed");
}
ActiveMQServerLogger.LOGGER.becomingLive(activeMQServer);
// stopBackup is going to write the NodeID previously set on the NodeManager,
// because activeMQServer.resetNodeManager() has created a NodeManager with replicatedBackup == true.
@ -260,28 +289,38 @@ public final class ReplicationBackupActivation extends Activation implements Dis
return null;
}
LOGGER.debugf("ReplicationFailure = %s", failure);
boolean voluntaryFailOver = false;
switch (failure) {
case VoluntaryFailOver:
voluntaryFailOver = true;
case NonVoluntaryFailover:
final DistributedLock liveLock = tryAcquireLiveLock();
// from now on we're meant to stop:
// - due to failover
// - due to restart/stop
assert stopping.get();
if (liveLock != null) {
return liveLock;
if (!stopping.compareAndSet(false, true)) {
return null;
}
// no more interested into these events: handling it manually from here
distributedManager.removeUnavailableManagerListener(this);
final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence();
final String nodeId = activeMQServer.getNodeManager().getNodeId().toString();
DistributedLock liveLockWithInSyncReplica = null;
if (nodeActivationSequence > 0) {
try {
liveLockWithInSyncReplica = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER);
} catch (Throwable error) {
// no need to retry here, can just restart as backup that will handle a more resilient tryActivate
LOGGER.warn("Errored while attempting failover", error);
liveLockWithInSyncReplica = null;
}
boolean restart = true;
if (voluntaryFailOver && isFirstFailbackAttempt()) {
restart = false;
LOGGER.error("Failed to fail-back: stopping broker based on quorum results");
} else {
ActiveMQServerLogger.LOGGER.restartingAsBackupBasedOnQuorumVoteResults();
LOGGER.warnf("We expect local activation sequence for NodeID = %s to be > 0 on a fail-over, while is %d", nodeId, nodeActivationSequence);
}
// let's ignore the stopping flag here, we're in control of it
asyncRestartServer(activeMQServer, restart, false);
assert stopping.get();
if (liveLockWithInSyncReplica != null) {
return liveLockWithInSyncReplica;
}
ActiveMQServerLogger.LOGGER.restartingAsBackupBasedOnQuorumVoteResults();
// let's ignore the stopping flag here, we're already in control of it
asyncRestartServer(activeMQServer, true, false);
return null;
case RegistrationError:
LOGGER.error("Stopping broker because of critical registration error");
@ -307,73 +346,7 @@ public final class ReplicationBackupActivation extends Activation implements Dis
}
}
} finally {
silentExecution("Errored on cluster topology listener for replication cleanup", () -> clusterController.removeClusterTopologyListenerForReplication(nodeLocator));
}
}
/**
* {@code wasLive} is {code true} only while transitioning from primary to backup.<br>
* If a natural born backup become live and allows failback, while transitioning to back again
* {@code wasLive} is still {@code false}.<br>
* The check on {@link ReplicationBackupPolicy#isTryFailback()} is redundant but still useful for correctness.
* <p>
* In case of fail-back, any event that's going to restart this broker as backup (eg quorum service unavailable
* or some replication failures) will cause {@code wasLive} to be {@code false}, because the HA policy set isn't
* a primary anymore.
*/
private boolean isFirstFailbackAttempt() {
return wasLive && policy.isTryFailback();
}
private DistributedLock tryAcquireLiveLock() throws InterruptedException {
// disable quorum service unavailability handling and just treat this imperatively
if (!stopping.compareAndSet(false, true)) {
// already unavailable quorum service: fail fast
return null;
}
distributedManager.removeUnavailableManagerListener(this);
assert activeMQServer.getNodeManager().getNodeId() != null;
final String liveID = activeMQServer.getNodeManager().getNodeId().toString();
final int voteRetries = policy.getVoteRetries();
final long maxAttempts = voteRetries >= 0 ? (voteRetries + 1) : -1;
if (maxAttempts == -1) {
LOGGER.error("It's not safe to retry an infinite amount of time to acquire a live lock: please consider setting a vote-retries value");
}
final long voteRetryWait = policy.getVoteRetryWait();
final DistributedLock liveLock = getLock(distributedManager, liveID);
if (liveLock == null) {
return null;
}
for (long attempt = 0; maxAttempts >= 0 ? (attempt < maxAttempts) : true; attempt++) {
try {
if (liveLock.tryLock(voteRetryWait, TimeUnit.MILLISECONDS)) {
LOGGER.debugf("%s live lock acquired after %d attempts.", liveID, (attempt + 1));
return liveLock;
}
} catch (UnavailableStateException e) {
LOGGER.warnf(e, "Failed to acquire live lock %s because of unavailable quorum service: stop trying", liveID);
distributedManager.stop();
return null;
}
}
LOGGER.warnf("Failed to acquire live lock %s after %d tries", liveID, maxAttempts);
distributedManager.stop();
return null;
}
private DistributedLock getLock(final DistributedPrimitiveManager manager,
final String lockId) throws InterruptedException {
if (!manager.isStarted()) {
return null;
}
try {
return manager.getDistributedLock(lockId);
} catch (ExecutionException e) {
LOGGER.warnf(e, "Errored while getting lock %s", lockId);
return null;
} catch (TimeoutException te) {
LOGGER.warnf(te, "Timeout while getting lock %s", lockId);
return null;
silentExecution("Error on cluster topology listener for replication cleanup", () -> clusterController.removeClusterTopologyListenerForReplication(nodeLocator));
}
}
@ -405,7 +378,6 @@ public final class ReplicationBackupActivation extends Activation implements Dis
return ReplicationFailure.RegistrationError;
}
this.replicationEndpoint = replicationEndpoint;
assert replicationEndpoint != null;
try {
return replicationObserver.awaitReplicationFailure();
} finally {
@ -414,11 +386,11 @@ public final class ReplicationBackupActivation extends Activation implements Dis
closeChannelOf(replicationEndpoint);
}
} finally {
silentExecution("Errored on live control close", liveControl::close);
silentExecution("Error on live control close", liveControl::close);
}
} finally {
silentExecution("Errored on cluster topology listener cleanup", () -> clusterController.removeClusterTopologyListener(replicationObserver));
silentExecution("Errored while removing incoming interceptor for replication", () -> clusterController.removeIncomingInterceptorForReplication(replicationError));
silentExecution("Error on cluster topology listener cleanup", () -> clusterController.removeClusterTopologyListener(replicationObserver));
silentExecution("Error while removing incoming interceptor for replication", () -> clusterController.removeIncomingInterceptorForReplication(replicationError));
}
} finally {
this.replicationObserver = null;
@ -438,7 +410,7 @@ public final class ReplicationBackupActivation extends Activation implements Dis
return;
}
if (replicationEndpoint.getChannel() != null) {
silentExecution("Errored while closing replication endpoint channel", () -> replicationEndpoint.getChannel().close());
silentExecution("Error while closing replication endpoint channel", () -> replicationEndpoint.getChannel().close());
replicationEndpoint.setChannel(null);
}
}

View File

@ -19,12 +19,14 @@ package org.apache.activemq.artemis.core.server.impl;
import javax.annotation.concurrent.GuardedBy;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.activemq.artemis.api.core.ActiveMQAlreadyReplicatingException;
import org.apache.activemq.artemis.api.core.ActiveMQException;
import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException;
import org.apache.activemq.artemis.api.core.Pair;
import org.apache.activemq.artemis.api.core.TransportConfiguration;
import org.apache.activemq.artemis.core.persistence.StorageManager;
import org.apache.activemq.artemis.core.protocol.core.Channel;
import org.apache.activemq.artemis.core.protocol.core.ChannelHandler;
import org.apache.activemq.artemis.core.protocol.core.CoreRemotingConnection;
@ -46,7 +48,10 @@ import org.apache.activemq.artemis.quorum.UnavailableStateException;
import org.apache.activemq.artemis.spi.core.remoting.Acceptor;
import org.jboss.logging.Logger;
import static org.apache.activemq.artemis.core.server.impl.ClusterTopologySearch.searchActiveLiveNodeId;
import static org.apache.activemq.artemis.core.server.ActiveMQServer.SERVER_STATE.STARTED;
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.awaitNextCommittedActivationSequence;
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.ensureSequentialAccessToNodeData;
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.tryActivate;
/**
* This is going to be {@link #run()} just by natural born primary, at the first start.
@ -55,8 +60,9 @@ import static org.apache.activemq.artemis.core.server.impl.ClusterTopologySearch
public class ReplicationPrimaryActivation extends LiveActivation implements DistributedLock.UnavailableLockListener {
private static final Logger LOGGER = Logger.getLogger(ReplicationPrimaryActivation.class);
private static final long DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS = 20_000;
private static final long BLOCKING_CALLS_TIMEOUT_MILLIS = 5_000;
// This is the time we expect a replica to become a live from the quorum pov
// ie time to execute tryActivate and ensureSequentialAccessToNodeData
private static final long FAILBACK_TIMEOUT_MILLIS = 4_000;
private final ReplicationPrimaryPolicy policy;
@ -69,7 +75,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
private final DistributedPrimitiveManager distributedManager;
private volatile boolean stoppingServer;
private final AtomicBoolean stoppingServer;
public ReplicationPrimaryActivation(final ActiveMQServerImpl activeMQServer,
final DistributedPrimitiveManager distributedManager,
@ -78,6 +84,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
this.policy = policy;
this.replicationLock = new Object();
this.distributedManager = distributedManager;
this.stoppingServer = new AtomicBoolean();
}
/**
@ -101,19 +108,36 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
@Override
public void run() {
try {
final NodeManager nodeManager = activeMQServer.getNodeManager();
final String nodeId = nodeManager.readNodeId().toString();
final long dataVersion = nodeManager.readDataVersion();
final DistributedLock liveLock = searchLiveOrAcquireLiveLock(nodeId, BLOCKING_CALLS_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS);
// we have a common nodeId that we can share and coordinate with between peers
if (policy.getCoordinationId() != null) {
LOGGER.infof("Applying shared peer NodeID=%s to enable coordinated live activation", policy.getCoordinationId());
// REVISIT: this is quite clunky, also in backup activation, we just need new nodeID persisted!
activeMQServer.resetNodeManager();
activeMQServer.getNodeManager().start();
activeMQServer.getNodeManager().setNodeID(policy.getCoordinationId());
activeMQServer.getNodeManager().stopBackup();
}
final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence();
final String nodeId = activeMQServer.getNodeManager().readNodeId().toString();
DistributedLock liveLock;
while (true) {
distributedManager.start();
try {
liveLock = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER);
break;
} catch (UnavailableStateException canRecoverEx) {
distributedManager.stop();
}
}
if (liveLock == null) {
distributedManager.stop();
LOGGER.infof("This broker cannot become a live server with NodeID = %s: restarting as backup", nodeId);
activeMQServer.setHAPolicy(policy.getBackupPolicy());
return;
}
ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER);
activeMQServer.initialisePart1(false);
activeMQServer.initialisePart2(false);
@ -142,73 +166,9 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
}
}
private DistributedLock searchLiveOrAcquireLiveLock(final String nodeId,
final long blockingCallTimeout,
final TimeUnit unit) throws ActiveMQException, InterruptedException {
if (policy.isCheckForLiveServer()) {
LOGGER.infof("Searching a live server with NodeID = %s", nodeId);
if (searchActiveLiveNodeId(policy.getClusterName(), nodeId, blockingCallTimeout, unit, activeMQServer.getConfiguration())) {
LOGGER.infof("Found a live server with NodeID = %s: restarting as backup", nodeId);
activeMQServer.setHAPolicy(policy.getBackupPolicy());
return null;
}
}
startDistributedPrimitiveManager();
return acquireDistributeLock(getDistributeLock(nodeId), blockingCallTimeout, unit);
}
private void startDistributedPrimitiveManager() throws InterruptedException, ActiveMQException {
LOGGER.infof("Trying to reach the majority of quorum nodes in %d ms.", DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS);
try {
if (distributedManager.start(DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) {
return;
}
} catch (InterruptedException ie) {
throw ie;
} catch (Throwable t) {
LOGGER.debug(t);
}
assert !distributedManager.isStarted();
throw new ActiveMQException("Cannot reach the majority of quorum nodes");
}
private DistributedLock getDistributeLock(final String nodeId) throws InterruptedException, ActiveMQException {
try {
return distributedManager.getDistributedLock(nodeId);
} catch (Throwable t) {
try {
distributedManager.stop();
} catch (Throwable ignore) {
// don't care
}
if (t instanceof InterruptedException) {
throw (InterruptedException) t;
}
throw new ActiveMQException("Cannot obtain a live lock instance");
}
}
private DistributedLock acquireDistributeLock(final DistributedLock liveLock,
final long acquireLockTimeout,
final TimeUnit unit) throws InterruptedException, ActiveMQException {
try {
if (liveLock.tryLock(acquireLockTimeout, unit)) {
return liveLock;
}
} catch (UnavailableStateException e) {
LOGGER.debug(e);
}
try {
distributedManager.stop();
} catch (Throwable ignore) {
// don't care
}
throw new ActiveMQException("Failed to become live");
}
@Override
public ChannelHandler getActivationChannelHandler(final Channel channel, final Acceptor acceptorUsed) {
if (stoppingServer) {
if (stoppingServer.get()) {
return null;
}
return packet -> {
@ -268,7 +228,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
awaitBackupAnnouncementOnFailbackRequest(clusterConnection);
}
} catch (Exception e) {
if (activeMQServer.getState() == ActiveMQServerImpl.SERVER_STATE.STARTED) {
if (activeMQServer.getState() == STARTED) {
/*
* The reasoning here is that the exception was either caused by (1) the
* (interaction with) the backup, or (2) by an IO Error at the storage. If (1), we
@ -283,11 +243,13 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
ActiveMQServerLogger.LOGGER.errorStoppingReplication(amqe);
} finally {
synchronized (replicationLock) {
if (this.replicationManager == replicationManager) {
this.replicationManager = null;
}
}
}
}
}
/**
* This is handling awaiting backup announcement before trying to failover.
@ -308,35 +270,55 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
}
}
/**
* If {@link #asyncStopServer()} happens before this call, the restart just won't happen.
* If {@link #asyncStopServer()} happens after this call, will make the server to stop right after being restarted.
*/
private void restartAsBackupAfterFailback() throws Exception {
if (stoppingServer) {
if (stoppingServer.get()) {
return;
}
synchronized (this) {
if (stoppingServer) {
final String coordinatedLockAndNodeId;
final long inSyncReplicaActivation;
synchronized (replicationLock) {
if (stoppingServer.get()) {
return;
}
distributedManager.stop();
final ReplicationManager replicationManager = this.replicationManager;
if (replicationManager == null) {
LOGGER.warnf("Failback interrupted");
// we got a disconnection from the replica *before* stopping acceptors: better not failback!
return;
}
// IMPORTANT: this is going to save server::fail to issue a replica connection failure (with failed == false)
// because onReplicationConnectionClose fail-fast on stopping == true.
if (!stoppingServer.compareAndSet(false, true)) {
LOGGER.infof("Failback interrupted: server is already stopping");
return;
}
coordinatedLockAndNodeId = activeMQServer.getNodeManager().getNodeId().toString();
inSyncReplicaActivation = activeMQServer.getNodeManager().getNodeActivationSequence();
// none can notice a concurrent drop of replica connection here: awaitNextCommittedActivationSequence defensively
// wait FAILBACK_TIMEOUT_MILLIS, proceed as backup and compete to become live again
activeMQServer.fail(true);
}
try {
distributedManager.start();
if (!awaitNextCommittedActivationSequence(distributedManager, coordinatedLockAndNodeId, inSyncReplicaActivation, FAILBACK_TIMEOUT_MILLIS, LOGGER)) {
LOGGER.warnf("Timed out waiting for failback server activation with NodeID = %s: and sequence > %d: after %dms",
coordinatedLockAndNodeId, inSyncReplicaActivation, FAILBACK_TIMEOUT_MILLIS);
}
} catch (UnavailableStateException ignored) {
LOGGER.debug("Unavailable distributed manager while awaiting failback activation sequence: ignored", ignored);
} finally {
distributedManager.stop();
}
ActiveMQServerLogger.LOGGER.restartingReplicatedBackupAfterFailback();
activeMQServer.setHAPolicy(policy.getBackupPolicy());
activeMQServer.start();
}
}
private void asyncStopServer() {
if (stoppingServer) {
if (stoppingServer.get()) {
return;
}
synchronized (this) {
if (stoppingServer) {
return;
}
stoppingServer = true;
if (stoppingServer.compareAndSet(false, true)) {
new Thread(() -> {
try {
activeMQServer.stop();
@ -374,19 +356,28 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
private void onReplicationConnectionClose() {
ExecutorService executorService = activeMQServer.getThreadPool();
if (executorService != null) {
synchronized (replicationLock) {
if (replicationManager == null) {
if (stoppingServer.get()) {
return;
}
}
executorService.execute(() -> {
synchronized (replicationLock) {
if (replicationManager == null) {
return;
}
// we increment only if we are staying alive
if (!stoppingServer.get() && STARTED.equals(activeMQServer.getState())) {
try {
ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER);
} catch (Throwable fatal) {
LOGGER.errorf(fatal, "Unexpected exception: %s on attempted activation sequence increment; stopping server async", fatal.getLocalizedMessage());
asyncStopServer();
}
}
// this is going to stop the replication manager
activeMQServer.getStorageManager().stopReplication();
assert !replicationManager.isStarted();
final StorageManager storageManager = activeMQServer.getStorageManager();
if (storageManager != null) {
storageManager.stopReplication();
}
replicationManager = null;
}
});

View File

@ -0,0 +1,312 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.activemq.artemis.core.server.impl.quorum;
import java.util.Objects;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.activemq.artemis.api.core.ActiveMQException;
import org.apache.activemq.artemis.core.server.ActiveMQServer;
import org.apache.activemq.artemis.core.server.NodeManager;
import org.apache.activemq.artemis.quorum.DistributedLock;
import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager;
import org.apache.activemq.artemis.quorum.MutableLong;
import org.apache.activemq.artemis.quorum.UnavailableStateException;
import org.jboss.logging.Logger;
/**
* This class contains the activation sequence logic of the pluggable quorum vote:
* it should be used by {@link org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation}
* and {@link org.apache.activemq.artemis.core.server.impl.ReplicationPrimaryActivation} to coordinate
* for replication.
*/
public final class ActivationSequenceStateMachine {
private static final long CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS = 200;
private static final long CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS = 2000;
private static final long LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS = 2000;
private ActivationSequenceStateMachine() {
}
/**
* It loops if the data of the broker is still valuable, but cannot become live.
* It loops (temporarly) if data is in sync or can self-heal, but cannot yet acquire the live lock.
* <p>
* It stops loop and return:
* <p><ul>
* <li>{@code null}: if data is stale (and there are no rights to become live)
* <li>{@code !=null}: if data is in sync and the {@link DistributedLock} is correctly acquired
* </ul><p>
* <p>
* After successfully returning from this method ie not null return value, a broker should use
* {@link #ensureSequentialAccessToNodeData(ActiveMQServer, DistributedPrimitiveManager, Logger)} to complete
* the activation and guarantee the initial not-replicated ownership of data.
*/
public static DistributedLock tryActivate(final String nodeId,
final long nodeActivationSequence,
final DistributedPrimitiveManager distributedManager,
final Logger logger) throws InterruptedException, ExecutionException, TimeoutException, UnavailableStateException {
final DistributedLock activationLock = distributedManager.getDistributedLock(nodeId);
try (MutableLong coordinatedNodeSequence = distributedManager.getMutableLong(nodeId)) {
while (true) {
// dirty read is sufficient to know if we are *not* an in sync replica
// typically the lock owner will increment to signal our data is stale and we are happy without any
// further coordination at this point
switch (validateActivationSequence(coordinatedNodeSequence, activationLock, nodeId, nodeActivationSequence, logger)) {
case Stale:
activationLock.close();
return null;
case SelfRepair:
case InSync:
break;
case MaybeInSync:
if (activationLock.tryLock()) {
// BAD: where's the broker that should commit it?
activationLock.unlock();
logger.warnf("Cannot assume live role for NodeID = %s: claimed activation sequence need to be repaired",
nodeId);
TimeUnit.MILLISECONDS.sleep(CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS);
continue;
}
// quick path while data is still valuable: wait until something change (commit/repair)
TimeUnit.MILLISECONDS.sleep(CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS);
continue;
}
// SelfRepair, InSync
if (!activationLock.tryLock(LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) {
logger.debugf("Candidate for Node ID = %s, with local activation sequence: %d, cannot acquire live lock within %dms; retrying",
nodeId, nodeActivationSequence, LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS);
continue;
}
switch (validateActivationSequence(coordinatedNodeSequence, activationLock, nodeId, nodeActivationSequence, logger)) {
case Stale:
activationLock.close();
return null;
case SelfRepair:
// Self-repair sequence ie we were the only one with the most up to date data.
// NOTE: We cannot move the sequence now, let's delay it on ensureSequentialAccessToNodeData
logger.infof("Assuming live role for NodeID = %s: local activation sequence %d matches claimed coordinated activation sequence %d. Repairing sequence", nodeId, nodeActivationSequence, nodeActivationSequence);
return activationLock;
case InSync:
// we are an in_sync_replica, good to go live as UNREPLICATED
logger.infof("Assuming live role for NodeID = %s, local activation sequence %d matches current coordinated activation sequence %d", nodeId, nodeActivationSequence, nodeActivationSequence);
return activationLock;
case MaybeInSync:
activationLock.unlock();
logger.warnf("Cannot assume live role for NodeID = %s: claimed activation sequence need to be repaired", nodeId);
TimeUnit.MILLISECONDS.sleep(CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS);
continue;
}
}
}
}
private enum ValidationResult {
/**
* coordinated activation sequence (claimed/committed) is far beyond the local one: data is not valuable anymore
**/
Stale,
/**
* coordinated activation sequence is the same as local one: data is in sync
**/
InSync,
/**
* next coordinated activation sequence is not committed yet: maybe data is in sync
**/
MaybeInSync,
/**
* next coordinated activation sequence is not committed yet, but this broker can self-repair: data is in sync
**/
SelfRepair
}
private static ValidationResult validateActivationSequence(final MutableLong coordinatedNodeSequence,
final DistributedLock activationLock,
final String lockAndLongId,
final long nodeActivationSequence,
final Logger logger) throws UnavailableStateException {
assert coordinatedNodeSequence.getMutableLongId().equals(lockAndLongId);
assert activationLock.getLockId().equals(lockAndLongId);
final long currentCoordinatedNodeSequence = coordinatedNodeSequence.get();
if (nodeActivationSequence == currentCoordinatedNodeSequence) {
return ValidationResult.InSync;
}
if (currentCoordinatedNodeSequence > 0) {
logger.infof("Not a candidate for NodeID = %s activation, local activation sequence %d does not match coordinated activation sequence %d",
lockAndLongId, nodeActivationSequence, currentCoordinatedNodeSequence);
return ValidationResult.Stale;
}
// claimed activation sequence
final long claimedCoordinatedNodeSequence = -currentCoordinatedNodeSequence;
final long sequenceGap = claimedCoordinatedNodeSequence - nodeActivationSequence;
if (sequenceGap == 0) {
return ValidationResult.SelfRepair;
}
if (sequenceGap == 1) {
// maybe data is still valuable
return ValidationResult.MaybeInSync;
}
assert sequenceGap > 1;
// sequence is moved so much that data is no longer valuable
logger.infof("Not a candidate for NodeID = %s activation, local activation sequence %d does not match coordinated activation sequence %d",
lockAndLongId, nodeActivationSequence, claimedCoordinatedNodeSequence);
return ValidationResult.Stale;
}
/**
* It wait until {@code timeoutMillis ms} has passed or the coordinated activation sequence has progressed enough
*/
public static boolean awaitNextCommittedActivationSequence(final DistributedPrimitiveManager distributedManager,
final String coordinatedLockAndNodeId,
final long activationSequence,
final long timeoutMills,
final Logger logger)
throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException {
Objects.requireNonNull(distributedManager);
Objects.requireNonNull(logger);
Objects.requireNonNull(coordinatedLockAndNodeId);
if (activationSequence < 0) {
throw new IllegalArgumentException("activationSequence must be >= 0, while is " + activationSequence);
}
if (!distributedManager.isStarted()) {
throw new IllegalStateException("manager must be started");
}
final MutableLong coordinatedActivationSequence = distributedManager.getMutableLong(coordinatedLockAndNodeId);
// wait for the live to activate and run un replicated with a sequence > inSyncReplicaActivation
// this read can be dirty b/c we are just looking for an increment.
boolean anyNext = false;
final long timeoutNs = TimeUnit.MILLISECONDS.toNanos(timeoutMills);
final long started = System.nanoTime();
long elapsedNs;
do {
final long coordinatedValue = coordinatedActivationSequence.get();
if (coordinatedValue > activationSequence) {
// all good, some activation has gone ahead
logger.infof("Detected a new activation sequence with NodeID = %s: and sequence: %d", coordinatedLockAndNodeId, coordinatedValue);
anyNext = true;
break;
}
if (coordinatedValue < 0) {
// commit claim
final long claimedSequence = -coordinatedValue;
final long activationsGap = claimedSequence - activationSequence;
if (activationsGap > 1) {
// all good, some activation has gone ahead
logger.infof("Detected furthers sequential server activations from sequence %d, with NodeID = %s: and claimed sequence: %d", activationSequence, coordinatedLockAndNodeId, claimedSequence);
anyNext = true;
break;
}
// activation is still in progress
logger.debugf("Detected claiming of activation sequence = %d for NodeID = %s", claimedSequence, coordinatedLockAndNodeId);
}
try {
TimeUnit.MILLISECONDS.sleep(CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS);
} catch (InterruptedException ignored) {
}
elapsedNs = System.nanoTime() - started;
}
while (elapsedNs < timeoutNs);
return anyNext;
}
/**
* This is going to increment the coordinated activation sequence while holding the live lock, failing with some exception otherwise.<br>
* <p>
* The acceptable states are {@link ValidationResult#InSync} and {@link ValidationResult#SelfRepair}, throwing some exception otherwise.
* <p>
* This must be used while holding a live lock to ensure not-exclusive ownership of data ie can be both used
* while loosing connectivity with a replica or after successfully {@link #tryActivate(String, long, DistributedPrimitiveManager, Logger)}.
*/
public static void ensureSequentialAccessToNodeData(ActiveMQServer activeMQServer,
DistributedPrimitiveManager distributedPrimitiveManager,
final Logger logger) throws ActiveMQException, InterruptedException, UnavailableStateException, ExecutionException, TimeoutException {
final NodeManager nodeManager = activeMQServer.getNodeManager();
final String lockAndLongId = nodeManager.getNodeId().toString();
final DistributedLock liveLock = distributedPrimitiveManager.getDistributedLock(lockAndLongId);
if (!liveLock.isHeldByCaller()) {
final String message = String.format("Server [%s], live lock for NodeID = %s, not held, activation sequence cannot be safely changed",
activeMQServer, lockAndLongId);
logger.info(message);
throw new UnavailableStateException(message);
}
final long nodeActivationSequence = nodeManager.readNodeActivationSequence();
final MutableLong coordinatedNodeActivationSequence = distributedPrimitiveManager.getMutableLong(lockAndLongId);
final long currentCoordinatedActivationSequence = coordinatedNodeActivationSequence.get();
final long nextActivationSequence;
if (currentCoordinatedActivationSequence < 0) {
// Check Self-Repair
if (nodeActivationSequence != -currentCoordinatedActivationSequence) {
final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, local activation sequence %d does not match current claimed coordinated sequence %d: need repair",
activeMQServer, lockAndLongId, nodeActivationSequence, -currentCoordinatedActivationSequence);
logger.info(message);
throw new ActiveMQException(message);
}
// auto-repair: this is the same server that failed to commit its claimed sequence
nextActivationSequence = nodeActivationSequence;
} else {
// Check InSync
if (nodeActivationSequence != currentCoordinatedActivationSequence) {
final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, local activation sequence %d does not match current coordinated sequence %d",
activeMQServer, lockAndLongId, nodeActivationSequence, currentCoordinatedActivationSequence);
logger.info(message);
throw new ActiveMQException(message);
}
nextActivationSequence = nodeActivationSequence + 1;
}
// UN_REPLICATED STATE ENTER: auto-repair doesn't need to claim and write locally
if (nodeActivationSequence != nextActivationSequence) {
// claim
if (!coordinatedNodeActivationSequence.compareAndSet(nodeActivationSequence, -nextActivationSequence)) {
final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, activation sequence claim failed, local activation sequence %d no longer matches current coordinated sequence %d",
activeMQServer, lockAndLongId, nodeActivationSequence, coordinatedNodeActivationSequence.get());
logger.infof(message);
throw new ActiveMQException(message);
}
// claim success: write locally
try {
nodeManager.writeNodeActivationSequence(nextActivationSequence);
} catch (NodeManager.NodeManagerException fatal) {
logger.errorf("Server [%s] failed to set local activation sequence to: %d for NodeId =%s. Cannot continue committing coordinated activation sequence: REQUIRES ADMIN INTERVENTION",
activeMQServer, nextActivationSequence, lockAndLongId);
throw new UnavailableStateException(fatal);
}
logger.infof("Server [%s], incremented local activation sequence to: %d for NodeId = %s",
activeMQServer, nextActivationSequence, lockAndLongId);
} else {
// self-heal need to update the in-memory sequence, because no writes will do it
nodeManager.setNodeActivationSequence(nextActivationSequence);
}
// commit
if (!coordinatedNodeActivationSequence.compareAndSet(-nextActivationSequence, nextActivationSequence)) {
final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, activation sequence commit failed, local activation sequence %d no longer matches current coordinated sequence %d",
activeMQServer, lockAndLongId, nodeActivationSequence, coordinatedNodeActivationSequence.get());
logger.infof(message);
throw new ActiveMQException(message);
}
logger.infof("Server [%s], incremented coordinated activation sequence to: %d for NodeId = %s",
activeMQServer, nextActivationSequence, lockAndLongId);
}
}

View File

@ -244,8 +244,10 @@ public class ManagementServiceImpl implements ManagementService {
ObjectName objectName = objectNameBuilder.getActiveMQServerObjectName();
unregisterFromJMX(objectName);
unregisterFromRegistry(ResourceNames.BROKER);
if (messagingServer != null) {
unregisterMeters(ResourceNames.BROKER + "." + messagingServer.getConfiguration().getName());
}
}
@Override
public void registerAddress(AddressInfo addressInfo) throws Exception {

View File

@ -3189,13 +3189,12 @@
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="check-for-live-server" type="xsd:boolean" default="false" maxOccurs="1" minOccurs="0">
<xsd:element name="coordination-id" type="xsd:string" maxOccurs="1" minOccurs="0">
<xsd:annotation>
<xsd:documentation>
Whether to check the cluster for a (live) server using our own server ID when starting
up. This option is only necessary for performing 'fail-back' on replicating
servers. Strictly speaking this setting only applies to live servers and not to
backups.
The common identity to use for coordination that is shared across instances that will replicate.
The value will be used as the internal server nodeId and as the identity of entities in the
distributed-primitive-manager.
</xsd:documentation>
</xsd:annotation>
</xsd:element>
@ -3208,21 +3207,6 @@
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="vote-retries" type="xsd:integer" default="12" minOccurs="0" maxOccurs="1">
<xsd:annotation>
<xsd:documentation>
If we start as a replica and lose connection to the master, how many times should we attempt to vote
for quorum before restarting
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="vote-retry-wait" type="xsd:long" default="2000" minOccurs="0" maxOccurs="1">
<xsd:annotation>
<xsd:documentation>
How long to wait (in milliseconds) between each vote
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="retry-replication-wait" type="xsd:long" default="2000" minOccurs="0" maxOccurs="1">
<xsd:annotation>
<xsd:documentation>
@ -3288,20 +3272,6 @@
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="vote-retries" type="xsd:integer" default="12" minOccurs="0" maxOccurs="1">
<xsd:annotation>
<xsd:documentation>
If we lose connection to the master, how many times should we attempt to vote for quorum before restarting
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="vote-retry-wait" type="xsd:long" default="2000" minOccurs="0" maxOccurs="1">
<xsd:annotation>
<xsd:documentation>
How long to wait (in milliseconds) between each vote
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="retry-replication-wait" type="xsd:long" default="2000" minOccurs="0" maxOccurs="1">
<xsd:annotation>
<xsd:documentation>

View File

@ -283,7 +283,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase {
assertFalse(policy.canScaleDown());
assertFalse(policy.isBackup());
assertFalse(policy.isSharedStore());
assertTrue(policy.isCheckForLiveServer());
assertTrue(policy.isWaitForActivation());
assertEquals("purple", policy.getGroupName());
assertEquals("purple", policy.getBackupGroupName());
@ -297,8 +296,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase {
assertEquals(policy.getBackupGroupName(), failbackPolicy.getBackupGroupName());
assertEquals(policy.getClusterName(), failbackPolicy.getClusterName());
assertEquals(failbackPolicy.getMaxSavedReplicatedJournalsSize(), ActiveMQDefaultConfiguration.getDefaultMaxSavedReplicatedJournalsSize());
assertEquals(1, failbackPolicy.getVoteRetries());
assertEquals(1000, failbackPolicy.getVoteRetryWait());
assertTrue(failbackPolicy.isTryFailback());
assertTrue(failbackPolicy.isBackup());
assertFalse(failbackPolicy.isSharedStore());
@ -337,8 +334,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase {
assertEquals("tiddles", policy.getBackupGroupName());
assertEquals("33rrrrr", policy.getClusterName());
assertEquals(22, policy.getMaxSavedReplicatedJournalsSize());
assertEquals(1, policy.getVoteRetries());
assertEquals(1000, policy.getVoteRetryWait());
assertFalse(policy.isTryFailback());
assertTrue(policy.isBackup());
assertFalse(policy.isSharedStore());
@ -358,7 +353,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase {
assertFalse(failoverLivePolicy.canScaleDown());
assertFalse(failoverLivePolicy.isBackup());
assertFalse(failoverLivePolicy.isSharedStore());
assertFalse(failoverLivePolicy.isCheckForLiveServer());
assertTrue(failoverLivePolicy.isWaitForActivation());
assertEquals(policy.getGroupName(), failoverLivePolicy.getGroupName());
assertEquals(policy.getClusterName(), failoverLivePolicy.getClusterName());

View File

@ -30,8 +30,6 @@
<cluster-name>33rrrrr</cluster-name>
<initial-replication-sync-timeout>9876</initial-replication-sync-timeout>
<retry-replication-wait>12345</retry-replication-wait>
<vote-retries>1</vote-retries>
<vote-retry-wait>1000</vote-retry-wait>
<allow-failback>false</allow-failback>
<manager>
<class-name>

View File

@ -27,9 +27,6 @@
<cluster-name>abcdefg</cluster-name>
<initial-replication-sync-timeout>9876</initial-replication-sync-timeout>
<retry-replication-wait>12345</retry-replication-wait>
<check-for-live-server>true</check-for-live-server>
<vote-retries>1</vote-retries>
<vote-retry-wait>1000</vote-retry-wait>
<manager>
<class-name>
org.apache.activemq.artemis.core.config.impl.HAPolicyConfigurationTest$FakeDistributedPrimitiveManager

View File

@ -65,8 +65,8 @@ which we will cover in a later chapter.
> message data will not be available after failover.
The `ha-policy` type configures which strategy a cluster should use to
provide the backing up of a servers data. Within this configuration
element is configured how a server should behave within the cluster,
provide the backing up of a server's data. Within this configuration
element we configure how a server should behave within the cluster,
either as a master (live), slave (backup) or colocated (both live and
backup). This would look something like:
@ -98,7 +98,7 @@ or
</ha-policy>
```
*Replication* allows too to configure 2 new roles to enable *pluggable quorum* provider configuration, by using:
*Replication* allows the configuration of two new roles to enable *pluggable quorum* provider configuration, by using:
```xml
<ha-policy>
<replication>
@ -116,17 +116,14 @@ to configure the classic *master* role, and
```
for the classic *slave* one.
If *replication* is configured using such new roles some additional element is required to complete configuration, detailed later.
If *replication* is configured using such new roles some additional element are required to complete configuration as detailed later.
### IMPORTANT NOTE ON PLUGGABLE QUORUM VOTE FEATURE
This feature is still **EXPERIMENTAL** and not meant to be run in production yet.
This feature is still **EXPERIMENTAL**. Extra testing should be done before running this feature into production. Please raise issues eventually found to the ActiveMQ Artemis Mail Lists.
It means:
- its configuration can change until declared as **officially stable**
- it has to solve yet an inherent data misalignment issue with replication (it can happen with `classic` replication as well)
More info about this issue are on [ARTEMIS-3340](https://issues.apache.org/jira/browse/ARTEMIS-3340).
- it's configuration can change until declared as **officially stable**
### Data Replication
@ -226,12 +223,12 @@ changes and repeats the process.
> live server by changing `slave` to `master`.
Much like in the shared-store case, when the live server stops or
crashes, its replicating backup will become active and take over its
crashes, it's replicating backup will become active and take over its
duties. Specifically, the backup will become active when it loses
connection to its live server. This can be problematic because this can
also happen because of a temporary network problem.
connection to its live server. This can be problematic because it can
also happen as the result of temporary network problem.
This issue is solved in 2 different ways depending on which replication roles are configured:
The issue can be solved in two different ways, depending on which replication roles are configured:
- **classic replication** (`master`/`slave` roles): backup will try to determine whether it still can
connect to the other servers in the cluster. If it can connect to more
than half the servers, it will become active, if more than half the
@ -275,7 +272,7 @@ The backup server must be similarly configured but as a `slave`
</ha-policy>
```
To configure a pluggable quorum replication's primary and backup instead:
To configure a pluggable quorum replication's primary and backup use:
```xml
<ha-policy>
@ -391,7 +388,6 @@ For `primary`:
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
</properties>
</manager>
<check-for-live-server>true</check-for-live-server>
</primary>
</replication>
</ha-policy>
@ -418,16 +414,16 @@ The configuration of `class-name` as follows
```
isn't really needed, because Apache Curator is the default provider, but has been shown for completeness.
The `properties` element, instead
The `properties` element:
```xml
<properties>
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
</properties>
```
Can specify a list of `property` elements in the form of key-value pairs, depending the ones
accepted by the specified `class-name` provider.
can specify a list of `property` elements in the form of key-value pairs, appropriate to what is
supported by the specified `class-name` provider.
Apache Curator's provider allow to configure these properties:
Apache Curator's provider allows the following properties:
- [`connect-string`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#connectString(java.lang.String)): (no default)
- [`session-ms`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#sessionTimeoutMs(int)): (default is 18000 ms)
@ -438,36 +434,25 @@ Apache Curator's provider allow to configure these properties:
- [`retries-ms`](https://curator.apache.org/apidocs/org/apache/curator/retry/RetryNTimes.html#%3Cinit%3E(int,int)): (default is 1000 ms)
- [`namespace`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#namespace(java.lang.String)): (no default)
Configuration of the [Apache Zookeeper](https://zookeeper.apache.org/) nodes is left to the user, but there are few
Configuration of the [Apache Zookeeper](https://zookeeper.apache.org/) ensemble is the responsibility of the user, but there are few
**suggestions to improve the reliability of the quorum service**:
- broker `session_ms` must be `>= 2 * server tick time` and `<= 20 * server tick time` as by
[Zookeeper 3.6.3 admin guide](https://zookeeper.apache.org/doc/r3.6.3/zookeeperAdmin.html): it directly impacts how fast a backup
can failover to an isolated/killed/unresponsive live; the higher, the slower.
- GC on broker machine should allow keeping GC pauses within 1/3 of `session_ms` in order to let the Zookeeper heartbeat protocol
to work reliably: if it's not possible, better increase `session_ms` accepting a slower failover
work reliably. If that is not possible, it is better to increase `session_ms`, accepting a slower failover.
- Zookeeper must have enough resources to keep GC (and OS) pauses much smaller than server tick time: please consider carefully if
broker and Zookeeper node should share the same physical machine, depending on the expected load of the broker
- network isolation protection requires configuring >=3 Zookeeper nodes
#### *Important*: Notes on pluggable quorum replication configuration
The first `classic` replication configuration that won't apply to the pluggable quorum replication
is `vote-on-replication-failure` and configure it produces a startup error: pluggable quorum replication
always behave like `vote-on-replication-failure` `true` ie shutting down a live broker (and its JVM) in case of quorum loss.
The second deprecated `classic` replication configuration is `quorum-vote-wait`: given that the pluggable quorum vote replication
requires backup to have an always-on reliable quorum service, there's no need to specify the timeout to reach
the majority of quorum nodes. A backup remains inactive (ie JVM still up, console too, unable to sync with live, to failover etc etc)
until the majority of quorum nodes is reachable again, re-activating if happens.
The only exception is with primary failing-back to an existing live backup using `<allow-failback>true</allow-failback>`:
if the quorum service isn't immediately available the primary (and its JVM) just stop, allowing fail-fast failing-back.
There are few *semantic differences* of other existing properties:
- `vote-retry-wait`: in `classic` replication means how long to wait between each quorum vote try, while with pluggable quorum replication
means how long request to failover for each attempt
- `vote-retries`: differently from `classic`, the amount of vote attempt is `1 + vote-retries` (with classic is just `vote-retries`).
Setting `0` means no retries, leaving backup to still perform an initial attempt.
There are some no longer needed `classic` replication configurations:
- `vote-on-replication-failure`
- `quorum-vote-wait`
- `vote-retries`
- `vote-retries-wait`
- `check-for-live-server`
**Notes on replication configuration with [Apache curator](https://curator.apache.org/) quorum provider**
@ -479,34 +464,34 @@ For the former case (session expiration with live no longer present), the backup
1. cluster connection PINGs (affected by [connection-ttl](connection-ttl.md) tuning)
2. closed TCP connection notification (depends by TCP configuration and networking stack/topology)
These 2 cases have 2 different failover duration depending on different factors:
1. `connection-ttl` affect how much time of the expiring `session-ms` is used to just detect a missing live broker: the higher `connection-tt`,
the slower it reacts; backup can attempt to failover for the remaining `session-ms - connection-ttl`
2. `session-ms` expiration is immediately detected: backup must try to failover for >=`session-ms` to be sure to catch
the session expiration and complete failover
The suggestion is to tune `connection-ttl` low enough to attempt failover as soon as possible, while taking in consideration that
the whole fail-over duration cannot last less than the configured `session-ms`.
The previous comments are meant to suggest to the careful reader that the minimum time to attempt to failover
cannot be below the full `session-ms` expires.
In short, it means
```
total failover attempt time > session-ms
```
with
```
total failover attempt time = vote-retry-wait * (vote-retries + 1)
```
and by consequence:
```
vote-retry-wait * (vote-retries + 1) > session-ms
```
For example with `session-ms = 18000 ms`, safe values for failover timeout are:
##### Peer or Multi Primary
With coordination delegated to the quorum service, roles are less important. It is possible to have two peer servers compete
for activation; the winner activating as live, the looser taking up a backup role. On restart, 'any' peer server
with the most up to date journal can activate.
The instances need to know in advance, what identity they will coordinate on.
In the replication 'primary' ha policy we can explicitly set the 'coordination-id' to a common value for all peers in a cluster.
For `multi primary`:
```xml
<vote-retries>11</vote-retries>
<vote-retry-wait>2000</vote-retry-wait>
<ha-policy>
<replication>
<primary>
<manager>
<class-name>org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager</class-name>
<properties>
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
</properties>
</manager>
<coordination-id>peer-journal-001</coordination-id>
</primary>
</replication>
</ha-policy>
```
Because `11 * 2000 = 22000 ms` that's bigger then `18000 ms`.
There's no risk that a backup broker will early stop attempting to failover, losing its chance to become live.
Note: the string value provided will be converted internally into a 16 byte UUID, so it may not be immediately recognisable or human-readable,
however it will ensure that all 'peers' coordinate.
### Shared Store
@ -637,10 +622,10 @@ another server using its nodeID. If it finds one, it will contact this
server and try to "fail-back". Since this is a remote replication
scenario, the "starting live" will have to synchronize its data with the
server running with its ID, once they are in sync, it will request the
other server (which it assumes it is a back that has assumed its duties)
to shutdown for it to take over. This is necessary because otherwise the
other server (which it assumes it is a backup that has assumed its duties)
to shutdown, for it to take over. This is necessary because otherwise the
live server has no means to know whether there was a fail-over or not,
and if there was if the server that took its duties is still running or
and if there was, if the server that took its duties is still running or
not. To configure this option at your `broker.xml`
configuration file as follows, for classic replication:
@ -663,21 +648,21 @@ And pluggable quorum replication:
<!-- some meaningful configuration -->
</manager>
<primary>
<check-for-live-server>true</check-for-live-server>
<!-- no need to check-for-live-server anymore -->
</primary>
</replication>
</ha-policy>
```
The key difference from classic replication is that if `master` cannot reach any
live server with its same nodeID, it's going straight to become live, while `primary`
request it to the quorum provider, searching again for any existing live if
the quorum provider is not available (eg connectivity loss, consensus absence) or
if there's another live broker with the same nodeID alive, in an endless loop.
live server with its nodeID, it activates unilaterally.
With `primary`, the responsibilities of coordination are delegated to the quorum provider,
there are no unilateral decisions. The `primary` will only activate when
it knows that it has the most up to date version of the journal identified by its nodeID.
In short: a started `primary` cannot become live without consensus.
> **Warning**
> **Warning for classic replication**
>
> Be aware that if you restart a live server while after failover has
> occurred then `check-for-live-server` must be set to `true`. If not the live server

View File

@ -62,6 +62,7 @@ under the License.
<module>scale-down</module>
<module>stop-server-failover</module>
<module>transaction-failover</module>
<module>zookeeper-single-pair-failback</module>
</modules>
</profile>
<profile>
@ -81,6 +82,7 @@ under the License.
<module>replicated-transaction-failover</module>
<module>scale-down</module>
<module>transaction-failover</module>
<module>zookeeper-single-pair-failback</module>
</modules>
</profile>
</profiles>

View File

@ -0,0 +1,110 @@
<?xml version='1.0'?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.activemq.examples.failover</groupId>
<artifactId>broker-failover</artifactId>
<version>2.18.0-SNAPSHOT</version>
</parent>
<artifactId>zookeeper-single-pair-ordered-failback</artifactId>
<packaging>jar</packaging>
<name>ActiveMQ Artemis Zookeeper Single Pair Ordered Failback Example</name>
<properties>
<activemq.basedir>${project.basedir}/../../../..</activemq.basedir>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.activemq</groupId>
<artifactId>artemis-cli</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>jakarta.jms</groupId>
<artifactId>jakarta.jms-api</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.activemq</groupId>
<artifactId>artemis-maven-plugin</artifactId>
<executions>
<execution>
<id>create0</id>
<goals>
<goal>create</goal>
</goals>
<configuration>
<!-- this makes it easier in certain envs -->
<javaOptions>-Djava.net.preferIPv4Stack=true</javaOptions>
<instance>${basedir}/target/server0</instance>
<configuration>${basedir}/target/classes/activemq/server0</configuration>
<javaOptions>-Dudp-address=${udp-address}</javaOptions>
</configuration>
</execution>
<execution>
<id>create1</id>
<goals>
<goal>create</goal>
</goals>
<configuration>
<!-- this makes it easier in certain envs -->
<javaOptions>-Djava.net.preferIPv4Stack=true</javaOptions>
<instance>${basedir}/target/server1</instance>
<configuration>${basedir}/target/classes/activemq/server1</configuration>
<javaOptions>-Dudp-address=${udp-address}</javaOptions>
</configuration>
</execution>
<execution>
<id>runClient</id>
<goals>
<goal>runClient</goal>
</goals>
<configuration>
<clientClass>org.apache.activemq.artemis.jms.example.ZookeeperSinglePairFailback</clientClass>
<args>
<param>${basedir}/target/server0</param>
<param>${basedir}/target/server1</param>
</args>
</configuration>
</execution>
</executions>
<dependencies>
<dependency>
<groupId>org.apache.activemq.examples.failover</groupId>
<artifactId>zookeeper-single-pair-ordered-failback</artifactId>
<version>2.18.0-SNAPSHOT</version>
</dependency>
</dependencies>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-clean-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,94 @@
# Zookeeper Single Pair Failback Example
This example demonstrates two servers coupled as a primary-backup pair for high availability (HA) using
pluggable quorum vote replication Reference Implementation based on [Apache Curator](https://curator.apache.org/) to use
[Apache Zookeeper](https://zookeeper.apache.org/) as external quorum service.
The example shows a client connection failing over from live to backup when the live broker is crashed and
then back to the original live when it is restarted (i.e. "failback").
To run the example, simply type **mvn verify** from this directory after running a Zookeeper node at `localhost:2181`.
If no Zookeeper node is configured, can use the commands below (see [Official Zookeeper Docker Image Site](https://hub.docker.com/_/zookeeper)
for more details on how configure it).
Run Zookeeper `3.6.3` with:
```
$ docker run --name artemis-zk --network host --restart always -d zookeeper:3.6.3
```
By default, the official docker image exposes `2181 2888 3888 8080` as client, follower, election and AdminServer ports.
Verify Zookeeper server is correctly started by running:
```
$ docker logs --follow artemis-zk
```
It should print the Zookeeper welcome ASCII logs:
```
ZooKeeper JMX enabled by default
Using config: /conf/zoo.cfg
2021-08-05 14:29:29,431 [myid:] - INFO [main:QuorumPeerConfig@174] - Reading configuration from: /conf/zoo.cfg
2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@451] - clientPort is not set
2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@464] - secureClientPort is not set
2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@480] - observerMasterPort is not set
2021-08-05 14:29:29,435 [myid:] - INFO [main:QuorumPeerConfig@497] - metricsProvider.className is org.apache.zookeeper.metrics.impl.DefaultMetricsProvider
2021-08-05 14:29:29,438 [myid:] - ERROR [main:QuorumPeerConfig@722] - Invalid configuration, only one server specified (ignoring)
2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@78] - autopurge.snapRetainCount set to 3
2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@79] - autopurge.purgeInterval set to 0
2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@101] - Purge task is not scheduled.
2021-08-05 14:29:29,441 [myid:1] - WARN [main:QuorumPeerMain@138] - Either no config or no quorum defined in config, running in standalone mode
2021-08-05 14:29:29,444 [myid:1] - INFO [main:ManagedUtil@44] - Log4j 1.2 jmx support found and enabled.
2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@174] - Reading configuration from: /conf/zoo.cfg
2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@451] - clientPort is not set
2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@464] - secureClientPort is not set
2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@480] - observerMasterPort is not set
2021-08-05 14:29:29,450 [myid:1] - INFO [main:QuorumPeerConfig@497] - metricsProvider.className is org.apache.zookeeper.metrics.impl.DefaultMetricsProvider
2021-08-05 14:29:29,450 [myid:1] - ERROR [main:QuorumPeerConfig@722] - Invalid configuration, only one server specified (ignoring)
2021-08-05 14:29:29,451 [myid:1] - INFO [main:ZooKeeperServerMain@122] - Starting server
2021-08-05 14:29:29,459 [myid:1] - INFO [main:ServerMetrics@62] - ServerMetrics initialized with provider org.apache.zookeeper.metrics.impl.DefaultMetricsProvider@525f1e4e
2021-08-05 14:29:29,461 [myid:1] - INFO [main:FileTxnSnapLog@124] - zookeeper.snapshot.trust.empty : false
2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] -
2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - ______ _
2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - |___ / | |
2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - / / ___ ___ | | __ ___ ___ _ __ ___ _ __
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - / / / _ \ / _ \ | |/ / / _ \ / _ \ | '_ \ / _ \ | '__|
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - / /__ | (_) | | (_) | | < | __/ | __/ | |_) | | __/ | |
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - /_____| \___/ \___/ |_|\_\ \___| \___| | .__/ \___| |_|
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - | |
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - |_|
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] -
```
Alternatively, can run
```
$ docker run -it --rm --network host zookeeper:3.6.3 zkCli.sh -server localhost:2181
```
Zookeeper server can be reached using localhost:2181 if it output something like:
```
2021-08-05 14:56:03,739 [myid:localhost:2181] - INFO [main-SendThread(localhost:2181):ClientCnxn$SendThread@1448] - Session establishment complete on server localhost/0:0:0:0:0:0:0:1:2181, session id = 0x100078b8cfc0002, negotiated timeout = 30000
```
Type
```
[zk: localhost:2181(CONNECTED) 0] quit
```
to quit the client instance.
The 2 brokers of this example are already configured to connect to a single Zookeeper node at the mentioned address, thanks to the XML configuration of their `manager`:
```xml
<manager>
<properties>
<property key="connect-string" value="localhost:2181"/>
<property key="namespace" value="examples"/>
<property key="session-ms" value="18000"/>
</properties>
</manager>
```
**NOTE** the `namespace` parameter is used to separate the pair information from others if the Zookeeper node is shared with other applications.
**WARNING** As already recommended on the [High Availability section](https://activemq.apache.org/components/artemis/documentation/latest/ha.html), a production environment needs >= 3 nodes to protect against network partitions.
This example can be run with
```
$ mvn verify
```
For more information on ActiveMQ Artemis failover and HA, and clustering in general, please see the clustering section of the user manual.

View File

@ -0,0 +1,157 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.activemq.artemis.jms.example;
import javax.jms.Connection;
import javax.jms.ConnectionFactory;
import javax.jms.JMSException;
import javax.jms.MessageConsumer;
import javax.jms.MessageProducer;
import javax.jms.Queue;
import javax.jms.Session;
import javax.jms.TextMessage;
import javax.naming.InitialContext;
import org.apache.activemq.artemis.util.ServerUtil;
/**
* Example of live and replicating backup pair using Zookeeper as the quorum provider.
* <p>
* After both servers are started, the live server is killed and the backup becomes active ("fails-over").
* <p>
* Later the live server is restarted and takes back its position by asking the backup to stop ("fail-back").
*/
public class ZookeeperSinglePairFailback {
private static Process server0;
private static Process server1;
public static void main(final String[] args) throws Exception {
// Step 0. Prepare Zookeeper Evironment as shown on readme.md
final int numMessages = 30;
Connection connection = null;
InitialContext initialContext = null;
try {
server0 = ServerUtil.startServer(args[0], ZookeeperSinglePairFailback.class.getSimpleName() + "-primary", 0, 30000);
server1 = ServerUtil.startServer(args[1], ZookeeperSinglePairFailback.class.getSimpleName() + "-backup", 1, 10000);
// Step 2. Get an initial context for looking up JNDI from the server #1
initialContext = new InitialContext();
// Step 3. Look up the JMS resources from JNDI
Queue queue = (Queue) initialContext.lookup("queue/exampleQueue");
ConnectionFactory connectionFactory = (ConnectionFactory) initialContext.lookup("ConnectionFactory");
// Step 4. Create a JMS Connection
connection = connectionFactory.createConnection();
// Step 5. Create a *non-transacted* JMS Session with client acknowledgement
Session session = connection.createSession(false, Session.CLIENT_ACKNOWLEDGE);
// Step 6. Start the connection to ensure delivery occurs
connection.start();
// Step 7. Create a JMS MessageProducer and a MessageConsumer
MessageProducer producer = session.createProducer(queue);
MessageConsumer consumer = session.createConsumer(queue);
// Step 8. Send some messages to server #1, the live server
for (int i = 0; i < numMessages; i++) {
TextMessage message = session.createTextMessage("This is text message " + i);
producer.send(message);
System.out.println("Sent message: " + message.getText());
}
// Step 9. Receive and acknowledge a third of the sent messages
TextMessage message0 = null;
for (int i = 0; i < numMessages / 3; i++) {
message0 = (TextMessage) consumer.receive(5000);
System.out.println("Got message: " + message0.getText());
}
message0.acknowledge();
System.out.println("Received and acknowledged a third of the sent messages");
// Step 10. Receive the rest third of the sent messages but *do not* acknowledge them yet
for (int i = numMessages / 3; i < numMessages; i++) {
message0 = (TextMessage) consumer.receive(5000);
System.out.println("Got message: " + message0.getText());
}
System.out.println("Received without acknowledged the rest of the sent messages");
Thread.sleep(2000);
// Step 11. Crash server #0, the live server, and wait a little while to make sure
// it has really crashed
ServerUtil.killServer(server0);
System.out.println("Killed primary");
Thread.sleep(2000);
// Step 12. Acknowledging the received messages will fail as failover to the backup server has occurred
try {
message0.acknowledge();
} catch (JMSException e) {
System.out.println("Got (the expected) exception while acknowledging message: " + e.getMessage());
}
// Step 13. Consume again the 2nd third of the messages again. Note that they are not considered as redelivered.
for (int i = numMessages / 3; i < (numMessages / 3) * 2; i++) {
message0 = (TextMessage) consumer.receive(5000);
System.out.printf("Got message: %s (redelivered?: %s)\n", message0.getText(), message0.getJMSRedelivered());
}
// Step 14. Acknowledging them on the failed-over broker works fine
message0.acknowledge();
System.out.println("Acknowledged 2n third of messages");
// Step 15. Restarting primary
server0 = ServerUtil.startServer(args[0], ZookeeperSinglePairFailback.class.getSimpleName() + "-primary", 0, 10000);
System.out.println("Started primary");
// await fail-back to complete
Thread.sleep(4000);
// Step 16. Consuming the 3rd third of the messages. Note that they are not considered as redelivered.
for (int i = (numMessages / 3) * 2; i < numMessages; i++) {
message0 = (TextMessage) consumer.receive(5000);
System.out.printf("Got message: %s (redelivered?: %s)\n", message0.getText(), message0.getJMSRedelivered());
}
message0.acknowledge();
System.out.println("Acknowledged 3d third of messages");
} finally {
// Step 17. Be sure to close our resources!
if (connection != null) {
connection.close();
}
if (initialContext != null) {
initialContext.close();
}
ServerUtil.killServer(server0);
ServerUtil.killServer(server1);
// Step 18. stop the ZK server
}
}
}

View File

@ -0,0 +1,90 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<configuration xmlns="urn:activemq" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:activemq /schema/artemis-configuration.xsd">
<core xmlns="urn:activemq:core">
<bindings-directory>./data/bindings</bindings-directory>
<journal-directory>./data/journal</journal-directory>
<large-messages-directory>./data/largemessages</large-messages-directory>
<paging-directory>./data/paging</paging-directory>
<cluster-user>exampleUser</cluster-user>
<cluster-password>secret</cluster-password>
<ha-policy>
<replication>
<primary>
<manager>
<properties>
<property key="connect-string" value="localhost:2181"/>
<property key="namespace" value="examples"/>
<property key="session-ms" value="18000"/>
</properties>
</manager>
</primary>
</replication>
</ha-policy>
<connectors>
<connector name="netty-connector">tcp://localhost:61616</connector>
<connector name="netty-backup-connector">tcp://localhost:61617</connector>
</connectors>
<!-- Acceptors -->
<acceptors>
<acceptor name="netty-acceptor">tcp://localhost:61616</acceptor>
</acceptors>
<cluster-connections>
<cluster-connection name="my-cluster">
<connector-ref>netty-connector</connector-ref>
<static-connectors>
<connector-ref>netty-backup-connector</connector-ref>
</static-connectors>
</cluster-connection>
</cluster-connections>
<!-- Other config -->
<security-settings>
<!--security for example queue-->
<security-setting match="exampleQueue">
<permission roles="guest" type="createDurableQueue"/>
<permission roles="guest" type="deleteDurableQueue"/>
<permission roles="guest" type="createNonDurableQueue"/>
<permission roles="guest" type="deleteNonDurableQueue"/>
<permission roles="guest" type="consume"/>
<permission roles="guest" type="send"/>
</security-setting>
</security-settings>
<addresses>
<address name="exampleQueue">
<anycast>
<queue name="exampleQueue"/>
</anycast>
</address>
</addresses>
</core>
</configuration>

View File

@ -0,0 +1,91 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<configuration xmlns="urn:activemq" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:activemq /schema/artemis-configuration.xsd">
<core xmlns="urn:activemq:core">
<bindings-directory>./data/bindings</bindings-directory>
<journal-directory>./data/journal</journal-directory>
<large-messages-directory>./data/largemessages</large-messages-directory>
<paging-directory>./data/paging</paging-directory>
<cluster-user>exampleUser</cluster-user>
<cluster-password>secret</cluster-password>
<ha-policy>
<replication>
<backup>
<manager>
<properties>
<property key="connect-string" value="localhost:2181"/>
<property key="namespace" value="examples"/>
<property key="session-ms" value="18000"/>
</properties>
</manager>
<allow-failback>true</allow-failback>
</backup>
</replication>
</ha-policy>
<!-- Connectors -->
<connectors>
<connector name="netty-live-connector">tcp://localhost:61616</connector>
<connector name="netty-connector">tcp://localhost:61617</connector>
</connectors>
<!-- Acceptors -->
<acceptors>
<acceptor name="netty-acceptor">tcp://localhost:61617</acceptor>
</acceptors>
<cluster-connections>
<cluster-connection name="my-cluster">
<connector-ref>netty-connector</connector-ref>
<static-connectors>
<connector-ref>netty-live-connector</connector-ref>
</static-connectors>
</cluster-connection>
</cluster-connections>
<!-- Other config -->
<security-settings>
<!--security for example queue-->
<security-setting match="exampleQueue">
<permission roles="guest" type="createDurableQueue"/>
<permission roles="guest" type="deleteDurableQueue"/>
<permission roles="guest" type="createNonDurableQueue"/>
<permission roles="guest" type="deleteNonDurableQueue"/>
<permission roles="guest" type="consume"/>
<permission roles="guest" type="send"/>
</security-setting>
</security-settings>
<addresses>
<address name="exampleQueue">
<anycast>
<queue name="exampleQueue"/>
</anycast>
</address>
</addresses>
</core>
</configuration>

View File

@ -0,0 +1,20 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
java.naming.factory.initial=org.apache.activemq.artemis.jndi.ActiveMQInitialContextFactory
connectionFactory.ConnectionFactory=tcp://localhost:61616?ha=true&retryInterval=1000&retryIntervalMultiplier=1.0&reconnectAttempts=-1
queue.queue/exampleQueue=exampleQueue

View File

@ -77,7 +77,6 @@ public class InfiniteRedeliveryTest extends ActiveMQTestBase {
Configuration backupConfig;
Configuration liveConfig;
NodeManager nodeManager;
protected TestableServer createTestableServer(Configuration config, NodeManager nodeManager) throws Exception {
boolean isBackup = config.getHAPolicyConfiguration() instanceof ReplicaPolicyConfiguration || config.getHAPolicyConfiguration() instanceof SharedStoreSlavePolicyConfiguration;
@ -99,13 +98,11 @@ public class InfiniteRedeliveryTest extends ActiveMQTestBase {
nodeManager = new InVMNodeManager(true, backupConfig.getJournalLocation());
backupServer = createTestableServer(backupConfig, nodeManager);
backupServer = createTestableServer(backupConfig, new InVMNodeManager(true, backupConfig.getJournalLocation()));
liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(TransportConfigurationUtils.getNettyAcceptor(true, 0));
liveServer = createTestableServer(liveConfig, nodeManager);
liveServer = createTestableServer(liveConfig, new InVMNodeManager(false, liveConfig.getJournalLocation()));
}
protected void configureReplicationPair(TransportConfiguration backupConnector,

View File

@ -659,6 +659,8 @@ public class FailoverTest extends FailoverTestBase {
backupServer.getServer().fail(true);
decrementActivationSequenceForForceRestartOf(liveServer);
liveServer.start();
consumer.close();
@ -823,6 +825,7 @@ public class FailoverTest extends FailoverTestBase {
Assert.assertFalse("must NOT be a backup", isBackup);
adaptLiveConfigForReplicatedFailBack(liveServer);
beforeRestart(liveServer);
decrementActivationSequenceForForceRestartOf(liveServer);
liveServer.start();
Assert.assertTrue("live initialized...", liveServer.getServer().waitForActivation(15, TimeUnit.SECONDS));
@ -931,12 +934,13 @@ public class FailoverTest extends FailoverTestBase {
while (!backupServer.isStarted() && i++ < 100) {
Thread.sleep(100);
}
liveServer.getServer().waitForActivation(5, TimeUnit.SECONDS);
backupServer.getServer().waitForActivation(5, TimeUnit.SECONDS);
Assert.assertTrue(backupServer.isStarted());
if (isReplicated) {
FileMoveManager moveManager = new FileMoveManager(backupServer.getServer().getConfiguration().getJournalLocation(), 0);
Assert.assertEquals(1, moveManager.getNumberOfFolders());
// backup has not had a chance to restart as a backup and cleanup
Wait.assertTrue(() -> moveManager.getNumberOfFolders() <= 2);
}
} else {
backupServer.stop();
@ -2427,6 +2431,10 @@ public class FailoverTest extends FailoverTestBase {
// no-op
}
protected void decrementActivationSequenceForForceRestartOf(TestableServer liveServer) throws Exception {
// no-op
}
protected ClientSession sendAndConsume(final ClientSessionFactory sf1, final boolean createQueue) throws Exception {
ClientSession session = createSession(sf1, false, true, true);

View File

@ -46,7 +46,6 @@ import org.apache.activemq.artemis.core.remoting.impl.invm.InVMRegistry;
import org.apache.activemq.artemis.core.server.NodeManager;
import org.apache.activemq.artemis.core.server.cluster.ha.HAPolicy;
import org.apache.activemq.artemis.core.server.cluster.ha.ReplicatedPolicy;
import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationPrimaryPolicy;
import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl;
import org.apache.activemq.artemis.core.server.impl.InVMNodeManager;
import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager;
@ -88,6 +87,10 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
protected NodeManager nodeManager;
protected NodeManager backupNodeManager;
protected DistributedPrimitiveManagerConfiguration managerConfiguration;
protected boolean startBackupServer = true;
@Override
@ -173,6 +176,10 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
return new InVMNodeManager(false);
}
protected NodeManager createNodeManager(Configuration configuration) throws Exception {
return new InVMNodeManager(false, configuration.getNodeManagerLockLocation());
}
protected void createConfigs() throws Exception {
nodeManager = createNodeManager();
TransportConfiguration liveConnector = getConnectorTransportConfiguration(true);
@ -211,13 +218,14 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
backupConfig.setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)).setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setSecurityEnabled(false);
setupHAPolicyConfiguration();
nodeManager = createReplicatedBackupNodeManager(backupConfig);
backupNodeManager = createReplicatedBackupNodeManager(backupConfig);
backupServer = createTestableServer(backupConfig);
backupServer = createTestableServer(backupConfig, backupNodeManager);
liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true));
liveServer = createTestableServer(liveConfig);
nodeManager = createNodeManager(liveConfig);
liveServer = createTestableServer(liveConfig, nodeManager);
if (supportsRetention()) {
liveServer.getServer().getConfiguration().setJournalRetentionDirectory(getJournalDir(0, false) + "_retention");
@ -233,7 +241,7 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
backupConfig = createDefaultInVMConfig();
liveConfig = createDefaultInVMConfig();
DistributedPrimitiveManagerConfiguration managerConfiguration =
managerConfiguration =
new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(),
Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString()));
@ -242,13 +250,14 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
backupConfig.setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)).setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setSecurityEnabled(false);
setupHAPolicyConfiguration();
nodeManager = createReplicatedBackupNodeManager(backupConfig);
backupNodeManager = createReplicatedBackupNodeManager(backupConfig);
backupServer = createTestableServer(backupConfig);
backupServer = createTestableServer(backupConfig, backupNodeManager);
liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true));
liveServer = createTestableServer(liveConfig);
nodeManager = createNodeManager(liveConfig);
liveServer = createTestableServer(liveConfig, nodeManager);
}
protected void setupHAPolicyConfiguration() {
@ -272,8 +281,6 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
HAPolicy policy = server.getServer().getHAPolicy();
if (policy instanceof ReplicatedPolicy) {
((ReplicatedPolicy) policy).setCheckForLiveServer(true);
} else if (policy instanceof ReplicationPrimaryPolicy) {
Assert.assertTrue("Adapting won't work for the current configuration", ((ReplicationPrimaryPolicy) policy).isCheckForLiveServer());
}
}
@ -294,6 +301,7 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
nodeManager = null;
backupNodeManager = null;
try {
ServerSocket serverSocket = new ServerSocket(61616);
serverSocket.close();

View File

@ -182,8 +182,6 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase {
break;
case PluggableQuorumReplication:
haPolicyConfiguration = ReplicationBackupPolicyConfiguration.withDefault()
.setVoteRetries(1)
.setVoteRetryWait(1000)
.setDistributedManagerConfiguration(getOrCreatePluggableQuorumConfiguration())
.setGroupName(getNodeGroupName() != null ? (getNodeGroupName() + "-" + i) : null);
break;
@ -278,12 +276,14 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase {
return addClientSession(sf.createSession(xa, autoCommitSends, autoCommitAcks));
}
protected void waitForDistribution(SimpleString address, ActiveMQServer server, int messageCount) throws Exception {
protected boolean waitForDistribution(SimpleString address, ActiveMQServer server, int messageCount) throws Exception {
ActiveMQServerLogger.LOGGER.debug("waiting for distribution of messages on server " + server);
Queue q = (Queue) server.getPostOffice().getBinding(address).getBindable();
Wait.waitFor(() -> getMessageCount(q) >= messageCount);
return Wait.waitFor(() -> {
return getMessageCount(q) >= messageCount;
});
}
}

View File

@ -156,12 +156,13 @@ public class ReplicaTimeoutTest extends ActiveMQTestBase {
liveConfig.setBindingsDirectory(getBindingsDir(0, false)).setJournalDirectory(getJournalDir(0, false)).
setPagingDirectory(getPageDir(0, false)).setLargeMessagesDirectory(getLargeMessagesDir(0, false)).setSecurityEnabled(false);
NodeManager nodeManager = createReplicatedBackupNodeManager(backupConfig);
NodeManager replicatedBackupNodeManager = createReplicatedBackupNodeManager(backupConfig);
backupServer = createTestableServer(backupConfig, nodeManager);
backupServer = createTestableServer(backupConfig, replicatedBackupNodeManager);
liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true));
NodeManager nodeManager = createReplicatedBackupNodeManager(liveConfig);
liveServer = createTestableServer(liveConfig, nodeManager);
final TestableServer theBackup = backupServer;

View File

@ -75,7 +75,10 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated
sendCrashReceive();
Wait.assertTrue(backupServers.get(0)::isActive, 5000, 10);
Wait.assertTrue(backupServers.get(1)::isActive, 5000, 10);
waitForTopology(backupServers.get(0).getServer(), liveServers.size(), 2);
waitForTopology(backupServers.get(1).getServer(), liveServers.size(), 2);
sendCrashBackupReceive();
}
@ -115,6 +118,14 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated
}
protected void sendCrashBackupReceive() throws Exception {
//make sure bindings are ready before sending messages b/c we verify strict load balancing in waitForDistribution
this.waitForBindings( backupServers.get(0).getServer(), ADDRESS.toString(), false, 1, 0, 2000);
this.waitForBindings( backupServers.get(0).getServer(), ADDRESS.toString(), false, 1, 0, 2000);
this.waitForBindings( backupServers.get(1).getServer(), ADDRESS.toString(), false, 1, 0, 2000);
this.waitForBindings( backupServers.get(1).getServer(), ADDRESS.toString(), false, 1, 0, 2000);
ServerLocator locator0 = getBackupServerLocator(0);
ServerLocator locator1 = getBackupServerLocator(1);
@ -138,8 +149,8 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated
producer.close();
waitForDistribution(ADDRESS, backupServers.get(0).getServer(), 100);
waitForDistribution(ADDRESS, backupServers.get(1).getServer(), 100);
assertTrue(waitForDistribution(ADDRESS, backupServers.get(0).getServer(), 100));
assertTrue(waitForDistribution(ADDRESS, backupServers.get(1).getServer(), 100));
List<TestableServer> toCrash = new ArrayList<>();
for (TestableServer backupServer : backupServers) {

View File

@ -24,7 +24,6 @@ import org.apache.activemq.artemis.api.core.ActiveMQException;
import org.apache.activemq.artemis.api.core.Interceptor;
import org.apache.activemq.artemis.api.core.TransportConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.apache.activemq.artemis.core.protocol.core.Packet;
import org.apache.activemq.artemis.core.protocol.core.impl.PacketImpl;
import org.apache.activemq.artemis.spi.core.protocol.RemotingConnection;
@ -90,7 +89,6 @@ public class PluggableQuorumBackupAuthenticationTest extends FailoverTestBase {
@Override
protected void setupHAPolicyConfiguration() {
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
}

View File

@ -17,8 +17,7 @@
package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;;
import org.apache.activemq.artemis.tests.integration.cluster.failover.BackupSyncJournalTest;
public class PluggableQuorumBackupSyncJournalTest extends BackupSyncJournalTest {
@ -30,8 +29,6 @@ public class PluggableQuorumBackupSyncJournalTest extends BackupSyncJournalTest
@Override
protected void setupHAPolicyConfiguration() {
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration())
.setCheckForLiveServer(true);
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration())
.setMaxSavedReplicatedJournalsSize(2)
.setAllowFailBack(true);

View File

@ -29,18 +29,24 @@ import org.apache.activemq.artemis.api.core.QueueConfiguration;
import org.apache.activemq.artemis.api.core.TransportConfiguration;
import org.apache.activemq.artemis.api.core.client.ClientSession;
import org.apache.activemq.artemis.component.WebServerComponent;
import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.apache.activemq.artemis.core.server.ActiveMQServer;
import org.apache.activemq.artemis.core.server.NodeManager;
import org.apache.activemq.artemis.core.server.ServiceComponent;
import org.apache.activemq.artemis.dto.AppDTO;
import org.apache.activemq.artemis.dto.WebServerDTO;
import org.apache.activemq.artemis.quorum.MutableLong;
import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager;
import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTest;
import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer;
import org.apache.activemq.artemis.tests.util.Wait;
import org.jboss.logging.Logger;
import org.junit.Assert;
import org.junit.Test;
public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends FailoverTest {
private static final Logger log = Logger.getLogger(PluggableQuorumReplicatedLargeMessageFailoverTest.class);
protected void beforeWaitForRemoteBackupSynchronization() {
}
@ -170,8 +176,6 @@ public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends Failo
@Override
protected void setupHAPolicyConfiguration() {
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration())
.setCheckForLiveServer(true);
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration())
.setMaxSavedReplicatedJournalsSize(2)
.setAllowFailBack(true);
@ -210,4 +214,30 @@ public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends Failo
}
super.crash(sessions);
}
@Override
protected void decrementActivationSequenceForForceRestartOf(TestableServer testableServer) throws Exception {
doDecrementActivationSequenceForForceRestartOf(log, nodeManager, managerConfiguration);
}
public static void doDecrementActivationSequenceForForceRestartOf(Logger log, NodeManager nodeManager, DistributedPrimitiveManagerConfiguration distributedPrimitiveManagerConfiguration) throws Exception {
nodeManager.start();
long localActivation = nodeManager.readNodeActivationSequence();
// file based
FileBasedPrimitiveManager fileBasedPrimitiveManager = new FileBasedPrimitiveManager(distributedPrimitiveManagerConfiguration.getProperties());
fileBasedPrimitiveManager.start();
try {
MutableLong mutableLong = fileBasedPrimitiveManager.getMutableLong(nodeManager.getNodeId().toString());
if (!mutableLong.compareAndSet(localActivation + 1, localActivation)) {
throw new Exception("Failed to decrement coordinated activation sequence to:" + localActivation + ", not +1 : " + mutableLong.get());
}
log.warn("Intentionally decrementing coordinated activation sequence for test, may result is lost data");
} finally {
fileBasedPrimitiveManager.stop();
nodeManager.stop();
}
}
}

View File

@ -17,7 +17,6 @@
package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.apache.activemq.artemis.tests.integration.cluster.failover.NettyReplicationStopTest;
public class PluggableQuorumNettyReplicationStopTest extends NettyReplicationStopTest {
@ -29,7 +28,6 @@ public class PluggableQuorumNettyReplicationStopTest extends NettyReplicationSto
@Override
protected void setupHAPolicyConfiguration() {
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
}

View File

@ -17,7 +17,6 @@
package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.apache.activemq.artemis.tests.integration.cluster.failover.PageCleanupWhileReplicaCatchupTest;
public class PluggableQuorumPageCleanupWhileReplicaCatchupTest extends PageCleanupWhileReplicaCatchupTest {
@ -29,7 +28,6 @@ public class PluggableQuorumPageCleanupWhileReplicaCatchupTest extends PageClean
@Override
protected void setupHAPolicyConfiguration() {
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
}

View File

@ -48,7 +48,6 @@ public class PluggableQuorumReplicaTimeoutTest extends ReplicaTimeoutTest {
managerConfiguration, managerConfiguration);
ReplicationPrimaryPolicyConfiguration primaryConfiguration = ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration());
primaryConfiguration.setInitialReplicationSyncTimeout(1000);
primaryConfiguration.setCheckForLiveServer(true);
ReplicationBackupPolicyConfiguration backupConfiguration = ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration());
backupConfiguration.setInitialReplicationSyncTimeout(1000);
backupConfiguration.setMaxSavedReplicatedJournalsSize(2)

View File

@ -18,11 +18,15 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
import org.apache.activemq.artemis.api.core.client.ClientSession;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.apache.activemq.artemis.tests.integration.cluster.failover.LargeMessageFailoverTest;
import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer;
import org.jboss.logging.Logger;
import static org.apache.activemq.artemis.tests.integration.cluster.failover.quorum.PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.doDecrementActivationSequenceForForceRestartOf;
public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMessageFailoverTest {
private static final Logger log = Logger.getLogger(PluggableQuorumReplicatedLargeMessageFailoverTest.class);
@Override
protected void createConfigs() throws Exception {
createPluggableReplicatedConfigs();
@ -30,7 +34,6 @@ public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMess
@Override
protected void setupHAPolicyConfiguration() {
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
}
@ -58,4 +61,8 @@ public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMess
super.crash(sessions);
}
@Override
protected void decrementActivationSequenceForForceRestartOf(TestableServer liveServer) throws Exception {
doDecrementActivationSequenceForForceRestartOf(log, nodeManager, managerConfiguration);
}
}

View File

@ -18,7 +18,6 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
import org.apache.activemq.artemis.api.core.client.ClientSession;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.apache.activemq.artemis.tests.integration.cluster.util.BackupSyncDelay;
import org.junit.After;
import org.junit.Before;
@ -66,7 +65,6 @@ public class PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest extends
@Override
protected void setupHAPolicyConfiguration() {
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration())
.setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
}

View File

@ -17,7 +17,6 @@
package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.apache.activemq.artemis.tests.integration.cluster.failover.PagingFailoverTest;
public class PluggableQuorumReplicatedPagingFailoverTest extends PagingFailoverTest {
@ -29,7 +28,6 @@ public class PluggableQuorumReplicatedPagingFailoverTest extends PagingFailoverT
@Override
protected void setupHAPolicyConfiguration() {
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
}
}

View File

@ -520,6 +520,11 @@ public class ActiveMQServerControlUsingCoreTest extends ActiveMQServerControlTes
return (String) proxy.retrieveAttributeValue("nodeID");
}
@Override
public long getActivationSequence() {
return (Long) proxy.retrieveAttributeValue("activationSequence");
}
@Override
public String getManagementAddress() {
return (String) proxy.retrieveAttributeValue("managementAddress");

View File

@ -45,8 +45,6 @@ public class PluggableQuorumReplicationFlowControlTest extends SharedNothingRepl
ReplicationBackupPolicyConfiguration haPolicy = ReplicationBackupPolicyConfiguration.withDefault();
haPolicy.setDistributedManagerConfiguration(managerConfiguration);
haPolicy.setClusterName("cluster");
// fail-fast in order to let the backup to quickly retry syncing with primary
haPolicy.setVoteRetries(0);
return haPolicy;
}
@ -54,7 +52,6 @@ public class PluggableQuorumReplicationFlowControlTest extends SharedNothingRepl
protected HAPolicyConfiguration createReplicationLiveConfiguration() {
ReplicationPrimaryPolicyConfiguration haPolicy = ReplicationPrimaryPolicyConfiguration.withDefault();
haPolicy.setDistributedManagerConfiguration(managerConfiguration);
haPolicy.setCheckForLiveServer(false);
return haPolicy;
}
}

View File

@ -18,7 +18,6 @@ package org.apache.activemq.artemis.tests.integration.replication;
import org.apache.activemq.artemis.api.core.TransportConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
@ -30,7 +29,7 @@ public class PluggableQuorumReplicationOrderTest extends ReplicationOrderTest {
@Parameterized.Parameter
public boolean useNetty;
@Parameterized.Parameters(name = "useNetty={1}")
@Parameterized.Parameters(name = "useNetty={0}")
public static Iterable<Object[]> getParams() {
return asList(new Object[][]{{false}, {true}});
}
@ -42,8 +41,6 @@ public class PluggableQuorumReplicationOrderTest extends ReplicationOrderTest {
@Override
protected void setupHAPolicyConfiguration() {
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration())
.setCheckForLiveServer(true);
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration())
.setMaxSavedReplicatedJournalsSize(2)
.setAllowFailBack(true);

View File

@ -18,14 +18,37 @@ package org.apache.activemq.artemis.tests.integration.replication;
import java.io.IOException;
import java.util.Collections;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import org.apache.activemq.artemis.api.core.Message;
import org.apache.activemq.artemis.api.core.QueueConfiguration;
import org.apache.activemq.artemis.api.core.RoutingType;
import org.apache.activemq.artemis.api.core.client.ClientConsumer;
import org.apache.activemq.artemis.api.core.client.ClientMessage;
import org.apache.activemq.artemis.api.core.client.ClientProducer;
import org.apache.activemq.artemis.api.core.client.ClientSession;
import org.apache.activemq.artemis.api.core.client.ClientSessionFactory;
import org.apache.activemq.artemis.api.core.client.ServerLocator;
import org.apache.activemq.artemis.core.client.impl.ServerLocatorImpl;
import org.apache.activemq.artemis.core.config.Configuration;
import org.apache.activemq.artemis.core.config.HAPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.apache.activemq.artemis.core.server.ActivateCallback;
import org.apache.activemq.artemis.core.server.ActiveMQServer;
import org.apache.activemq.artemis.core.server.ActiveMQServers;
import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager;
import org.apache.activemq.artemis.quorum.DistributedLock;
import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager;
import org.apache.activemq.artemis.quorum.MutableLong;
import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager;
import org.apache.activemq.artemis.tests.util.Wait;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest {
@ -43,7 +66,6 @@ public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest
protected HAPolicyConfiguration createReplicationLiveConfiguration() {
ReplicationPrimaryPolicyConfiguration haPolicy = ReplicationPrimaryPolicyConfiguration.withDefault();
haPolicy.setDistributedManagerConfiguration(managerConfiguration);
haPolicy.setCheckForLiveServer(false);
return haPolicy;
}
@ -55,4 +77,616 @@ public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest
return haPolicy;
}
@Test
public void testUnReplicatedOrderedTransition() throws Exception {
// start live
final Configuration liveConfiguration = createLiveConfiguration();
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
liveServer.setIdentity("LIVE");
liveServer.start();
Wait.waitFor(liveServer::isStarted);
ServerLocator locator = ServerLocatorImpl.newLocator("(tcp://localhost:61616,tcp://localhost:61617)?ha=true");
locator.setCallTimeout(60_000L);
locator.setConnectionTTL(60_000L);
ClientSessionFactory csf = locator.createSessionFactory();
ClientSession clientSession = csf.createSession();
clientSession.createQueue(new QueueConfiguration("slow").setRoutingType(RoutingType.ANYCAST));
clientSession.close();
// start backup
Configuration backupConfiguration = createBackupConfiguration();
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
backupServer.setIdentity("BACKUP");
backupServer.start();
Wait.waitFor(backupServer::isStarted);
waitForTopology(liveServer, 1, 1, 30000);
waitForTopology(backupServer, 1, 1, 30000);
liveServer.stop();
// backup will take over and run un replicated
csf = locator.createSessionFactory();
clientSession = csf.createSession();
clientSession.createQueue(new QueueConfiguration("slow_un_replicated").setRoutingType(RoutingType.ANYCAST));
clientSession.close();
waitForTopology(backupServer, 1, 0, 30000);
assertTrue(Wait.waitFor(() -> 2L == backupServer.getNodeManager().getNodeActivationSequence()));
backupServer.stop(false);
// now only backup should be able to start as it has run un_replicated
liveServer.start();
Wait.assertFalse(liveServer::isActive);
liveServer.stop();
// restart backup
backupServer.start();
Wait.waitFor(backupServer::isStarted);
assertEquals(3L, backupServer.getNodeManager().getNodeActivationSequence());
csf = locator.createSessionFactory();
clientSession = csf.createSession();
clientSession.createQueue(new QueueConfiguration("backup_as_un_replicated").setRoutingType(RoutingType.ANYCAST));
clientSession.close();
// verify the live restart as a backup to the restarted backupServer that has taken on the live role, no failback
liveServer.start();
csf = locator.createSessionFactory();
clientSession = csf.createSession();
clientSession.createQueue(new QueueConfiguration("backup_as_replicated").setRoutingType(RoutingType.ANYCAST));
clientSession.close();
assertTrue(Wait.waitFor(liveServer::isReplicaSync));
assertTrue(Wait.waitFor(() -> 3L == liveServer.getNodeManager().getNodeActivationSequence()));
backupServer.stop(true);
waitForTopology(liveServer, 1, 0, 30000);
assertTrue(Wait.waitFor(() -> 4L == liveServer.getNodeManager().getNodeActivationSequence()));
liveServer.stop(true);
clientSession.close();
locator.close();
}
@Test
public void testBackupFailoverAndPrimaryFailback() throws Exception {
final int timeout = (int) TimeUnit.SECONDS.toMillis(30);
// start live
Configuration liveConfiguration = createLiveConfiguration();
ActiveMQServer primaryInstance = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
primaryInstance.setIdentity("PRIMARY");
primaryInstance.start();
// primary initially UN REPLICATED
Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence());
// start backup
Configuration backupConfiguration = createBackupConfiguration();
((ReplicationBackupPolicyConfiguration)backupConfiguration.getHAPolicyConfiguration()).setAllowFailBack(true);
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
backupServer.setIdentity("BACKUP");
backupServer.start();
Wait.waitFor(backupServer::isStarted);
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout);
// primary REPLICATED, backup matches (has replicated) activation sequence
Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence());
Assert.assertEquals(1L, backupServer.getNodeManager().getNodeActivationSequence());
primaryInstance.stop();
// backup UN REPLICATED (new version)
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 2L == backupServer.getNodeManager().getNodeActivationSequence(), timeout);
// just to let the console logging breath!
TimeUnit.MILLISECONDS.sleep(100);
// restart primary that will request failback
ActiveMQServer restartedPrimaryForFailBack = primaryInstance; //addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
restartedPrimaryForFailBack.start();
// first step is backup getting replicated
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout);
// restarted primary will run un replicated (increment sequence) while backup restarts to revert to backup role.
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> {
try {
return 3L == restartedPrimaryForFailBack.getNodeManager().getNodeActivationSequence();
} catch (NullPointerException ok) {
return false;
}
}, timeout);
// the backup should then resume with an insync replica view of that version
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> restartedPrimaryForFailBack.isReplicaSync(), timeout);
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout);
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 3L == backupServer.getNodeManager().getNodeActivationSequence(), timeout);
// just to let the console logging breath!
TimeUnit.MILLISECONDS.sleep(100);
// stop backup to verify primary goes on with new sequence as un replicated
backupServer.stop();
// just to let the console logging breath!
TimeUnit.MILLISECONDS.sleep(100);
// live goes un replicated
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> {
try {
return 4L == restartedPrimaryForFailBack.getNodeManager().getNodeActivationSequence();
} catch (NullPointerException ok) {
return false;
}
}, timeout);
restartedPrimaryForFailBack.stop();
}
@Test
public void testPrimaryIncrementActivationSequenceOnUnReplicated() throws Exception {
final int timeout = (int) TimeUnit.SECONDS.toMillis(30);
// start live
Configuration liveConfiguration = createLiveConfiguration();
ActiveMQServer primaryInstance = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
primaryInstance.setIdentity("PRIMARY");
primaryInstance.start();
// primary UN REPLICATED
Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence());
// start backup
Configuration backupConfiguration = createBackupConfiguration();
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
backupServer.setIdentity("BACKUP");
backupServer.start();
Wait.waitFor(backupServer::isStarted);
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout);
// primary REPLICATED, backup matches (has replicated) activation sequence
Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence());
Assert.assertEquals(1L, backupServer.getNodeManager().getNodeActivationSequence());
// transition to un replicated once backup goes away
backupServer.stop();
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 2L == primaryInstance.getNodeManager().getNodeActivationSequence(), timeout);
// done
primaryInstance.stop();
}
@Test
public void testBackupStartsFirst() throws Exception {
// start backup
Configuration backupConfiguration = createBackupConfiguration();
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
backupServer.setIdentity("BACKUP");
backupServer.start();
// start live
final Configuration liveConfiguration = createLiveConfiguration();
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
liveServer.setIdentity("LIVE");
liveServer.start();
Wait.waitFor(liveServer::isStarted);
assertTrue(Wait.waitFor(backupServer::isStarted));
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
assertTrue(liveServer.isReplicaSync());
}
@Test
public void testBackupOutOfSequenceReleasesLock() throws Exception {
// start backup
Configuration backupConfiguration = createBackupConfiguration();
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
backupServer.setIdentity("BACKUP");
backupServer.start();
// start live
final Configuration liveConfiguration = createLiveConfiguration();
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
liveServer.setIdentity("LIVE");
liveServer.start();
Wait.waitFor(liveServer::isStarted);
assertTrue(Wait.waitFor(backupServer::isStarted));
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
assertTrue(liveServer.isReplicaSync());
backupServer.stop();
TimeUnit.SECONDS.sleep(1);
liveServer.stop();
// backup can get lock but does not have the sequence to start, will try and be a backup
backupServer.start();
// live server should be active
liveServer.start();
Wait.waitFor(liveServer::isStarted);
assertTrue(Wait.waitFor(backupServer::isStarted));
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
assertTrue(liveServer.isReplicaSync());
}
@Test
public void testBackupOutOfSequenceCheckActivationSequence() throws Exception {
// start backup
Configuration backupConfiguration = createBackupConfiguration();
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
backupServer.setIdentity("BACKUP");
backupServer.start();
// start live
final Configuration liveConfiguration = createLiveConfiguration();
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
liveServer.setIdentity("LIVE");
liveServer.start();
Wait.waitFor(liveServer::isStarted);
assertTrue(Wait.waitFor(backupServer::isStarted));
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
assertTrue(liveServer.isReplicaSync());
backupServer.stop();
TimeUnit.SECONDS.sleep(1);
final String coordinatedId = liveServer.getNodeID().toString();
liveServer.stop();
// backup can get lock but does not have the sequence to start, will try and be a backup
// to verify it can short circuit with a dirty read we grab the lock for a little while
DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf(
managerConfiguration.getClassName(),
managerConfiguration.getProperties());
distributedPrimitiveManager.start();
final DistributedLock lock = distributedPrimitiveManager.getDistributedLock(coordinatedId);
assertTrue(lock.tryLock());
CountDownLatch preActivate = new CountDownLatch(1);
backupServer.registerActivateCallback(new ActivateCallback() {
@Override
public void preActivate() {
ActivateCallback.super.preActivate();
preActivate.countDown();
}
});
backupServer.start();
// it should be able to do a dirty read of the sequence id and not have to wait to get a lock
assertTrue(preActivate.await(1, TimeUnit.SECONDS));
// release the lock
distributedPrimitiveManager.stop();
// live server should be active
liveServer.start();
Wait.waitFor(liveServer::isStarted);
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
assertTrue(liveServer.isReplicaSync());
}
@Test
public void testSelfRepairPrimary() throws Exception {
// start live
final Configuration liveConfiguration = createLiveConfiguration();
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
liveServer.setIdentity("LIVE");
liveServer.start();
final String coordinatedId = liveServer.getNodeID().toString();
Wait.waitFor(liveServer::isStarted);
liveServer.stop();
liveServer.start();
Wait.waitFor(liveServer::isStarted);
Assert.assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence());
liveServer.stop();
// backup can get lock but does not have the sequence to start, will try and be a backup
// to verify it can short circuit with a dirty read we grab the lock for a little while
DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager
.newInstanceOf(managerConfiguration.getClassName(), managerConfiguration.getProperties());
distributedPrimitiveManager.start();
try (DistributedLock lock = distributedPrimitiveManager.getDistributedLock(coordinatedId)) {
assertTrue(lock.tryLock());
distributedPrimitiveManager.getMutableLong(coordinatedId).compareAndSet(2, -2);
}
liveServer.start();
Wait.waitFor(liveServer::isStarted);
Assert.assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence());
Assert.assertEquals(2, distributedPrimitiveManager.getMutableLong(coordinatedId).get());
distributedPrimitiveManager.stop();
Configuration backupConfiguration = createBackupConfiguration();
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
backupServer.setIdentity("BACKUP");
backupServer.start();
Wait.waitFor(backupServer::isReplicaSync);
Assert.assertEquals(2, backupServer.getNodeManager().getNodeActivationSequence());
backupServer.stop();
}
@Test
public void testPrimaryPeers() throws Exception {
final String PEER_NODE_ID = "some-shared-id-001";
final Configuration liveConfiguration = createLiveConfiguration();
((ReplicationPrimaryPolicyConfiguration)liveConfiguration.getHAPolicyConfiguration()).setCoordinationId(PEER_NODE_ID);
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
liveServer.setIdentity("LIVE");
liveServer.start();
Wait.waitFor(liveServer::isStarted);
ServerLocator locator = ServerLocatorImpl.newLocator("(tcp://localhost:61616,tcp://localhost:61617)?ha=true");
locator.setCallTimeout(60_000L);
locator.setConnectionTTL(60_000L);
final ClientSessionFactory keepLocatorAliveSLF = locator.createSessionFactory();
ClientSessionFactory csf = locator.createSessionFactory();
sendTo(csf, "live_un_replicated");
csf.close();
// start peer, will backup
Configuration peerLiveConfiguration = createBackupConfiguration(); // to get acceptors and locators ports that won't clash
peerLiveConfiguration.setHAPolicyConfiguration(createReplicationLiveConfiguration());
((ReplicationPrimaryPolicyConfiguration)peerLiveConfiguration.getHAPolicyConfiguration()).setCoordinationId(PEER_NODE_ID);
peerLiveConfiguration.setName("localhost::live-peer");
ActiveMQServer livePeerServer = addServer(ActiveMQServers.newActiveMQServer(peerLiveConfiguration));
livePeerServer.setIdentity("LIVE-PEER");
livePeerServer.start();
Wait.waitFor(livePeerServer::isStarted);
waitForTopology(liveServer, 1, 1, 30000);
waitForTopology(livePeerServer, 1, 1, 30000);
liveServer.stop();
// livePeerServer will take over and run un replicated
csf = locator.createSessionFactory();
receiveFrom(csf, "live_un_replicated");
sendTo(csf, "peer_un_replicated");
csf.close();
waitForTopology(livePeerServer, 1, 0, 30000);
assertTrue(Wait.waitFor(() -> 2L == livePeerServer.getNodeManager().getNodeActivationSequence()));
livePeerServer.stop(false);
liveServer.start();
Wait.assertTrue(() -> !liveServer.isActive());
// restart backup
livePeerServer.start();
Wait.waitFor(livePeerServer::isStarted);
assertEquals(3L, livePeerServer.getNodeManager().getNodeActivationSequence());
csf = locator.createSessionFactory();
receiveFrom(csf, "peer_un_replicated");
sendTo(csf, "backup_as_un_replicated");
csf.close();
// verify the live restart as a backup to the restarted PeerLiveServer that has taken on the live role
liveServer.start();
csf = locator.createSessionFactory();
receiveFrom(csf, "backup_as_un_replicated");
sendTo(csf, "backup_as_replicated");
csf.close();
assertTrue(Wait.waitFor(liveServer::isReplicaSync));
assertTrue(Wait.waitFor(() -> 3L == liveServer.getNodeManager().getNodeActivationSequence()));
waitForTopology(liveServer, 1, 1, 30000);
waitForTopology(livePeerServer, 1, 1, 30000);
livePeerServer.stop(true);
assertTrue(Wait.waitFor(() -> 4L == liveServer.getNodeManager().getNodeActivationSequence()));
csf = locator.createSessionFactory();
receiveFrom(csf, "backup_as_replicated");
csf.close();
waitForTopology(liveServer, 1, 0, 30000);
liveServer.stop(true);
keepLocatorAliveSLF.close();
locator.close();
}
@Test
public void testUnavailableSelfHeal() throws Exception {
// start backup
Configuration backupConfiguration = createBackupConfiguration();
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
backupServer.setIdentity("BACKUP");
backupServer.start();
// start live
final Configuration liveConfiguration = createLiveConfiguration();
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
liveServer.setIdentity("LIVE");
liveServer.start();
Wait.waitFor(liveServer::isStarted);
assertTrue(Wait.waitFor(backupServer::isStarted));
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
assertTrue(liveServer.isReplicaSync());
final String coordinatedId = liveServer.getNodeID().toString();
backupServer.stop();
TimeUnit.MILLISECONDS.sleep(500);
liveServer.stop();
// some manual intervention to force an unavailable
// simulate live failing in activation local sequence update on un replicated run when backup stops.
DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf(managerConfiguration.getClassName(), managerConfiguration.getProperties());
distributedPrimitiveManager.start();
final MutableLong activationSequence = distributedPrimitiveManager.getMutableLong(coordinatedId);
Assert.assertTrue(activationSequence.compareAndSet(2, -2));
// case: 1, the fail to write locally 2 but the write actually succeeding
// should delay pending resolution of the uncommitted claim
backupServer.start();
// live server should activate after self healing its outstanding claim
liveServer.start();
Wait.waitFor(liveServer::isStarted);
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
assertTrue(liveServer.isReplicaSync());
}
@Test
public void testUnavailableAdminIntervention() throws Exception {
// start backup
Configuration backupConfiguration = createBackupConfiguration();
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
backupServer.setIdentity("BACKUP");
backupServer.start();
// start live
final Configuration liveConfiguration = createLiveConfiguration();
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
liveServer.setIdentity("LIVE");
liveServer.start();
Wait.waitFor(liveServer::isStarted);
assertTrue(Wait.waitFor(backupServer::isStarted));
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
assertTrue(liveServer.isReplicaSync());
final String coordinatedId = liveServer.getNodeID().toString();
System.err.println("coodr id: " + coordinatedId);
backupServer.stop();
TimeUnit.MILLISECONDS.sleep(500);
liveServer.stop();
// some manual intervention to force an unavailable
// simulate live failing in activation local sequence update on un replicated run when backup stops.
DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf(
managerConfiguration.getClassName(),
managerConfiguration.getProperties());
distributedPrimitiveManager.start();
final MutableLong coordinatedActivationSequence = distributedPrimitiveManager.getMutableLong(coordinatedId);
Assert.assertTrue(coordinatedActivationSequence.compareAndSet(2, -2));
// case: 2, the fail to write locally 2 but the write actually failing
// need to put 1 in the local activation sequence of the live
FileLockNodeManager fileLockNodeManager = new FileLockNodeManager(liveConfiguration.getNodeManagerLockLocation().getAbsoluteFile(), true);
fileLockNodeManager.start();
assertEquals(2, fileLockNodeManager.readNodeActivationSequence());
fileLockNodeManager.writeNodeActivationSequence(1);
fileLockNodeManager.stop();
// should delay pending resolution of the uncommitted claim
backupServer.start();
CountDownLatch liveStarting = new CountDownLatch(1);
// should delay pending resolution of the uncommitted claim
// IMPORTANT: primary activation run on the start caller thread!! We need another thread here
final Thread liveServerStarterThread = new Thread(() -> {
liveStarting.countDown();
try {
liveServer.start();
} catch (Throwable e) {
e.printStackTrace();
}
});
liveServerStarterThread.start();
liveStarting.await();
TimeUnit.MILLISECONDS.sleep(500);
// both are candidates and one of them failed to commit the claim
// let them compete on retry
Assert.assertTrue(coordinatedActivationSequence.compareAndSet(-2, 1));
// one of the two can activate
Wait.waitFor(() -> liveServer.isStarted() || backupServer.isStarted());
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
assertTrue(liveServer.isReplicaSync());
assertEquals(2, backupServer.getNodeManager().getNodeActivationSequence());
assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence());
}
private void sendTo(ClientSessionFactory clientSessionFactory, String addr) throws Exception {
ClientSession clientSession = clientSessionFactory.createSession(true, true);
clientSession.createQueue(new QueueConfiguration(addr).setRoutingType(RoutingType.ANYCAST).setDurable(true));
ClientProducer producer = clientSession.createProducer(addr);
ClientMessage message = clientSession.createMessage(true);
message.putStringProperty("K", addr);
message.putLongProperty("delay", 0L); // so slow interceptor does not get us
producer.send(message);
producer.close();
clientSession.close();
}
private void receiveFrom(ClientSessionFactory clientSessionFactory, String addr) throws Exception {
ClientSession clientSession = clientSessionFactory.createSession(true, true);
clientSession.start();
ClientConsumer consumer = clientSession.createConsumer(addr);
Message message = consumer.receive(4000);
assertNotNull(message);
assertTrue(message.getStringProperty("K").equals(addr));
consumer.close();
clientSession.close();
}
}

View File

@ -244,7 +244,7 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase {
.setCheckForLiveServer(false);
}
private Configuration createLiveConfiguration() throws Exception {
protected Configuration createLiveConfiguration() throws Exception {
Configuration conf = new ConfigurationImpl();
conf.setName("localhost::live");
@ -275,7 +275,7 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase {
return new ReplicaPolicyConfiguration().setClusterName("cluster");
}
private Configuration createBackupConfiguration() throws Exception {
protected Configuration createBackupConfiguration() throws Exception {
Configuration conf = new ConfigurationImpl();
conf.setName("localhost::backup");

View File

@ -24,7 +24,6 @@ import org.apache.activemq.artemis.api.core.client.ClientSession;
import org.apache.activemq.artemis.api.core.client.ClientSessionFactory;
import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager;
import org.apache.activemq.artemis.spi.core.security.ActiveMQBasicSecurityManager;
import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTestBase;
@ -66,9 +65,9 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes
.setLargeMessagesDirectory(getLargeMessagesDir(0, true));
setupHAPolicyConfiguration();
nodeManager = createReplicatedBackupNodeManager(backupConfig);
backupNodeManager = createReplicatedBackupNodeManager(backupConfig);
backupServer = createTestableServer(backupConfig);
backupServer = createTestableServer(backupConfig, backupNodeManager);
backupServer.getServer().setSecurityManager(new ActiveMQBasicSecurityManager());
@ -77,7 +76,8 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes
.clearAcceptorConfigurations()
.addAcceptorConfiguration(getAcceptorTransportConfiguration(true));
liveServer = createTestableServer(liveConfig);
nodeManager = createNodeManager(liveConfig);
liveServer = createTestableServer(liveConfig, nodeManager);
liveServer.getServer().setSecurityManager(new ActiveMQBasicSecurityManager());
}
@ -94,7 +94,6 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes
@Override
protected void setupHAPolicyConfiguration() {
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
}

View File

@ -677,6 +677,44 @@
</args>
</configuration>
</execution>
<execution>
<phase>test-compile</phase>
<id>create-zk-replication-primary-peer-a</id>
<goals>
<goal>create</goal>
</goals>
<configuration>
<configuration>${basedir}/target/classes/servers/zkReplicationPrimaryPeerA</configuration>
<allowAnonymous>true</allowAnonymous>
<user>admin</user>
<password>admin</password>
<instance>${basedir}/target/zkReplicationPrimaryPeerA</instance>
<args>
<!-- this is needed to run the server remotely -->
<arg>--java-options</arg>
<arg>-Djava.rmi.server.hostname=localhost</arg>
</args>
</configuration>
</execution>
<execution>
<phase>test-compile</phase>
<id>create-zk-replication-primary-peer-b</id>
<goals>
<goal>create</goal>
</goals>
<configuration>
<configuration>${basedir}/target/classes/servers/zkReplicationPrimaryPeerB</configuration>
<allowAnonymous>true</allowAnonymous>
<user>admin</user>
<password>admin</password>
<instance>${basedir}/target/zkReplicationPrimaryPeerB</instance>
<args>
<!-- this is needed to run the server remotely -->
<arg>--java-options</arg>
<arg>-Djava.rmi.server.hostname=localhost</arg>
</args>
</configuration>
</execution>
<execution>
<phase>test-compile</phase>
<id>create-zk-replication-backup</id>

View File

@ -39,7 +39,6 @@ under the License.
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
</properties>
</manager>
<check-for-live-server>true</check-for-live-server>
</primary>
</replication>
</ha-policy>

View File

@ -0,0 +1,138 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
--><configuration xmlns="urn:activemq" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:activemq /schema/artemis-server.xsd">
<core xmlns="urn:activemq:core">
<name>primary-peer-a</name>
<bindings-directory>./data/bindings</bindings-directory>
<journal-directory>./data/journal</journal-directory>
<large-messages-directory>./data/largemessages</large-messages-directory>
<paging-directory>./data/paging</paging-directory>
<ha-policy>
<replication>
<primary>
<manager>
<class-name>org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager</class-name>
<properties>
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
</properties>
</manager>
<coordination-id>peer-journal-001</coordination-id>
</primary>
</replication>
</ha-policy>
<connectors>
<!-- Connector used to be announced through cluster connections and notifications -->
<connector name="artemis">tcp://localhost:61616</connector>
<connector name="peer">tcp://localhost:61716</connector>
</connectors>
<!-- Acceptors -->
<acceptors>
<acceptor name="artemis">tcp://localhost:61616</acceptor>
</acceptors>
<cluster-user>admin</cluster-user>
<cluster-password>password</cluster-password>
<cluster-connections>
<cluster-connection name="my-cluster">
<connector-ref>artemis</connector-ref>
<message-load-balancing>OFF</message-load-balancing>
<max-hops>1</max-hops>
<static-connectors>
<connector-ref>peer</connector-ref>
</static-connectors>
</cluster-connection>
</cluster-connections>
<!-- Other config -->
<security-settings>
<!--security for example queue-->
<security-setting match="#">
<permission type="createNonDurableQueue" roles="amq, guest"/>
<permission type="deleteNonDurableQueue" roles="amq, guest"/>
<permission type="createDurableQueue" roles="amq, guest"/>
<permission type="deleteDurableQueue" roles="amq, guest"/>
<permission type="createAddress" roles="amq, guest"/>
<permission type="deleteAddress" roles="amq, guest"/>
<permission type="consume" roles="amq, guest"/>
<permission type="browse" roles="amq, guest"/>
<permission type="send" roles="amq, guest"/>
<!-- we need this otherwise ./artemis data imp wouldn't work -->
<permission type="manage" roles="amq"/>
</security-setting>
</security-settings>
<address-settings>
<!-- if you define auto-create on certain queues, management has to be auto-create -->
<address-setting match="activemq.management#">
<dead-letter-address>DLQ</dead-letter-address>
<expiry-address>ExpiryQueue</expiry-address>
<redelivery-delay>0</redelivery-delay>
<!-- with -1 only the global-max-size is in use for limiting -->
<max-size-bytes>-1</max-size-bytes>
<message-counter-history-day-limit>10</message-counter-history-day-limit>
<address-full-policy>PAGE</address-full-policy>
<auto-create-queues>true</auto-create-queues>
<auto-create-addresses>true</auto-create-addresses>
<auto-create-jms-queues>true</auto-create-jms-queues>
<auto-create-jms-topics>true</auto-create-jms-topics>
</address-setting>
<!--default for catch all-->
<address-setting match="#">
<dead-letter-address>DLQ</dead-letter-address>
<expiry-address>ExpiryQueue</expiry-address>
<redelivery-delay>0</redelivery-delay>
<!-- with -1 only the global-max-size is in use for limiting -->
<max-size-bytes>10MB</max-size-bytes>
<page-size-bytes>1MB</page-size-bytes>
<message-counter-history-day-limit>10</message-counter-history-day-limit>
<address-full-policy>PAGE</address-full-policy>
<auto-create-queues>true</auto-create-queues>
<auto-create-addresses>true</auto-create-addresses>
<auto-create-jms-queues>true</auto-create-jms-queues>
<auto-create-jms-topics>true</auto-create-jms-topics>
</address-setting>
</address-settings>
<addresses>
<address name="exampleTopic">
<multicast>
</multicast>
</address>
<address name="exampleQueue">
<anycast>
<queue name="exampleQueue"/>
</anycast>
</address>
</addresses>
</core>
</configuration>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one or more
~ contributor license agreements. See the NOTICE file distributed with
~ this work for additional information regarding copyright ownership.
~ The ASF licenses this file to You under the Apache License, Version 2.0
~ (the "License"); you may not use this file except in compliance with
~ the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<management-context xmlns="http://activemq.org/schema">
<connector connector-port="10099" connector-host="localhost"/>
</management-context>

View File

@ -0,0 +1,138 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
--><configuration xmlns="urn:activemq" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:activemq /schema/artemis-server.xsd">
<core xmlns="urn:activemq:core">
<name>primary-peer-b</name>
<bindings-directory>./data/bindings</bindings-directory>
<journal-directory>./data/journal</journal-directory>
<large-messages-directory>./data/largemessages</large-messages-directory>
<paging-directory>./data/paging</paging-directory>
<ha-policy>
<replication>
<primary>
<manager>
<class-name>org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager</class-name>
<properties>
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
</properties>
</manager>
<coordination-id>peer-journal-001</coordination-id>
</primary>
</replication>
</ha-policy>
<connectors>
<!-- Connector used to be announced through cluster connections and notifications -->
<connector name="artemis">tcp://localhost:61716</connector>
<connector name="peer">tcp://localhost:61616</connector>
</connectors>
<!-- Acceptors -->
<acceptors>
<acceptor name="artemis">tcp://localhost:61716</acceptor>
</acceptors>
<cluster-user>admin</cluster-user>
<cluster-password>password</cluster-password>
<cluster-connections>
<cluster-connection name="my-cluster">
<connector-ref>artemis</connector-ref>
<message-load-balancing>OFF</message-load-balancing>
<max-hops>1</max-hops>
<static-connectors>
<connector-ref>peer</connector-ref>
</static-connectors>
</cluster-connection>
</cluster-connections>
<!-- Other config -->
<security-settings>
<!--security for example queue-->
<security-setting match="#">
<permission type="createNonDurableQueue" roles="amq, guest"/>
<permission type="deleteNonDurableQueue" roles="amq, guest"/>
<permission type="createDurableQueue" roles="amq, guest"/>
<permission type="deleteDurableQueue" roles="amq, guest"/>
<permission type="createAddress" roles="amq, guest"/>
<permission type="deleteAddress" roles="amq, guest"/>
<permission type="consume" roles="amq, guest"/>
<permission type="browse" roles="amq, guest"/>
<permission type="send" roles="amq, guest"/>
<!-- we need this otherwise ./artemis data imp wouldn't work -->
<permission type="manage" roles="amq"/>
</security-setting>
</security-settings>
<address-settings>
<!-- if you define auto-create on certain queues, management has to be auto-create -->
<address-setting match="activemq.management#">
<dead-letter-address>DLQ</dead-letter-address>
<expiry-address>ExpiryQueue</expiry-address>
<redelivery-delay>0</redelivery-delay>
<!-- with -1 only the global-max-size is in use for limiting -->
<max-size-bytes>-1</max-size-bytes>
<message-counter-history-day-limit>10</message-counter-history-day-limit>
<address-full-policy>PAGE</address-full-policy>
<auto-create-queues>true</auto-create-queues>
<auto-create-addresses>true</auto-create-addresses>
<auto-create-jms-queues>true</auto-create-jms-queues>
<auto-create-jms-topics>true</auto-create-jms-topics>
</address-setting>
<!--default for catch all-->
<address-setting match="#">
<dead-letter-address>DLQ</dead-letter-address>
<expiry-address>ExpiryQueue</expiry-address>
<redelivery-delay>0</redelivery-delay>
<!-- with -1 only the global-max-size is in use for limiting -->
<max-size-bytes>10MB</max-size-bytes>
<page-size-bytes>1MB</page-size-bytes>
<message-counter-history-day-limit>10</message-counter-history-day-limit>
<address-full-policy>PAGE</address-full-policy>
<auto-create-queues>true</auto-create-queues>
<auto-create-addresses>true</auto-create-addresses>
<auto-create-jms-queues>true</auto-create-jms-queues>
<auto-create-jms-topics>true</auto-create-jms-topics>
</address-setting>
</address-settings>
<addresses>
<address name="exampleTopic">
<multicast>
</multicast>
</address>
<address name="exampleQueue">
<anycast>
<queue name="exampleQueue"/>
</anycast>
</address>
</addresses>
</core>
</configuration>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one or more
~ contributor license agreements. See the NOTICE file distributed with
~ this work for additional information regarding copyright ownership.
~ The ASF licenses this file to You under the Apache License, Version 2.0
~ (the "License"); you may not use this file except in compliance with
~ the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<management-context xmlns="http://activemq.org/schema">
<connector connector-port="10199" connector-host="localhost"/>
</management-context>

View File

@ -20,8 +20,7 @@ package org.apache.activemq.artemis.tests.smoke.quorum;
import javax.management.remote.JMXServiceURL;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
@ -55,15 +54,15 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
private static final Logger LOGGER = Logger.getLogger(PluggableQuorumSinglePairTest.class);
private static final String JMX_SERVER_HOSTNAME = "localhost";
private static final int JMX_PORT_PRIMARY = 10099;
private static final int JMX_PORT_BACKUP = 10199;
static final String JMX_SERVER_HOSTNAME = "localhost";
static final int JMX_PORT_PRIMARY = 10099;
static final int JMX_PORT_BACKUP = 10199;
private static final String PRIMARY_DATA_FOLDER = "ReplicationPrimary";;
private static final String BACKUP_DATA_FOLDER = "ReplicationBackup";
static final String PRIMARY_DATA_FOLDER = "ReplicationPrimary";
static final String BACKUP_DATA_FOLDER = "ReplicationBackup";
private static final int PRIMARY_PORT_OFFSET = 0;
private static final int BACKUP_PORT_OFFSET = PRIMARY_PORT_OFFSET + 100;
static final int PRIMARY_PORT_OFFSET = 0;
static final int BACKUP_PORT_OFFSET = PRIMARY_PORT_OFFSET + 100;
public static class BrokerControl {
@ -73,7 +72,7 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
final JMXServiceURL jmxServiceURL;
final int portID;
private BrokerControl(final String name, int jmxPort, String dataFolder, int portID) {
BrokerControl(final String name, int jmxPort, String dataFolder, int portID) {
this.portID = portID;
this.dataFolder = dataFolder;
try {
@ -108,6 +107,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
public Optional<String> listNetworkTopology() throws Exception {
return Jmx.listNetworkTopology(jmxServiceURL, objectNameBuilder);
}
public Optional<Long> getActivationSequence() throws Exception {
return Jmx.getActivationSequence(jmxServiceURL, objectNameBuilder);
}
public Optional<Boolean> isActive() throws Exception {
return Jmx.isActive(jmxServiceURL, objectNameBuilder);
}
}
@Parameterized.Parameter
@ -118,14 +125,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
return Arrays.asList(new Object[][]{{false}, {true}});
}
private final BrokerControl primary;
private final BrokerControl backup;
private final Collection<BrokerControl> brokers;
protected BrokerControl primary;
protected BrokerControl backup;
protected LinkedList<BrokerControl> brokers;
public PluggableQuorumSinglePairTest(String brokerFolderPrefix) {
primary = new BrokerControl("primary", JMX_PORT_PRIMARY, brokerFolderPrefix + PRIMARY_DATA_FOLDER, PRIMARY_PORT_OFFSET);
backup = new BrokerControl("backup", JMX_PORT_BACKUP, brokerFolderPrefix + BACKUP_DATA_FOLDER, BACKUP_PORT_OFFSET);
brokers = Collections.unmodifiableList(Arrays.asList(primary, backup));
brokers = new LinkedList(Arrays.asList(primary, backup));
}
protected abstract boolean awaitAsyncSetupCompleted(long timeout, TimeUnit unit) throws InterruptedException;
@ -150,6 +157,10 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
Process primaryInstance = primary.startServer(this, timeout);
Assert.assertTrue(awaitAsyncSetupCompleted(timeout, TimeUnit.MILLISECONDS));
Wait.assertTrue(() -> !primary.isBackup().orElse(true), timeout);
// primary UN REPLICATED
Assert.assertEquals(1L, primary.getActivationSequence().get().longValue());
LOGGER.info("started primary");
LOGGER.info("starting backup");
Process backupInstance = backup.startServer(this, 0);
@ -176,6 +187,11 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
Assert.assertNotNull(urlPrimary);
LOGGER.infof("primary: %s", urlPrimary);
Assert.assertNotEquals(urlPrimary, urlBackup);
// primary REPLICATED, backup matches (has replicated) activation sequence
Assert.assertEquals(1L, primary.getActivationSequence().get().longValue());
Assert.assertEquals(1L, backup.getActivationSequence().get().longValue());
LOGGER.info("killing primary");
ServerUtil.killServer(primaryInstance, forceKill);
LOGGER.info("killed primary");
@ -188,11 +204,15 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
.and(withNodes(1))), timeout);
LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get());
Assert.assertEquals(nodeID, backup.getNodeID().get());
// backup UN REPLICATED (new version)
Assert.assertEquals(2L, backup.getActivationSequence().get().longValue());
// wait a bit before restarting primary
LOGGER.info("waiting before starting primary");
TimeUnit.SECONDS.sleep(4);
LOGGER.info("starting primary");
primary.startServer(this, 0);
primaryInstance = primary.startServer(this, 0);
LOGGER.info("started primary");
Wait.assertTrue(() -> backup.isBackup().orElse(false), timeout);
Assert.assertTrue(!primary.isBackup().get());
@ -209,6 +229,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
Assert.assertTrue(backup.isReplicaSync().get());
LOGGER.infof("backup is synchronized with live");
Assert.assertEquals(nodeID, primary.getNodeID().get());
// primary ran un replicated for a short while after failback, before backup was in sync
Assert.assertEquals(3L, primary.getActivationSequence().get().longValue());
Assert.assertEquals(3L, backup.getActivationSequence().get().longValue());
LOGGER.infof("Done, killing both");
ServerUtil.killServer(primaryInstance);
ServerUtil.killServer(backupInstance);
}
@Test
@ -272,5 +300,87 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
Wait.waitFor(()-> !backupInstance.isAlive(), timeout);
}
@Test
public void testOnlyLastUnreplicatedCanStart() throws Exception {
final int timeout = (int) TimeUnit.SECONDS.toMillis(30);
LOGGER.info("starting primary");
Process primaryInstance = primary.startServer(this, timeout);
Assert.assertTrue(awaitAsyncSetupCompleted(timeout, TimeUnit.MILLISECONDS));
Wait.assertTrue(() -> !primary.isBackup().orElse(true), timeout);
LOGGER.info("started primary");
LOGGER.info("starting backup");
Process backupInstance = backup.startServer(this, 0);
Wait.assertTrue(() -> backup.isBackup().orElse(false), timeout);
final String nodeID = primary.getNodeID().get();
Assert.assertNotNull(nodeID);
LOGGER.infof("NodeID: %s", nodeID);
for (BrokerControl broker : brokers) {
Wait.assertTrue(() -> validateNetworkTopology(broker.listNetworkTopology().orElse(""),
containsExactNodeIds(nodeID)
.and(withLive(nodeID, Objects::nonNull))
.and(withBackup(nodeID, Objects::nonNull))
.and(withMembers(1))
.and(withNodes(2))), timeout);
}
LOGGER.infof("primary topology is: %s", primary.listNetworkTopology().get());
LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get());
Assert.assertTrue(backup.isReplicaSync().get());
LOGGER.infof("backup is synchronized with live");
final String urlBackup = backupOf(nodeID, decodeNetworkTopologyJson(backup.listNetworkTopology().get()));
Assert.assertNotNull(urlBackup);
LOGGER.infof("backup: %s", urlBackup);
final String urlPrimary = liveOf(nodeID, decodeNetworkTopologyJson(primary.listNetworkTopology().get()));
Assert.assertNotNull(urlPrimary);
LOGGER.infof("primary: %s", urlPrimary);
Assert.assertNotEquals(urlPrimary, urlBackup);
// verify sequence id's in sync
Assert.assertEquals(1L, primary.getActivationSequence().get().longValue());
Assert.assertEquals(1L, backup.getActivationSequence().get().longValue());
LOGGER.info("killing primary");
ServerUtil.killServer(primaryInstance, forceKill);
LOGGER.info("killed primary");
Wait.assertTrue(() -> !backup.isBackup().orElse(true), timeout);
Wait.assertTrue(() -> validateNetworkTopology(backup.listNetworkTopology().orElse(""),
containsExactNodeIds(nodeID)
.and(withLive(nodeID, urlBackup::equals))
.and(withBackup(nodeID, Objects::isNull))
.and(withMembers(1))
.and(withNodes(1))), timeout);
LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get());
Assert.assertEquals(nodeID, backup.getNodeID().get());
// backup now UNREPLICATED, it is the only node that can continue
Assert.assertEquals(2L, backup.getActivationSequence().get().longValue());
LOGGER.info("killing backup");
ServerUtil.killServer(backupInstance, forceKill);
// wait a bit before restarting primary
LOGGER.info("waiting before starting primary");
TimeUnit.SECONDS.sleep(4);
LOGGER.info("restarting primary");
Process restartedPrimary = primary.startServer(this, 0);
LOGGER.info("restarted primary, " + restartedPrimary);
Wait.assertFalse("Primary shouldn't activate", () -> primary.isActive().orElse(false), 5000);
ServerUtil.killServer(restartedPrimary);
LOGGER.info("restarting backup");
// backup can resume with data seq 3
final Process restartedBackupInstance = backup.startServer(this, 5000);
Wait.waitFor(() -> backup.isActive().orElse(false), 5000);
assertTrue(Wait.waitFor(() -> nodeID.equals(backup.getNodeID().orElse("not set yet"))));
LOGGER.info("restarted backup");
Assert.assertEquals(3L, backup.getActivationSequence().get().longValue());
}
}

View File

@ -0,0 +1,109 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.activemq.artemis.tests.smoke.quorum;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import org.apache.activemq.artemis.util.ServerUtil;
import org.apache.activemq.artemis.utils.Wait;
import org.jboss.logging.Logger;
import org.junit.Assert;
import org.junit.Test;
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.containsExactNodeIds;
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.validateNetworkTopology;
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withBackup;
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withLive;
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withMembers;
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withNodes;
public class ZookeeperPluggableQuorumPeerTest extends ZookeeperPluggableQuorumSinglePairTest {
private static final Logger LOGGER = Logger.getLogger(ZookeeperPluggableQuorumPeerTest.class);
public ZookeeperPluggableQuorumPeerTest() {
super();
// accepting the primary/backup vars to reuse the test, for peers, these are interchangeable as either can take
// both roles as both wish to be primary but will revert to backup
primary = new BrokerControl("primary-peer-a", JMX_PORT_PRIMARY, "zkReplicationPrimaryPeerA", PRIMARY_PORT_OFFSET);
backup = new BrokerControl("primary-peer-b", JMX_PORT_BACKUP, "zkReplicationPrimaryPeerB", BACKUP_PORT_OFFSET);
brokers = new LinkedList(Arrays.asList(primary, backup));
}
@Test
@Override
public void testBackupFailoverAndPrimaryFailback() throws Exception {
// peers don't request fail back by default
// just wait for setup to avoid partial stop of zk via fast tear down with async setup
Wait.waitFor(this::ensembleHasLeader);
}
@Test
public void testMultiPrimary_Peer() throws Exception {
final int timeout = (int) TimeUnit.SECONDS.toMillis(30);
LOGGER.info("starting peer b primary");
Process backupInstance = backup.startServer(this, timeout);
// alive as unreplicated, it has configured node id
assertTrue(Wait.waitFor(() -> 1L == backup.getActivationSequence().orElse(Long.MAX_VALUE).longValue()));
final String nodeID = backup.getNodeID().get();
Assert.assertNotNull(nodeID);
LOGGER.infof("NodeID: %s", nodeID);
LOGGER.info("starting peer a primary");
primary.startServer(this, 0);
Wait.assertTrue(() -> primary.isBackup().orElse(false), timeout);
Wait.assertTrue(() -> !backup.isBackup().orElse(true), timeout);
for (BrokerControl broker : brokers) {
Wait.assertTrue(() -> validateNetworkTopology(broker.listNetworkTopology().orElse(""),
containsExactNodeIds(nodeID)
.and(withLive(nodeID, Objects::nonNull))
.and(withBackup(nodeID, Objects::nonNull))
.and(withMembers(1))
.and(withNodes(2))), timeout);
}
LOGGER.infof("primary topology is: %s", primary.listNetworkTopology().get());
LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get());
Assert.assertTrue(backup.isReplicaSync().get());
Assert.assertTrue(primary.isReplicaSync().get());
LOGGER.info("killing peer-b");
ServerUtil.killServer(backupInstance, forceKill);
// peer-a now UNREPLICATED
Wait.assertTrue(() -> 2L == primary.getActivationSequence().get().longValue());
LOGGER.info("restarting peer-b");
backup.startServer(this, 0);
assertTrue(Wait.waitFor(() -> nodeID.equals(backup.getNodeID().orElse("not set yet"))));
// peer-b now a REPLICA
Wait.waitFor(() -> backup.isReplicaSync().get());
Wait.assertTrue(() -> 2L == backup.getActivationSequence().get().longValue());
}
}

View File

@ -1,13 +1,13 @@
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* <p>
*
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -19,6 +19,8 @@ package org.apache.activemq.artemis.tests.smoke.quorum;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.activemq.artemis.utils.ThreadLeakCheckRule;
import org.apache.curator.test.InstanceSpec;
import org.apache.curator.test.TestingCluster;
import org.apache.curator.test.TestingZooKeeperServer;
@ -38,7 +40,7 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl
@Rule
public TemporaryFolder tmpFolder = new TemporaryFolder();
private TestingCluster testingServer;
protected TestingCluster testingServer;
private InstanceSpec[] clusterSpecs;
private int nodes;
@ -60,6 +62,8 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl
@Override
@After
public void after() throws Exception {
// zk bits that leak from servers
ThreadLeakCheckRule.addKownThread("ListenerHandler-");
try {
super.after();
} finally {
@ -76,6 +80,16 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl
return true;
}
protected boolean ensembleHasLeader() {
return testingServer.getServers().stream().filter(ZookeeperPluggableQuorumSinglePairTest::isLeader).count() != 0;
}
private static boolean isLeader(TestingZooKeeperServer server) {
long leaderId = server.getQuorumPeer().getLeaderId();
long id = server.getQuorumPeer().getId();
return id == leaderId;
}
@Override
protected void stopMajority() throws Exception {
List<TestingZooKeeperServer> followers = testingServer.getServers();

View File

@ -77,6 +77,16 @@ public class Jmx {
return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::getNodeID, ActiveMQServerControl.class, throwable -> null);
}
public static Optional<Long> getActivationSequence(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception {
return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::getActivationSequence, ActiveMQServerControl.class, throwable -> null);
}
public static Optional<Boolean> isActive(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception {
return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::isActive, ActiveMQServerControl.class, throwable -> null);
}
public static Optional<String> listNetworkTopology(JMXServiceURL serviceURI,
ObjectNameBuilder builder) throws Exception {
return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::listNetworkTopology, ActiveMQServerControl.class, throwable -> null);

View File

@ -18,6 +18,7 @@ package org.apache.activemq.artemis.tests.unit.core.server.impl;
import java.io.File;
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager;
import org.apache.activemq.artemis.tests.util.ActiveMQTestBase;
import org.junit.Before;
@ -33,6 +34,25 @@ public class FileLockTest extends ActiveMQTestBase {
file.mkdirs();
}
@Test
public void testSetNodeID() throws Exception {
FileLockNodeManager underTest = new FileLockNodeManager(getTestDirfile(), false);
ReplicationPrimaryPolicyConfiguration replicationPrimaryPolicyConfiguration = ReplicationPrimaryPolicyConfiguration.withDefault();
String seed = "";
for (int i = 0; i < 20; i++) {
replicationPrimaryPolicyConfiguration.setCoordinationId(seed);
if (replicationPrimaryPolicyConfiguration.getCoordinationId() != null) {
underTest.setNodeID(replicationPrimaryPolicyConfiguration.getCoordinationId());
}
seed += String.valueOf(i);
}
replicationPrimaryPolicyConfiguration.setCoordinationId("somme-dash-and-odd");
if (replicationPrimaryPolicyConfiguration.getCoordinationId() != null) {
underTest.setNodeID(replicationPrimaryPolicyConfiguration.getCoordinationId());
}
}
@Test
public void testNIOLock() throws Exception {
doTestLock(new FileLockNodeManager(getTestDirfile(), false), new FileLockNodeManager(getTestDirfile(), false));