ARTEMIS-3340 Sequential activation tracking for pluggable quorum replication policies + peer
Co-authored-by: franz1981 <nigro.fra@gmail.com>
This commit is contained in:
parent
536271485f
commit
ca7a100de0
|
@ -19,7 +19,7 @@ package org.apache.activemq.artemis.cli.commands;
|
|||
import java.io.File;
|
||||
import java.util.Timer;
|
||||
import java.util.TimerTask;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import io.airlift.airline.Command;
|
||||
import io.airlift.airline.Option;
|
||||
|
@ -71,6 +71,7 @@ public class Run extends LockAbstract {
|
|||
public Object execute(ActionContext context) throws Exception {
|
||||
super.execute(context);
|
||||
|
||||
AtomicReference<Throwable> serverActivationFailed = new AtomicReference<>();
|
||||
try {
|
||||
BrokerDTO broker = getBrokerDTO();
|
||||
ActiveMQSecurityManager securityManager = SecurityManagerFactory.create(broker.security);
|
||||
|
@ -110,8 +111,7 @@ public class Run extends LockAbstract {
|
|||
server = BrokerFactory.createServer(broker.server, securityManager, activateCallback);
|
||||
|
||||
server.createComponents();
|
||||
AtomicBoolean serverActivationFailed = new AtomicBoolean(false);
|
||||
server.getServer().registerActivationFailureListener(exception -> serverActivationFailed.set(true));
|
||||
server.getServer().registerActivationFailureListener(exception -> serverActivationFailed.set(exception));
|
||||
server.start();
|
||||
server.getServer().addExternalComponent(managementContext, false);
|
||||
|
||||
|
@ -126,14 +126,16 @@ public class Run extends LockAbstract {
|
|||
server.getServer().addExternalComponent(component, true);
|
||||
assert component.isStarted();
|
||||
}
|
||||
|
||||
if (serverActivationFailed.get()) {
|
||||
stop();
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
t.printStackTrace();
|
||||
stop();
|
||||
serverActivationFailed.set(t);
|
||||
}
|
||||
|
||||
if (serverActivationFailed.get() != null) {
|
||||
stop();
|
||||
return serverActivationFailed.get();
|
||||
}
|
||||
|
||||
return new Pair<>(managementContext, server.getServer());
|
||||
}
|
||||
|
||||
|
|
|
@ -2851,4 +2851,12 @@ public interface AuditLogger extends BasicLogger {
|
|||
@LogMessage(level = Logger.Level.INFO)
|
||||
@Message(id = 601748, value = "User {0} is getting max retry interval on target resource: {1} {2}", format = Message.Format.MESSAGE_FORMAT)
|
||||
void getMaxRetryInterval(String user, Object source, Object... args);
|
||||
|
||||
static void getActivationSequence(Object source) {
|
||||
BASE_LOGGER.getActivationSequence(getCaller(), source);
|
||||
}
|
||||
|
||||
@LogMessage(level = Logger.Level.INFO)
|
||||
@Message(id = 601749, value = "User {0} is getting activation sequence on target resource: {1} {2}", format = Message.Format.MESSAGE_FORMAT)
|
||||
void getActivationSequence(String user, Object source, Object... args);
|
||||
}
|
||||
|
|
|
@ -321,6 +321,15 @@ public interface ActiveMQServerControl {
|
|||
@Attribute(desc = "Node ID of this server")
|
||||
String getNodeID();
|
||||
|
||||
|
||||
/**
|
||||
* Returns the current activation sequence number of this server.
|
||||
* <br>
|
||||
* When replicated, peers may coordinate activation with this monotonic sequence
|
||||
*/
|
||||
@Attribute(desc = "Activation sequence of this server instance")
|
||||
long getActivationSequence();
|
||||
|
||||
/**
|
||||
* Returns the management notification address of this server.
|
||||
* <br>
|
||||
|
|
|
@ -44,6 +44,12 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.zookeeper</groupId>
|
||||
<artifactId>zookeeper</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.curator</groupId>
|
||||
|
@ -59,6 +65,10 @@
|
|||
<groupId>org.jboss.logging</groupId>
|
||||
<artifactId>jboss-logging</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.jboss.slf4j</groupId>
|
||||
<artifactId>slf4j-jboss-logmanager</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.activemq</groupId>
|
||||
<artifactId>artemis-commons</artifactId>
|
||||
|
|
|
@ -18,16 +18,18 @@ package org.apache.activemq.artemis.quorum.file;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import org.apache.activemq.artemis.quorum.DistributedLock;
|
||||
import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager;
|
||||
import org.apache.activemq.artemis.quorum.MutableLong;
|
||||
import org.apache.activemq.artemis.quorum.UnavailableStateException;
|
||||
|
||||
/**
|
||||
* This is an implementation suitable to be used just on unit tests and it won't attempt
|
||||
|
@ -127,8 +129,55 @@ public class FileBasedPrimitiveManager implements DistributedPrimitiveManager {
|
|||
}
|
||||
|
||||
@Override
|
||||
public MutableLong getMutableLong(String mutableLongId) throws InterruptedException, ExecutionException, TimeoutException {
|
||||
// TODO
|
||||
return null;
|
||||
public MutableLong getMutableLong(final String mutableLongId) throws ExecutionException {
|
||||
// use a lock file - but with a prefix
|
||||
final FileDistributedLock fileDistributedLock = (FileDistributedLock) getDistributedLock("ML:" + mutableLongId);
|
||||
return new MutableLong() {
|
||||
@Override
|
||||
public String getMutableLongId() {
|
||||
return mutableLongId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long get() throws UnavailableStateException {
|
||||
try {
|
||||
return readLong(fileDistributedLock);
|
||||
} catch (IOException e) {
|
||||
throw new UnavailableStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long value) throws UnavailableStateException {
|
||||
try {
|
||||
writeLong(fileDistributedLock, value);
|
||||
} catch (IOException e) {
|
||||
throw new UnavailableStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
fileDistributedLock.close();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private void writeLong(FileDistributedLock fileDistributedLock, long value) throws IOException {
|
||||
ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN);
|
||||
buffer.putLong(value);
|
||||
buffer.flip();
|
||||
if (fileDistributedLock.getChannel().position(0).write(buffer) == Long.BYTES) {
|
||||
fileDistributedLock.getChannel().force(false);
|
||||
}
|
||||
}
|
||||
|
||||
private long readLong(FileDistributedLock fileDistributedLock) throws IOException {
|
||||
ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN);
|
||||
if (fileDistributedLock.getChannel().position(0).read(buffer, 0) != Long.BYTES) {
|
||||
return 0;
|
||||
}
|
||||
buffer.flip();
|
||||
return buffer.getLong();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -138,4 +138,8 @@ final class FileDistributedLock implements DistributedLock {
|
|||
public void close() {
|
||||
close(true);
|
||||
}
|
||||
|
||||
public FileChannel getChannel() {
|
||||
return channel;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -178,6 +178,7 @@ public abstract class DistributedLockTest {
|
|||
ownerManager.getDistributedLock("a").unlock();
|
||||
Assert.assertFalse(observerManager.getDistributedLock("a").isHeldByCaller());
|
||||
Assert.assertFalse(ownerManager.getDistributedLock("a").isHeldByCaller());
|
||||
Assert.assertTrue(observerManager.getDistributedLock("a").tryLock());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -34,13 +34,6 @@ public class ReplicationBackupPolicyConfiguration implements HAPolicyConfigurati
|
|||
|
||||
private long initialReplicationSyncTimeout = ActiveMQDefaultConfiguration.getDefaultInitialReplicationSyncTimeout();
|
||||
|
||||
private int voteRetries = ActiveMQDefaultConfiguration.getDefaultVoteRetries();
|
||||
|
||||
/**
|
||||
* TODO: move if into {@link ActiveMQDefaultConfiguration} when the configuration is stable.
|
||||
*/
|
||||
private long voteRetryWait = 2000;
|
||||
|
||||
private long retryReplicationWait = ActiveMQDefaultConfiguration.getDefaultRetryReplicationWait();
|
||||
|
||||
private DistributedPrimitiveManagerConfiguration distributedManagerConfiguration = null;
|
||||
|
@ -102,24 +95,6 @@ public class ReplicationBackupPolicyConfiguration implements HAPolicyConfigurati
|
|||
return this;
|
||||
}
|
||||
|
||||
public int getVoteRetries() {
|
||||
return voteRetries;
|
||||
}
|
||||
|
||||
public ReplicationBackupPolicyConfiguration setVoteRetries(int voteRetries) {
|
||||
this.voteRetries = voteRetries;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ReplicationBackupPolicyConfiguration setVoteRetryWait(long voteRetryWait) {
|
||||
this.voteRetryWait = voteRetryWait;
|
||||
return this;
|
||||
}
|
||||
|
||||
public long getVoteRetryWait() {
|
||||
return voteRetryWait;
|
||||
}
|
||||
|
||||
public long getRetryReplicationWait() {
|
||||
return retryReplicationWait;
|
||||
}
|
||||
|
|
|
@ -21,25 +21,18 @@ import org.apache.activemq.artemis.core.config.HAPolicyConfiguration;
|
|||
|
||||
public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfiguration {
|
||||
|
||||
private boolean checkForLiveServer = ActiveMQDefaultConfiguration.isDefaultCheckForLiveServer();
|
||||
|
||||
private String groupName = null;
|
||||
|
||||
private String clusterName = null;
|
||||
|
||||
private long initialReplicationSyncTimeout = ActiveMQDefaultConfiguration.getDefaultInitialReplicationSyncTimeout();
|
||||
|
||||
private int voteRetries = ActiveMQDefaultConfiguration.getDefaultVoteRetries();
|
||||
|
||||
/**
|
||||
* TODO: move if into {@link ActiveMQDefaultConfiguration} when the configuration is stable.
|
||||
*/
|
||||
private long voteRetryWait = 2000;
|
||||
|
||||
private Long retryReplicationWait = ActiveMQDefaultConfiguration.getDefaultRetryReplicationWait();
|
||||
|
||||
private DistributedPrimitiveManagerConfiguration distributedManagerConfiguration = null;
|
||||
|
||||
private String coordinationId = null;
|
||||
|
||||
public static ReplicationPrimaryPolicyConfiguration withDefault() {
|
||||
return new ReplicationPrimaryPolicyConfiguration();
|
||||
}
|
||||
|
@ -52,15 +45,6 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat
|
|||
return TYPE.PRIMARY;
|
||||
}
|
||||
|
||||
public boolean isCheckForLiveServer() {
|
||||
return checkForLiveServer;
|
||||
}
|
||||
|
||||
public ReplicationPrimaryPolicyConfiguration setCheckForLiveServer(boolean checkForLiveServer) {
|
||||
this.checkForLiveServer = checkForLiveServer;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getGroupName() {
|
||||
return groupName;
|
||||
}
|
||||
|
@ -88,24 +72,6 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat
|
|||
return this;
|
||||
}
|
||||
|
||||
public int getVoteRetries() {
|
||||
return voteRetries;
|
||||
}
|
||||
|
||||
public ReplicationPrimaryPolicyConfiguration setVoteRetries(int voteRetries) {
|
||||
this.voteRetries = voteRetries;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ReplicationPrimaryPolicyConfiguration setVoteRetryWait(long voteRetryWait) {
|
||||
this.voteRetryWait = voteRetryWait;
|
||||
return this;
|
||||
}
|
||||
|
||||
public long getVoteRetryWait() {
|
||||
return voteRetryWait;
|
||||
}
|
||||
|
||||
public void setRetryReplicationWait(Long retryReplicationWait) {
|
||||
this.retryReplicationWait = retryReplicationWait;
|
||||
}
|
||||
|
@ -122,4 +88,27 @@ public class ReplicationPrimaryPolicyConfiguration implements HAPolicyConfigurat
|
|||
public DistributedPrimitiveManagerConfiguration getDistributedManagerConfiguration() {
|
||||
return distributedManagerConfiguration;
|
||||
}
|
||||
|
||||
public String getCoordinationId() {
|
||||
return coordinationId;
|
||||
}
|
||||
|
||||
public void setCoordinationId(String newCoordinationId) {
|
||||
if (newCoordinationId == null) {
|
||||
return;
|
||||
}
|
||||
final int len = newCoordinationId.length();
|
||||
if (len >= 16) {
|
||||
this.coordinationId = newCoordinationId.substring(0, 16);
|
||||
} else if (len % 2 != 0) {
|
||||
// must be even for conversion to uuid, extend to next even
|
||||
this.coordinationId = newCoordinationId + "+";
|
||||
} else if (len > 0 ) {
|
||||
// run with it
|
||||
this.coordinationId = newCoordinationId;
|
||||
}
|
||||
if (this.coordinationId != null) {
|
||||
this.coordinationId = this.coordinationId.replace('-', '.');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1715,22 +1715,18 @@ public final class FileConfigurationParser extends XMLConfigurationUtil {
|
|||
private ReplicationPrimaryPolicyConfiguration createReplicationPrimaryHaPolicy(Element policyNode, Configuration config) {
|
||||
ReplicationPrimaryPolicyConfiguration configuration = ReplicationPrimaryPolicyConfiguration.withDefault();
|
||||
|
||||
configuration.setCheckForLiveServer(getBoolean(policyNode, "check-for-live-server", configuration.isCheckForLiveServer()));
|
||||
|
||||
configuration.setGroupName(getString(policyNode, "group-name", configuration.getGroupName(), Validators.NO_CHECK));
|
||||
|
||||
configuration.setClusterName(getString(policyNode, "cluster-name", configuration.getClusterName(), Validators.NO_CHECK));
|
||||
|
||||
configuration.setInitialReplicationSyncTimeout(getLong(policyNode, "initial-replication-sync-timeout", configuration.getInitialReplicationSyncTimeout(), Validators.GT_ZERO));
|
||||
|
||||
configuration.setVoteRetries(getInteger(policyNode, "vote-retries", configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO));
|
||||
|
||||
configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO));
|
||||
|
||||
configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO));
|
||||
configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getRetryReplicationWait(), Validators.GT_ZERO));
|
||||
|
||||
configuration.setDistributedManagerConfiguration(createDistributedPrimitiveManagerConfiguration(policyNode, config));
|
||||
|
||||
configuration.setCoordinationId(getString(policyNode, "coordination-id", configuration.getCoordinationId(), Validators.NOT_NULL_OR_EMPTY));
|
||||
|
||||
return configuration;
|
||||
}
|
||||
|
||||
|
@ -1748,11 +1744,7 @@ public final class FileConfigurationParser extends XMLConfigurationUtil {
|
|||
|
||||
configuration.setMaxSavedReplicatedJournalsSize(getInteger(policyNode, "max-saved-replicated-journals-size", configuration.getMaxSavedReplicatedJournalsSize(), Validators.MINUS_ONE_OR_GE_ZERO));
|
||||
|
||||
configuration.setVoteRetries(getInteger(policyNode, "vote-retries", configuration.getVoteRetries(), Validators.MINUS_ONE_OR_GE_ZERO));
|
||||
|
||||
configuration.setVoteRetryWait(getLong(policyNode, "vote-retry-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO));
|
||||
|
||||
configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getVoteRetryWait(), Validators.GT_ZERO));
|
||||
configuration.setRetryReplicationWait(getLong(policyNode, "retry-replication-wait", configuration.getRetryReplicationWait(), Validators.GT_ZERO));
|
||||
|
||||
configuration.setDistributedManagerConfiguration(createDistributedPrimitiveManagerConfiguration(policyNode, config));
|
||||
|
||||
|
|
|
@ -4204,6 +4204,17 @@ public class ActiveMQServerControlImpl extends AbstractControl implements Active
|
|||
return server.getNodeID() == null ? null : server.getNodeID().toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getActivationSequence() {
|
||||
if (AuditLogger.isBaseLoggingEnabled()) {
|
||||
AuditLogger.getActivationSequence(this.server);
|
||||
}
|
||||
if (server.getNodeManager() != null) {
|
||||
return server.getNodeManager().getNodeActivationSequence();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getManagementNotificationAddress() {
|
||||
if (AuditLogger.isBaseLoggingEnabled()) {
|
||||
|
|
|
@ -43,7 +43,8 @@ public class ReplicationStartSyncMessage extends PacketImpl {
|
|||
public enum SyncDataType {
|
||||
JournalBindings(AbstractJournalStorageManager.JournalContent.BINDINGS.typeByte),
|
||||
JournalMessages(AbstractJournalStorageManager.JournalContent.MESSAGES.typeByte),
|
||||
LargeMessages((byte) 2);
|
||||
LargeMessages((byte) 2),
|
||||
ActivationSequence((byte) 3);
|
||||
|
||||
private byte code;
|
||||
|
||||
|
@ -62,6 +63,9 @@ public class ReplicationStartSyncMessage extends PacketImpl {
|
|||
return JournalMessages;
|
||||
if (code == LargeMessages.code)
|
||||
return LargeMessages;
|
||||
if (code == ActivationSequence.code)
|
||||
return ActivationSequence;
|
||||
|
||||
throw new InvalidParameterException("invalid byte: " + code);
|
||||
}
|
||||
}
|
||||
|
@ -80,6 +84,14 @@ public class ReplicationStartSyncMessage extends PacketImpl {
|
|||
nodeID = ""; // this value will be ignored
|
||||
}
|
||||
|
||||
|
||||
public ReplicationStartSyncMessage(String nodeID, long nodeDataVersion) {
|
||||
this(nodeID);
|
||||
ids = new long[1];
|
||||
ids[0] = nodeDataVersion;
|
||||
dataType = SyncDataType.ActivationSequence;
|
||||
}
|
||||
|
||||
public ReplicationStartSyncMessage(String nodeID) {
|
||||
this();
|
||||
synchronizationIsFinished = true;
|
||||
|
@ -118,10 +130,6 @@ public class ReplicationStartSyncMessage extends PacketImpl {
|
|||
DataConstants.SIZE_BOOLEAN + // buffer.writeBoolean(allowsAutoFailBack);
|
||||
nodeID.length() * 3; // buffer.writeString(nodeID); -- an estimate
|
||||
|
||||
|
||||
if (synchronizationIsFinished) {
|
||||
return size;
|
||||
}
|
||||
size += DataConstants.SIZE_BYTE + // buffer.writeByte(dataType.code);
|
||||
DataConstants.SIZE_INT + // buffer.writeInt(ids.length);
|
||||
DataConstants.SIZE_LONG * ids.length; // the write loop
|
||||
|
@ -135,8 +143,6 @@ public class ReplicationStartSyncMessage extends PacketImpl {
|
|||
buffer.writeBoolean(synchronizationIsFinished);
|
||||
buffer.writeBoolean(allowsAutoFailBack);
|
||||
buffer.writeString(nodeID);
|
||||
if (synchronizationIsFinished)
|
||||
return;
|
||||
buffer.writeByte(dataType.code);
|
||||
buffer.writeInt(ids.length);
|
||||
for (long id : ids) {
|
||||
|
@ -149,9 +155,6 @@ public class ReplicationStartSyncMessage extends PacketImpl {
|
|||
synchronizationIsFinished = buffer.readBoolean();
|
||||
allowsAutoFailBack = buffer.readBoolean();
|
||||
nodeID = buffer.readString();
|
||||
if (synchronizationIsFinished) {
|
||||
return;
|
||||
}
|
||||
dataType = SyncDataType.getDataType(buffer.readByte());
|
||||
int length = buffer.readInt();
|
||||
ids = new long[length];
|
||||
|
|
|
@ -483,7 +483,7 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon
|
|||
logger.trace("Backup is synchronized / BACKUP-SYNC-DONE");
|
||||
}
|
||||
|
||||
ActiveMQServerLogger.LOGGER.backupServerSynched(server);
|
||||
ActiveMQServerLogger.LOGGER.backupServerSynchronized(server, liveID);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -560,6 +560,11 @@ public final class ReplicationEndpoint implements ChannelHandler, ActiveMQCompon
|
|||
return replicationResponseMessage;
|
||||
|
||||
if (packet.isSynchronizationFinished()) {
|
||||
if (packet.getFileIds() != null && packet.getFileIds().length == 1) {
|
||||
// this is the version sequence of the data we are replicating
|
||||
// verified if we activate with this data
|
||||
server.getNodeManager().writeNodeActivationSequence(packet.getFileIds()[0]);
|
||||
}
|
||||
finishSynchronization(packet.getNodeID());
|
||||
replicationResponseMessage.setSynchronizationIsFinishedAcknowledgement(true);
|
||||
return replicationResponseMessage;
|
||||
|
|
|
@ -821,7 +821,7 @@ public final class ReplicationManager implements ActiveMQComponent {
|
|||
}
|
||||
|
||||
synchronizationIsFinishedAcknowledgement.countUp();
|
||||
sendReplicatePacket(new ReplicationStartSyncMessage(nodeID));
|
||||
sendReplicatePacket(new ReplicationStartSyncMessage(nodeID, server.getNodeManager().getNodeActivationSequence()));
|
||||
try {
|
||||
if (!synchronizationIsFinishedAcknowledgement.await(initialReplicationSyncTimeout)) {
|
||||
ActiveMQReplicationTimeooutException exception = ActiveMQMessageBundle.BUNDLE.replicationSynchronizationTimeout(initialReplicationSyncTimeout);
|
||||
|
|
|
@ -190,8 +190,8 @@ public interface ActiveMQServerLogger extends BasicLogger {
|
|||
void errorStoppingConnectorService(@Cause Throwable e, String name);
|
||||
|
||||
@LogMessage(level = Logger.Level.INFO)
|
||||
@Message(id = 221024, value = "Backup server {0} is synchronized with live-server.", format = Message.Format.MESSAGE_FORMAT)
|
||||
void backupServerSynched(ActiveMQServerImpl server);
|
||||
@Message(id = 221024, value = "Backup server {0} is synchronized with live server, nodeID={1}.", format = Message.Format.MESSAGE_FORMAT)
|
||||
void backupServerSynchronized(ActiveMQServerImpl server, String liveID);
|
||||
|
||||
@LogMessage(level = Logger.Level.INFO)
|
||||
@Message(id = 221025, value = "Replication: sending {0} (size={1}) to replica.", format = Message.Format.MESSAGE_FORMAT)
|
||||
|
|
|
@ -39,6 +39,7 @@ public abstract class NodeManager implements ActiveMQComponent {
|
|||
private UUID uuid;
|
||||
private boolean isStarted = false;
|
||||
private final Set<FileLockNodeManager.LockListener> lockListeners;
|
||||
protected long nodeActivationSequence; // local version of a coordinated sequence, tracking state transitions of ownership
|
||||
|
||||
public NodeManager(final boolean replicatedBackup) {
|
||||
this.replicatedBackup = replicatedBackup;
|
||||
|
@ -79,18 +80,30 @@ public abstract class NodeManager implements ActiveMQComponent {
|
|||
}
|
||||
}
|
||||
|
||||
public long readDataVersion() throws NodeManagerException {
|
||||
public long readNodeActivationSequence() throws NodeManagerException {
|
||||
// TODO make it abstract
|
||||
throw new UnsupportedOperationException("TODO");
|
||||
}
|
||||
|
||||
public void writeDataVersion(long version) throws NodeManagerException {
|
||||
public void writeNodeActivationSequence(long version) throws NodeManagerException {
|
||||
// TODO make it abstract
|
||||
throw new UnsupportedOperationException("TODO");
|
||||
}
|
||||
|
||||
public abstract SimpleString readNodeId() throws NodeManagerException;
|
||||
|
||||
public long getNodeActivationSequence() {
|
||||
synchronized (nodeIDGuard) {
|
||||
return nodeActivationSequence;
|
||||
}
|
||||
}
|
||||
|
||||
public void setNodeActivationSequence(long activationSequence) {
|
||||
synchronized (nodeIDGuard) {
|
||||
nodeActivationSequence = activationSequence;
|
||||
}
|
||||
}
|
||||
|
||||
public UUID getUUID() {
|
||||
synchronized (nodeIDGuard) {
|
||||
return uuid;
|
||||
|
|
|
@ -32,10 +32,8 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
|
|||
private final String groupName;
|
||||
private final String clusterName;
|
||||
private final int maxSavedReplicatedJournalsSize;
|
||||
private final int voteRetries;
|
||||
private final long voteRetryWait;
|
||||
private final long retryReplicationWait;
|
||||
private final DistributedPrimitiveManagerConfiguration distributedManagerConfiguration;
|
||||
private final DistributedPrimitiveManagerConfiguration managerConfiguration;
|
||||
private final boolean tryFailback;
|
||||
|
||||
private ReplicationBackupPolicy(ReplicationBackupPolicyConfiguration configuration,
|
||||
|
@ -44,10 +42,8 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
|
|||
this.clusterName = configuration.getClusterName();
|
||||
this.maxSavedReplicatedJournalsSize = configuration.getMaxSavedReplicatedJournalsSize();
|
||||
this.groupName = configuration.getGroupName();
|
||||
this.voteRetries = configuration.getVoteRetries();
|
||||
this.voteRetryWait = configuration.getVoteRetryWait();
|
||||
this.retryReplicationWait = configuration.getRetryReplicationWait();
|
||||
this.distributedManagerConfiguration = configuration.getDistributedManagerConfiguration();
|
||||
this.managerConfiguration = configuration.getDistributedManagerConfiguration();
|
||||
this.tryFailback = true;
|
||||
this.livePolicy = livePolicy;
|
||||
}
|
||||
|
@ -56,10 +52,8 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
|
|||
this.clusterName = configuration.getClusterName();
|
||||
this.maxSavedReplicatedJournalsSize = configuration.getMaxSavedReplicatedJournalsSize();
|
||||
this.groupName = configuration.getGroupName();
|
||||
this.voteRetries = configuration.getVoteRetries();
|
||||
this.voteRetryWait = configuration.getVoteRetryWait();
|
||||
this.retryReplicationWait = configuration.getRetryReplicationWait();
|
||||
this.distributedManagerConfiguration = configuration.getDistributedManagerConfiguration();
|
||||
this.managerConfiguration = configuration.getDistributedManagerConfiguration();
|
||||
this.tryFailback = false;
|
||||
livePolicy = ReplicationPrimaryPolicy.failoverPolicy(
|
||||
configuration.getInitialReplicationSyncTimeout(),
|
||||
|
@ -84,16 +78,12 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
|
|||
/**
|
||||
* It creates a companion backup policy for a natural-born primary: it would cause the broker to try failback.
|
||||
*/
|
||||
static ReplicationBackupPolicy failback(int voteRetries,
|
||||
long voteRetryWait,
|
||||
long retryReplicationWait,
|
||||
static ReplicationBackupPolicy failback(long retryReplicationWait,
|
||||
String clusterName,
|
||||
String groupName,
|
||||
ReplicationPrimaryPolicy livePolicy,
|
||||
DistributedPrimitiveManagerConfiguration distributedManagerConfiguration) {
|
||||
return new ReplicationBackupPolicy(ReplicationBackupPolicyConfiguration.withDefault()
|
||||
.setVoteRetries(voteRetries)
|
||||
.setVoteRetryWait(voteRetryWait)
|
||||
.setRetryReplicationWait(retryReplicationWait)
|
||||
.setClusterName(clusterName)
|
||||
.setGroupName(groupName)
|
||||
|
@ -106,9 +96,8 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
|
|||
boolean wasLive,
|
||||
Map<String, Object> activationParams,
|
||||
IOCriticalErrorListener shutdownOnCriticalIO) throws Exception {
|
||||
return new ReplicationBackupActivation(server, wasLive, DistributedPrimitiveManager.newInstanceOf(
|
||||
distributedManagerConfiguration.getClassName(),
|
||||
distributedManagerConfiguration.getProperties()), this);
|
||||
return new ReplicationBackupActivation(server, DistributedPrimitiveManager.newInstanceOf(
|
||||
managerConfiguration.getClassName(), managerConfiguration.getProperties()), this);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -157,14 +146,6 @@ public class ReplicationBackupPolicy implements HAPolicy<ReplicationBackupActiva
|
|||
return maxSavedReplicatedJournalsSize;
|
||||
}
|
||||
|
||||
public int getVoteRetries() {
|
||||
return voteRetries;
|
||||
}
|
||||
|
||||
public long getVoteRetryWait() {
|
||||
return voteRetryWait;
|
||||
}
|
||||
|
||||
public long getRetryReplicationWait() {
|
||||
return retryReplicationWait;
|
||||
}
|
||||
|
|
|
@ -31,10 +31,10 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
|
|||
private final ReplicationBackupPolicy backupPolicy;
|
||||
private final String clusterName;
|
||||
private final String groupName;
|
||||
private final boolean checkForLiveServer;
|
||||
private final long initialReplicationSyncTimeout;
|
||||
private final DistributedPrimitiveManagerConfiguration distributedManagerConfiguration;
|
||||
private final boolean allowAutoFailBack;
|
||||
private final String coordinationId;
|
||||
|
||||
private ReplicationPrimaryPolicy(ReplicationPrimaryPolicyConfiguration configuration,
|
||||
ReplicationBackupPolicy backupPolicy,
|
||||
|
@ -42,9 +42,9 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
|
|||
Objects.requireNonNull(backupPolicy);
|
||||
clusterName = configuration.getClusterName();
|
||||
groupName = configuration.getGroupName();
|
||||
checkForLiveServer = configuration.isCheckForLiveServer();
|
||||
initialReplicationSyncTimeout = configuration.getInitialReplicationSyncTimeout();
|
||||
distributedManagerConfiguration = configuration.getDistributedManagerConfiguration();
|
||||
coordinationId = configuration.getCoordinationId();
|
||||
this.allowAutoFailBack = allowAutoFailBack;
|
||||
this.backupPolicy = backupPolicy;
|
||||
}
|
||||
|
@ -52,12 +52,11 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
|
|||
private ReplicationPrimaryPolicy(ReplicationPrimaryPolicyConfiguration config) {
|
||||
clusterName = config.getClusterName();
|
||||
groupName = config.getGroupName();
|
||||
checkForLiveServer = config.isCheckForLiveServer();
|
||||
coordinationId = config.getCoordinationId();
|
||||
initialReplicationSyncTimeout = config.getInitialReplicationSyncTimeout();
|
||||
distributedManagerConfiguration = config.getDistributedManagerConfiguration();
|
||||
this.allowAutoFailBack = false;
|
||||
backupPolicy = ReplicationBackupPolicy.failback(config.getVoteRetries(), config.getVoteRetryWait(),
|
||||
config.getRetryReplicationWait(), config.getClusterName(),
|
||||
backupPolicy = ReplicationBackupPolicy.failback(config.getRetryReplicationWait(), config.getClusterName(),
|
||||
config.getGroupName(), this,
|
||||
config.getDistributedManagerConfiguration());
|
||||
}
|
||||
|
@ -73,7 +72,6 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
|
|||
boolean allowAutoFailback,
|
||||
DistributedPrimitiveManagerConfiguration distributedManagerConfiguration) {
|
||||
return new ReplicationPrimaryPolicy(ReplicationPrimaryPolicyConfiguration.withDefault()
|
||||
.setCheckForLiveServer(false)
|
||||
.setInitialReplicationSyncTimeout(initialReplicationSyncTimeout)
|
||||
.setGroupName(groupName)
|
||||
.setClusterName(clusterName)
|
||||
|
@ -139,10 +137,6 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
|
|||
return null;
|
||||
}
|
||||
|
||||
public boolean isCheckForLiveServer() {
|
||||
return checkForLiveServer;
|
||||
}
|
||||
|
||||
public boolean isAllowAutoFailBack() {
|
||||
return allowAutoFailBack;
|
||||
}
|
||||
|
@ -163,4 +157,8 @@ public class ReplicationPrimaryPolicy implements HAPolicy<ReplicationPrimaryActi
|
|||
public boolean useQuorumManager() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public String getCoordinationId() {
|
||||
return coordinationId;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -659,7 +659,7 @@ public class ActiveMQServerImpl implements ActiveMQServer {
|
|||
afterActivationCreated.run();
|
||||
} catch (Throwable e) {
|
||||
logger.warn(e.getMessage(), e); // just debug, this is not supposed to happend, and if it does
|
||||
// it will be embedeed code from tests
|
||||
// it will be embedded code from tests
|
||||
}
|
||||
afterActivationCreated = null;
|
||||
}
|
||||
|
@ -2877,6 +2877,8 @@ public class ActiveMQServerImpl implements ActiveMQServer {
|
|||
public String toString() {
|
||||
if (identity != null) {
|
||||
return "ActiveMQServerImpl::" + identity;
|
||||
} else if (configuration != null && configuration.getName() != null) {
|
||||
return "ActiveMQServerImpl::" + "name=" + configuration.getName();
|
||||
}
|
||||
return "ActiveMQServerImpl::" + (nodeManager != null ? "serverUUID=" + nodeManager.getUUID() : "");
|
||||
}
|
||||
|
|
|
@ -1,160 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.activemq.artemis.core.server.impl;
|
||||
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.activemq.artemis.api.core.ActiveMQException;
|
||||
import org.apache.activemq.artemis.api.core.ActiveMQExceptionType;
|
||||
import org.apache.activemq.artemis.api.core.DiscoveryGroupConfiguration;
|
||||
import org.apache.activemq.artemis.api.core.TransportConfiguration;
|
||||
import org.apache.activemq.artemis.api.core.client.ActiveMQClient;
|
||||
import org.apache.activemq.artemis.api.core.client.ClientSession;
|
||||
import org.apache.activemq.artemis.api.core.client.ClientSessionFactory;
|
||||
import org.apache.activemq.artemis.api.core.client.ClusterTopologyListener;
|
||||
import org.apache.activemq.artemis.api.core.client.ServerLocator;
|
||||
import org.apache.activemq.artemis.api.core.client.TopologyMember;
|
||||
import org.apache.activemq.artemis.core.client.impl.ClientSessionFactoryInternal;
|
||||
import org.apache.activemq.artemis.core.client.impl.ServerLocatorInternal;
|
||||
import org.apache.activemq.artemis.core.config.ClusterConnectionConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.Configuration;
|
||||
import org.apache.activemq.artemis.core.config.ConfigurationUtils;
|
||||
import org.apache.activemq.artemis.core.server.ActiveMQMessageBundle;
|
||||
import org.apache.activemq.artemis.core.server.ActiveMQServerLogger;
|
||||
import org.jboss.logging.Logger;
|
||||
|
||||
/**
|
||||
* This class contains some utils to allow a broker to check presence and role of another broker in the cluster.
|
||||
*/
|
||||
final class ClusterTopologySearch {
|
||||
|
||||
private ClusterTopologySearch() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether there is a live server already running with nodeID.<br>
|
||||
* This search isn't filtering the caller broker transport and is meant to be used
|
||||
* when the broker acceptors aren't running yet.
|
||||
*/
|
||||
public static boolean searchActiveLiveNodeId(String clusterName,
|
||||
String nodeId,
|
||||
long timeout,
|
||||
TimeUnit unit,
|
||||
Configuration serverConfiguration) throws ActiveMQException {
|
||||
if (serverConfiguration.getClusterConfigurations().isEmpty())
|
||||
return false;
|
||||
final ClusterConnectionConfiguration clusterConnectionConfiguration = ConfigurationUtils.getReplicationClusterConfiguration(serverConfiguration, clusterName);
|
||||
|
||||
final LiveNodeIdListener liveNodeIdListener = new LiveNodeIdListener(nodeId, serverConfiguration.getClusterUser(), serverConfiguration.getClusterPassword());
|
||||
|
||||
try (ServerLocatorInternal locator = createLocator(serverConfiguration, clusterConnectionConfiguration)) {
|
||||
// if would like to filter out a transport configuration:
|
||||
// locator.setClusterTransportConfiguration(callerBrokerTransportConfiguration)
|
||||
locator.addClusterTopologyListener(liveNodeIdListener);
|
||||
locator.setReconnectAttempts(0);
|
||||
try (ClientSessionFactoryInternal ignored = locator.connectNoWarnings()) {
|
||||
return liveNodeIdListener.awaitNodePresent(timeout, unit);
|
||||
} catch (Exception notConnected) {
|
||||
if (!(notConnected instanceof ActiveMQException) || ActiveMQExceptionType.INTERNAL_ERROR.equals(((ActiveMQException) notConnected).getType())) {
|
||||
// report all exceptions that aren't ActiveMQException and all INTERNAL_ERRORs
|
||||
ActiveMQServerLogger.LOGGER.failedConnectingToCluster(notConnected);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final class LiveNodeIdListener implements ClusterTopologyListener {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(LiveNodeIdListener.class);
|
||||
private final String nodeId;
|
||||
private final String user;
|
||||
private final String password;
|
||||
private final CountDownLatch searchCompleted;
|
||||
private boolean isNodePresent = false;
|
||||
|
||||
LiveNodeIdListener(String nodeId, String user, String password) {
|
||||
this.nodeId = nodeId;
|
||||
this.user = user;
|
||||
this.password = password;
|
||||
this.searchCompleted = new CountDownLatch(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void nodeUP(TopologyMember topologyMember, boolean last) {
|
||||
boolean isOurNodeId = nodeId != null && nodeId.equals(topologyMember.getNodeId());
|
||||
if (isOurNodeId && isActive(topologyMember.getLive())) {
|
||||
isNodePresent = true;
|
||||
}
|
||||
if (isOurNodeId || last) {
|
||||
searchCompleted.countDown();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean awaitNodePresent(long timeout, TimeUnit unit) throws InterruptedException {
|
||||
searchCompleted.await(timeout, unit);
|
||||
return isNodePresent;
|
||||
}
|
||||
|
||||
/**
|
||||
* In a cluster of replicated live/backup pairs if a backup crashes and then its live crashes the cluster will
|
||||
* retain the topology information of the live such that when the live server restarts it will check the
|
||||
* cluster to see if its nodeID is present (which it will be) and then it will activate as a backup rather than
|
||||
* a live. To prevent this situation an additional check is necessary to see if the server with the matching
|
||||
* nodeID is actually active or not which is done by attempting to make a connection to it.
|
||||
*
|
||||
* @param transportConfiguration
|
||||
* @return
|
||||
*/
|
||||
private boolean isActive(TransportConfiguration transportConfiguration) {
|
||||
try (ServerLocator serverLocator = ActiveMQClient.createServerLocator(false, transportConfiguration);
|
||||
ClientSessionFactory clientSessionFactory = serverLocator.createSessionFactory();
|
||||
ClientSession clientSession = clientSessionFactory.createSession(user, password, false, false, false, false, 0)) {
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
logger.debug("isActive check failed", e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void nodeDown(long eventUID, String nodeID) {
|
||||
// no-op
|
||||
}
|
||||
}
|
||||
|
||||
private static ServerLocatorInternal createLocator(Configuration configuration,
|
||||
ClusterConnectionConfiguration config) throws ActiveMQException {
|
||||
final ServerLocatorInternal locator;
|
||||
if (config.getDiscoveryGroupName() != null) {
|
||||
DiscoveryGroupConfiguration dg = configuration.getDiscoveryGroupConfigurations().get(config.getDiscoveryGroupName());
|
||||
|
||||
if (dg == null) {
|
||||
throw ActiveMQMessageBundle.BUNDLE.noDiscoveryGroupFound(null);
|
||||
}
|
||||
locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(dg);
|
||||
} else {
|
||||
TransportConfiguration[] tcConfigs = config.getStaticConnectors() != null ? configuration.getTransportConfigurations(config.getStaticConnectors()) : null;
|
||||
|
||||
locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(tcConfigs);
|
||||
}
|
||||
return locator;
|
||||
}
|
||||
|
||||
}
|
|
@ -28,7 +28,7 @@ import org.apache.activemq.artemis.core.server.NodeManager;
|
|||
import org.apache.activemq.artemis.utils.UUID;
|
||||
import org.apache.activemq.artemis.utils.UUIDGenerator;
|
||||
|
||||
import static java.nio.file.StandardOpenOption.CREATE_NEW;
|
||||
import static java.nio.file.StandardOpenOption.CREATE;
|
||||
import static java.nio.file.StandardOpenOption.READ;
|
||||
import static java.nio.file.StandardOpenOption.WRITE;
|
||||
|
||||
|
@ -36,33 +36,36 @@ public abstract class FileBasedNodeManager extends NodeManager {
|
|||
|
||||
protected static final byte FIRST_TIME_START = '0';
|
||||
public static final String SERVER_LOCK_NAME = "server.lock";
|
||||
public static final String DATA_VERSION_NAME = "server.data.version";
|
||||
public static final String SERVER_ACTIVATION_SEQUENCE_NAME = "server.activation.sequence";
|
||||
private static final String ACCESS_MODE = "rw";
|
||||
private final File directory;
|
||||
protected FileChannel channel;
|
||||
protected FileChannel dataVersionChannel;
|
||||
protected FileChannel activationSequenceChannel;
|
||||
|
||||
public FileBasedNodeManager(boolean replicatedBackup, File directory) {
|
||||
super(replicatedBackup);
|
||||
this.directory = directory;
|
||||
if (directory != null) {
|
||||
directory.mkdirs();
|
||||
}
|
||||
}
|
||||
|
||||
protected void useDataVersionChannel() throws IOException {
|
||||
if (dataVersionChannel != null) {
|
||||
protected void useActivationSequenceChannel() throws IOException {
|
||||
if (activationSequenceChannel != null) {
|
||||
return;
|
||||
}
|
||||
dataVersionChannel = FileChannel.open(newFile(DATA_VERSION_NAME).toPath(), READ, WRITE, CREATE_NEW);
|
||||
activationSequenceChannel = FileChannel.open(newFile(SERVER_ACTIVATION_SEQUENCE_NAME).toPath(), READ, WRITE, CREATE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long readDataVersion() throws NodeManagerException {
|
||||
public long readNodeActivationSequence() throws NodeManagerException {
|
||||
if (!isStarted()) {
|
||||
throw new NodeManagerException(new IllegalStateException("node manager must be started first"));
|
||||
}
|
||||
try {
|
||||
useDataVersionChannel();
|
||||
useActivationSequenceChannel();
|
||||
ByteBuffer tmpBuffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN);
|
||||
if (dataVersionChannel.read(tmpBuffer, 0) != Long.BYTES) {
|
||||
if (activationSequenceChannel.read(tmpBuffer, 0) != Long.BYTES) {
|
||||
return 0;
|
||||
}
|
||||
tmpBuffer.flip();
|
||||
|
@ -73,16 +76,17 @@ public abstract class FileBasedNodeManager extends NodeManager {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void writeDataVersion(long version) throws NodeManagerException {
|
||||
public void writeNodeActivationSequence(long version) throws NodeManagerException {
|
||||
if (!isStarted()) {
|
||||
throw new NodeManagerException(new IllegalStateException("node manager must be started first"));
|
||||
}
|
||||
try {
|
||||
useDataVersionChannel();
|
||||
useActivationSequenceChannel();
|
||||
ByteBuffer tmpBuffer = ByteBuffer.allocate(Long.BYTES).order(ByteOrder.BIG_ENDIAN);
|
||||
tmpBuffer.putLong(0, version);
|
||||
dataVersionChannel.write(tmpBuffer, 0);
|
||||
dataVersionChannel.force(false);
|
||||
activationSequenceChannel.write(tmpBuffer, 0);
|
||||
activationSequenceChannel.force(false);
|
||||
setNodeActivationSequence(version);
|
||||
} catch (IOException ie) {
|
||||
throw new NodeManagerException(ie);
|
||||
}
|
||||
|
@ -149,12 +153,8 @@ public abstract class FileBasedNodeManager extends NodeManager {
|
|||
createNodeId();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return
|
||||
*/
|
||||
protected final File newFile(final String fileName) {
|
||||
File file = new File(directory, fileName);
|
||||
return file;
|
||||
return new File(directory, fileName);
|
||||
}
|
||||
|
||||
protected final synchronized void createNodeId() throws IOException {
|
||||
|
@ -190,8 +190,8 @@ public abstract class FileBasedNodeManager extends NodeManager {
|
|||
channelCopy.close();
|
||||
} finally {
|
||||
try {
|
||||
FileChannel dataVersionChannel = this.dataVersionChannel;
|
||||
this.dataVersionChannel = null;
|
||||
FileChannel dataVersionChannel = this.activationSequenceChannel;
|
||||
this.activationSequenceChannel = null;
|
||||
if (dataVersionChannel != null) {
|
||||
dataVersionChannel.close();
|
||||
}
|
||||
|
|
|
@ -19,9 +19,6 @@ package org.apache.activemq.artemis.core.server.impl;
|
|||
import javax.annotation.concurrent.GuardedBy;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
|
@ -35,7 +32,7 @@ import org.apache.activemq.artemis.core.replication.ReplicationEndpoint;
|
|||
import org.apache.activemq.artemis.core.server.ActiveMQServer;
|
||||
import org.apache.activemq.artemis.core.server.ActiveMQServerLogger;
|
||||
import org.apache.activemq.artemis.core.server.LiveNodeLocator;
|
||||
import org.apache.activemq.artemis.core.server.NodeManager;;
|
||||
import org.apache.activemq.artemis.core.server.NodeManager;
|
||||
import org.apache.activemq.artemis.core.server.cluster.ClusterControl;
|
||||
import org.apache.activemq.artemis.core.server.cluster.ClusterController;
|
||||
import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationBackupPolicy;
|
||||
|
@ -45,6 +42,8 @@ import org.apache.activemq.artemis.quorum.UnavailableStateException;
|
|||
import org.jboss.logging.Logger;
|
||||
|
||||
import static org.apache.activemq.artemis.core.server.impl.ReplicationObserver.ReplicationFailure;
|
||||
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.ensureSequentialAccessToNodeData;
|
||||
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.tryActivate;
|
||||
|
||||
/**
|
||||
* This activation can be used by a primary while trying to fail-back ie {@code failback == true} or
|
||||
|
@ -54,7 +53,6 @@ public final class ReplicationBackupActivation extends Activation implements Dis
|
|||
|
||||
private static final Logger LOGGER = Logger.getLogger(ReplicationBackupActivation.class);
|
||||
|
||||
private final boolean wasLive;
|
||||
private final ReplicationBackupPolicy policy;
|
||||
private final ActiveMQServerImpl activeMQServer;
|
||||
// This field is != null iff this node is a primary during a fail-back ie acting as a backup in order to become live again.
|
||||
|
@ -72,10 +70,8 @@ public final class ReplicationBackupActivation extends Activation implements Dis
|
|||
private final AtomicBoolean stopping;
|
||||
|
||||
public ReplicationBackupActivation(final ActiveMQServerImpl activeMQServer,
|
||||
final boolean wasLive,
|
||||
final DistributedPrimitiveManager distributedManager,
|
||||
final ReplicationBackupPolicy policy) {
|
||||
this.wasLive = wasLive;
|
||||
this.activeMQServer = activeMQServer;
|
||||
if (policy.isTryFailback()) {
|
||||
final SimpleString serverNodeID = activeMQServer.getNodeID();
|
||||
|
@ -146,14 +142,35 @@ public final class ReplicationBackupActivation extends Activation implements Dis
|
|||
}
|
||||
}
|
||||
try {
|
||||
LOGGER.info("Trying to reach majority of quorum service nodes");
|
||||
distributedManager.start();
|
||||
LOGGER.info("Quorum service available: starting broker");
|
||||
final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence();
|
||||
// only a backup with positive local activation sequence could contain valuable data
|
||||
if (nodeActivationSequence > 0) {
|
||||
final String nodeId = activeMQServer.getNodeManager().getNodeId().toString();
|
||||
DistributedLock liveLockWithInSyncReplica;
|
||||
while (true) {
|
||||
distributedManager.start();
|
||||
try {
|
||||
liveLockWithInSyncReplica = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER);
|
||||
break;
|
||||
} catch (UnavailableStateException canRecoverEx) {
|
||||
distributedManager.stop();
|
||||
}
|
||||
}
|
||||
if (liveLockWithInSyncReplica != null) {
|
||||
// retain state and start as live
|
||||
if (!activeMQServer.initialisePart1(false)) {
|
||||
return;
|
||||
}
|
||||
activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED);
|
||||
startAsLive(liveLockWithInSyncReplica);
|
||||
return;
|
||||
}
|
||||
}
|
||||
distributedManager.addUnavailableManagerListener(this);
|
||||
// Stop the previous node manager and create a new one with NodeManager::replicatedBackup == true:
|
||||
// NodeManager::start skip setup lock file with NodeID, until NodeManager::stopBackup is called.
|
||||
activeMQServer.resetNodeManager();
|
||||
activeMQServer.getNodeManager().stop();
|
||||
// A primary need to preserve NodeID across runs
|
||||
activeMQServer.moveServerData(policy.getMaxSavedReplicatedJournalsSize(), policy.isTryFailback());
|
||||
activeMQServer.getNodeManager().start();
|
||||
|
@ -164,11 +181,15 @@ public final class ReplicationBackupActivation extends Activation implements Dis
|
|||
if (closed)
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
final ClusterController clusterController = activeMQServer.getClusterManager().getClusterController();
|
||||
|
||||
LOGGER.infof("Apache ActiveMQ Artemis Backup Server version %s [%s] started, awaiting connection to a live cluster member to start replication", activeMQServer.getVersion().getFullVersion(),
|
||||
activeMQServer.toString());
|
||||
|
||||
clusterController.awaitConnectionToReplicationCluster();
|
||||
activeMQServer.getBackupManager().start();
|
||||
ActiveMQServerLogger.LOGGER.backupServerStarted(activeMQServer.getVersion().getFullVersion(),
|
||||
activeMQServer.getNodeManager().getNodeId());
|
||||
activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED);
|
||||
final DistributedLock liveLock = replicateAndFailover(clusterController);
|
||||
if (liveLock == null) {
|
||||
|
@ -192,6 +213,14 @@ public final class ReplicationBackupActivation extends Activation implements Dis
|
|||
liveLock.close();
|
||||
return;
|
||||
}
|
||||
try {
|
||||
ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER);
|
||||
} catch (Throwable fatal) {
|
||||
LOGGER.warn(fatal);
|
||||
// policy is already live one, but there's no activation yet: we can just stop
|
||||
asyncRestartServer(activeMQServer, false, false);
|
||||
throw new ActiveMQIllegalStateException("This server cannot ensure sequential access to broker data: activation is failed");
|
||||
}
|
||||
ActiveMQServerLogger.LOGGER.becomingLive(activeMQServer);
|
||||
// stopBackup is going to write the NodeID previously set on the NodeManager,
|
||||
// because activeMQServer.resetNodeManager() has created a NodeManager with replicatedBackup == true.
|
||||
|
@ -260,28 +289,38 @@ public final class ReplicationBackupActivation extends Activation implements Dis
|
|||
return null;
|
||||
}
|
||||
LOGGER.debugf("ReplicationFailure = %s", failure);
|
||||
boolean voluntaryFailOver = false;
|
||||
switch (failure) {
|
||||
case VoluntaryFailOver:
|
||||
voluntaryFailOver = true;
|
||||
case NonVoluntaryFailover:
|
||||
final DistributedLock liveLock = tryAcquireLiveLock();
|
||||
// from now on we're meant to stop:
|
||||
// - due to failover
|
||||
// - due to restart/stop
|
||||
assert stopping.get();
|
||||
if (liveLock != null) {
|
||||
return liveLock;
|
||||
if (!stopping.compareAndSet(false, true)) {
|
||||
return null;
|
||||
}
|
||||
boolean restart = true;
|
||||
if (voluntaryFailOver && isFirstFailbackAttempt()) {
|
||||
restart = false;
|
||||
LOGGER.error("Failed to fail-back: stopping broker based on quorum results");
|
||||
// no more interested into these events: handling it manually from here
|
||||
distributedManager.removeUnavailableManagerListener(this);
|
||||
final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence();
|
||||
final String nodeId = activeMQServer.getNodeManager().getNodeId().toString();
|
||||
DistributedLock liveLockWithInSyncReplica = null;
|
||||
if (nodeActivationSequence > 0) {
|
||||
try {
|
||||
liveLockWithInSyncReplica = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER);
|
||||
} catch (Throwable error) {
|
||||
// no need to retry here, can just restart as backup that will handle a more resilient tryActivate
|
||||
LOGGER.warn("Errored while attempting failover", error);
|
||||
liveLockWithInSyncReplica = null;
|
||||
}
|
||||
} else {
|
||||
ActiveMQServerLogger.LOGGER.restartingAsBackupBasedOnQuorumVoteResults();
|
||||
LOGGER.warnf("We expect local activation sequence for NodeID = %s to be > 0 on a fail-over, while is %d", nodeId, nodeActivationSequence);
|
||||
}
|
||||
// let's ignore the stopping flag here, we're in control of it
|
||||
asyncRestartServer(activeMQServer, restart, false);
|
||||
assert stopping.get();
|
||||
if (liveLockWithInSyncReplica != null) {
|
||||
return liveLockWithInSyncReplica;
|
||||
}
|
||||
ActiveMQServerLogger.LOGGER.restartingAsBackupBasedOnQuorumVoteResults();
|
||||
// let's ignore the stopping flag here, we're already in control of it
|
||||
asyncRestartServer(activeMQServer, true, false);
|
||||
return null;
|
||||
case RegistrationError:
|
||||
LOGGER.error("Stopping broker because of critical registration error");
|
||||
|
@ -307,73 +346,7 @@ public final class ReplicationBackupActivation extends Activation implements Dis
|
|||
}
|
||||
}
|
||||
} finally {
|
||||
silentExecution("Errored on cluster topology listener for replication cleanup", () -> clusterController.removeClusterTopologyListenerForReplication(nodeLocator));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* {@code wasLive} is {code true} only while transitioning from primary to backup.<br>
|
||||
* If a natural born backup become live and allows failback, while transitioning to back again
|
||||
* {@code wasLive} is still {@code false}.<br>
|
||||
* The check on {@link ReplicationBackupPolicy#isTryFailback()} is redundant but still useful for correctness.
|
||||
* <p>
|
||||
* In case of fail-back, any event that's going to restart this broker as backup (eg quorum service unavailable
|
||||
* or some replication failures) will cause {@code wasLive} to be {@code false}, because the HA policy set isn't
|
||||
* a primary anymore.
|
||||
*/
|
||||
private boolean isFirstFailbackAttempt() {
|
||||
return wasLive && policy.isTryFailback();
|
||||
}
|
||||
|
||||
private DistributedLock tryAcquireLiveLock() throws InterruptedException {
|
||||
// disable quorum service unavailability handling and just treat this imperatively
|
||||
if (!stopping.compareAndSet(false, true)) {
|
||||
// already unavailable quorum service: fail fast
|
||||
return null;
|
||||
}
|
||||
distributedManager.removeUnavailableManagerListener(this);
|
||||
assert activeMQServer.getNodeManager().getNodeId() != null;
|
||||
final String liveID = activeMQServer.getNodeManager().getNodeId().toString();
|
||||
final int voteRetries = policy.getVoteRetries();
|
||||
final long maxAttempts = voteRetries >= 0 ? (voteRetries + 1) : -1;
|
||||
if (maxAttempts == -1) {
|
||||
LOGGER.error("It's not safe to retry an infinite amount of time to acquire a live lock: please consider setting a vote-retries value");
|
||||
}
|
||||
final long voteRetryWait = policy.getVoteRetryWait();
|
||||
final DistributedLock liveLock = getLock(distributedManager, liveID);
|
||||
if (liveLock == null) {
|
||||
return null;
|
||||
}
|
||||
for (long attempt = 0; maxAttempts >= 0 ? (attempt < maxAttempts) : true; attempt++) {
|
||||
try {
|
||||
if (liveLock.tryLock(voteRetryWait, TimeUnit.MILLISECONDS)) {
|
||||
LOGGER.debugf("%s live lock acquired after %d attempts.", liveID, (attempt + 1));
|
||||
return liveLock;
|
||||
}
|
||||
} catch (UnavailableStateException e) {
|
||||
LOGGER.warnf(e, "Failed to acquire live lock %s because of unavailable quorum service: stop trying", liveID);
|
||||
distributedManager.stop();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
LOGGER.warnf("Failed to acquire live lock %s after %d tries", liveID, maxAttempts);
|
||||
distributedManager.stop();
|
||||
return null;
|
||||
}
|
||||
|
||||
private DistributedLock getLock(final DistributedPrimitiveManager manager,
|
||||
final String lockId) throws InterruptedException {
|
||||
if (!manager.isStarted()) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return manager.getDistributedLock(lockId);
|
||||
} catch (ExecutionException e) {
|
||||
LOGGER.warnf(e, "Errored while getting lock %s", lockId);
|
||||
return null;
|
||||
} catch (TimeoutException te) {
|
||||
LOGGER.warnf(te, "Timeout while getting lock %s", lockId);
|
||||
return null;
|
||||
silentExecution("Error on cluster topology listener for replication cleanup", () -> clusterController.removeClusterTopologyListenerForReplication(nodeLocator));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -405,7 +378,6 @@ public final class ReplicationBackupActivation extends Activation implements Dis
|
|||
return ReplicationFailure.RegistrationError;
|
||||
}
|
||||
this.replicationEndpoint = replicationEndpoint;
|
||||
assert replicationEndpoint != null;
|
||||
try {
|
||||
return replicationObserver.awaitReplicationFailure();
|
||||
} finally {
|
||||
|
@ -414,11 +386,11 @@ public final class ReplicationBackupActivation extends Activation implements Dis
|
|||
closeChannelOf(replicationEndpoint);
|
||||
}
|
||||
} finally {
|
||||
silentExecution("Errored on live control close", liveControl::close);
|
||||
silentExecution("Error on live control close", liveControl::close);
|
||||
}
|
||||
} finally {
|
||||
silentExecution("Errored on cluster topology listener cleanup", () -> clusterController.removeClusterTopologyListener(replicationObserver));
|
||||
silentExecution("Errored while removing incoming interceptor for replication", () -> clusterController.removeIncomingInterceptorForReplication(replicationError));
|
||||
silentExecution("Error on cluster topology listener cleanup", () -> clusterController.removeClusterTopologyListener(replicationObserver));
|
||||
silentExecution("Error while removing incoming interceptor for replication", () -> clusterController.removeIncomingInterceptorForReplication(replicationError));
|
||||
}
|
||||
} finally {
|
||||
this.replicationObserver = null;
|
||||
|
@ -438,7 +410,7 @@ public final class ReplicationBackupActivation extends Activation implements Dis
|
|||
return;
|
||||
}
|
||||
if (replicationEndpoint.getChannel() != null) {
|
||||
silentExecution("Errored while closing replication endpoint channel", () -> replicationEndpoint.getChannel().close());
|
||||
silentExecution("Error while closing replication endpoint channel", () -> replicationEndpoint.getChannel().close());
|
||||
replicationEndpoint.setChannel(null);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,12 +19,14 @@ package org.apache.activemq.artemis.core.server.impl;
|
|||
import javax.annotation.concurrent.GuardedBy;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import org.apache.activemq.artemis.api.core.ActiveMQAlreadyReplicatingException;
|
||||
import org.apache.activemq.artemis.api.core.ActiveMQException;
|
||||
import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException;
|
||||
import org.apache.activemq.artemis.api.core.Pair;
|
||||
import org.apache.activemq.artemis.api.core.TransportConfiguration;
|
||||
import org.apache.activemq.artemis.core.persistence.StorageManager;
|
||||
import org.apache.activemq.artemis.core.protocol.core.Channel;
|
||||
import org.apache.activemq.artemis.core.protocol.core.ChannelHandler;
|
||||
import org.apache.activemq.artemis.core.protocol.core.CoreRemotingConnection;
|
||||
|
@ -46,7 +48,10 @@ import org.apache.activemq.artemis.quorum.UnavailableStateException;
|
|||
import org.apache.activemq.artemis.spi.core.remoting.Acceptor;
|
||||
import org.jboss.logging.Logger;
|
||||
|
||||
import static org.apache.activemq.artemis.core.server.impl.ClusterTopologySearch.searchActiveLiveNodeId;
|
||||
import static org.apache.activemq.artemis.core.server.ActiveMQServer.SERVER_STATE.STARTED;
|
||||
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.awaitNextCommittedActivationSequence;
|
||||
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.ensureSequentialAccessToNodeData;
|
||||
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.tryActivate;
|
||||
|
||||
/**
|
||||
* This is going to be {@link #run()} just by natural born primary, at the first start.
|
||||
|
@ -55,8 +60,9 @@ import static org.apache.activemq.artemis.core.server.impl.ClusterTopologySearch
|
|||
public class ReplicationPrimaryActivation extends LiveActivation implements DistributedLock.UnavailableLockListener {
|
||||
|
||||
private static final Logger LOGGER = Logger.getLogger(ReplicationPrimaryActivation.class);
|
||||
private static final long DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS = 20_000;
|
||||
private static final long BLOCKING_CALLS_TIMEOUT_MILLIS = 5_000;
|
||||
// This is the time we expect a replica to become a live from the quorum pov
|
||||
// ie time to execute tryActivate and ensureSequentialAccessToNodeData
|
||||
private static final long FAILBACK_TIMEOUT_MILLIS = 4_000;
|
||||
|
||||
private final ReplicationPrimaryPolicy policy;
|
||||
|
||||
|
@ -69,7 +75,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
|
|||
|
||||
private final DistributedPrimitiveManager distributedManager;
|
||||
|
||||
private volatile boolean stoppingServer;
|
||||
private final AtomicBoolean stoppingServer;
|
||||
|
||||
public ReplicationPrimaryActivation(final ActiveMQServerImpl activeMQServer,
|
||||
final DistributedPrimitiveManager distributedManager,
|
||||
|
@ -78,6 +84,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
|
|||
this.policy = policy;
|
||||
this.replicationLock = new Object();
|
||||
this.distributedManager = distributedManager;
|
||||
this.stoppingServer = new AtomicBoolean();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -101,19 +108,36 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
|
|||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
|
||||
final NodeManager nodeManager = activeMQServer.getNodeManager();
|
||||
|
||||
final String nodeId = nodeManager.readNodeId().toString();
|
||||
|
||||
final long dataVersion = nodeManager.readDataVersion();
|
||||
|
||||
final DistributedLock liveLock = searchLiveOrAcquireLiveLock(nodeId, BLOCKING_CALLS_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS);
|
||||
|
||||
// we have a common nodeId that we can share and coordinate with between peers
|
||||
if (policy.getCoordinationId() != null) {
|
||||
LOGGER.infof("Applying shared peer NodeID=%s to enable coordinated live activation", policy.getCoordinationId());
|
||||
// REVISIT: this is quite clunky, also in backup activation, we just need new nodeID persisted!
|
||||
activeMQServer.resetNodeManager();
|
||||
activeMQServer.getNodeManager().start();
|
||||
activeMQServer.getNodeManager().setNodeID(policy.getCoordinationId());
|
||||
activeMQServer.getNodeManager().stopBackup();
|
||||
}
|
||||
final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence();
|
||||
final String nodeId = activeMQServer.getNodeManager().readNodeId().toString();
|
||||
DistributedLock liveLock;
|
||||
while (true) {
|
||||
distributedManager.start();
|
||||
try {
|
||||
liveLock = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER);
|
||||
break;
|
||||
} catch (UnavailableStateException canRecoverEx) {
|
||||
distributedManager.stop();
|
||||
}
|
||||
}
|
||||
if (liveLock == null) {
|
||||
distributedManager.stop();
|
||||
LOGGER.infof("This broker cannot become a live server with NodeID = %s: restarting as backup", nodeId);
|
||||
activeMQServer.setHAPolicy(policy.getBackupPolicy());
|
||||
return;
|
||||
}
|
||||
|
||||
ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER);
|
||||
|
||||
activeMQServer.initialisePart1(false);
|
||||
|
||||
activeMQServer.initialisePart2(false);
|
||||
|
@ -142,73 +166,9 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
|
|||
}
|
||||
}
|
||||
|
||||
private DistributedLock searchLiveOrAcquireLiveLock(final String nodeId,
|
||||
final long blockingCallTimeout,
|
||||
final TimeUnit unit) throws ActiveMQException, InterruptedException {
|
||||
if (policy.isCheckForLiveServer()) {
|
||||
LOGGER.infof("Searching a live server with NodeID = %s", nodeId);
|
||||
if (searchActiveLiveNodeId(policy.getClusterName(), nodeId, blockingCallTimeout, unit, activeMQServer.getConfiguration())) {
|
||||
LOGGER.infof("Found a live server with NodeID = %s: restarting as backup", nodeId);
|
||||
activeMQServer.setHAPolicy(policy.getBackupPolicy());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
startDistributedPrimitiveManager();
|
||||
return acquireDistributeLock(getDistributeLock(nodeId), blockingCallTimeout, unit);
|
||||
}
|
||||
|
||||
private void startDistributedPrimitiveManager() throws InterruptedException, ActiveMQException {
|
||||
LOGGER.infof("Trying to reach the majority of quorum nodes in %d ms.", DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS);
|
||||
try {
|
||||
if (distributedManager.start(DISTRIBUTED_MANAGER_START_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) {
|
||||
return;
|
||||
}
|
||||
} catch (InterruptedException ie) {
|
||||
throw ie;
|
||||
} catch (Throwable t) {
|
||||
LOGGER.debug(t);
|
||||
}
|
||||
assert !distributedManager.isStarted();
|
||||
throw new ActiveMQException("Cannot reach the majority of quorum nodes");
|
||||
}
|
||||
|
||||
private DistributedLock getDistributeLock(final String nodeId) throws InterruptedException, ActiveMQException {
|
||||
try {
|
||||
return distributedManager.getDistributedLock(nodeId);
|
||||
} catch (Throwable t) {
|
||||
try {
|
||||
distributedManager.stop();
|
||||
} catch (Throwable ignore) {
|
||||
// don't care
|
||||
}
|
||||
if (t instanceof InterruptedException) {
|
||||
throw (InterruptedException) t;
|
||||
}
|
||||
throw new ActiveMQException("Cannot obtain a live lock instance");
|
||||
}
|
||||
}
|
||||
|
||||
private DistributedLock acquireDistributeLock(final DistributedLock liveLock,
|
||||
final long acquireLockTimeout,
|
||||
final TimeUnit unit) throws InterruptedException, ActiveMQException {
|
||||
try {
|
||||
if (liveLock.tryLock(acquireLockTimeout, unit)) {
|
||||
return liveLock;
|
||||
}
|
||||
} catch (UnavailableStateException e) {
|
||||
LOGGER.debug(e);
|
||||
}
|
||||
try {
|
||||
distributedManager.stop();
|
||||
} catch (Throwable ignore) {
|
||||
// don't care
|
||||
}
|
||||
throw new ActiveMQException("Failed to become live");
|
||||
}
|
||||
|
||||
@Override
|
||||
public ChannelHandler getActivationChannelHandler(final Channel channel, final Acceptor acceptorUsed) {
|
||||
if (stoppingServer) {
|
||||
if (stoppingServer.get()) {
|
||||
return null;
|
||||
}
|
||||
return packet -> {
|
||||
|
@ -268,7 +228,7 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
|
|||
awaitBackupAnnouncementOnFailbackRequest(clusterConnection);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (activeMQServer.getState() == ActiveMQServerImpl.SERVER_STATE.STARTED) {
|
||||
if (activeMQServer.getState() == STARTED) {
|
||||
/*
|
||||
* The reasoning here is that the exception was either caused by (1) the
|
||||
* (interaction with) the backup, or (2) by an IO Error at the storage. If (1), we
|
||||
|
@ -283,7 +243,9 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
|
|||
ActiveMQServerLogger.LOGGER.errorStoppingReplication(amqe);
|
||||
} finally {
|
||||
synchronized (replicationLock) {
|
||||
this.replicationManager = null;
|
||||
if (this.replicationManager == replicationManager) {
|
||||
this.replicationManager = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -308,35 +270,55 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* If {@link #asyncStopServer()} happens before this call, the restart just won't happen.
|
||||
* If {@link #asyncStopServer()} happens after this call, will make the server to stop right after being restarted.
|
||||
*/
|
||||
private void restartAsBackupAfterFailback() throws Exception {
|
||||
if (stoppingServer) {
|
||||
if (stoppingServer.get()) {
|
||||
return;
|
||||
}
|
||||
synchronized (this) {
|
||||
if (stoppingServer) {
|
||||
final String coordinatedLockAndNodeId;
|
||||
final long inSyncReplicaActivation;
|
||||
synchronized (replicationLock) {
|
||||
if (stoppingServer.get()) {
|
||||
return;
|
||||
}
|
||||
distributedManager.stop();
|
||||
final ReplicationManager replicationManager = this.replicationManager;
|
||||
if (replicationManager == null) {
|
||||
LOGGER.warnf("Failback interrupted");
|
||||
// we got a disconnection from the replica *before* stopping acceptors: better not failback!
|
||||
return;
|
||||
}
|
||||
// IMPORTANT: this is going to save server::fail to issue a replica connection failure (with failed == false)
|
||||
// because onReplicationConnectionClose fail-fast on stopping == true.
|
||||
if (!stoppingServer.compareAndSet(false, true)) {
|
||||
LOGGER.infof("Failback interrupted: server is already stopping");
|
||||
return;
|
||||
}
|
||||
coordinatedLockAndNodeId = activeMQServer.getNodeManager().getNodeId().toString();
|
||||
inSyncReplicaActivation = activeMQServer.getNodeManager().getNodeActivationSequence();
|
||||
// none can notice a concurrent drop of replica connection here: awaitNextCommittedActivationSequence defensively
|
||||
// wait FAILBACK_TIMEOUT_MILLIS, proceed as backup and compete to become live again
|
||||
activeMQServer.fail(true);
|
||||
ActiveMQServerLogger.LOGGER.restartingReplicatedBackupAfterFailback();
|
||||
activeMQServer.setHAPolicy(policy.getBackupPolicy());
|
||||
activeMQServer.start();
|
||||
}
|
||||
try {
|
||||
distributedManager.start();
|
||||
if (!awaitNextCommittedActivationSequence(distributedManager, coordinatedLockAndNodeId, inSyncReplicaActivation, FAILBACK_TIMEOUT_MILLIS, LOGGER)) {
|
||||
LOGGER.warnf("Timed out waiting for failback server activation with NodeID = %s: and sequence > %d: after %dms",
|
||||
coordinatedLockAndNodeId, inSyncReplicaActivation, FAILBACK_TIMEOUT_MILLIS);
|
||||
}
|
||||
} catch (UnavailableStateException ignored) {
|
||||
LOGGER.debug("Unavailable distributed manager while awaiting failback activation sequence: ignored", ignored);
|
||||
} finally {
|
||||
distributedManager.stop();
|
||||
}
|
||||
ActiveMQServerLogger.LOGGER.restartingReplicatedBackupAfterFailback();
|
||||
activeMQServer.setHAPolicy(policy.getBackupPolicy());
|
||||
activeMQServer.start();
|
||||
}
|
||||
|
||||
private void asyncStopServer() {
|
||||
if (stoppingServer) {
|
||||
if (stoppingServer.get()) {
|
||||
return;
|
||||
}
|
||||
synchronized (this) {
|
||||
if (stoppingServer) {
|
||||
return;
|
||||
}
|
||||
stoppingServer = true;
|
||||
if (stoppingServer.compareAndSet(false, true)) {
|
||||
new Thread(() -> {
|
||||
try {
|
||||
activeMQServer.stop();
|
||||
|
@ -374,19 +356,28 @@ public class ReplicationPrimaryActivation extends LiveActivation implements Dist
|
|||
private void onReplicationConnectionClose() {
|
||||
ExecutorService executorService = activeMQServer.getThreadPool();
|
||||
if (executorService != null) {
|
||||
synchronized (replicationLock) {
|
||||
if (replicationManager == null) {
|
||||
return;
|
||||
}
|
||||
if (stoppingServer.get()) {
|
||||
return;
|
||||
}
|
||||
executorService.execute(() -> {
|
||||
synchronized (replicationLock) {
|
||||
if (replicationManager == null) {
|
||||
return;
|
||||
}
|
||||
// we increment only if we are staying alive
|
||||
if (!stoppingServer.get() && STARTED.equals(activeMQServer.getState())) {
|
||||
try {
|
||||
ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER);
|
||||
} catch (Throwable fatal) {
|
||||
LOGGER.errorf(fatal, "Unexpected exception: %s on attempted activation sequence increment; stopping server async", fatal.getLocalizedMessage());
|
||||
asyncStopServer();
|
||||
}
|
||||
}
|
||||
// this is going to stop the replication manager
|
||||
activeMQServer.getStorageManager().stopReplication();
|
||||
assert !replicationManager.isStarted();
|
||||
final StorageManager storageManager = activeMQServer.getStorageManager();
|
||||
if (storageManager != null) {
|
||||
storageManager.stopReplication();
|
||||
}
|
||||
replicationManager = null;
|
||||
}
|
||||
});
|
||||
|
|
|
@ -0,0 +1,312 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.activemq.artemis.core.server.impl.quorum;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import org.apache.activemq.artemis.api.core.ActiveMQException;
|
||||
import org.apache.activemq.artemis.core.server.ActiveMQServer;
|
||||
import org.apache.activemq.artemis.core.server.NodeManager;
|
||||
import org.apache.activemq.artemis.quorum.DistributedLock;
|
||||
import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager;
|
||||
import org.apache.activemq.artemis.quorum.MutableLong;
|
||||
import org.apache.activemq.artemis.quorum.UnavailableStateException;
|
||||
import org.jboss.logging.Logger;
|
||||
|
||||
/**
|
||||
* This class contains the activation sequence logic of the pluggable quorum vote:
|
||||
* it should be used by {@link org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation}
|
||||
* and {@link org.apache.activemq.artemis.core.server.impl.ReplicationPrimaryActivation} to coordinate
|
||||
* for replication.
|
||||
*/
|
||||
public final class ActivationSequenceStateMachine {
|
||||
|
||||
private static final long CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS = 200;
|
||||
private static final long CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS = 2000;
|
||||
private static final long LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS = 2000;
|
||||
|
||||
private ActivationSequenceStateMachine() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* It loops if the data of the broker is still valuable, but cannot become live.
|
||||
* It loops (temporarly) if data is in sync or can self-heal, but cannot yet acquire the live lock.
|
||||
* <p>
|
||||
* It stops loop and return:
|
||||
* <p><ul>
|
||||
* <li>{@code null}: if data is stale (and there are no rights to become live)
|
||||
* <li>{@code !=null}: if data is in sync and the {@link DistributedLock} is correctly acquired
|
||||
* </ul><p>
|
||||
* <p>
|
||||
* After successfully returning from this method ie not null return value, a broker should use
|
||||
* {@link #ensureSequentialAccessToNodeData(ActiveMQServer, DistributedPrimitiveManager, Logger)} to complete
|
||||
* the activation and guarantee the initial not-replicated ownership of data.
|
||||
*/
|
||||
public static DistributedLock tryActivate(final String nodeId,
|
||||
final long nodeActivationSequence,
|
||||
final DistributedPrimitiveManager distributedManager,
|
||||
final Logger logger) throws InterruptedException, ExecutionException, TimeoutException, UnavailableStateException {
|
||||
final DistributedLock activationLock = distributedManager.getDistributedLock(nodeId);
|
||||
try (MutableLong coordinatedNodeSequence = distributedManager.getMutableLong(nodeId)) {
|
||||
while (true) {
|
||||
// dirty read is sufficient to know if we are *not* an in sync replica
|
||||
// typically the lock owner will increment to signal our data is stale and we are happy without any
|
||||
// further coordination at this point
|
||||
switch (validateActivationSequence(coordinatedNodeSequence, activationLock, nodeId, nodeActivationSequence, logger)) {
|
||||
|
||||
case Stale:
|
||||
activationLock.close();
|
||||
return null;
|
||||
case SelfRepair:
|
||||
case InSync:
|
||||
break;
|
||||
case MaybeInSync:
|
||||
if (activationLock.tryLock()) {
|
||||
// BAD: where's the broker that should commit it?
|
||||
activationLock.unlock();
|
||||
logger.warnf("Cannot assume live role for NodeID = %s: claimed activation sequence need to be repaired",
|
||||
nodeId);
|
||||
TimeUnit.MILLISECONDS.sleep(CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS);
|
||||
continue;
|
||||
}
|
||||
// quick path while data is still valuable: wait until something change (commit/repair)
|
||||
TimeUnit.MILLISECONDS.sleep(CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS);
|
||||
continue;
|
||||
}
|
||||
// SelfRepair, InSync
|
||||
if (!activationLock.tryLock(LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) {
|
||||
logger.debugf("Candidate for Node ID = %s, with local activation sequence: %d, cannot acquire live lock within %dms; retrying",
|
||||
nodeId, nodeActivationSequence, LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS);
|
||||
continue;
|
||||
}
|
||||
switch (validateActivationSequence(coordinatedNodeSequence, activationLock, nodeId, nodeActivationSequence, logger)) {
|
||||
|
||||
case Stale:
|
||||
activationLock.close();
|
||||
return null;
|
||||
case SelfRepair:
|
||||
// Self-repair sequence ie we were the only one with the most up to date data.
|
||||
// NOTE: We cannot move the sequence now, let's delay it on ensureSequentialAccessToNodeData
|
||||
logger.infof("Assuming live role for NodeID = %s: local activation sequence %d matches claimed coordinated activation sequence %d. Repairing sequence", nodeId, nodeActivationSequence, nodeActivationSequence);
|
||||
return activationLock;
|
||||
case InSync:
|
||||
// we are an in_sync_replica, good to go live as UNREPLICATED
|
||||
logger.infof("Assuming live role for NodeID = %s, local activation sequence %d matches current coordinated activation sequence %d", nodeId, nodeActivationSequence, nodeActivationSequence);
|
||||
return activationLock;
|
||||
case MaybeInSync:
|
||||
activationLock.unlock();
|
||||
logger.warnf("Cannot assume live role for NodeID = %s: claimed activation sequence need to be repaired", nodeId);
|
||||
TimeUnit.MILLISECONDS.sleep(CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private enum ValidationResult {
|
||||
/**
|
||||
* coordinated activation sequence (claimed/committed) is far beyond the local one: data is not valuable anymore
|
||||
**/
|
||||
Stale,
|
||||
/**
|
||||
* coordinated activation sequence is the same as local one: data is in sync
|
||||
**/
|
||||
InSync,
|
||||
/**
|
||||
* next coordinated activation sequence is not committed yet: maybe data is in sync
|
||||
**/
|
||||
MaybeInSync,
|
||||
/**
|
||||
* next coordinated activation sequence is not committed yet, but this broker can self-repair: data is in sync
|
||||
**/
|
||||
SelfRepair
|
||||
}
|
||||
|
||||
private static ValidationResult validateActivationSequence(final MutableLong coordinatedNodeSequence,
|
||||
final DistributedLock activationLock,
|
||||
final String lockAndLongId,
|
||||
final long nodeActivationSequence,
|
||||
final Logger logger) throws UnavailableStateException {
|
||||
assert coordinatedNodeSequence.getMutableLongId().equals(lockAndLongId);
|
||||
assert activationLock.getLockId().equals(lockAndLongId);
|
||||
final long currentCoordinatedNodeSequence = coordinatedNodeSequence.get();
|
||||
if (nodeActivationSequence == currentCoordinatedNodeSequence) {
|
||||
return ValidationResult.InSync;
|
||||
}
|
||||
if (currentCoordinatedNodeSequence > 0) {
|
||||
logger.infof("Not a candidate for NodeID = %s activation, local activation sequence %d does not match coordinated activation sequence %d",
|
||||
lockAndLongId, nodeActivationSequence, currentCoordinatedNodeSequence);
|
||||
return ValidationResult.Stale;
|
||||
}
|
||||
// claimed activation sequence
|
||||
final long claimedCoordinatedNodeSequence = -currentCoordinatedNodeSequence;
|
||||
final long sequenceGap = claimedCoordinatedNodeSequence - nodeActivationSequence;
|
||||
if (sequenceGap == 0) {
|
||||
return ValidationResult.SelfRepair;
|
||||
}
|
||||
if (sequenceGap == 1) {
|
||||
// maybe data is still valuable
|
||||
return ValidationResult.MaybeInSync;
|
||||
}
|
||||
assert sequenceGap > 1;
|
||||
// sequence is moved so much that data is no longer valuable
|
||||
logger.infof("Not a candidate for NodeID = %s activation, local activation sequence %d does not match coordinated activation sequence %d",
|
||||
lockAndLongId, nodeActivationSequence, claimedCoordinatedNodeSequence);
|
||||
return ValidationResult.Stale;
|
||||
}
|
||||
|
||||
/**
|
||||
* It wait until {@code timeoutMillis ms} has passed or the coordinated activation sequence has progressed enough
|
||||
*/
|
||||
public static boolean awaitNextCommittedActivationSequence(final DistributedPrimitiveManager distributedManager,
|
||||
final String coordinatedLockAndNodeId,
|
||||
final long activationSequence,
|
||||
final long timeoutMills,
|
||||
final Logger logger)
|
||||
throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException {
|
||||
Objects.requireNonNull(distributedManager);
|
||||
Objects.requireNonNull(logger);
|
||||
Objects.requireNonNull(coordinatedLockAndNodeId);
|
||||
if (activationSequence < 0) {
|
||||
throw new IllegalArgumentException("activationSequence must be >= 0, while is " + activationSequence);
|
||||
}
|
||||
if (!distributedManager.isStarted()) {
|
||||
throw new IllegalStateException("manager must be started");
|
||||
}
|
||||
final MutableLong coordinatedActivationSequence = distributedManager.getMutableLong(coordinatedLockAndNodeId);
|
||||
// wait for the live to activate and run un replicated with a sequence > inSyncReplicaActivation
|
||||
// this read can be dirty b/c we are just looking for an increment.
|
||||
boolean anyNext = false;
|
||||
final long timeoutNs = TimeUnit.MILLISECONDS.toNanos(timeoutMills);
|
||||
final long started = System.nanoTime();
|
||||
long elapsedNs;
|
||||
do {
|
||||
final long coordinatedValue = coordinatedActivationSequence.get();
|
||||
if (coordinatedValue > activationSequence) {
|
||||
// all good, some activation has gone ahead
|
||||
logger.infof("Detected a new activation sequence with NodeID = %s: and sequence: %d", coordinatedLockAndNodeId, coordinatedValue);
|
||||
anyNext = true;
|
||||
break;
|
||||
}
|
||||
if (coordinatedValue < 0) {
|
||||
// commit claim
|
||||
final long claimedSequence = -coordinatedValue;
|
||||
final long activationsGap = claimedSequence - activationSequence;
|
||||
if (activationsGap > 1) {
|
||||
// all good, some activation has gone ahead
|
||||
logger.infof("Detected furthers sequential server activations from sequence %d, with NodeID = %s: and claimed sequence: %d", activationSequence, coordinatedLockAndNodeId, claimedSequence);
|
||||
anyNext = true;
|
||||
break;
|
||||
}
|
||||
// activation is still in progress
|
||||
logger.debugf("Detected claiming of activation sequence = %d for NodeID = %s", claimedSequence, coordinatedLockAndNodeId);
|
||||
}
|
||||
try {
|
||||
TimeUnit.MILLISECONDS.sleep(CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS);
|
||||
} catch (InterruptedException ignored) {
|
||||
}
|
||||
elapsedNs = System.nanoTime() - started;
|
||||
}
|
||||
while (elapsedNs < timeoutNs);
|
||||
return anyNext;
|
||||
}
|
||||
|
||||
/**
|
||||
* This is going to increment the coordinated activation sequence while holding the live lock, failing with some exception otherwise.<br>
|
||||
* <p>
|
||||
* The acceptable states are {@link ValidationResult#InSync} and {@link ValidationResult#SelfRepair}, throwing some exception otherwise.
|
||||
* <p>
|
||||
* This must be used while holding a live lock to ensure not-exclusive ownership of data ie can be both used
|
||||
* while loosing connectivity with a replica or after successfully {@link #tryActivate(String, long, DistributedPrimitiveManager, Logger)}.
|
||||
*/
|
||||
public static void ensureSequentialAccessToNodeData(ActiveMQServer activeMQServer,
|
||||
DistributedPrimitiveManager distributedPrimitiveManager,
|
||||
final Logger logger) throws ActiveMQException, InterruptedException, UnavailableStateException, ExecutionException, TimeoutException {
|
||||
|
||||
final NodeManager nodeManager = activeMQServer.getNodeManager();
|
||||
final String lockAndLongId = nodeManager.getNodeId().toString();
|
||||
final DistributedLock liveLock = distributedPrimitiveManager.getDistributedLock(lockAndLongId);
|
||||
if (!liveLock.isHeldByCaller()) {
|
||||
final String message = String.format("Server [%s], live lock for NodeID = %s, not held, activation sequence cannot be safely changed",
|
||||
activeMQServer, lockAndLongId);
|
||||
logger.info(message);
|
||||
throw new UnavailableStateException(message);
|
||||
}
|
||||
final long nodeActivationSequence = nodeManager.readNodeActivationSequence();
|
||||
final MutableLong coordinatedNodeActivationSequence = distributedPrimitiveManager.getMutableLong(lockAndLongId);
|
||||
final long currentCoordinatedActivationSequence = coordinatedNodeActivationSequence.get();
|
||||
final long nextActivationSequence;
|
||||
if (currentCoordinatedActivationSequence < 0) {
|
||||
// Check Self-Repair
|
||||
if (nodeActivationSequence != -currentCoordinatedActivationSequence) {
|
||||
final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, local activation sequence %d does not match current claimed coordinated sequence %d: need repair",
|
||||
activeMQServer, lockAndLongId, nodeActivationSequence, -currentCoordinatedActivationSequence);
|
||||
logger.info(message);
|
||||
throw new ActiveMQException(message);
|
||||
}
|
||||
// auto-repair: this is the same server that failed to commit its claimed sequence
|
||||
nextActivationSequence = nodeActivationSequence;
|
||||
} else {
|
||||
// Check InSync
|
||||
if (nodeActivationSequence != currentCoordinatedActivationSequence) {
|
||||
final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, local activation sequence %d does not match current coordinated sequence %d",
|
||||
activeMQServer, lockAndLongId, nodeActivationSequence, currentCoordinatedActivationSequence);
|
||||
logger.info(message);
|
||||
throw new ActiveMQException(message);
|
||||
}
|
||||
nextActivationSequence = nodeActivationSequence + 1;
|
||||
}
|
||||
// UN_REPLICATED STATE ENTER: auto-repair doesn't need to claim and write locally
|
||||
if (nodeActivationSequence != nextActivationSequence) {
|
||||
// claim
|
||||
if (!coordinatedNodeActivationSequence.compareAndSet(nodeActivationSequence, -nextActivationSequence)) {
|
||||
final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, activation sequence claim failed, local activation sequence %d no longer matches current coordinated sequence %d",
|
||||
activeMQServer, lockAndLongId, nodeActivationSequence, coordinatedNodeActivationSequence.get());
|
||||
logger.infof(message);
|
||||
throw new ActiveMQException(message);
|
||||
}
|
||||
// claim success: write locally
|
||||
try {
|
||||
nodeManager.writeNodeActivationSequence(nextActivationSequence);
|
||||
} catch (NodeManager.NodeManagerException fatal) {
|
||||
logger.errorf("Server [%s] failed to set local activation sequence to: %d for NodeId =%s. Cannot continue committing coordinated activation sequence: REQUIRES ADMIN INTERVENTION",
|
||||
activeMQServer, nextActivationSequence, lockAndLongId);
|
||||
throw new UnavailableStateException(fatal);
|
||||
}
|
||||
logger.infof("Server [%s], incremented local activation sequence to: %d for NodeId = %s",
|
||||
activeMQServer, nextActivationSequence, lockAndLongId);
|
||||
} else {
|
||||
// self-heal need to update the in-memory sequence, because no writes will do it
|
||||
nodeManager.setNodeActivationSequence(nextActivationSequence);
|
||||
}
|
||||
// commit
|
||||
if (!coordinatedNodeActivationSequence.compareAndSet(-nextActivationSequence, nextActivationSequence)) {
|
||||
final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, activation sequence commit failed, local activation sequence %d no longer matches current coordinated sequence %d",
|
||||
activeMQServer, lockAndLongId, nodeActivationSequence, coordinatedNodeActivationSequence.get());
|
||||
logger.infof(message);
|
||||
throw new ActiveMQException(message);
|
||||
}
|
||||
logger.infof("Server [%s], incremented coordinated activation sequence to: %d for NodeId = %s",
|
||||
activeMQServer, nextActivationSequence, lockAndLongId);
|
||||
}
|
||||
|
||||
}
|
|
@ -244,7 +244,9 @@ public class ManagementServiceImpl implements ManagementService {
|
|||
ObjectName objectName = objectNameBuilder.getActiveMQServerObjectName();
|
||||
unregisterFromJMX(objectName);
|
||||
unregisterFromRegistry(ResourceNames.BROKER);
|
||||
unregisterMeters(ResourceNames.BROKER + "." + messagingServer.getConfiguration().getName());
|
||||
if (messagingServer != null) {
|
||||
unregisterMeters(ResourceNames.BROKER + "." + messagingServer.getConfiguration().getName());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -3189,13 +3189,12 @@
|
|||
</xsd:documentation>
|
||||
</xsd:annotation>
|
||||
</xsd:element>
|
||||
<xsd:element name="check-for-live-server" type="xsd:boolean" default="false" maxOccurs="1" minOccurs="0">
|
||||
<xsd:element name="coordination-id" type="xsd:string" maxOccurs="1" minOccurs="0">
|
||||
<xsd:annotation>
|
||||
<xsd:documentation>
|
||||
Whether to check the cluster for a (live) server using our own server ID when starting
|
||||
up. This option is only necessary for performing 'fail-back' on replicating
|
||||
servers. Strictly speaking this setting only applies to live servers and not to
|
||||
backups.
|
||||
The common identity to use for coordination that is shared across instances that will replicate.
|
||||
The value will be used as the internal server nodeId and as the identity of entities in the
|
||||
distributed-primitive-manager.
|
||||
</xsd:documentation>
|
||||
</xsd:annotation>
|
||||
</xsd:element>
|
||||
|
@ -3208,21 +3207,6 @@
|
|||
</xsd:documentation>
|
||||
</xsd:annotation>
|
||||
</xsd:element>
|
||||
<xsd:element name="vote-retries" type="xsd:integer" default="12" minOccurs="0" maxOccurs="1">
|
||||
<xsd:annotation>
|
||||
<xsd:documentation>
|
||||
If we start as a replica and lose connection to the master, how many times should we attempt to vote
|
||||
for quorum before restarting
|
||||
</xsd:documentation>
|
||||
</xsd:annotation>
|
||||
</xsd:element>
|
||||
<xsd:element name="vote-retry-wait" type="xsd:long" default="2000" minOccurs="0" maxOccurs="1">
|
||||
<xsd:annotation>
|
||||
<xsd:documentation>
|
||||
How long to wait (in milliseconds) between each vote
|
||||
</xsd:documentation>
|
||||
</xsd:annotation>
|
||||
</xsd:element>
|
||||
<xsd:element name="retry-replication-wait" type="xsd:long" default="2000" minOccurs="0" maxOccurs="1">
|
||||
<xsd:annotation>
|
||||
<xsd:documentation>
|
||||
|
@ -3288,20 +3272,6 @@
|
|||
</xsd:documentation>
|
||||
</xsd:annotation>
|
||||
</xsd:element>
|
||||
<xsd:element name="vote-retries" type="xsd:integer" default="12" minOccurs="0" maxOccurs="1">
|
||||
<xsd:annotation>
|
||||
<xsd:documentation>
|
||||
If we lose connection to the master, how many times should we attempt to vote for quorum before restarting
|
||||
</xsd:documentation>
|
||||
</xsd:annotation>
|
||||
</xsd:element>
|
||||
<xsd:element name="vote-retry-wait" type="xsd:long" default="2000" minOccurs="0" maxOccurs="1">
|
||||
<xsd:annotation>
|
||||
<xsd:documentation>
|
||||
How long to wait (in milliseconds) between each vote
|
||||
</xsd:documentation>
|
||||
</xsd:annotation>
|
||||
</xsd:element>
|
||||
<xsd:element name="retry-replication-wait" type="xsd:long" default="2000" minOccurs="0" maxOccurs="1">
|
||||
<xsd:annotation>
|
||||
<xsd:documentation>
|
||||
|
|
|
@ -283,7 +283,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase {
|
|||
assertFalse(policy.canScaleDown());
|
||||
assertFalse(policy.isBackup());
|
||||
assertFalse(policy.isSharedStore());
|
||||
assertTrue(policy.isCheckForLiveServer());
|
||||
assertTrue(policy.isWaitForActivation());
|
||||
assertEquals("purple", policy.getGroupName());
|
||||
assertEquals("purple", policy.getBackupGroupName());
|
||||
|
@ -297,8 +296,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase {
|
|||
assertEquals(policy.getBackupGroupName(), failbackPolicy.getBackupGroupName());
|
||||
assertEquals(policy.getClusterName(), failbackPolicy.getClusterName());
|
||||
assertEquals(failbackPolicy.getMaxSavedReplicatedJournalsSize(), ActiveMQDefaultConfiguration.getDefaultMaxSavedReplicatedJournalsSize());
|
||||
assertEquals(1, failbackPolicy.getVoteRetries());
|
||||
assertEquals(1000, failbackPolicy.getVoteRetryWait());
|
||||
assertTrue(failbackPolicy.isTryFailback());
|
||||
assertTrue(failbackPolicy.isBackup());
|
||||
assertFalse(failbackPolicy.isSharedStore());
|
||||
|
@ -337,8 +334,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase {
|
|||
assertEquals("tiddles", policy.getBackupGroupName());
|
||||
assertEquals("33rrrrr", policy.getClusterName());
|
||||
assertEquals(22, policy.getMaxSavedReplicatedJournalsSize());
|
||||
assertEquals(1, policy.getVoteRetries());
|
||||
assertEquals(1000, policy.getVoteRetryWait());
|
||||
assertFalse(policy.isTryFailback());
|
||||
assertTrue(policy.isBackup());
|
||||
assertFalse(policy.isSharedStore());
|
||||
|
@ -358,7 +353,6 @@ public class HAPolicyConfigurationTest extends ActiveMQTestBase {
|
|||
assertFalse(failoverLivePolicy.canScaleDown());
|
||||
assertFalse(failoverLivePolicy.isBackup());
|
||||
assertFalse(failoverLivePolicy.isSharedStore());
|
||||
assertFalse(failoverLivePolicy.isCheckForLiveServer());
|
||||
assertTrue(failoverLivePolicy.isWaitForActivation());
|
||||
assertEquals(policy.getGroupName(), failoverLivePolicy.getGroupName());
|
||||
assertEquals(policy.getClusterName(), failoverLivePolicy.getClusterName());
|
||||
|
|
|
@ -30,8 +30,6 @@
|
|||
<cluster-name>33rrrrr</cluster-name>
|
||||
<initial-replication-sync-timeout>9876</initial-replication-sync-timeout>
|
||||
<retry-replication-wait>12345</retry-replication-wait>
|
||||
<vote-retries>1</vote-retries>
|
||||
<vote-retry-wait>1000</vote-retry-wait>
|
||||
<allow-failback>false</allow-failback>
|
||||
<manager>
|
||||
<class-name>
|
||||
|
|
|
@ -27,9 +27,6 @@
|
|||
<cluster-name>abcdefg</cluster-name>
|
||||
<initial-replication-sync-timeout>9876</initial-replication-sync-timeout>
|
||||
<retry-replication-wait>12345</retry-replication-wait>
|
||||
<check-for-live-server>true</check-for-live-server>
|
||||
<vote-retries>1</vote-retries>
|
||||
<vote-retry-wait>1000</vote-retry-wait>
|
||||
<manager>
|
||||
<class-name>
|
||||
org.apache.activemq.artemis.core.config.impl.HAPolicyConfigurationTest$FakeDistributedPrimitiveManager
|
||||
|
|
|
@ -65,8 +65,8 @@ which we will cover in a later chapter.
|
|||
> message data will not be available after failover.
|
||||
|
||||
The `ha-policy` type configures which strategy a cluster should use to
|
||||
provide the backing up of a servers data. Within this configuration
|
||||
element is configured how a server should behave within the cluster,
|
||||
provide the backing up of a server's data. Within this configuration
|
||||
element we configure how a server should behave within the cluster,
|
||||
either as a master (live), slave (backup) or colocated (both live and
|
||||
backup). This would look something like:
|
||||
|
||||
|
@ -98,7 +98,7 @@ or
|
|||
</ha-policy>
|
||||
```
|
||||
|
||||
*Replication* allows too to configure 2 new roles to enable *pluggable quorum* provider configuration, by using:
|
||||
*Replication* allows the configuration of two new roles to enable *pluggable quorum* provider configuration, by using:
|
||||
```xml
|
||||
<ha-policy>
|
||||
<replication>
|
||||
|
@ -116,17 +116,14 @@ to configure the classic *master* role, and
|
|||
```
|
||||
for the classic *slave* one.
|
||||
|
||||
If *replication* is configured using such new roles some additional element is required to complete configuration, detailed later.
|
||||
If *replication* is configured using such new roles some additional element are required to complete configuration as detailed later.
|
||||
|
||||
### IMPORTANT NOTE ON PLUGGABLE QUORUM VOTE FEATURE
|
||||
|
||||
This feature is still **EXPERIMENTAL** and not meant to be run in production yet.
|
||||
This feature is still **EXPERIMENTAL**. Extra testing should be done before running this feature into production. Please raise issues eventually found to the ActiveMQ Artemis Mail Lists.
|
||||
|
||||
It means:
|
||||
- its configuration can change until declared as **officially stable**
|
||||
- it has to solve yet an inherent data misalignment issue with replication (it can happen with `classic` replication as well)
|
||||
|
||||
More info about this issue are on [ARTEMIS-3340](https://issues.apache.org/jira/browse/ARTEMIS-3340).
|
||||
- it's configuration can change until declared as **officially stable**
|
||||
|
||||
### Data Replication
|
||||
|
||||
|
@ -226,12 +223,12 @@ changes and repeats the process.
|
|||
> live server by changing `slave` to `master`.
|
||||
|
||||
Much like in the shared-store case, when the live server stops or
|
||||
crashes, its replicating backup will become active and take over its
|
||||
crashes, it's replicating backup will become active and take over its
|
||||
duties. Specifically, the backup will become active when it loses
|
||||
connection to its live server. This can be problematic because this can
|
||||
also happen because of a temporary network problem.
|
||||
connection to its live server. This can be problematic because it can
|
||||
also happen as the result of temporary network problem.
|
||||
|
||||
This issue is solved in 2 different ways depending on which replication roles are configured:
|
||||
The issue can be solved in two different ways, depending on which replication roles are configured:
|
||||
- **classic replication** (`master`/`slave` roles): backup will try to determine whether it still can
|
||||
connect to the other servers in the cluster. If it can connect to more
|
||||
than half the servers, it will become active, if more than half the
|
||||
|
@ -275,7 +272,7 @@ The backup server must be similarly configured but as a `slave`
|
|||
</ha-policy>
|
||||
```
|
||||
|
||||
To configure a pluggable quorum replication's primary and backup instead:
|
||||
To configure a pluggable quorum replication's primary and backup use:
|
||||
|
||||
```xml
|
||||
<ha-policy>
|
||||
|
@ -391,7 +388,6 @@ For `primary`:
|
|||
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
|
||||
</properties>
|
||||
</manager>
|
||||
<check-for-live-server>true</check-for-live-server>
|
||||
</primary>
|
||||
</replication>
|
||||
</ha-policy>
|
||||
|
@ -418,16 +414,16 @@ The configuration of `class-name` as follows
|
|||
```
|
||||
isn't really needed, because Apache Curator is the default provider, but has been shown for completeness.
|
||||
|
||||
The `properties` element, instead
|
||||
The `properties` element:
|
||||
```xml
|
||||
<properties>
|
||||
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
|
||||
</properties>
|
||||
```
|
||||
Can specify a list of `property` elements in the form of key-value pairs, depending the ones
|
||||
accepted by the specified `class-name` provider.
|
||||
can specify a list of `property` elements in the form of key-value pairs, appropriate to what is
|
||||
supported by the specified `class-name` provider.
|
||||
|
||||
Apache Curator's provider allow to configure these properties:
|
||||
Apache Curator's provider allows the following properties:
|
||||
|
||||
- [`connect-string`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#connectString(java.lang.String)): (no default)
|
||||
- [`session-ms`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#sessionTimeoutMs(int)): (default is 18000 ms)
|
||||
|
@ -438,36 +434,25 @@ Apache Curator's provider allow to configure these properties:
|
|||
- [`retries-ms`](https://curator.apache.org/apidocs/org/apache/curator/retry/RetryNTimes.html#%3Cinit%3E(int,int)): (default is 1000 ms)
|
||||
- [`namespace`](https://curator.apache.org/apidocs/org/apache/curator/framework/CuratorFrameworkFactory.Builder.html#namespace(java.lang.String)): (no default)
|
||||
|
||||
Configuration of the [Apache Zookeeper](https://zookeeper.apache.org/) nodes is left to the user, but there are few
|
||||
Configuration of the [Apache Zookeeper](https://zookeeper.apache.org/) ensemble is the responsibility of the user, but there are few
|
||||
**suggestions to improve the reliability of the quorum service**:
|
||||
- broker `session_ms` must be `>= 2 * server tick time` and `<= 20 * server tick time` as by
|
||||
[Zookeeper 3.6.3 admin guide](https://zookeeper.apache.org/doc/r3.6.3/zookeeperAdmin.html): it directly impacts how fast a backup
|
||||
can failover to an isolated/killed/unresponsive live; the higher, the slower.
|
||||
- GC on broker machine should allow keeping GC pauses within 1/3 of `session_ms` in order to let the Zookeeper heartbeat protocol
|
||||
to work reliably: if it's not possible, better increase `session_ms` accepting a slower failover
|
||||
work reliably. If that is not possible, it is better to increase `session_ms`, accepting a slower failover.
|
||||
- Zookeeper must have enough resources to keep GC (and OS) pauses much smaller than server tick time: please consider carefully if
|
||||
broker and Zookeeper node should share the same physical machine, depending on the expected load of the broker
|
||||
- network isolation protection requires configuring >=3 Zookeeper nodes
|
||||
|
||||
#### *Important*: Notes on pluggable quorum replication configuration
|
||||
|
||||
The first `classic` replication configuration that won't apply to the pluggable quorum replication
|
||||
is `vote-on-replication-failure` and configure it produces a startup error: pluggable quorum replication
|
||||
always behave like `vote-on-replication-failure` `true` ie shutting down a live broker (and its JVM) in case of quorum loss.
|
||||
|
||||
The second deprecated `classic` replication configuration is `quorum-vote-wait`: given that the pluggable quorum vote replication
|
||||
requires backup to have an always-on reliable quorum service, there's no need to specify the timeout to reach
|
||||
the majority of quorum nodes. A backup remains inactive (ie JVM still up, console too, unable to sync with live, to failover etc etc)
|
||||
until the majority of quorum nodes is reachable again, re-activating if happens.
|
||||
|
||||
The only exception is with primary failing-back to an existing live backup using `<allow-failback>true</allow-failback>`:
|
||||
if the quorum service isn't immediately available the primary (and its JVM) just stop, allowing fail-fast failing-back.
|
||||
|
||||
There are few *semantic differences* of other existing properties:
|
||||
- `vote-retry-wait`: in `classic` replication means how long to wait between each quorum vote try, while with pluggable quorum replication
|
||||
means how long request to failover for each attempt
|
||||
- `vote-retries`: differently from `classic`, the amount of vote attempt is `1 + vote-retries` (with classic is just `vote-retries`).
|
||||
Setting `0` means no retries, leaving backup to still perform an initial attempt.
|
||||
There are some no longer needed `classic` replication configurations:
|
||||
- `vote-on-replication-failure`
|
||||
- `quorum-vote-wait`
|
||||
- `vote-retries`
|
||||
- `vote-retries-wait`
|
||||
- `check-for-live-server`
|
||||
|
||||
**Notes on replication configuration with [Apache curator](https://curator.apache.org/) quorum provider**
|
||||
|
||||
|
@ -479,34 +464,34 @@ For the former case (session expiration with live no longer present), the backup
|
|||
1. cluster connection PINGs (affected by [connection-ttl](connection-ttl.md) tuning)
|
||||
2. closed TCP connection notification (depends by TCP configuration and networking stack/topology)
|
||||
|
||||
These 2 cases have 2 different failover duration depending on different factors:
|
||||
1. `connection-ttl` affect how much time of the expiring `session-ms` is used to just detect a missing live broker: the higher `connection-tt`,
|
||||
the slower it reacts; backup can attempt to failover for the remaining `session-ms - connection-ttl`
|
||||
2. `session-ms` expiration is immediately detected: backup must try to failover for >=`session-ms` to be sure to catch
|
||||
the session expiration and complete failover
|
||||
The suggestion is to tune `connection-ttl` low enough to attempt failover as soon as possible, while taking in consideration that
|
||||
the whole fail-over duration cannot last less than the configured `session-ms`.
|
||||
|
||||
The previous comments are meant to suggest to the careful reader that the minimum time to attempt to failover
|
||||
cannot be below the full `session-ms` expires.
|
||||
In short, it means
|
||||
```
|
||||
total failover attempt time > session-ms
|
||||
```
|
||||
with
|
||||
```
|
||||
total failover attempt time = vote-retry-wait * (vote-retries + 1)
|
||||
```
|
||||
and by consequence:
|
||||
```
|
||||
vote-retry-wait * (vote-retries + 1) > session-ms
|
||||
```
|
||||
For example with `session-ms = 18000 ms`, safe values for failover timeout are:
|
||||
##### Peer or Multi Primary
|
||||
With coordination delegated to the quorum service, roles are less important. It is possible to have two peer servers compete
|
||||
for activation; the winner activating as live, the looser taking up a backup role. On restart, 'any' peer server
|
||||
with the most up to date journal can activate.
|
||||
The instances need to know in advance, what identity they will coordinate on.
|
||||
In the replication 'primary' ha policy we can explicitly set the 'coordination-id' to a common value for all peers in a cluster.
|
||||
|
||||
For `multi primary`:
|
||||
```xml
|
||||
<vote-retries>11</vote-retries>
|
||||
<vote-retry-wait>2000</vote-retry-wait>
|
||||
<ha-policy>
|
||||
<replication>
|
||||
<primary>
|
||||
<manager>
|
||||
<class-name>org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager</class-name>
|
||||
<properties>
|
||||
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
|
||||
</properties>
|
||||
</manager>
|
||||
<coordination-id>peer-journal-001</coordination-id>
|
||||
</primary>
|
||||
</replication>
|
||||
</ha-policy>
|
||||
```
|
||||
Because `11 * 2000 = 22000 ms` that's bigger then `18000 ms`.
|
||||
|
||||
There's no risk that a backup broker will early stop attempting to failover, losing its chance to become live.
|
||||
Note: the string value provided will be converted internally into a 16 byte UUID, so it may not be immediately recognisable or human-readable,
|
||||
however it will ensure that all 'peers' coordinate.
|
||||
|
||||
### Shared Store
|
||||
|
||||
|
@ -637,10 +622,10 @@ another server using its nodeID. If it finds one, it will contact this
|
|||
server and try to "fail-back". Since this is a remote replication
|
||||
scenario, the "starting live" will have to synchronize its data with the
|
||||
server running with its ID, once they are in sync, it will request the
|
||||
other server (which it assumes it is a back that has assumed its duties)
|
||||
to shutdown for it to take over. This is necessary because otherwise the
|
||||
other server (which it assumes it is a backup that has assumed its duties)
|
||||
to shutdown, for it to take over. This is necessary because otherwise the
|
||||
live server has no means to know whether there was a fail-over or not,
|
||||
and if there was if the server that took its duties is still running or
|
||||
and if there was, if the server that took its duties is still running or
|
||||
not. To configure this option at your `broker.xml`
|
||||
configuration file as follows, for classic replication:
|
||||
|
||||
|
@ -663,21 +648,21 @@ And pluggable quorum replication:
|
|||
<!-- some meaningful configuration -->
|
||||
</manager>
|
||||
<primary>
|
||||
<check-for-live-server>true</check-for-live-server>
|
||||
<!-- no need to check-for-live-server anymore -->
|
||||
</primary>
|
||||
</replication>
|
||||
</ha-policy>
|
||||
```
|
||||
|
||||
The key difference from classic replication is that if `master` cannot reach any
|
||||
live server with its same nodeID, it's going straight to become live, while `primary`
|
||||
request it to the quorum provider, searching again for any existing live if
|
||||
the quorum provider is not available (eg connectivity loss, consensus absence) or
|
||||
if there's another live broker with the same nodeID alive, in an endless loop.
|
||||
live server with its nodeID, it activates unilaterally.
|
||||
With `primary`, the responsibilities of coordination are delegated to the quorum provider,
|
||||
there are no unilateral decisions. The `primary` will only activate when
|
||||
it knows that it has the most up to date version of the journal identified by its nodeID.
|
||||
|
||||
In short: a started `primary` cannot become live without consensus.
|
||||
|
||||
> **Warning**
|
||||
> **Warning for classic replication**
|
||||
>
|
||||
> Be aware that if you restart a live server while after failover has
|
||||
> occurred then `check-for-live-server` must be set to `true`. If not the live server
|
||||
|
|
|
@ -62,6 +62,7 @@ under the License.
|
|||
<module>scale-down</module>
|
||||
<module>stop-server-failover</module>
|
||||
<module>transaction-failover</module>
|
||||
<module>zookeeper-single-pair-failback</module>
|
||||
</modules>
|
||||
</profile>
|
||||
<profile>
|
||||
|
@ -81,6 +82,7 @@ under the License.
|
|||
<module>replicated-transaction-failover</module>
|
||||
<module>scale-down</module>
|
||||
<module>transaction-failover</module>
|
||||
<module>zookeeper-single-pair-failback</module>
|
||||
</modules>
|
||||
</profile>
|
||||
</profiles>
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
<?xml version='1.0'?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>org.apache.activemq.examples.failover</groupId>
|
||||
<artifactId>broker-failover</artifactId>
|
||||
<version>2.18.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>zookeeper-single-pair-ordered-failback</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>ActiveMQ Artemis Zookeeper Single Pair Ordered Failback Example</name>
|
||||
|
||||
<properties>
|
||||
<activemq.basedir>${project.basedir}/../../../..</activemq.basedir>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.activemq</groupId>
|
||||
<artifactId>artemis-cli</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>jakarta.jms</groupId>
|
||||
<artifactId>jakarta.jms-api</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.activemq</groupId>
|
||||
<artifactId>artemis-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>create0</id>
|
||||
<goals>
|
||||
<goal>create</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<!-- this makes it easier in certain envs -->
|
||||
<javaOptions>-Djava.net.preferIPv4Stack=true</javaOptions>
|
||||
<instance>${basedir}/target/server0</instance>
|
||||
<configuration>${basedir}/target/classes/activemq/server0</configuration>
|
||||
<javaOptions>-Dudp-address=${udp-address}</javaOptions>
|
||||
</configuration>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>create1</id>
|
||||
<goals>
|
||||
<goal>create</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<!-- this makes it easier in certain envs -->
|
||||
<javaOptions>-Djava.net.preferIPv4Stack=true</javaOptions>
|
||||
<instance>${basedir}/target/server1</instance>
|
||||
<configuration>${basedir}/target/classes/activemq/server1</configuration>
|
||||
<javaOptions>-Dudp-address=${udp-address}</javaOptions>
|
||||
</configuration>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>runClient</id>
|
||||
<goals>
|
||||
<goal>runClient</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<clientClass>org.apache.activemq.artemis.jms.example.ZookeeperSinglePairFailback</clientClass>
|
||||
<args>
|
||||
<param>${basedir}/target/server0</param>
|
||||
<param>${basedir}/target/server1</param>
|
||||
</args>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.activemq.examples.failover</groupId>
|
||||
<artifactId>zookeeper-single-pair-ordered-failback</artifactId>
|
||||
<version>2.18.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-clean-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,94 @@
|
|||
# Zookeeper Single Pair Failback Example
|
||||
|
||||
This example demonstrates two servers coupled as a primary-backup pair for high availability (HA) using
|
||||
pluggable quorum vote replication Reference Implementation based on [Apache Curator](https://curator.apache.org/) to use
|
||||
[Apache Zookeeper](https://zookeeper.apache.org/) as external quorum service.
|
||||
|
||||
The example shows a client connection failing over from live to backup when the live broker is crashed and
|
||||
then back to the original live when it is restarted (i.e. "failback").
|
||||
|
||||
To run the example, simply type **mvn verify** from this directory after running a Zookeeper node at `localhost:2181`.
|
||||
|
||||
If no Zookeeper node is configured, can use the commands below (see [Official Zookeeper Docker Image Site](https://hub.docker.com/_/zookeeper)
|
||||
for more details on how configure it).
|
||||
|
||||
Run Zookeeper `3.6.3` with:
|
||||
```
|
||||
$ docker run --name artemis-zk --network host --restart always -d zookeeper:3.6.3
|
||||
```
|
||||
By default, the official docker image exposes `2181 2888 3888 8080` as client, follower, election and AdminServer ports.
|
||||
|
||||
Verify Zookeeper server is correctly started by running:
|
||||
```
|
||||
$ docker logs --follow artemis-zk
|
||||
```
|
||||
It should print the Zookeeper welcome ASCII logs:
|
||||
```
|
||||
ZooKeeper JMX enabled by default
|
||||
Using config: /conf/zoo.cfg
|
||||
2021-08-05 14:29:29,431 [myid:] - INFO [main:QuorumPeerConfig@174] - Reading configuration from: /conf/zoo.cfg
|
||||
2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@451] - clientPort is not set
|
||||
2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@464] - secureClientPort is not set
|
||||
2021-08-05 14:29:29,434 [myid:] - INFO [main:QuorumPeerConfig@480] - observerMasterPort is not set
|
||||
2021-08-05 14:29:29,435 [myid:] - INFO [main:QuorumPeerConfig@497] - metricsProvider.className is org.apache.zookeeper.metrics.impl.DefaultMetricsProvider
|
||||
2021-08-05 14:29:29,438 [myid:] - ERROR [main:QuorumPeerConfig@722] - Invalid configuration, only one server specified (ignoring)
|
||||
2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@78] - autopurge.snapRetainCount set to 3
|
||||
2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@79] - autopurge.purgeInterval set to 0
|
||||
2021-08-05 14:29:29,441 [myid:1] - INFO [main:DatadirCleanupManager@101] - Purge task is not scheduled.
|
||||
2021-08-05 14:29:29,441 [myid:1] - WARN [main:QuorumPeerMain@138] - Either no config or no quorum defined in config, running in standalone mode
|
||||
2021-08-05 14:29:29,444 [myid:1] - INFO [main:ManagedUtil@44] - Log4j 1.2 jmx support found and enabled.
|
||||
2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@174] - Reading configuration from: /conf/zoo.cfg
|
||||
2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@451] - clientPort is not set
|
||||
2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@464] - secureClientPort is not set
|
||||
2021-08-05 14:29:29,449 [myid:1] - INFO [main:QuorumPeerConfig@480] - observerMasterPort is not set
|
||||
2021-08-05 14:29:29,450 [myid:1] - INFO [main:QuorumPeerConfig@497] - metricsProvider.className is org.apache.zookeeper.metrics.impl.DefaultMetricsProvider
|
||||
2021-08-05 14:29:29,450 [myid:1] - ERROR [main:QuorumPeerConfig@722] - Invalid configuration, only one server specified (ignoring)
|
||||
2021-08-05 14:29:29,451 [myid:1] - INFO [main:ZooKeeperServerMain@122] - Starting server
|
||||
2021-08-05 14:29:29,459 [myid:1] - INFO [main:ServerMetrics@62] - ServerMetrics initialized with provider org.apache.zookeeper.metrics.impl.DefaultMetricsProvider@525f1e4e
|
||||
2021-08-05 14:29:29,461 [myid:1] - INFO [main:FileTxnSnapLog@124] - zookeeper.snapshot.trust.empty : false
|
||||
2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] -
|
||||
2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - ______ _
|
||||
2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - |___ / | |
|
||||
2021-08-05 14:29:29,467 [myid:1] - INFO [main:ZookeeperBanner@42] - / / ___ ___ | | __ ___ ___ _ __ ___ _ __
|
||||
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - / / / _ \ / _ \ | |/ / / _ \ / _ \ | '_ \ / _ \ | '__|
|
||||
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - / /__ | (_) | | (_) | | < | __/ | __/ | |_) | | __/ | |
|
||||
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - /_____| \___/ \___/ |_|\_\ \___| \___| | .__/ \___| |_|
|
||||
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - | |
|
||||
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] - |_|
|
||||
2021-08-05 14:29:29,468 [myid:1] - INFO [main:ZookeeperBanner@42] -
|
||||
```
|
||||
Alternatively, can run
|
||||
```
|
||||
$ docker run -it --rm --network host zookeeper:3.6.3 zkCli.sh -server localhost:2181
|
||||
```
|
||||
Zookeeper server can be reached using localhost:2181 if it output something like:
|
||||
```
|
||||
2021-08-05 14:56:03,739 [myid:localhost:2181] - INFO [main-SendThread(localhost:2181):ClientCnxn$SendThread@1448] - Session establishment complete on server localhost/0:0:0:0:0:0:0:1:2181, session id = 0x100078b8cfc0002, negotiated timeout = 30000
|
||||
|
||||
```
|
||||
Type
|
||||
```
|
||||
[zk: localhost:2181(CONNECTED) 0] quit
|
||||
```
|
||||
to quit the client instance.
|
||||
|
||||
The 2 brokers of this example are already configured to connect to a single Zookeeper node at the mentioned address, thanks to the XML configuration of their `manager`:
|
||||
```xml
|
||||
<manager>
|
||||
<properties>
|
||||
<property key="connect-string" value="localhost:2181"/>
|
||||
<property key="namespace" value="examples"/>
|
||||
<property key="session-ms" value="18000"/>
|
||||
</properties>
|
||||
</manager>
|
||||
```
|
||||
**NOTE** the `namespace` parameter is used to separate the pair information from others if the Zookeeper node is shared with other applications.
|
||||
|
||||
**WARNING** As already recommended on the [High Availability section](https://activemq.apache.org/components/artemis/documentation/latest/ha.html), a production environment needs >= 3 nodes to protect against network partitions.
|
||||
|
||||
This example can be run with
|
||||
```
|
||||
$ mvn verify
|
||||
```
|
||||
|
||||
For more information on ActiveMQ Artemis failover and HA, and clustering in general, please see the clustering section of the user manual.
|
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.activemq.artemis.jms.example;
|
||||
|
||||
import javax.jms.Connection;
|
||||
import javax.jms.ConnectionFactory;
|
||||
import javax.jms.JMSException;
|
||||
import javax.jms.MessageConsumer;
|
||||
import javax.jms.MessageProducer;
|
||||
import javax.jms.Queue;
|
||||
import javax.jms.Session;
|
||||
import javax.jms.TextMessage;
|
||||
import javax.naming.InitialContext;
|
||||
|
||||
import org.apache.activemq.artemis.util.ServerUtil;
|
||||
|
||||
/**
|
||||
* Example of live and replicating backup pair using Zookeeper as the quorum provider.
|
||||
* <p>
|
||||
* After both servers are started, the live server is killed and the backup becomes active ("fails-over").
|
||||
* <p>
|
||||
* Later the live server is restarted and takes back its position by asking the backup to stop ("fail-back").
|
||||
*/
|
||||
public class ZookeeperSinglePairFailback {
|
||||
|
||||
private static Process server0;
|
||||
|
||||
private static Process server1;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
// Step 0. Prepare Zookeeper Evironment as shown on readme.md
|
||||
|
||||
final int numMessages = 30;
|
||||
|
||||
Connection connection = null;
|
||||
|
||||
InitialContext initialContext = null;
|
||||
|
||||
try {
|
||||
server0 = ServerUtil.startServer(args[0], ZookeeperSinglePairFailback.class.getSimpleName() + "-primary", 0, 30000);
|
||||
server1 = ServerUtil.startServer(args[1], ZookeeperSinglePairFailback.class.getSimpleName() + "-backup", 1, 10000);
|
||||
|
||||
// Step 2. Get an initial context for looking up JNDI from the server #1
|
||||
initialContext = new InitialContext();
|
||||
|
||||
// Step 3. Look up the JMS resources from JNDI
|
||||
Queue queue = (Queue) initialContext.lookup("queue/exampleQueue");
|
||||
ConnectionFactory connectionFactory = (ConnectionFactory) initialContext.lookup("ConnectionFactory");
|
||||
|
||||
// Step 4. Create a JMS Connection
|
||||
connection = connectionFactory.createConnection();
|
||||
|
||||
// Step 5. Create a *non-transacted* JMS Session with client acknowledgement
|
||||
Session session = connection.createSession(false, Session.CLIENT_ACKNOWLEDGE);
|
||||
|
||||
// Step 6. Start the connection to ensure delivery occurs
|
||||
connection.start();
|
||||
|
||||
// Step 7. Create a JMS MessageProducer and a MessageConsumer
|
||||
MessageProducer producer = session.createProducer(queue);
|
||||
MessageConsumer consumer = session.createConsumer(queue);
|
||||
|
||||
// Step 8. Send some messages to server #1, the live server
|
||||
for (int i = 0; i < numMessages; i++) {
|
||||
TextMessage message = session.createTextMessage("This is text message " + i);
|
||||
producer.send(message);
|
||||
System.out.println("Sent message: " + message.getText());
|
||||
}
|
||||
|
||||
// Step 9. Receive and acknowledge a third of the sent messages
|
||||
TextMessage message0 = null;
|
||||
for (int i = 0; i < numMessages / 3; i++) {
|
||||
message0 = (TextMessage) consumer.receive(5000);
|
||||
System.out.println("Got message: " + message0.getText());
|
||||
}
|
||||
message0.acknowledge();
|
||||
System.out.println("Received and acknowledged a third of the sent messages");
|
||||
|
||||
// Step 10. Receive the rest third of the sent messages but *do not* acknowledge them yet
|
||||
for (int i = numMessages / 3; i < numMessages; i++) {
|
||||
message0 = (TextMessage) consumer.receive(5000);
|
||||
System.out.println("Got message: " + message0.getText());
|
||||
}
|
||||
System.out.println("Received without acknowledged the rest of the sent messages");
|
||||
|
||||
Thread.sleep(2000);
|
||||
// Step 11. Crash server #0, the live server, and wait a little while to make sure
|
||||
// it has really crashed
|
||||
ServerUtil.killServer(server0);
|
||||
System.out.println("Killed primary");
|
||||
|
||||
Thread.sleep(2000);
|
||||
|
||||
// Step 12. Acknowledging the received messages will fail as failover to the backup server has occurred
|
||||
try {
|
||||
message0.acknowledge();
|
||||
} catch (JMSException e) {
|
||||
System.out.println("Got (the expected) exception while acknowledging message: " + e.getMessage());
|
||||
}
|
||||
|
||||
// Step 13. Consume again the 2nd third of the messages again. Note that they are not considered as redelivered.
|
||||
for (int i = numMessages / 3; i < (numMessages / 3) * 2; i++) {
|
||||
message0 = (TextMessage) consumer.receive(5000);
|
||||
System.out.printf("Got message: %s (redelivered?: %s)\n", message0.getText(), message0.getJMSRedelivered());
|
||||
}
|
||||
|
||||
// Step 14. Acknowledging them on the failed-over broker works fine
|
||||
message0.acknowledge();
|
||||
System.out.println("Acknowledged 2n third of messages");
|
||||
|
||||
// Step 15. Restarting primary
|
||||
server0 = ServerUtil.startServer(args[0], ZookeeperSinglePairFailback.class.getSimpleName() + "-primary", 0, 10000);
|
||||
System.out.println("Started primary");
|
||||
|
||||
// await fail-back to complete
|
||||
Thread.sleep(4000);
|
||||
|
||||
// Step 16. Consuming the 3rd third of the messages. Note that they are not considered as redelivered.
|
||||
for (int i = (numMessages / 3) * 2; i < numMessages; i++) {
|
||||
message0 = (TextMessage) consumer.receive(5000);
|
||||
System.out.printf("Got message: %s (redelivered?: %s)\n", message0.getText(), message0.getJMSRedelivered());
|
||||
}
|
||||
message0.acknowledge();
|
||||
System.out.println("Acknowledged 3d third of messages");
|
||||
|
||||
} finally {
|
||||
// Step 17. Be sure to close our resources!
|
||||
|
||||
if (connection != null) {
|
||||
connection.close();
|
||||
}
|
||||
|
||||
if (initialContext != null) {
|
||||
initialContext.close();
|
||||
}
|
||||
|
||||
ServerUtil.killServer(server0);
|
||||
ServerUtil.killServer(server1);
|
||||
|
||||
// Step 18. stop the ZK server
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<configuration xmlns="urn:activemq" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:activemq /schema/artemis-configuration.xsd">
|
||||
<core xmlns="urn:activemq:core">
|
||||
|
||||
<bindings-directory>./data/bindings</bindings-directory>
|
||||
|
||||
<journal-directory>./data/journal</journal-directory>
|
||||
|
||||
<large-messages-directory>./data/largemessages</large-messages-directory>
|
||||
|
||||
<paging-directory>./data/paging</paging-directory>
|
||||
|
||||
<cluster-user>exampleUser</cluster-user>
|
||||
|
||||
<cluster-password>secret</cluster-password>
|
||||
|
||||
<ha-policy>
|
||||
<replication>
|
||||
<primary>
|
||||
<manager>
|
||||
<properties>
|
||||
<property key="connect-string" value="localhost:2181"/>
|
||||
<property key="namespace" value="examples"/>
|
||||
<property key="session-ms" value="18000"/>
|
||||
</properties>
|
||||
</manager>
|
||||
</primary>
|
||||
</replication>
|
||||
</ha-policy>
|
||||
|
||||
<connectors>
|
||||
<connector name="netty-connector">tcp://localhost:61616</connector>
|
||||
<connector name="netty-backup-connector">tcp://localhost:61617</connector>
|
||||
</connectors>
|
||||
|
||||
<!-- Acceptors -->
|
||||
<acceptors>
|
||||
<acceptor name="netty-acceptor">tcp://localhost:61616</acceptor>
|
||||
</acceptors>
|
||||
|
||||
<cluster-connections>
|
||||
<cluster-connection name="my-cluster">
|
||||
<connector-ref>netty-connector</connector-ref>
|
||||
<static-connectors>
|
||||
<connector-ref>netty-backup-connector</connector-ref>
|
||||
</static-connectors>
|
||||
</cluster-connection>
|
||||
</cluster-connections>
|
||||
|
||||
<!-- Other config -->
|
||||
|
||||
<security-settings>
|
||||
<!--security for example queue-->
|
||||
<security-setting match="exampleQueue">
|
||||
<permission roles="guest" type="createDurableQueue"/>
|
||||
<permission roles="guest" type="deleteDurableQueue"/>
|
||||
<permission roles="guest" type="createNonDurableQueue"/>
|
||||
<permission roles="guest" type="deleteNonDurableQueue"/>
|
||||
<permission roles="guest" type="consume"/>
|
||||
<permission roles="guest" type="send"/>
|
||||
</security-setting>
|
||||
</security-settings>
|
||||
|
||||
<addresses>
|
||||
<address name="exampleQueue">
|
||||
<anycast>
|
||||
<queue name="exampleQueue"/>
|
||||
</anycast>
|
||||
</address>
|
||||
</addresses>
|
||||
</core>
|
||||
</configuration>
|
|
@ -0,0 +1,91 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<configuration xmlns="urn:activemq" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:activemq /schema/artemis-configuration.xsd">
|
||||
<core xmlns="urn:activemq:core">
|
||||
|
||||
<bindings-directory>./data/bindings</bindings-directory>
|
||||
|
||||
<journal-directory>./data/journal</journal-directory>
|
||||
|
||||
<large-messages-directory>./data/largemessages</large-messages-directory>
|
||||
|
||||
<paging-directory>./data/paging</paging-directory>
|
||||
|
||||
<cluster-user>exampleUser</cluster-user>
|
||||
|
||||
<cluster-password>secret</cluster-password>
|
||||
|
||||
<ha-policy>
|
||||
<replication>
|
||||
<backup>
|
||||
<manager>
|
||||
<properties>
|
||||
<property key="connect-string" value="localhost:2181"/>
|
||||
<property key="namespace" value="examples"/>
|
||||
<property key="session-ms" value="18000"/>
|
||||
</properties>
|
||||
</manager>
|
||||
<allow-failback>true</allow-failback>
|
||||
</backup>
|
||||
</replication>
|
||||
</ha-policy>
|
||||
|
||||
<!-- Connectors -->
|
||||
<connectors>
|
||||
<connector name="netty-live-connector">tcp://localhost:61616</connector>
|
||||
<connector name="netty-connector">tcp://localhost:61617</connector>
|
||||
</connectors>
|
||||
|
||||
<!-- Acceptors -->
|
||||
<acceptors>
|
||||
<acceptor name="netty-acceptor">tcp://localhost:61617</acceptor>
|
||||
</acceptors>
|
||||
|
||||
<cluster-connections>
|
||||
<cluster-connection name="my-cluster">
|
||||
<connector-ref>netty-connector</connector-ref>
|
||||
<static-connectors>
|
||||
<connector-ref>netty-live-connector</connector-ref>
|
||||
</static-connectors>
|
||||
</cluster-connection>
|
||||
</cluster-connections>
|
||||
<!-- Other config -->
|
||||
|
||||
<security-settings>
|
||||
<!--security for example queue-->
|
||||
<security-setting match="exampleQueue">
|
||||
<permission roles="guest" type="createDurableQueue"/>
|
||||
<permission roles="guest" type="deleteDurableQueue"/>
|
||||
<permission roles="guest" type="createNonDurableQueue"/>
|
||||
<permission roles="guest" type="deleteNonDurableQueue"/>
|
||||
<permission roles="guest" type="consume"/>
|
||||
<permission roles="guest" type="send"/>
|
||||
</security-setting>
|
||||
</security-settings>
|
||||
|
||||
<addresses>
|
||||
<address name="exampleQueue">
|
||||
<anycast>
|
||||
<queue name="exampleQueue"/>
|
||||
</anycast>
|
||||
</address>
|
||||
</addresses>
|
||||
</core>
|
||||
</configuration>
|
|
@ -0,0 +1,20 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
java.naming.factory.initial=org.apache.activemq.artemis.jndi.ActiveMQInitialContextFactory
|
||||
connectionFactory.ConnectionFactory=tcp://localhost:61616?ha=true&retryInterval=1000&retryIntervalMultiplier=1.0&reconnectAttempts=-1
|
||||
queue.queue/exampleQueue=exampleQueue
|
|
@ -77,7 +77,6 @@ public class InfiniteRedeliveryTest extends ActiveMQTestBase {
|
|||
|
||||
Configuration backupConfig;
|
||||
Configuration liveConfig;
|
||||
NodeManager nodeManager;
|
||||
|
||||
protected TestableServer createTestableServer(Configuration config, NodeManager nodeManager) throws Exception {
|
||||
boolean isBackup = config.getHAPolicyConfiguration() instanceof ReplicaPolicyConfiguration || config.getHAPolicyConfiguration() instanceof SharedStoreSlavePolicyConfiguration;
|
||||
|
@ -99,13 +98,11 @@ public class InfiniteRedeliveryTest extends ActiveMQTestBase {
|
|||
|
||||
|
||||
|
||||
nodeManager = new InVMNodeManager(true, backupConfig.getJournalLocation());
|
||||
|
||||
backupServer = createTestableServer(backupConfig, nodeManager);
|
||||
backupServer = createTestableServer(backupConfig, new InVMNodeManager(true, backupConfig.getJournalLocation()));
|
||||
|
||||
liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(TransportConfigurationUtils.getNettyAcceptor(true, 0));
|
||||
|
||||
liveServer = createTestableServer(liveConfig, nodeManager);
|
||||
liveServer = createTestableServer(liveConfig, new InVMNodeManager(false, liveConfig.getJournalLocation()));
|
||||
}
|
||||
|
||||
protected void configureReplicationPair(TransportConfiguration backupConnector,
|
||||
|
|
|
@ -659,6 +659,8 @@ public class FailoverTest extends FailoverTestBase {
|
|||
|
||||
backupServer.getServer().fail(true);
|
||||
|
||||
decrementActivationSequenceForForceRestartOf(liveServer);
|
||||
|
||||
liveServer.start();
|
||||
|
||||
consumer.close();
|
||||
|
@ -823,6 +825,7 @@ public class FailoverTest extends FailoverTestBase {
|
|||
Assert.assertFalse("must NOT be a backup", isBackup);
|
||||
adaptLiveConfigForReplicatedFailBack(liveServer);
|
||||
beforeRestart(liveServer);
|
||||
decrementActivationSequenceForForceRestartOf(liveServer);
|
||||
liveServer.start();
|
||||
Assert.assertTrue("live initialized...", liveServer.getServer().waitForActivation(15, TimeUnit.SECONDS));
|
||||
|
||||
|
@ -931,12 +934,13 @@ public class FailoverTest extends FailoverTestBase {
|
|||
while (!backupServer.isStarted() && i++ < 100) {
|
||||
Thread.sleep(100);
|
||||
}
|
||||
liveServer.getServer().waitForActivation(5, TimeUnit.SECONDS);
|
||||
backupServer.getServer().waitForActivation(5, TimeUnit.SECONDS);
|
||||
Assert.assertTrue(backupServer.isStarted());
|
||||
|
||||
if (isReplicated) {
|
||||
FileMoveManager moveManager = new FileMoveManager(backupServer.getServer().getConfiguration().getJournalLocation(), 0);
|
||||
Assert.assertEquals(1, moveManager.getNumberOfFolders());
|
||||
// backup has not had a chance to restart as a backup and cleanup
|
||||
Wait.assertTrue(() -> moveManager.getNumberOfFolders() <= 2);
|
||||
}
|
||||
} else {
|
||||
backupServer.stop();
|
||||
|
@ -2427,6 +2431,10 @@ public class FailoverTest extends FailoverTestBase {
|
|||
// no-op
|
||||
}
|
||||
|
||||
protected void decrementActivationSequenceForForceRestartOf(TestableServer liveServer) throws Exception {
|
||||
// no-op
|
||||
}
|
||||
|
||||
protected ClientSession sendAndConsume(final ClientSessionFactory sf1, final boolean createQueue) throws Exception {
|
||||
ClientSession session = createSession(sf1, false, true, true);
|
||||
|
||||
|
|
|
@ -46,7 +46,6 @@ import org.apache.activemq.artemis.core.remoting.impl.invm.InVMRegistry;
|
|||
import org.apache.activemq.artemis.core.server.NodeManager;
|
||||
import org.apache.activemq.artemis.core.server.cluster.ha.HAPolicy;
|
||||
import org.apache.activemq.artemis.core.server.cluster.ha.ReplicatedPolicy;
|
||||
import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationPrimaryPolicy;
|
||||
import org.apache.activemq.artemis.core.server.impl.ActiveMQServerImpl;
|
||||
import org.apache.activemq.artemis.core.server.impl.InVMNodeManager;
|
||||
import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager;
|
||||
|
@ -88,6 +87,10 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
|
|||
|
||||
protected NodeManager nodeManager;
|
||||
|
||||
protected NodeManager backupNodeManager;
|
||||
|
||||
protected DistributedPrimitiveManagerConfiguration managerConfiguration;
|
||||
|
||||
protected boolean startBackupServer = true;
|
||||
|
||||
@Override
|
||||
|
@ -173,6 +176,10 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
|
|||
return new InVMNodeManager(false);
|
||||
}
|
||||
|
||||
protected NodeManager createNodeManager(Configuration configuration) throws Exception {
|
||||
return new InVMNodeManager(false, configuration.getNodeManagerLockLocation());
|
||||
}
|
||||
|
||||
protected void createConfigs() throws Exception {
|
||||
nodeManager = createNodeManager();
|
||||
TransportConfiguration liveConnector = getConnectorTransportConfiguration(true);
|
||||
|
@ -211,13 +218,14 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
|
|||
backupConfig.setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)).setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setSecurityEnabled(false);
|
||||
|
||||
setupHAPolicyConfiguration();
|
||||
nodeManager = createReplicatedBackupNodeManager(backupConfig);
|
||||
backupNodeManager = createReplicatedBackupNodeManager(backupConfig);
|
||||
|
||||
backupServer = createTestableServer(backupConfig);
|
||||
backupServer = createTestableServer(backupConfig, backupNodeManager);
|
||||
|
||||
liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true));
|
||||
|
||||
liveServer = createTestableServer(liveConfig);
|
||||
nodeManager = createNodeManager(liveConfig);
|
||||
liveServer = createTestableServer(liveConfig, nodeManager);
|
||||
|
||||
if (supportsRetention()) {
|
||||
liveServer.getServer().getConfiguration().setJournalRetentionDirectory(getJournalDir(0, false) + "_retention");
|
||||
|
@ -233,7 +241,7 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
|
|||
backupConfig = createDefaultInVMConfig();
|
||||
liveConfig = createDefaultInVMConfig();
|
||||
|
||||
DistributedPrimitiveManagerConfiguration managerConfiguration =
|
||||
managerConfiguration =
|
||||
new DistributedPrimitiveManagerConfiguration(FileBasedPrimitiveManager.class.getName(),
|
||||
Collections.singletonMap("locks-folder", tmpFolder.newFolder("manager").toString()));
|
||||
|
||||
|
@ -242,13 +250,14 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
|
|||
backupConfig.setBindingsDirectory(getBindingsDir(0, true)).setJournalDirectory(getJournalDir(0, true)).setPagingDirectory(getPageDir(0, true)).setLargeMessagesDirectory(getLargeMessagesDir(0, true)).setSecurityEnabled(false);
|
||||
|
||||
setupHAPolicyConfiguration();
|
||||
nodeManager = createReplicatedBackupNodeManager(backupConfig);
|
||||
backupNodeManager = createReplicatedBackupNodeManager(backupConfig);
|
||||
|
||||
backupServer = createTestableServer(backupConfig);
|
||||
backupServer = createTestableServer(backupConfig, backupNodeManager);
|
||||
|
||||
liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true));
|
||||
|
||||
liveServer = createTestableServer(liveConfig);
|
||||
nodeManager = createNodeManager(liveConfig);
|
||||
liveServer = createTestableServer(liveConfig, nodeManager);
|
||||
}
|
||||
|
||||
protected void setupHAPolicyConfiguration() {
|
||||
|
@ -272,8 +281,6 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
|
|||
HAPolicy policy = server.getServer().getHAPolicy();
|
||||
if (policy instanceof ReplicatedPolicy) {
|
||||
((ReplicatedPolicy) policy).setCheckForLiveServer(true);
|
||||
} else if (policy instanceof ReplicationPrimaryPolicy) {
|
||||
Assert.assertTrue("Adapting won't work for the current configuration", ((ReplicationPrimaryPolicy) policy).isCheckForLiveServer());
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -294,6 +301,7 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
|
|||
|
||||
nodeManager = null;
|
||||
|
||||
backupNodeManager = null;
|
||||
try {
|
||||
ServerSocket serverSocket = new ServerSocket(61616);
|
||||
serverSocket.close();
|
||||
|
|
|
@ -182,8 +182,6 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase {
|
|||
break;
|
||||
case PluggableQuorumReplication:
|
||||
haPolicyConfiguration = ReplicationBackupPolicyConfiguration.withDefault()
|
||||
.setVoteRetries(1)
|
||||
.setVoteRetryWait(1000)
|
||||
.setDistributedManagerConfiguration(getOrCreatePluggableQuorumConfiguration())
|
||||
.setGroupName(getNodeGroupName() != null ? (getNodeGroupName() + "-" + i) : null);
|
||||
break;
|
||||
|
@ -278,12 +276,14 @@ public abstract class MultipleServerFailoverTestBase extends ActiveMQTestBase {
|
|||
return addClientSession(sf.createSession(xa, autoCommitSends, autoCommitAcks));
|
||||
}
|
||||
|
||||
protected void waitForDistribution(SimpleString address, ActiveMQServer server, int messageCount) throws Exception {
|
||||
protected boolean waitForDistribution(SimpleString address, ActiveMQServer server, int messageCount) throws Exception {
|
||||
ActiveMQServerLogger.LOGGER.debug("waiting for distribution of messages on server " + server);
|
||||
|
||||
Queue q = (Queue) server.getPostOffice().getBinding(address).getBindable();
|
||||
|
||||
Wait.waitFor(() -> getMessageCount(q) >= messageCount);
|
||||
return Wait.waitFor(() -> {
|
||||
return getMessageCount(q) >= messageCount;
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -156,12 +156,13 @@ public class ReplicaTimeoutTest extends ActiveMQTestBase {
|
|||
liveConfig.setBindingsDirectory(getBindingsDir(0, false)).setJournalDirectory(getJournalDir(0, false)).
|
||||
setPagingDirectory(getPageDir(0, false)).setLargeMessagesDirectory(getLargeMessagesDir(0, false)).setSecurityEnabled(false);
|
||||
|
||||
NodeManager nodeManager = createReplicatedBackupNodeManager(backupConfig);
|
||||
NodeManager replicatedBackupNodeManager = createReplicatedBackupNodeManager(backupConfig);
|
||||
|
||||
backupServer = createTestableServer(backupConfig, nodeManager);
|
||||
backupServer = createTestableServer(backupConfig, replicatedBackupNodeManager);
|
||||
|
||||
liveConfig.clearAcceptorConfigurations().addAcceptorConfiguration(getAcceptorTransportConfiguration(true));
|
||||
|
||||
NodeManager nodeManager = createReplicatedBackupNodeManager(liveConfig);
|
||||
liveServer = createTestableServer(liveConfig, nodeManager);
|
||||
|
||||
final TestableServer theBackup = backupServer;
|
||||
|
|
|
@ -75,7 +75,10 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated
|
|||
|
||||
sendCrashReceive();
|
||||
Wait.assertTrue(backupServers.get(0)::isActive, 5000, 10);
|
||||
Wait.assertTrue(backupServers.get(1)::isActive, 5000, 10);
|
||||
waitForTopology(backupServers.get(0).getServer(), liveServers.size(), 2);
|
||||
waitForTopology(backupServers.get(1).getServer(), liveServers.size(), 2);
|
||||
|
||||
sendCrashBackupReceive();
|
||||
}
|
||||
|
||||
|
@ -115,6 +118,14 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated
|
|||
}
|
||||
|
||||
protected void sendCrashBackupReceive() throws Exception {
|
||||
|
||||
//make sure bindings are ready before sending messages b/c we verify strict load balancing in waitForDistribution
|
||||
this.waitForBindings( backupServers.get(0).getServer(), ADDRESS.toString(), false, 1, 0, 2000);
|
||||
this.waitForBindings( backupServers.get(0).getServer(), ADDRESS.toString(), false, 1, 0, 2000);
|
||||
|
||||
this.waitForBindings( backupServers.get(1).getServer(), ADDRESS.toString(), false, 1, 0, 2000);
|
||||
this.waitForBindings( backupServers.get(1).getServer(), ADDRESS.toString(), false, 1, 0, 2000);
|
||||
|
||||
ServerLocator locator0 = getBackupServerLocator(0);
|
||||
ServerLocator locator1 = getBackupServerLocator(1);
|
||||
|
||||
|
@ -138,8 +149,8 @@ public class ReplicatedMultipleServerFailoverExtraBackupsTest extends Replicated
|
|||
|
||||
producer.close();
|
||||
|
||||
waitForDistribution(ADDRESS, backupServers.get(0).getServer(), 100);
|
||||
waitForDistribution(ADDRESS, backupServers.get(1).getServer(), 100);
|
||||
assertTrue(waitForDistribution(ADDRESS, backupServers.get(0).getServer(), 100));
|
||||
assertTrue(waitForDistribution(ADDRESS, backupServers.get(1).getServer(), 100));
|
||||
|
||||
List<TestableServer> toCrash = new ArrayList<>();
|
||||
for (TestableServer backupServer : backupServers) {
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.activemq.artemis.api.core.ActiveMQException;
|
|||
import org.apache.activemq.artemis.api.core.Interceptor;
|
||||
import org.apache.activemq.artemis.api.core.TransportConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.protocol.core.Packet;
|
||||
import org.apache.activemq.artemis.core.protocol.core.impl.PacketImpl;
|
||||
import org.apache.activemq.artemis.spi.core.protocol.RemotingConnection;
|
||||
|
@ -90,7 +89,6 @@ public class PluggableQuorumBackupAuthenticationTest extends FailoverTestBase {
|
|||
|
||||
@Override
|
||||
protected void setupHAPolicyConfiguration() {
|
||||
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
|
||||
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
|
||||
}
|
||||
|
||||
|
|
|
@ -17,8 +17,7 @@
|
|||
|
||||
package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
|
||||
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;;
|
||||
import org.apache.activemq.artemis.tests.integration.cluster.failover.BackupSyncJournalTest;
|
||||
|
||||
public class PluggableQuorumBackupSyncJournalTest extends BackupSyncJournalTest {
|
||||
|
@ -30,8 +29,6 @@ public class PluggableQuorumBackupSyncJournalTest extends BackupSyncJournalTest
|
|||
|
||||
@Override
|
||||
protected void setupHAPolicyConfiguration() {
|
||||
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration())
|
||||
.setCheckForLiveServer(true);
|
||||
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration())
|
||||
.setMaxSavedReplicatedJournalsSize(2)
|
||||
.setAllowFailBack(true);
|
||||
|
|
|
@ -29,18 +29,24 @@ import org.apache.activemq.artemis.api.core.QueueConfiguration;
|
|||
import org.apache.activemq.artemis.api.core.TransportConfiguration;
|
||||
import org.apache.activemq.artemis.api.core.client.ClientSession;
|
||||
import org.apache.activemq.artemis.component.WebServerComponent;
|
||||
import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.server.ActiveMQServer;
|
||||
import org.apache.activemq.artemis.core.server.NodeManager;
|
||||
import org.apache.activemq.artemis.core.server.ServiceComponent;
|
||||
import org.apache.activemq.artemis.dto.AppDTO;
|
||||
import org.apache.activemq.artemis.dto.WebServerDTO;
|
||||
import org.apache.activemq.artemis.quorum.MutableLong;
|
||||
import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager;
|
||||
import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTest;
|
||||
import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer;
|
||||
import org.apache.activemq.artemis.tests.util.Wait;
|
||||
import org.jboss.logging.Logger;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends FailoverTest {
|
||||
private static final Logger log = Logger.getLogger(PluggableQuorumReplicatedLargeMessageFailoverTest.class);
|
||||
|
||||
protected void beforeWaitForRemoteBackupSynchronization() {
|
||||
}
|
||||
|
@ -170,8 +176,6 @@ public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends Failo
|
|||
|
||||
@Override
|
||||
protected void setupHAPolicyConfiguration() {
|
||||
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration())
|
||||
.setCheckForLiveServer(true);
|
||||
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration())
|
||||
.setMaxSavedReplicatedJournalsSize(2)
|
||||
.setAllowFailBack(true);
|
||||
|
@ -210,4 +214,30 @@ public class PluggableQuorumNettyNoGroupNameReplicatedFailoverTest extends Failo
|
|||
}
|
||||
super.crash(sessions);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void decrementActivationSequenceForForceRestartOf(TestableServer testableServer) throws Exception {
|
||||
doDecrementActivationSequenceForForceRestartOf(log, nodeManager, managerConfiguration);
|
||||
}
|
||||
|
||||
public static void doDecrementActivationSequenceForForceRestartOf(Logger log, NodeManager nodeManager, DistributedPrimitiveManagerConfiguration distributedPrimitiveManagerConfiguration) throws Exception {
|
||||
nodeManager.start();
|
||||
long localActivation = nodeManager.readNodeActivationSequence();
|
||||
// file based
|
||||
FileBasedPrimitiveManager fileBasedPrimitiveManager = new FileBasedPrimitiveManager(distributedPrimitiveManagerConfiguration.getProperties());
|
||||
fileBasedPrimitiveManager.start();
|
||||
try {
|
||||
MutableLong mutableLong = fileBasedPrimitiveManager.getMutableLong(nodeManager.getNodeId().toString());
|
||||
|
||||
if (!mutableLong.compareAndSet(localActivation + 1, localActivation)) {
|
||||
throw new Exception("Failed to decrement coordinated activation sequence to:" + localActivation + ", not +1 : " + mutableLong.get());
|
||||
}
|
||||
log.warn("Intentionally decrementing coordinated activation sequence for test, may result is lost data");
|
||||
|
||||
} finally {
|
||||
fileBasedPrimitiveManager.stop();
|
||||
nodeManager.stop();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
|
||||
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.tests.integration.cluster.failover.NettyReplicationStopTest;
|
||||
|
||||
public class PluggableQuorumNettyReplicationStopTest extends NettyReplicationStopTest {
|
||||
|
@ -29,7 +28,6 @@ public class PluggableQuorumNettyReplicationStopTest extends NettyReplicationSto
|
|||
|
||||
@Override
|
||||
protected void setupHAPolicyConfiguration() {
|
||||
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
|
||||
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
|
||||
}
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
|
||||
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.tests.integration.cluster.failover.PageCleanupWhileReplicaCatchupTest;
|
||||
|
||||
public class PluggableQuorumPageCleanupWhileReplicaCatchupTest extends PageCleanupWhileReplicaCatchupTest {
|
||||
|
@ -29,7 +28,6 @@ public class PluggableQuorumPageCleanupWhileReplicaCatchupTest extends PageClean
|
|||
|
||||
@Override
|
||||
protected void setupHAPolicyConfiguration() {
|
||||
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
|
||||
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
|
||||
}
|
||||
|
||||
|
|
|
@ -48,7 +48,6 @@ public class PluggableQuorumReplicaTimeoutTest extends ReplicaTimeoutTest {
|
|||
managerConfiguration, managerConfiguration);
|
||||
ReplicationPrimaryPolicyConfiguration primaryConfiguration = ((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration());
|
||||
primaryConfiguration.setInitialReplicationSyncTimeout(1000);
|
||||
primaryConfiguration.setCheckForLiveServer(true);
|
||||
ReplicationBackupPolicyConfiguration backupConfiguration = ((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration());
|
||||
backupConfiguration.setInitialReplicationSyncTimeout(1000);
|
||||
backupConfiguration.setMaxSavedReplicatedJournalsSize(2)
|
||||
|
|
|
@ -18,11 +18,15 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
|
|||
|
||||
import org.apache.activemq.artemis.api.core.client.ClientSession;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.tests.integration.cluster.failover.LargeMessageFailoverTest;
|
||||
import org.apache.activemq.artemis.tests.integration.cluster.util.TestableServer;
|
||||
import org.jboss.logging.Logger;
|
||||
|
||||
import static org.apache.activemq.artemis.tests.integration.cluster.failover.quorum.PluggableQuorumNettyNoGroupNameReplicatedFailoverTest.doDecrementActivationSequenceForForceRestartOf;
|
||||
|
||||
public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMessageFailoverTest {
|
||||
|
||||
private static final Logger log = Logger.getLogger(PluggableQuorumReplicatedLargeMessageFailoverTest.class);
|
||||
@Override
|
||||
protected void createConfigs() throws Exception {
|
||||
createPluggableReplicatedConfigs();
|
||||
|
@ -30,7 +34,6 @@ public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMess
|
|||
|
||||
@Override
|
||||
protected void setupHAPolicyConfiguration() {
|
||||
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
|
||||
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
|
||||
}
|
||||
|
||||
|
@ -58,4 +61,8 @@ public class PluggableQuorumReplicatedLargeMessageFailoverTest extends LargeMess
|
|||
super.crash(sessions);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void decrementActivationSequenceForForceRestartOf(TestableServer liveServer) throws Exception {
|
||||
doDecrementActivationSequenceForForceRestartOf(log, nodeManager, managerConfiguration);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
|
|||
|
||||
import org.apache.activemq.artemis.api.core.client.ClientSession;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.tests.integration.cluster.util.BackupSyncDelay;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
|
@ -66,7 +65,6 @@ public class PluggableQuorumReplicatedLargeMessageWithDelayFailoverTest extends
|
|||
|
||||
@Override
|
||||
protected void setupHAPolicyConfiguration() {
|
||||
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
|
||||
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration())
|
||||
.setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
package org.apache.activemq.artemis.tests.integration.cluster.failover.quorum;
|
||||
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.tests.integration.cluster.failover.PagingFailoverTest;
|
||||
|
||||
public class PluggableQuorumReplicatedPagingFailoverTest extends PagingFailoverTest {
|
||||
|
@ -29,7 +28,6 @@ public class PluggableQuorumReplicatedPagingFailoverTest extends PagingFailoverT
|
|||
|
||||
@Override
|
||||
protected void setupHAPolicyConfiguration() {
|
||||
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
|
||||
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -520,6 +520,11 @@ public class ActiveMQServerControlUsingCoreTest extends ActiveMQServerControlTes
|
|||
return (String) proxy.retrieveAttributeValue("nodeID");
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getActivationSequence() {
|
||||
return (Long) proxy.retrieveAttributeValue("activationSequence");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getManagementAddress() {
|
||||
return (String) proxy.retrieveAttributeValue("managementAddress");
|
||||
|
|
|
@ -45,8 +45,6 @@ public class PluggableQuorumReplicationFlowControlTest extends SharedNothingRepl
|
|||
ReplicationBackupPolicyConfiguration haPolicy = ReplicationBackupPolicyConfiguration.withDefault();
|
||||
haPolicy.setDistributedManagerConfiguration(managerConfiguration);
|
||||
haPolicy.setClusterName("cluster");
|
||||
// fail-fast in order to let the backup to quickly retry syncing with primary
|
||||
haPolicy.setVoteRetries(0);
|
||||
return haPolicy;
|
||||
}
|
||||
|
||||
|
@ -54,7 +52,6 @@ public class PluggableQuorumReplicationFlowControlTest extends SharedNothingRepl
|
|||
protected HAPolicyConfiguration createReplicationLiveConfiguration() {
|
||||
ReplicationPrimaryPolicyConfiguration haPolicy = ReplicationPrimaryPolicyConfiguration.withDefault();
|
||||
haPolicy.setDistributedManagerConfiguration(managerConfiguration);
|
||||
haPolicy.setCheckForLiveServer(false);
|
||||
return haPolicy;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.activemq.artemis.tests.integration.replication;
|
|||
|
||||
import org.apache.activemq.artemis.api.core.TransportConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.Parameterized;
|
||||
|
||||
|
@ -30,7 +29,7 @@ public class PluggableQuorumReplicationOrderTest extends ReplicationOrderTest {
|
|||
@Parameterized.Parameter
|
||||
public boolean useNetty;
|
||||
|
||||
@Parameterized.Parameters(name = "useNetty={1}")
|
||||
@Parameterized.Parameters(name = "useNetty={0}")
|
||||
public static Iterable<Object[]> getParams() {
|
||||
return asList(new Object[][]{{false}, {true}});
|
||||
}
|
||||
|
@ -42,8 +41,6 @@ public class PluggableQuorumReplicationOrderTest extends ReplicationOrderTest {
|
|||
|
||||
@Override
|
||||
protected void setupHAPolicyConfiguration() {
|
||||
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration())
|
||||
.setCheckForLiveServer(true);
|
||||
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration())
|
||||
.setMaxSavedReplicatedJournalsSize(2)
|
||||
.setAllowFailBack(true);
|
||||
|
|
|
@ -18,14 +18,37 @@ package org.apache.activemq.artemis.tests.integration.replication;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.activemq.artemis.api.core.Message;
|
||||
import org.apache.activemq.artemis.api.core.QueueConfiguration;
|
||||
import org.apache.activemq.artemis.api.core.RoutingType;
|
||||
import org.apache.activemq.artemis.api.core.client.ClientConsumer;
|
||||
import org.apache.activemq.artemis.api.core.client.ClientMessage;
|
||||
import org.apache.activemq.artemis.api.core.client.ClientProducer;
|
||||
import org.apache.activemq.artemis.api.core.client.ClientSession;
|
||||
import org.apache.activemq.artemis.api.core.client.ClientSessionFactory;
|
||||
import org.apache.activemq.artemis.api.core.client.ServerLocator;
|
||||
import org.apache.activemq.artemis.core.client.impl.ServerLocatorImpl;
|
||||
import org.apache.activemq.artemis.core.config.Configuration;
|
||||
import org.apache.activemq.artemis.core.config.HAPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.server.ActivateCallback;
|
||||
import org.apache.activemq.artemis.core.server.ActiveMQServer;
|
||||
import org.apache.activemq.artemis.core.server.ActiveMQServers;
|
||||
import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager;
|
||||
import org.apache.activemq.artemis.quorum.DistributedLock;
|
||||
import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager;
|
||||
import org.apache.activemq.artemis.quorum.MutableLong;
|
||||
import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager;
|
||||
import org.apache.activemq.artemis.tests.util.Wait;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest {
|
||||
|
@ -43,7 +66,6 @@ public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest
|
|||
protected HAPolicyConfiguration createReplicationLiveConfiguration() {
|
||||
ReplicationPrimaryPolicyConfiguration haPolicy = ReplicationPrimaryPolicyConfiguration.withDefault();
|
||||
haPolicy.setDistributedManagerConfiguration(managerConfiguration);
|
||||
haPolicy.setCheckForLiveServer(false);
|
||||
return haPolicy;
|
||||
}
|
||||
|
||||
|
@ -55,4 +77,616 @@ public class PluggableQuorumReplicationTest extends SharedNothingReplicationTest
|
|||
return haPolicy;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnReplicatedOrderedTransition() throws Exception {
|
||||
// start live
|
||||
final Configuration liveConfiguration = createLiveConfiguration();
|
||||
|
||||
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
|
||||
liveServer.setIdentity("LIVE");
|
||||
liveServer.start();
|
||||
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
|
||||
ServerLocator locator = ServerLocatorImpl.newLocator("(tcp://localhost:61616,tcp://localhost:61617)?ha=true");
|
||||
locator.setCallTimeout(60_000L);
|
||||
locator.setConnectionTTL(60_000L);
|
||||
|
||||
ClientSessionFactory csf = locator.createSessionFactory();
|
||||
ClientSession clientSession = csf.createSession();
|
||||
clientSession.createQueue(new QueueConfiguration("slow").setRoutingType(RoutingType.ANYCAST));
|
||||
clientSession.close();
|
||||
|
||||
// start backup
|
||||
Configuration backupConfiguration = createBackupConfiguration();
|
||||
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
|
||||
backupServer.setIdentity("BACKUP");
|
||||
backupServer.start();
|
||||
|
||||
Wait.waitFor(backupServer::isStarted);
|
||||
|
||||
waitForTopology(liveServer, 1, 1, 30000);
|
||||
waitForTopology(backupServer, 1, 1, 30000);
|
||||
|
||||
liveServer.stop();
|
||||
|
||||
// backup will take over and run un replicated
|
||||
|
||||
csf = locator.createSessionFactory();
|
||||
clientSession = csf.createSession();
|
||||
clientSession.createQueue(new QueueConfiguration("slow_un_replicated").setRoutingType(RoutingType.ANYCAST));
|
||||
clientSession.close();
|
||||
|
||||
waitForTopology(backupServer, 1, 0, 30000);
|
||||
assertTrue(Wait.waitFor(() -> 2L == backupServer.getNodeManager().getNodeActivationSequence()));
|
||||
|
||||
backupServer.stop(false);
|
||||
|
||||
// now only backup should be able to start as it has run un_replicated
|
||||
liveServer.start();
|
||||
Wait.assertFalse(liveServer::isActive);
|
||||
liveServer.stop();
|
||||
|
||||
// restart backup
|
||||
backupServer.start();
|
||||
|
||||
Wait.waitFor(backupServer::isStarted);
|
||||
assertEquals(3L, backupServer.getNodeManager().getNodeActivationSequence());
|
||||
|
||||
csf = locator.createSessionFactory();
|
||||
clientSession = csf.createSession();
|
||||
clientSession.createQueue(new QueueConfiguration("backup_as_un_replicated").setRoutingType(RoutingType.ANYCAST));
|
||||
clientSession.close();
|
||||
|
||||
// verify the live restart as a backup to the restarted backupServer that has taken on the live role, no failback
|
||||
liveServer.start();
|
||||
|
||||
csf = locator.createSessionFactory();
|
||||
clientSession = csf.createSession();
|
||||
clientSession.createQueue(new QueueConfiguration("backup_as_replicated").setRoutingType(RoutingType.ANYCAST));
|
||||
clientSession.close();
|
||||
|
||||
assertTrue(Wait.waitFor(liveServer::isReplicaSync));
|
||||
assertTrue(Wait.waitFor(() -> 3L == liveServer.getNodeManager().getNodeActivationSequence()));
|
||||
|
||||
backupServer.stop(true);
|
||||
|
||||
waitForTopology(liveServer, 1, 0, 30000);
|
||||
assertTrue(Wait.waitFor(() -> 4L == liveServer.getNodeManager().getNodeActivationSequence()));
|
||||
|
||||
liveServer.stop(true);
|
||||
clientSession.close();
|
||||
locator.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBackupFailoverAndPrimaryFailback() throws Exception {
|
||||
final int timeout = (int) TimeUnit.SECONDS.toMillis(30);
|
||||
|
||||
// start live
|
||||
Configuration liveConfiguration = createLiveConfiguration();
|
||||
|
||||
ActiveMQServer primaryInstance = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
|
||||
primaryInstance.setIdentity("PRIMARY");
|
||||
primaryInstance.start();
|
||||
|
||||
// primary initially UN REPLICATED
|
||||
Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence());
|
||||
|
||||
// start backup
|
||||
Configuration backupConfiguration = createBackupConfiguration();
|
||||
((ReplicationBackupPolicyConfiguration)backupConfiguration.getHAPolicyConfiguration()).setAllowFailBack(true);
|
||||
|
||||
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
|
||||
backupServer.setIdentity("BACKUP");
|
||||
backupServer.start();
|
||||
|
||||
Wait.waitFor(backupServer::isStarted);
|
||||
|
||||
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout);
|
||||
|
||||
// primary REPLICATED, backup matches (has replicated) activation sequence
|
||||
Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence());
|
||||
Assert.assertEquals(1L, backupServer.getNodeManager().getNodeActivationSequence());
|
||||
|
||||
primaryInstance.stop();
|
||||
|
||||
// backup UN REPLICATED (new version)
|
||||
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 2L == backupServer.getNodeManager().getNodeActivationSequence(), timeout);
|
||||
|
||||
// just to let the console logging breath!
|
||||
TimeUnit.MILLISECONDS.sleep(100);
|
||||
|
||||
// restart primary that will request failback
|
||||
ActiveMQServer restartedPrimaryForFailBack = primaryInstance; //addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
|
||||
restartedPrimaryForFailBack.start();
|
||||
|
||||
// first step is backup getting replicated
|
||||
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout);
|
||||
|
||||
// restarted primary will run un replicated (increment sequence) while backup restarts to revert to backup role.
|
||||
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> {
|
||||
try {
|
||||
return 3L == restartedPrimaryForFailBack.getNodeManager().getNodeActivationSequence();
|
||||
} catch (NullPointerException ok) {
|
||||
return false;
|
||||
}
|
||||
}, timeout);
|
||||
|
||||
// the backup should then resume with an insync replica view of that version
|
||||
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> restartedPrimaryForFailBack.isReplicaSync(), timeout);
|
||||
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout);
|
||||
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 3L == backupServer.getNodeManager().getNodeActivationSequence(), timeout);
|
||||
|
||||
// just to let the console logging breath!
|
||||
TimeUnit.MILLISECONDS.sleep(100);
|
||||
|
||||
// stop backup to verify primary goes on with new sequence as un replicated
|
||||
backupServer.stop();
|
||||
|
||||
// just to let the console logging breath!
|
||||
TimeUnit.MILLISECONDS.sleep(100);
|
||||
|
||||
// live goes un replicated
|
||||
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> {
|
||||
try {
|
||||
return 4L == restartedPrimaryForFailBack.getNodeManager().getNodeActivationSequence();
|
||||
} catch (NullPointerException ok) {
|
||||
return false;
|
||||
}
|
||||
}, timeout);
|
||||
|
||||
restartedPrimaryForFailBack.stop();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testPrimaryIncrementActivationSequenceOnUnReplicated() throws Exception {
|
||||
final int timeout = (int) TimeUnit.SECONDS.toMillis(30);
|
||||
|
||||
// start live
|
||||
Configuration liveConfiguration = createLiveConfiguration();
|
||||
|
||||
ActiveMQServer primaryInstance = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
|
||||
primaryInstance.setIdentity("PRIMARY");
|
||||
primaryInstance.start();
|
||||
|
||||
// primary UN REPLICATED
|
||||
Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence());
|
||||
|
||||
// start backup
|
||||
Configuration backupConfiguration = createBackupConfiguration();
|
||||
|
||||
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
|
||||
backupServer.setIdentity("BACKUP");
|
||||
backupServer.start();
|
||||
|
||||
Wait.waitFor(backupServer::isStarted);
|
||||
|
||||
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> backupServer.isReplicaSync(), timeout);
|
||||
|
||||
// primary REPLICATED, backup matches (has replicated) activation sequence
|
||||
Assert.assertEquals(1L, primaryInstance.getNodeManager().getNodeActivationSequence());
|
||||
Assert.assertEquals(1L, backupServer.getNodeManager().getNodeActivationSequence());
|
||||
|
||||
// transition to un replicated once backup goes away
|
||||
backupServer.stop();
|
||||
|
||||
org.apache.activemq.artemis.utils.Wait.assertTrue(() -> 2L == primaryInstance.getNodeManager().getNodeActivationSequence(), timeout);
|
||||
|
||||
// done
|
||||
primaryInstance.stop();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testBackupStartsFirst() throws Exception {
|
||||
|
||||
// start backup
|
||||
Configuration backupConfiguration = createBackupConfiguration();
|
||||
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
|
||||
backupServer.setIdentity("BACKUP");
|
||||
backupServer.start();
|
||||
|
||||
// start live
|
||||
final Configuration liveConfiguration = createLiveConfiguration();
|
||||
|
||||
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
|
||||
liveServer.setIdentity("LIVE");
|
||||
liveServer.start();
|
||||
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
|
||||
assertTrue(Wait.waitFor(backupServer::isStarted));
|
||||
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
|
||||
assertTrue(liveServer.isReplicaSync());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBackupOutOfSequenceReleasesLock() throws Exception {
|
||||
|
||||
// start backup
|
||||
Configuration backupConfiguration = createBackupConfiguration();
|
||||
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
|
||||
backupServer.setIdentity("BACKUP");
|
||||
backupServer.start();
|
||||
|
||||
// start live
|
||||
final Configuration liveConfiguration = createLiveConfiguration();
|
||||
|
||||
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
|
||||
liveServer.setIdentity("LIVE");
|
||||
liveServer.start();
|
||||
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
|
||||
assertTrue(Wait.waitFor(backupServer::isStarted));
|
||||
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
|
||||
assertTrue(liveServer.isReplicaSync());
|
||||
|
||||
backupServer.stop();
|
||||
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
|
||||
liveServer.stop();
|
||||
// backup can get lock but does not have the sequence to start, will try and be a backup
|
||||
|
||||
backupServer.start();
|
||||
|
||||
// live server should be active
|
||||
liveServer.start();
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
|
||||
assertTrue(Wait.waitFor(backupServer::isStarted));
|
||||
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
|
||||
assertTrue(liveServer.isReplicaSync());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testBackupOutOfSequenceCheckActivationSequence() throws Exception {
|
||||
|
||||
// start backup
|
||||
Configuration backupConfiguration = createBackupConfiguration();
|
||||
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
|
||||
backupServer.setIdentity("BACKUP");
|
||||
backupServer.start();
|
||||
|
||||
// start live
|
||||
final Configuration liveConfiguration = createLiveConfiguration();
|
||||
|
||||
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
|
||||
liveServer.setIdentity("LIVE");
|
||||
liveServer.start();
|
||||
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
|
||||
assertTrue(Wait.waitFor(backupServer::isStarted));
|
||||
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
|
||||
assertTrue(liveServer.isReplicaSync());
|
||||
|
||||
backupServer.stop();
|
||||
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
|
||||
final String coordinatedId = liveServer.getNodeID().toString();
|
||||
liveServer.stop();
|
||||
|
||||
// backup can get lock but does not have the sequence to start, will try and be a backup
|
||||
// to verify it can short circuit with a dirty read we grab the lock for a little while
|
||||
DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf(
|
||||
managerConfiguration.getClassName(),
|
||||
managerConfiguration.getProperties());
|
||||
distributedPrimitiveManager.start();
|
||||
final DistributedLock lock = distributedPrimitiveManager.getDistributedLock(coordinatedId);
|
||||
assertTrue(lock.tryLock());
|
||||
CountDownLatch preActivate = new CountDownLatch(1);
|
||||
backupServer.registerActivateCallback(new ActivateCallback() {
|
||||
@Override
|
||||
public void preActivate() {
|
||||
ActivateCallback.super.preActivate();
|
||||
preActivate.countDown();
|
||||
}
|
||||
});
|
||||
backupServer.start();
|
||||
|
||||
// it should be able to do a dirty read of the sequence id and not have to wait to get a lock
|
||||
assertTrue(preActivate.await(1, TimeUnit.SECONDS));
|
||||
|
||||
// release the lock
|
||||
distributedPrimitiveManager.stop();
|
||||
|
||||
// live server should be active
|
||||
liveServer.start();
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
|
||||
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
|
||||
assertTrue(liveServer.isReplicaSync());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSelfRepairPrimary() throws Exception {
|
||||
// start live
|
||||
final Configuration liveConfiguration = createLiveConfiguration();
|
||||
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
|
||||
liveServer.setIdentity("LIVE");
|
||||
|
||||
liveServer.start();
|
||||
final String coordinatedId = liveServer.getNodeID().toString();
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
liveServer.stop();
|
||||
|
||||
liveServer.start();
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
Assert.assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence());
|
||||
liveServer.stop();
|
||||
|
||||
// backup can get lock but does not have the sequence to start, will try and be a backup
|
||||
// to verify it can short circuit with a dirty read we grab the lock for a little while
|
||||
DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager
|
||||
.newInstanceOf(managerConfiguration.getClassName(), managerConfiguration.getProperties());
|
||||
distributedPrimitiveManager.start();
|
||||
try (DistributedLock lock = distributedPrimitiveManager.getDistributedLock(coordinatedId)) {
|
||||
assertTrue(lock.tryLock());
|
||||
distributedPrimitiveManager.getMutableLong(coordinatedId).compareAndSet(2, -2);
|
||||
}
|
||||
liveServer.start();
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
Assert.assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence());
|
||||
Assert.assertEquals(2, distributedPrimitiveManager.getMutableLong(coordinatedId).get());
|
||||
|
||||
distributedPrimitiveManager.stop();
|
||||
|
||||
Configuration backupConfiguration = createBackupConfiguration();
|
||||
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
|
||||
backupServer.setIdentity("BACKUP");
|
||||
backupServer.start();
|
||||
Wait.waitFor(backupServer::isReplicaSync);
|
||||
Assert.assertEquals(2, backupServer.getNodeManager().getNodeActivationSequence());
|
||||
backupServer.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPrimaryPeers() throws Exception {
|
||||
final String PEER_NODE_ID = "some-shared-id-001";
|
||||
|
||||
final Configuration liveConfiguration = createLiveConfiguration();
|
||||
((ReplicationPrimaryPolicyConfiguration)liveConfiguration.getHAPolicyConfiguration()).setCoordinationId(PEER_NODE_ID);
|
||||
|
||||
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
|
||||
liveServer.setIdentity("LIVE");
|
||||
liveServer.start();
|
||||
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
|
||||
ServerLocator locator = ServerLocatorImpl.newLocator("(tcp://localhost:61616,tcp://localhost:61617)?ha=true");
|
||||
locator.setCallTimeout(60_000L);
|
||||
locator.setConnectionTTL(60_000L);
|
||||
final ClientSessionFactory keepLocatorAliveSLF = locator.createSessionFactory();
|
||||
|
||||
ClientSessionFactory csf = locator.createSessionFactory();
|
||||
sendTo(csf, "live_un_replicated");
|
||||
csf.close();
|
||||
|
||||
// start peer, will backup
|
||||
Configuration peerLiveConfiguration = createBackupConfiguration(); // to get acceptors and locators ports that won't clash
|
||||
peerLiveConfiguration.setHAPolicyConfiguration(createReplicationLiveConfiguration());
|
||||
((ReplicationPrimaryPolicyConfiguration)peerLiveConfiguration.getHAPolicyConfiguration()).setCoordinationId(PEER_NODE_ID);
|
||||
peerLiveConfiguration.setName("localhost::live-peer");
|
||||
|
||||
ActiveMQServer livePeerServer = addServer(ActiveMQServers.newActiveMQServer(peerLiveConfiguration));
|
||||
livePeerServer.setIdentity("LIVE-PEER");
|
||||
livePeerServer.start();
|
||||
|
||||
Wait.waitFor(livePeerServer::isStarted);
|
||||
|
||||
waitForTopology(liveServer, 1, 1, 30000);
|
||||
waitForTopology(livePeerServer, 1, 1, 30000);
|
||||
|
||||
liveServer.stop();
|
||||
|
||||
// livePeerServer will take over and run un replicated
|
||||
|
||||
csf = locator.createSessionFactory();
|
||||
receiveFrom(csf, "live_un_replicated");
|
||||
sendTo(csf, "peer_un_replicated");
|
||||
csf.close();
|
||||
|
||||
waitForTopology(livePeerServer, 1, 0, 30000);
|
||||
|
||||
assertTrue(Wait.waitFor(() -> 2L == livePeerServer.getNodeManager().getNodeActivationSequence()));
|
||||
|
||||
livePeerServer.stop(false);
|
||||
|
||||
liveServer.start();
|
||||
|
||||
Wait.assertTrue(() -> !liveServer.isActive());
|
||||
|
||||
// restart backup
|
||||
livePeerServer.start();
|
||||
|
||||
Wait.waitFor(livePeerServer::isStarted);
|
||||
|
||||
assertEquals(3L, livePeerServer.getNodeManager().getNodeActivationSequence());
|
||||
|
||||
csf = locator.createSessionFactory();
|
||||
receiveFrom(csf, "peer_un_replicated");
|
||||
sendTo(csf, "backup_as_un_replicated");
|
||||
csf.close();
|
||||
|
||||
// verify the live restart as a backup to the restarted PeerLiveServer that has taken on the live role
|
||||
liveServer.start();
|
||||
|
||||
csf = locator.createSessionFactory();
|
||||
receiveFrom(csf, "backup_as_un_replicated");
|
||||
sendTo(csf, "backup_as_replicated");
|
||||
csf.close();
|
||||
|
||||
assertTrue(Wait.waitFor(liveServer::isReplicaSync));
|
||||
assertTrue(Wait.waitFor(() -> 3L == liveServer.getNodeManager().getNodeActivationSequence()));
|
||||
|
||||
waitForTopology(liveServer, 1, 1, 30000);
|
||||
waitForTopology(livePeerServer, 1, 1, 30000);
|
||||
|
||||
livePeerServer.stop(true);
|
||||
|
||||
assertTrue(Wait.waitFor(() -> 4L == liveServer.getNodeManager().getNodeActivationSequence()));
|
||||
|
||||
csf = locator.createSessionFactory();
|
||||
receiveFrom(csf, "backup_as_replicated");
|
||||
csf.close();
|
||||
|
||||
waitForTopology(liveServer, 1, 0, 30000);
|
||||
|
||||
liveServer.stop(true);
|
||||
keepLocatorAliveSLF.close();
|
||||
locator.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnavailableSelfHeal() throws Exception {
|
||||
|
||||
// start backup
|
||||
Configuration backupConfiguration = createBackupConfiguration();
|
||||
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
|
||||
backupServer.setIdentity("BACKUP");
|
||||
backupServer.start();
|
||||
|
||||
// start live
|
||||
final Configuration liveConfiguration = createLiveConfiguration();
|
||||
|
||||
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
|
||||
liveServer.setIdentity("LIVE");
|
||||
liveServer.start();
|
||||
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
|
||||
assertTrue(Wait.waitFor(backupServer::isStarted));
|
||||
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
|
||||
assertTrue(liveServer.isReplicaSync());
|
||||
|
||||
final String coordinatedId = liveServer.getNodeID().toString();
|
||||
|
||||
backupServer.stop();
|
||||
TimeUnit.MILLISECONDS.sleep(500);
|
||||
liveServer.stop();
|
||||
|
||||
// some manual intervention to force an unavailable
|
||||
// simulate live failing in activation local sequence update on un replicated run when backup stops.
|
||||
|
||||
DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf(managerConfiguration.getClassName(), managerConfiguration.getProperties());
|
||||
distributedPrimitiveManager.start();
|
||||
final MutableLong activationSequence = distributedPrimitiveManager.getMutableLong(coordinatedId);
|
||||
Assert.assertTrue(activationSequence.compareAndSet(2, -2));
|
||||
|
||||
// case: 1, the fail to write locally 2 but the write actually succeeding
|
||||
// should delay pending resolution of the uncommitted claim
|
||||
backupServer.start();
|
||||
|
||||
// live server should activate after self healing its outstanding claim
|
||||
liveServer.start();
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
|
||||
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
|
||||
assertTrue(liveServer.isReplicaSync());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnavailableAdminIntervention() throws Exception {
|
||||
// start backup
|
||||
Configuration backupConfiguration = createBackupConfiguration();
|
||||
ActiveMQServer backupServer = addServer(ActiveMQServers.newActiveMQServer(backupConfiguration));
|
||||
backupServer.setIdentity("BACKUP");
|
||||
backupServer.start();
|
||||
|
||||
// start live
|
||||
final Configuration liveConfiguration = createLiveConfiguration();
|
||||
|
||||
ActiveMQServer liveServer = addServer(ActiveMQServers.newActiveMQServer(liveConfiguration));
|
||||
liveServer.setIdentity("LIVE");
|
||||
liveServer.start();
|
||||
|
||||
Wait.waitFor(liveServer::isStarted);
|
||||
|
||||
assertTrue(Wait.waitFor(backupServer::isStarted));
|
||||
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
|
||||
assertTrue(liveServer.isReplicaSync());
|
||||
|
||||
final String coordinatedId = liveServer.getNodeID().toString();
|
||||
|
||||
System.err.println("coodr id: " + coordinatedId);
|
||||
backupServer.stop();
|
||||
TimeUnit.MILLISECONDS.sleep(500);
|
||||
liveServer.stop();
|
||||
|
||||
// some manual intervention to force an unavailable
|
||||
// simulate live failing in activation local sequence update on un replicated run when backup stops.
|
||||
|
||||
DistributedPrimitiveManager distributedPrimitiveManager = DistributedPrimitiveManager.newInstanceOf(
|
||||
managerConfiguration.getClassName(),
|
||||
managerConfiguration.getProperties());
|
||||
distributedPrimitiveManager.start();
|
||||
final MutableLong coordinatedActivationSequence = distributedPrimitiveManager.getMutableLong(coordinatedId);
|
||||
Assert.assertTrue(coordinatedActivationSequence.compareAndSet(2, -2));
|
||||
|
||||
// case: 2, the fail to write locally 2 but the write actually failing
|
||||
// need to put 1 in the local activation sequence of the live
|
||||
FileLockNodeManager fileLockNodeManager = new FileLockNodeManager(liveConfiguration.getNodeManagerLockLocation().getAbsoluteFile(), true);
|
||||
fileLockNodeManager.start();
|
||||
assertEquals(2, fileLockNodeManager.readNodeActivationSequence());
|
||||
fileLockNodeManager.writeNodeActivationSequence(1);
|
||||
fileLockNodeManager.stop();
|
||||
|
||||
// should delay pending resolution of the uncommitted claim
|
||||
backupServer.start();
|
||||
CountDownLatch liveStarting = new CountDownLatch(1);
|
||||
// should delay pending resolution of the uncommitted claim
|
||||
// IMPORTANT: primary activation run on the start caller thread!! We need another thread here
|
||||
final Thread liveServerStarterThread = new Thread(() -> {
|
||||
liveStarting.countDown();
|
||||
try {
|
||||
liveServer.start();
|
||||
} catch (Throwable e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
liveServerStarterThread.start();
|
||||
liveStarting.await();
|
||||
TimeUnit.MILLISECONDS.sleep(500);
|
||||
// both are candidates and one of them failed to commit the claim
|
||||
// let them compete on retry
|
||||
Assert.assertTrue(coordinatedActivationSequence.compareAndSet(-2, 1));
|
||||
// one of the two can activate
|
||||
Wait.waitFor(() -> liveServer.isStarted() || backupServer.isStarted());
|
||||
|
||||
assertTrue(Wait.waitFor(backupServer::isReplicaSync));
|
||||
assertTrue(liveServer.isReplicaSync());
|
||||
|
||||
assertEquals(2, backupServer.getNodeManager().getNodeActivationSequence());
|
||||
assertEquals(2, liveServer.getNodeManager().getNodeActivationSequence());
|
||||
|
||||
}
|
||||
|
||||
private void sendTo(ClientSessionFactory clientSessionFactory, String addr) throws Exception {
|
||||
ClientSession clientSession = clientSessionFactory.createSession(true, true);
|
||||
clientSession.createQueue(new QueueConfiguration(addr).setRoutingType(RoutingType.ANYCAST).setDurable(true));
|
||||
ClientProducer producer = clientSession.createProducer(addr);
|
||||
ClientMessage message = clientSession.createMessage(true);
|
||||
message.putStringProperty("K", addr);
|
||||
message.putLongProperty("delay", 0L); // so slow interceptor does not get us
|
||||
producer.send(message);
|
||||
producer.close();
|
||||
clientSession.close();
|
||||
}
|
||||
|
||||
private void receiveFrom(ClientSessionFactory clientSessionFactory, String addr) throws Exception {
|
||||
ClientSession clientSession = clientSessionFactory.createSession(true, true);
|
||||
clientSession.start();
|
||||
ClientConsumer consumer = clientSession.createConsumer(addr);
|
||||
Message message = consumer.receive(4000);
|
||||
assertNotNull(message);
|
||||
assertTrue(message.getStringProperty("K").equals(addr));
|
||||
consumer.close();
|
||||
clientSession.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -244,7 +244,7 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase {
|
|||
.setCheckForLiveServer(false);
|
||||
}
|
||||
|
||||
private Configuration createLiveConfiguration() throws Exception {
|
||||
protected Configuration createLiveConfiguration() throws Exception {
|
||||
Configuration conf = new ConfigurationImpl();
|
||||
conf.setName("localhost::live");
|
||||
|
||||
|
@ -275,7 +275,7 @@ public class SharedNothingReplicationTest extends ActiveMQTestBase {
|
|||
return new ReplicaPolicyConfiguration().setClusterName("cluster");
|
||||
}
|
||||
|
||||
private Configuration createBackupConfiguration() throws Exception {
|
||||
protected Configuration createBackupConfiguration() throws Exception {
|
||||
Configuration conf = new ConfigurationImpl();
|
||||
conf.setName("localhost::backup");
|
||||
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.activemq.artemis.api.core.client.ClientSession;
|
|||
import org.apache.activemq.artemis.api.core.client.ClientSessionFactory;
|
||||
import org.apache.activemq.artemis.core.config.ha.DistributedPrimitiveManagerConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationBackupPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.quorum.file.FileBasedPrimitiveManager;
|
||||
import org.apache.activemq.artemis.spi.core.security.ActiveMQBasicSecurityManager;
|
||||
import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTestBase;
|
||||
|
@ -66,9 +65,9 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes
|
|||
.setLargeMessagesDirectory(getLargeMessagesDir(0, true));
|
||||
|
||||
setupHAPolicyConfiguration();
|
||||
nodeManager = createReplicatedBackupNodeManager(backupConfig);
|
||||
backupNodeManager = createReplicatedBackupNodeManager(backupConfig);
|
||||
|
||||
backupServer = createTestableServer(backupConfig);
|
||||
backupServer = createTestableServer(backupConfig, backupNodeManager);
|
||||
|
||||
backupServer.getServer().setSecurityManager(new ActiveMQBasicSecurityManager());
|
||||
|
||||
|
@ -77,7 +76,8 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes
|
|||
.clearAcceptorConfigurations()
|
||||
.addAcceptorConfiguration(getAcceptorTransportConfiguration(true));
|
||||
|
||||
liveServer = createTestableServer(liveConfig);
|
||||
nodeManager = createNodeManager(liveConfig);
|
||||
liveServer = createTestableServer(liveConfig, nodeManager);
|
||||
|
||||
liveServer.getServer().setSecurityManager(new ActiveMQBasicSecurityManager());
|
||||
}
|
||||
|
@ -94,7 +94,6 @@ public class PluggableQuorumBasicSecurityManagerFailoverTest extends FailoverTes
|
|||
|
||||
@Override
|
||||
protected void setupHAPolicyConfiguration() {
|
||||
((ReplicationPrimaryPolicyConfiguration) liveConfig.getHAPolicyConfiguration()).setCheckForLiveServer(true);
|
||||
((ReplicationBackupPolicyConfiguration) backupConfig.getHAPolicyConfiguration()).setMaxSavedReplicatedJournalsSize(2).setAllowFailBack(true);
|
||||
}
|
||||
|
||||
|
|
|
@ -677,6 +677,44 @@
|
|||
</args>
|
||||
</configuration>
|
||||
</execution>
|
||||
<execution>
|
||||
<phase>test-compile</phase>
|
||||
<id>create-zk-replication-primary-peer-a</id>
|
||||
<goals>
|
||||
<goal>create</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<configuration>${basedir}/target/classes/servers/zkReplicationPrimaryPeerA</configuration>
|
||||
<allowAnonymous>true</allowAnonymous>
|
||||
<user>admin</user>
|
||||
<password>admin</password>
|
||||
<instance>${basedir}/target/zkReplicationPrimaryPeerA</instance>
|
||||
<args>
|
||||
<!-- this is needed to run the server remotely -->
|
||||
<arg>--java-options</arg>
|
||||
<arg>-Djava.rmi.server.hostname=localhost</arg>
|
||||
</args>
|
||||
</configuration>
|
||||
</execution>
|
||||
<execution>
|
||||
<phase>test-compile</phase>
|
||||
<id>create-zk-replication-primary-peer-b</id>
|
||||
<goals>
|
||||
<goal>create</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<configuration>${basedir}/target/classes/servers/zkReplicationPrimaryPeerB</configuration>
|
||||
<allowAnonymous>true</allowAnonymous>
|
||||
<user>admin</user>
|
||||
<password>admin</password>
|
||||
<instance>${basedir}/target/zkReplicationPrimaryPeerB</instance>
|
||||
<args>
|
||||
<!-- this is needed to run the server remotely -->
|
||||
<arg>--java-options</arg>
|
||||
<arg>-Djava.rmi.server.hostname=localhost</arg>
|
||||
</args>
|
||||
</configuration>
|
||||
</execution>
|
||||
<execution>
|
||||
<phase>test-compile</phase>
|
||||
<id>create-zk-replication-backup</id>
|
||||
|
|
|
@ -39,7 +39,6 @@ under the License.
|
|||
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
|
||||
</properties>
|
||||
</manager>
|
||||
<check-for-live-server>true</check-for-live-server>
|
||||
</primary>
|
||||
</replication>
|
||||
</ha-policy>
|
||||
|
|
|
@ -0,0 +1,138 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
--><configuration xmlns="urn:activemq" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:activemq /schema/artemis-server.xsd">
|
||||
|
||||
<core xmlns="urn:activemq:core">
|
||||
|
||||
<name>primary-peer-a</name>
|
||||
|
||||
<bindings-directory>./data/bindings</bindings-directory>
|
||||
|
||||
<journal-directory>./data/journal</journal-directory>
|
||||
|
||||
<large-messages-directory>./data/largemessages</large-messages-directory>
|
||||
|
||||
<paging-directory>./data/paging</paging-directory>
|
||||
|
||||
<ha-policy>
|
||||
<replication>
|
||||
<primary>
|
||||
<manager>
|
||||
<class-name>org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager</class-name>
|
||||
<properties>
|
||||
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
|
||||
</properties>
|
||||
</manager>
|
||||
<coordination-id>peer-journal-001</coordination-id>
|
||||
</primary>
|
||||
</replication>
|
||||
</ha-policy>
|
||||
|
||||
<connectors>
|
||||
<!-- Connector used to be announced through cluster connections and notifications -->
|
||||
<connector name="artemis">tcp://localhost:61616</connector>
|
||||
<connector name="peer">tcp://localhost:61716</connector>
|
||||
</connectors>
|
||||
|
||||
|
||||
<!-- Acceptors -->
|
||||
<acceptors>
|
||||
<acceptor name="artemis">tcp://localhost:61616</acceptor>
|
||||
</acceptors>
|
||||
|
||||
<cluster-user>admin</cluster-user>
|
||||
|
||||
<cluster-password>password</cluster-password>
|
||||
|
||||
<cluster-connections>
|
||||
<cluster-connection name="my-cluster">
|
||||
<connector-ref>artemis</connector-ref>
|
||||
<message-load-balancing>OFF</message-load-balancing>
|
||||
<max-hops>1</max-hops>
|
||||
<static-connectors>
|
||||
<connector-ref>peer</connector-ref>
|
||||
</static-connectors>
|
||||
</cluster-connection>
|
||||
</cluster-connections>
|
||||
|
||||
<!-- Other config -->
|
||||
|
||||
<security-settings>
|
||||
<!--security for example queue-->
|
||||
<security-setting match="#">
|
||||
<permission type="createNonDurableQueue" roles="amq, guest"/>
|
||||
<permission type="deleteNonDurableQueue" roles="amq, guest"/>
|
||||
<permission type="createDurableQueue" roles="amq, guest"/>
|
||||
<permission type="deleteDurableQueue" roles="amq, guest"/>
|
||||
<permission type="createAddress" roles="amq, guest"/>
|
||||
<permission type="deleteAddress" roles="amq, guest"/>
|
||||
<permission type="consume" roles="amq, guest"/>
|
||||
<permission type="browse" roles="amq, guest"/>
|
||||
<permission type="send" roles="amq, guest"/>
|
||||
<!-- we need this otherwise ./artemis data imp wouldn't work -->
|
||||
<permission type="manage" roles="amq"/>
|
||||
</security-setting>
|
||||
</security-settings>
|
||||
|
||||
<address-settings>
|
||||
<!-- if you define auto-create on certain queues, management has to be auto-create -->
|
||||
<address-setting match="activemq.management#">
|
||||
<dead-letter-address>DLQ</dead-letter-address>
|
||||
<expiry-address>ExpiryQueue</expiry-address>
|
||||
<redelivery-delay>0</redelivery-delay>
|
||||
<!-- with -1 only the global-max-size is in use for limiting -->
|
||||
<max-size-bytes>-1</max-size-bytes>
|
||||
<message-counter-history-day-limit>10</message-counter-history-day-limit>
|
||||
<address-full-policy>PAGE</address-full-policy>
|
||||
<auto-create-queues>true</auto-create-queues>
|
||||
<auto-create-addresses>true</auto-create-addresses>
|
||||
<auto-create-jms-queues>true</auto-create-jms-queues>
|
||||
<auto-create-jms-topics>true</auto-create-jms-topics>
|
||||
</address-setting>
|
||||
<!--default for catch all-->
|
||||
<address-setting match="#">
|
||||
<dead-letter-address>DLQ</dead-letter-address>
|
||||
<expiry-address>ExpiryQueue</expiry-address>
|
||||
<redelivery-delay>0</redelivery-delay>
|
||||
<!-- with -1 only the global-max-size is in use for limiting -->
|
||||
<max-size-bytes>10MB</max-size-bytes>
|
||||
<page-size-bytes>1MB</page-size-bytes>
|
||||
|
||||
<message-counter-history-day-limit>10</message-counter-history-day-limit>
|
||||
<address-full-policy>PAGE</address-full-policy>
|
||||
<auto-create-queues>true</auto-create-queues>
|
||||
<auto-create-addresses>true</auto-create-addresses>
|
||||
<auto-create-jms-queues>true</auto-create-jms-queues>
|
||||
<auto-create-jms-topics>true</auto-create-jms-topics>
|
||||
</address-setting>
|
||||
</address-settings>
|
||||
|
||||
<addresses>
|
||||
<address name="exampleTopic">
|
||||
<multicast>
|
||||
</multicast>
|
||||
</address>
|
||||
<address name="exampleQueue">
|
||||
<anycast>
|
||||
<queue name="exampleQueue"/>
|
||||
</anycast>
|
||||
</address>
|
||||
</addresses>
|
||||
</core>
|
||||
</configuration>
|
|
@ -0,0 +1,20 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<!--
|
||||
~ Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
~ contributor license agreements. See the NOTICE file distributed with
|
||||
~ this work for additional information regarding copyright ownership.
|
||||
~ The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
~ (the "License"); you may not use this file except in compliance with
|
||||
~ the License. You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing, software
|
||||
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
~ See the License for the specific language governing permissions and
|
||||
~ limitations under the License.
|
||||
-->
|
||||
<management-context xmlns="http://activemq.org/schema">
|
||||
<connector connector-port="10099" connector-host="localhost"/>
|
||||
</management-context>
|
|
@ -0,0 +1,138 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
--><configuration xmlns="urn:activemq" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:activemq /schema/artemis-server.xsd">
|
||||
|
||||
<core xmlns="urn:activemq:core">
|
||||
|
||||
<name>primary-peer-b</name>
|
||||
|
||||
<bindings-directory>./data/bindings</bindings-directory>
|
||||
|
||||
<journal-directory>./data/journal</journal-directory>
|
||||
|
||||
<large-messages-directory>./data/largemessages</large-messages-directory>
|
||||
|
||||
<paging-directory>./data/paging</paging-directory>
|
||||
|
||||
<ha-policy>
|
||||
<replication>
|
||||
<primary>
|
||||
<manager>
|
||||
<class-name>org.apache.activemq.artemis.quorum.zookeeper.CuratorDistributedPrimitiveManager</class-name>
|
||||
<properties>
|
||||
<property key="connect-string" value="127.0.0.1:6666,127.0.0.1:6667,127.0.0.1:6668"/>
|
||||
</properties>
|
||||
</manager>
|
||||
<coordination-id>peer-journal-001</coordination-id>
|
||||
</primary>
|
||||
</replication>
|
||||
</ha-policy>
|
||||
|
||||
<connectors>
|
||||
<!-- Connector used to be announced through cluster connections and notifications -->
|
||||
<connector name="artemis">tcp://localhost:61716</connector>
|
||||
<connector name="peer">tcp://localhost:61616</connector>
|
||||
</connectors>
|
||||
|
||||
|
||||
<!-- Acceptors -->
|
||||
<acceptors>
|
||||
<acceptor name="artemis">tcp://localhost:61716</acceptor>
|
||||
</acceptors>
|
||||
|
||||
<cluster-user>admin</cluster-user>
|
||||
|
||||
<cluster-password>password</cluster-password>
|
||||
|
||||
<cluster-connections>
|
||||
<cluster-connection name="my-cluster">
|
||||
<connector-ref>artemis</connector-ref>
|
||||
<message-load-balancing>OFF</message-load-balancing>
|
||||
<max-hops>1</max-hops>
|
||||
<static-connectors>
|
||||
<connector-ref>peer</connector-ref>
|
||||
</static-connectors>
|
||||
</cluster-connection>
|
||||
</cluster-connections>
|
||||
|
||||
<!-- Other config -->
|
||||
|
||||
<security-settings>
|
||||
<!--security for example queue-->
|
||||
<security-setting match="#">
|
||||
<permission type="createNonDurableQueue" roles="amq, guest"/>
|
||||
<permission type="deleteNonDurableQueue" roles="amq, guest"/>
|
||||
<permission type="createDurableQueue" roles="amq, guest"/>
|
||||
<permission type="deleteDurableQueue" roles="amq, guest"/>
|
||||
<permission type="createAddress" roles="amq, guest"/>
|
||||
<permission type="deleteAddress" roles="amq, guest"/>
|
||||
<permission type="consume" roles="amq, guest"/>
|
||||
<permission type="browse" roles="amq, guest"/>
|
||||
<permission type="send" roles="amq, guest"/>
|
||||
<!-- we need this otherwise ./artemis data imp wouldn't work -->
|
||||
<permission type="manage" roles="amq"/>
|
||||
</security-setting>
|
||||
</security-settings>
|
||||
|
||||
<address-settings>
|
||||
<!-- if you define auto-create on certain queues, management has to be auto-create -->
|
||||
<address-setting match="activemq.management#">
|
||||
<dead-letter-address>DLQ</dead-letter-address>
|
||||
<expiry-address>ExpiryQueue</expiry-address>
|
||||
<redelivery-delay>0</redelivery-delay>
|
||||
<!-- with -1 only the global-max-size is in use for limiting -->
|
||||
<max-size-bytes>-1</max-size-bytes>
|
||||
<message-counter-history-day-limit>10</message-counter-history-day-limit>
|
||||
<address-full-policy>PAGE</address-full-policy>
|
||||
<auto-create-queues>true</auto-create-queues>
|
||||
<auto-create-addresses>true</auto-create-addresses>
|
||||
<auto-create-jms-queues>true</auto-create-jms-queues>
|
||||
<auto-create-jms-topics>true</auto-create-jms-topics>
|
||||
</address-setting>
|
||||
<!--default for catch all-->
|
||||
<address-setting match="#">
|
||||
<dead-letter-address>DLQ</dead-letter-address>
|
||||
<expiry-address>ExpiryQueue</expiry-address>
|
||||
<redelivery-delay>0</redelivery-delay>
|
||||
<!-- with -1 only the global-max-size is in use for limiting -->
|
||||
<max-size-bytes>10MB</max-size-bytes>
|
||||
<page-size-bytes>1MB</page-size-bytes>
|
||||
|
||||
<message-counter-history-day-limit>10</message-counter-history-day-limit>
|
||||
<address-full-policy>PAGE</address-full-policy>
|
||||
<auto-create-queues>true</auto-create-queues>
|
||||
<auto-create-addresses>true</auto-create-addresses>
|
||||
<auto-create-jms-queues>true</auto-create-jms-queues>
|
||||
<auto-create-jms-topics>true</auto-create-jms-topics>
|
||||
</address-setting>
|
||||
</address-settings>
|
||||
|
||||
<addresses>
|
||||
<address name="exampleTopic">
|
||||
<multicast>
|
||||
</multicast>
|
||||
</address>
|
||||
<address name="exampleQueue">
|
||||
<anycast>
|
||||
<queue name="exampleQueue"/>
|
||||
</anycast>
|
||||
</address>
|
||||
</addresses>
|
||||
</core>
|
||||
</configuration>
|
|
@ -0,0 +1,20 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<!--
|
||||
~ Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
~ contributor license agreements. See the NOTICE file distributed with
|
||||
~ this work for additional information regarding copyright ownership.
|
||||
~ The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
~ (the "License"); you may not use this file except in compliance with
|
||||
~ the License. You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing, software
|
||||
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
~ See the License for the specific language governing permissions and
|
||||
~ limitations under the License.
|
||||
-->
|
||||
<management-context xmlns="http://activemq.org/schema">
|
||||
<connector connector-port="10199" connector-host="localhost"/>
|
||||
</management-context>
|
|
@ -20,8 +20,7 @@ package org.apache.activemq.artemis.tests.smoke.quorum;
|
|||
import javax.management.remote.JMXServiceURL;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
@ -55,15 +54,15 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
|
|||
|
||||
private static final Logger LOGGER = Logger.getLogger(PluggableQuorumSinglePairTest.class);
|
||||
|
||||
private static final String JMX_SERVER_HOSTNAME = "localhost";
|
||||
private static final int JMX_PORT_PRIMARY = 10099;
|
||||
private static final int JMX_PORT_BACKUP = 10199;
|
||||
static final String JMX_SERVER_HOSTNAME = "localhost";
|
||||
static final int JMX_PORT_PRIMARY = 10099;
|
||||
static final int JMX_PORT_BACKUP = 10199;
|
||||
|
||||
private static final String PRIMARY_DATA_FOLDER = "ReplicationPrimary";;
|
||||
private static final String BACKUP_DATA_FOLDER = "ReplicationBackup";
|
||||
static final String PRIMARY_DATA_FOLDER = "ReplicationPrimary";
|
||||
static final String BACKUP_DATA_FOLDER = "ReplicationBackup";
|
||||
|
||||
private static final int PRIMARY_PORT_OFFSET = 0;
|
||||
private static final int BACKUP_PORT_OFFSET = PRIMARY_PORT_OFFSET + 100;
|
||||
static final int PRIMARY_PORT_OFFSET = 0;
|
||||
static final int BACKUP_PORT_OFFSET = PRIMARY_PORT_OFFSET + 100;
|
||||
|
||||
public static class BrokerControl {
|
||||
|
||||
|
@ -73,7 +72,7 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
|
|||
final JMXServiceURL jmxServiceURL;
|
||||
final int portID;
|
||||
|
||||
private BrokerControl(final String name, int jmxPort, String dataFolder, int portID) {
|
||||
BrokerControl(final String name, int jmxPort, String dataFolder, int portID) {
|
||||
this.portID = portID;
|
||||
this.dataFolder = dataFolder;
|
||||
try {
|
||||
|
@ -108,6 +107,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
|
|||
public Optional<String> listNetworkTopology() throws Exception {
|
||||
return Jmx.listNetworkTopology(jmxServiceURL, objectNameBuilder);
|
||||
}
|
||||
|
||||
public Optional<Long> getActivationSequence() throws Exception {
|
||||
return Jmx.getActivationSequence(jmxServiceURL, objectNameBuilder);
|
||||
}
|
||||
|
||||
public Optional<Boolean> isActive() throws Exception {
|
||||
return Jmx.isActive(jmxServiceURL, objectNameBuilder);
|
||||
}
|
||||
}
|
||||
|
||||
@Parameterized.Parameter
|
||||
|
@ -118,14 +125,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
|
|||
return Arrays.asList(new Object[][]{{false}, {true}});
|
||||
}
|
||||
|
||||
private final BrokerControl primary;
|
||||
private final BrokerControl backup;
|
||||
private final Collection<BrokerControl> brokers;
|
||||
protected BrokerControl primary;
|
||||
protected BrokerControl backup;
|
||||
protected LinkedList<BrokerControl> brokers;
|
||||
|
||||
public PluggableQuorumSinglePairTest(String brokerFolderPrefix) {
|
||||
primary = new BrokerControl("primary", JMX_PORT_PRIMARY, brokerFolderPrefix + PRIMARY_DATA_FOLDER, PRIMARY_PORT_OFFSET);
|
||||
backup = new BrokerControl("backup", JMX_PORT_BACKUP, brokerFolderPrefix + BACKUP_DATA_FOLDER, BACKUP_PORT_OFFSET);
|
||||
brokers = Collections.unmodifiableList(Arrays.asList(primary, backup));
|
||||
brokers = new LinkedList(Arrays.asList(primary, backup));
|
||||
}
|
||||
|
||||
protected abstract boolean awaitAsyncSetupCompleted(long timeout, TimeUnit unit) throws InterruptedException;
|
||||
|
@ -150,6 +157,10 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
|
|||
Process primaryInstance = primary.startServer(this, timeout);
|
||||
Assert.assertTrue(awaitAsyncSetupCompleted(timeout, TimeUnit.MILLISECONDS));
|
||||
Wait.assertTrue(() -> !primary.isBackup().orElse(true), timeout);
|
||||
|
||||
// primary UN REPLICATED
|
||||
Assert.assertEquals(1L, primary.getActivationSequence().get().longValue());
|
||||
|
||||
LOGGER.info("started primary");
|
||||
LOGGER.info("starting backup");
|
||||
Process backupInstance = backup.startServer(this, 0);
|
||||
|
@ -176,6 +187,11 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
|
|||
Assert.assertNotNull(urlPrimary);
|
||||
LOGGER.infof("primary: %s", urlPrimary);
|
||||
Assert.assertNotEquals(urlPrimary, urlBackup);
|
||||
|
||||
// primary REPLICATED, backup matches (has replicated) activation sequence
|
||||
Assert.assertEquals(1L, primary.getActivationSequence().get().longValue());
|
||||
Assert.assertEquals(1L, backup.getActivationSequence().get().longValue());
|
||||
|
||||
LOGGER.info("killing primary");
|
||||
ServerUtil.killServer(primaryInstance, forceKill);
|
||||
LOGGER.info("killed primary");
|
||||
|
@ -188,11 +204,15 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
|
|||
.and(withNodes(1))), timeout);
|
||||
LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get());
|
||||
Assert.assertEquals(nodeID, backup.getNodeID().get());
|
||||
|
||||
// backup UN REPLICATED (new version)
|
||||
Assert.assertEquals(2L, backup.getActivationSequence().get().longValue());
|
||||
|
||||
// wait a bit before restarting primary
|
||||
LOGGER.info("waiting before starting primary");
|
||||
TimeUnit.SECONDS.sleep(4);
|
||||
LOGGER.info("starting primary");
|
||||
primary.startServer(this, 0);
|
||||
primaryInstance = primary.startServer(this, 0);
|
||||
LOGGER.info("started primary");
|
||||
Wait.assertTrue(() -> backup.isBackup().orElse(false), timeout);
|
||||
Assert.assertTrue(!primary.isBackup().get());
|
||||
|
@ -209,6 +229,14 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
|
|||
Assert.assertTrue(backup.isReplicaSync().get());
|
||||
LOGGER.infof("backup is synchronized with live");
|
||||
Assert.assertEquals(nodeID, primary.getNodeID().get());
|
||||
|
||||
// primary ran un replicated for a short while after failback, before backup was in sync
|
||||
Assert.assertEquals(3L, primary.getActivationSequence().get().longValue());
|
||||
Assert.assertEquals(3L, backup.getActivationSequence().get().longValue());
|
||||
|
||||
LOGGER.infof("Done, killing both");
|
||||
ServerUtil.killServer(primaryInstance);
|
||||
ServerUtil.killServer(backupInstance);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -272,5 +300,87 @@ public abstract class PluggableQuorumSinglePairTest extends SmokeTestBase {
|
|||
Wait.waitFor(()-> !backupInstance.isAlive(), timeout);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testOnlyLastUnreplicatedCanStart() throws Exception {
|
||||
final int timeout = (int) TimeUnit.SECONDS.toMillis(30);
|
||||
LOGGER.info("starting primary");
|
||||
Process primaryInstance = primary.startServer(this, timeout);
|
||||
Assert.assertTrue(awaitAsyncSetupCompleted(timeout, TimeUnit.MILLISECONDS));
|
||||
Wait.assertTrue(() -> !primary.isBackup().orElse(true), timeout);
|
||||
LOGGER.info("started primary");
|
||||
LOGGER.info("starting backup");
|
||||
Process backupInstance = backup.startServer(this, 0);
|
||||
Wait.assertTrue(() -> backup.isBackup().orElse(false), timeout);
|
||||
final String nodeID = primary.getNodeID().get();
|
||||
Assert.assertNotNull(nodeID);
|
||||
LOGGER.infof("NodeID: %s", nodeID);
|
||||
for (BrokerControl broker : brokers) {
|
||||
Wait.assertTrue(() -> validateNetworkTopology(broker.listNetworkTopology().orElse(""),
|
||||
containsExactNodeIds(nodeID)
|
||||
.and(withLive(nodeID, Objects::nonNull))
|
||||
.and(withBackup(nodeID, Objects::nonNull))
|
||||
.and(withMembers(1))
|
||||
.and(withNodes(2))), timeout);
|
||||
}
|
||||
LOGGER.infof("primary topology is: %s", primary.listNetworkTopology().get());
|
||||
LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get());
|
||||
Assert.assertTrue(backup.isReplicaSync().get());
|
||||
LOGGER.infof("backup is synchronized with live");
|
||||
final String urlBackup = backupOf(nodeID, decodeNetworkTopologyJson(backup.listNetworkTopology().get()));
|
||||
Assert.assertNotNull(urlBackup);
|
||||
LOGGER.infof("backup: %s", urlBackup);
|
||||
final String urlPrimary = liveOf(nodeID, decodeNetworkTopologyJson(primary.listNetworkTopology().get()));
|
||||
Assert.assertNotNull(urlPrimary);
|
||||
LOGGER.infof("primary: %s", urlPrimary);
|
||||
Assert.assertNotEquals(urlPrimary, urlBackup);
|
||||
|
||||
|
||||
// verify sequence id's in sync
|
||||
Assert.assertEquals(1L, primary.getActivationSequence().get().longValue());
|
||||
Assert.assertEquals(1L, backup.getActivationSequence().get().longValue());
|
||||
|
||||
LOGGER.info("killing primary");
|
||||
ServerUtil.killServer(primaryInstance, forceKill);
|
||||
LOGGER.info("killed primary");
|
||||
Wait.assertTrue(() -> !backup.isBackup().orElse(true), timeout);
|
||||
Wait.assertTrue(() -> validateNetworkTopology(backup.listNetworkTopology().orElse(""),
|
||||
containsExactNodeIds(nodeID)
|
||||
.and(withLive(nodeID, urlBackup::equals))
|
||||
.and(withBackup(nodeID, Objects::isNull))
|
||||
.and(withMembers(1))
|
||||
.and(withNodes(1))), timeout);
|
||||
LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get());
|
||||
Assert.assertEquals(nodeID, backup.getNodeID().get());
|
||||
|
||||
|
||||
// backup now UNREPLICATED, it is the only node that can continue
|
||||
Assert.assertEquals(2L, backup.getActivationSequence().get().longValue());
|
||||
|
||||
LOGGER.info("killing backup");
|
||||
ServerUtil.killServer(backupInstance, forceKill);
|
||||
|
||||
// wait a bit before restarting primary
|
||||
LOGGER.info("waiting before starting primary");
|
||||
TimeUnit.SECONDS.sleep(4);
|
||||
LOGGER.info("restarting primary");
|
||||
|
||||
Process restartedPrimary = primary.startServer(this, 0);
|
||||
LOGGER.info("restarted primary, " + restartedPrimary);
|
||||
|
||||
Wait.assertFalse("Primary shouldn't activate", () -> primary.isActive().orElse(false), 5000);
|
||||
|
||||
ServerUtil.killServer(restartedPrimary);
|
||||
|
||||
LOGGER.info("restarting backup");
|
||||
|
||||
// backup can resume with data seq 3
|
||||
final Process restartedBackupInstance = backup.startServer(this, 5000);
|
||||
Wait.waitFor(() -> backup.isActive().orElse(false), 5000);
|
||||
assertTrue(Wait.waitFor(() -> nodeID.equals(backup.getNodeID().orElse("not set yet"))));
|
||||
LOGGER.info("restarted backup");
|
||||
|
||||
Assert.assertEquals(3L, backup.getActivationSequence().get().longValue());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.activemq.artemis.tests.smoke.quorum;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.activemq.artemis.util.ServerUtil;
|
||||
import org.apache.activemq.artemis.utils.Wait;
|
||||
import org.jboss.logging.Logger;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.containsExactNodeIds;
|
||||
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.validateNetworkTopology;
|
||||
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withBackup;
|
||||
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withLive;
|
||||
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withMembers;
|
||||
import static org.apache.activemq.artemis.tests.smoke.utils.Jmx.withNodes;
|
||||
|
||||
public class ZookeeperPluggableQuorumPeerTest extends ZookeeperPluggableQuorumSinglePairTest {
|
||||
|
||||
private static final Logger LOGGER = Logger.getLogger(ZookeeperPluggableQuorumPeerTest.class);
|
||||
|
||||
public ZookeeperPluggableQuorumPeerTest() {
|
||||
super();
|
||||
// accepting the primary/backup vars to reuse the test, for peers, these are interchangeable as either can take
|
||||
// both roles as both wish to be primary but will revert to backup
|
||||
primary = new BrokerControl("primary-peer-a", JMX_PORT_PRIMARY, "zkReplicationPrimaryPeerA", PRIMARY_PORT_OFFSET);
|
||||
backup = new BrokerControl("primary-peer-b", JMX_PORT_BACKUP, "zkReplicationPrimaryPeerB", BACKUP_PORT_OFFSET);
|
||||
brokers = new LinkedList(Arrays.asList(primary, backup));
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void testBackupFailoverAndPrimaryFailback() throws Exception {
|
||||
// peers don't request fail back by default
|
||||
// just wait for setup to avoid partial stop of zk via fast tear down with async setup
|
||||
Wait.waitFor(this::ensembleHasLeader);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiPrimary_Peer() throws Exception {
|
||||
|
||||
final int timeout = (int) TimeUnit.SECONDS.toMillis(30);
|
||||
LOGGER.info("starting peer b primary");
|
||||
|
||||
Process backupInstance = backup.startServer(this, timeout);
|
||||
|
||||
// alive as unreplicated, it has configured node id
|
||||
assertTrue(Wait.waitFor(() -> 1L == backup.getActivationSequence().orElse(Long.MAX_VALUE).longValue()));
|
||||
|
||||
final String nodeID = backup.getNodeID().get();
|
||||
Assert.assertNotNull(nodeID);
|
||||
LOGGER.infof("NodeID: %s", nodeID);
|
||||
|
||||
LOGGER.info("starting peer a primary");
|
||||
primary.startServer(this, 0);
|
||||
Wait.assertTrue(() -> primary.isBackup().orElse(false), timeout);
|
||||
|
||||
Wait.assertTrue(() -> !backup.isBackup().orElse(true), timeout);
|
||||
|
||||
for (BrokerControl broker : brokers) {
|
||||
Wait.assertTrue(() -> validateNetworkTopology(broker.listNetworkTopology().orElse(""),
|
||||
containsExactNodeIds(nodeID)
|
||||
.and(withLive(nodeID, Objects::nonNull))
|
||||
.and(withBackup(nodeID, Objects::nonNull))
|
||||
.and(withMembers(1))
|
||||
.and(withNodes(2))), timeout);
|
||||
}
|
||||
|
||||
LOGGER.infof("primary topology is: %s", primary.listNetworkTopology().get());
|
||||
LOGGER.infof("backup topology is: %s", backup.listNetworkTopology().get());
|
||||
Assert.assertTrue(backup.isReplicaSync().get());
|
||||
Assert.assertTrue(primary.isReplicaSync().get());
|
||||
|
||||
|
||||
LOGGER.info("killing peer-b");
|
||||
ServerUtil.killServer(backupInstance, forceKill);
|
||||
|
||||
// peer-a now UNREPLICATED
|
||||
Wait.assertTrue(() -> 2L == primary.getActivationSequence().get().longValue());
|
||||
|
||||
LOGGER.info("restarting peer-b");
|
||||
backup.startServer(this, 0);
|
||||
|
||||
assertTrue(Wait.waitFor(() -> nodeID.equals(backup.getNodeID().orElse("not set yet"))));
|
||||
// peer-b now a REPLICA
|
||||
Wait.waitFor(() -> backup.isReplicaSync().get());
|
||||
Wait.assertTrue(() -> 2L == backup.getActivationSequence().get().longValue());
|
||||
}
|
||||
}
|
|
@ -1,13 +1,13 @@
|
|||
/**
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* <p>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p>
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -19,6 +19,8 @@ package org.apache.activemq.artemis.tests.smoke.quorum;
|
|||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.activemq.artemis.utils.ThreadLeakCheckRule;
|
||||
import org.apache.curator.test.InstanceSpec;
|
||||
import org.apache.curator.test.TestingCluster;
|
||||
import org.apache.curator.test.TestingZooKeeperServer;
|
||||
|
@ -38,7 +40,7 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl
|
|||
|
||||
@Rule
|
||||
public TemporaryFolder tmpFolder = new TemporaryFolder();
|
||||
private TestingCluster testingServer;
|
||||
protected TestingCluster testingServer;
|
||||
private InstanceSpec[] clusterSpecs;
|
||||
private int nodes;
|
||||
|
||||
|
@ -60,6 +62,8 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl
|
|||
@Override
|
||||
@After
|
||||
public void after() throws Exception {
|
||||
// zk bits that leak from servers
|
||||
ThreadLeakCheckRule.addKownThread("ListenerHandler-");
|
||||
try {
|
||||
super.after();
|
||||
} finally {
|
||||
|
@ -76,6 +80,16 @@ public class ZookeeperPluggableQuorumSinglePairTest extends PluggableQuorumSingl
|
|||
return true;
|
||||
}
|
||||
|
||||
protected boolean ensembleHasLeader() {
|
||||
return testingServer.getServers().stream().filter(ZookeeperPluggableQuorumSinglePairTest::isLeader).count() != 0;
|
||||
}
|
||||
|
||||
private static boolean isLeader(TestingZooKeeperServer server) {
|
||||
long leaderId = server.getQuorumPeer().getLeaderId();
|
||||
long id = server.getQuorumPeer().getId();
|
||||
return id == leaderId;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void stopMajority() throws Exception {
|
||||
List<TestingZooKeeperServer> followers = testingServer.getServers();
|
||||
|
|
|
@ -77,6 +77,16 @@ public class Jmx {
|
|||
return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::getNodeID, ActiveMQServerControl.class, throwable -> null);
|
||||
}
|
||||
|
||||
public static Optional<Long> getActivationSequence(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception {
|
||||
return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::getActivationSequence, ActiveMQServerControl.class, throwable -> null);
|
||||
|
||||
}
|
||||
|
||||
public static Optional<Boolean> isActive(JMXServiceURL serviceURI, ObjectNameBuilder builder) throws Exception {
|
||||
return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::isActive, ActiveMQServerControl.class, throwable -> null);
|
||||
|
||||
}
|
||||
|
||||
public static Optional<String> listNetworkTopology(JMXServiceURL serviceURI,
|
||||
ObjectNameBuilder builder) throws Exception {
|
||||
return queryControl(serviceURI, builder.getActiveMQServerObjectName(), ActiveMQServerControl::listNetworkTopology, ActiveMQServerControl.class, throwable -> null);
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.activemq.artemis.tests.unit.core.server.impl;
|
|||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.activemq.artemis.core.config.ha.ReplicationPrimaryPolicyConfiguration;
|
||||
import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager;
|
||||
import org.apache.activemq.artemis.tests.util.ActiveMQTestBase;
|
||||
import org.junit.Before;
|
||||
|
@ -33,6 +34,25 @@ public class FileLockTest extends ActiveMQTestBase {
|
|||
file.mkdirs();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetNodeID() throws Exception {
|
||||
FileLockNodeManager underTest = new FileLockNodeManager(getTestDirfile(), false);
|
||||
ReplicationPrimaryPolicyConfiguration replicationPrimaryPolicyConfiguration = ReplicationPrimaryPolicyConfiguration.withDefault();
|
||||
String seed = "";
|
||||
for (int i = 0; i < 20; i++) {
|
||||
replicationPrimaryPolicyConfiguration.setCoordinationId(seed);
|
||||
if (replicationPrimaryPolicyConfiguration.getCoordinationId() != null) {
|
||||
underTest.setNodeID(replicationPrimaryPolicyConfiguration.getCoordinationId());
|
||||
}
|
||||
seed += String.valueOf(i);
|
||||
}
|
||||
|
||||
replicationPrimaryPolicyConfiguration.setCoordinationId("somme-dash-and-odd");
|
||||
if (replicationPrimaryPolicyConfiguration.getCoordinationId() != null) {
|
||||
underTest.setNodeID(replicationPrimaryPolicyConfiguration.getCoordinationId());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNIOLock() throws Exception {
|
||||
doTestLock(new FileLockNodeManager(getTestDirfile(), false), new FileLockNodeManager(getTestDirfile(), false));
|
||||
|
|
Loading…
Reference in New Issue