fis-gtm/sr_unix/gtmrecv_fetchresync.c

422 lines
18 KiB
C

/****************************************************************
* *
* Copyright 2001, 2011 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_ipc.h"
#include "gtm_socket.h"
#include "gtm_string.h"
#include "gtm_inet.h"
#include "gtm_stdio.h"
#include <sys/un.h>
#include "gtm_time.h" /* needed for difftime() definition; if this file is not included, difftime returns bad values on AIX */
#include <sys/time.h>
#include <errno.h>
#include "gtm_fcntl.h"
#include "gtm_unistd.h"
#include <sys/shm.h>
#include <sys/wait.h>
#include "gdsroot.h"
#include "gdsblk.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "jnl.h"
#include "buddy_list.h" /* needed for muprec.h */
#include "hashtab_mname.h" /* needed for muprec.h */
#include "hashtab_int4.h" /* needed for muprec.h */
#include "hashtab_int8.h" /* needed for muprec.h */
#include "muprec.h"
#include "error.h"
#include "iosp.h"
#include "gtmrecv.h"
#include "repl_comm.h"
#include "repl_msg.h"
#include "repl_errno.h"
#include "repl_dbg.h"
#include "gtm_logicals.h"
#include "eintr_wrappers.h"
#include "repl_sp.h"
#include "repl_log.h"
#include "io.h"
#include "trans_log_name.h"
#include "util.h"
#include "gtmsource.h"
#include "repl_instance.h"
#include "iotcpdef.h"
#include "gtmio.h"
#include "replgbl.h"
#define MAX_ATTEMPTS_FOR_FETCH_RESYNC 60 /* max-wait in seconds for source server response after connection is established */
#define MAX_WAIT_FOR_FETCHRESYNC_CONN 60 /* max-wait in seconds to establish connection with the source server */
#define FETCHRESYNC_PRIMARY_POLL (MICROSEC_IN_SEC - 1) /* micro seconds, almost 1 second */
GBLREF uint4 process_id;
GBLREF int recvpool_shmid;
GBLREF int gtmrecv_listen_sock_fd, gtmrecv_sock_fd;
GBLREF struct sockaddr_in primary_addr;
GBLREF seq_num seq_num_zero;
GBLREF jnl_gbls_t jgbl;
GBLREF int repl_max_send_buffsize, repl_max_recv_buffsize;
GBLREF jnlpool_addrs jnlpool;
GBLREF boolean_t repl_connection_reset;
GBLREF mur_gbls_t murgbl;
error_def(ERR_PRIMARYNOTROOT);
error_def(ERR_RECVPOOLSETUP);
error_def(ERR_REPLCOMM);
error_def(ERR_REPLINSTNOHIST);
error_def(ERR_TEXT);
CONDITION_HANDLER(gtmrecv_fetchresync_ch)
{
int rc;
START_CH;
if (FD_INVALID != gtmrecv_listen_sock_fd)
CLOSEFILE_RESET(gtmrecv_listen_sock_fd, rc); /* resets "gtmrecv_listen_sock_fd" to FD_INVALID */
if (FD_INVALID != gtmrecv_sock_fd)
CLOSEFILE_RESET(gtmrecv_sock_fd, rc); /* resets "gtmrecv_sock_fd" to FD_INVALID */
PRN_ERROR;
NEXTCH;
}
int gtmrecv_fetchresync(int port, seq_num *resync_seqno, seq_num max_reg_seqno)
{
GTM_SOCKLEN_TYPE primary_addr_len;
repl_resync_msg_t resync_msg;
repl_msg_t msg;
unsigned char *msg_ptr; /* needed for REPL_{SEND,RECV}_LOOP */
int tosend_len, sent_len, sent_this_iter; /* needed for REPL_SEND_LOOP */
int torecv_len, recvd_len, recvd_this_iter; /* needed for REPL_RECV_LOOP */
int status; /* needed for REPL_{SEND,RECV}_LOOP */
fd_set input_fds;
int wait_count;
char seq_num_str[32], *seq_num_ptr;
pid_t rollback_pid;
int rollback_status;
int wait_status;
time_t t1, t2;
struct timeval gtmrecv_fetchresync_max_wait, gtmrecv_fetchresync_poll, sel_timeout_val;
repl_instinfo_msg_t instinfo_msg;
repl_needinst_msg_ptr_t need_instinfo_msg;
repl_needtriple_msg_ptr_t need_tripleinfo_msg;
int4 triple_num;
repl_triple triple;
char assumed_remote_proto_ver; /* Protocol version of the source server with which receiver
* server communicates. Need to be "signed char" in order to be
* able to do signed comparisons of this with the macros
* REPL_PROTO_VER_DUALSITE(0) and REPL_PROTO_VER_UNINITIALIZED(-1)
*/
seq_num triple_seqnum;
short retry_num;
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
assumed_remote_proto_ver = REPL_PROTO_VER_MULTISITE;
repl_log(stdout, TRUE, TRUE, "Assuming primary supports multisite functionality. Connecting "
"using multisite communication protocol.\n");
ESTABLISH_RET(gtmrecv_fetchresync_ch, (!SS_NORMAL));
do
{
QWASSIGN(*resync_seqno, seq_num_zero);
gtmrecv_fetchresync_max_wait.tv_sec = MAX_WAIT_FOR_FETCHRESYNC_CONN;
gtmrecv_fetchresync_max_wait.tv_usec = 0;
gtmrecv_fetchresync_poll.tv_sec = 0;
gtmrecv_fetchresync_poll.tv_usec = FETCHRESYNC_PRIMARY_POLL;
gtmrecv_comm_init(port);
primary_addr_len = SIZEOF(primary_addr);
murgbl.remote_proto_ver = REPL_PROTO_VER_UNINITIALIZED;
repl_log(stdout, TRUE, TRUE, "Waiting for a connection...\n");
FD_ZERO(&input_fds);
FD_SET(gtmrecv_listen_sock_fd, &input_fds);
/* Note - the following call to select checks for EINTR. The SELECT macro is not used because
* the code also checks for EAGAIN and takes action before retrying the select.
*/
t1 = time(NULL);
while ((status = select(gtmrecv_listen_sock_fd + 1, &input_fds, NULL, NULL, &gtmrecv_fetchresync_max_wait)) < 0)
{
if ((EINTR == errno) || (EAGAIN == errno))
{
t2 = time(NULL);
if (0 >= (int)(gtmrecv_fetchresync_max_wait.tv_sec =
(MAX_WAIT_FOR_FETCHRESYNC_CONN - (int)difftime(t2, t1))))
{
status = 0;
break;
}
gtmrecv_fetchresync_max_wait.tv_usec = 0;
FD_SET(gtmrecv_listen_sock_fd, &input_fds);
continue;
} else
rts_error(VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2,
RTS_ERROR_LITERAL("Error in select on listen socket"), errno);
}
if (status == 0)
{
repl_log(stdout, TRUE, TRUE, "Waited about %d seconds for connection from primary source server\n",
MAX_WAIT_FOR_FETCHRESYNC_CONN);
rts_error(VARLSTCNT(6) ERR_REPLCOMM, 0, ERR_TEXT, 2,
RTS_ERROR_LITERAL("Waited too long to get a connection request. Check if primary is alive."));
}
ACCEPT_SOCKET(gtmrecv_listen_sock_fd, (struct sockaddr *)&primary_addr,
(GTM_SOCKLEN_TYPE *)&primary_addr_len, gtmrecv_sock_fd);
if (0 > gtmrecv_sock_fd)
{
#ifdef __hpux
/* ENOBUFS in HP-UX is either because of a memory problem or when we have received a RST just
after a SYN before an accept call. Normally this is not fatal and is just a transient state.Hence
exiting just after a single error of this kind should not be done. So retry in case of HP-UX and ENOBUFS error*/
if (ENOBUFS == errno)
{
retry_num = 0;
/*In case of succeeding with select in first go, accept will still get 5ms time difference*/
while (HPUX_MAX_RETRIES > retry_num)
{
SHORT_SLEEP(5);
FD_ZERO(&input_fds);
FD_SET(gtmrecv_listen_sock_fd, &input_fds);
/*Since we use Blocking socket, check before re-trying whether there is a connection to be accepted*/
/*Timeout of HPUX_SEL_TIMEOUT. In case the earlier connection is not available there can be
some time gap between the time the error occured and the new client requests coming in*/
for ( ; HPUX_MAX_RETRIES > retry_num; retry_num++)
{
FD_ZERO(&input_fds);
FD_SET(gtmrecv_listen_sock_fd, &input_fds);
sel_timeout_val.tv_sec = 0;
sel_timeout_val.tv_usec = HPUX_SEL_TIMEOUT;
status = select(gtmrecv_listen_sock_fd + 1, &input_fds, NULL,
NULL, &sel_timeout_val);
if (0 < status)
break;
else
SHORT_SLEEP(5);
}
if (0 > status)
rts_error(VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2,
RTS_ERROR_LITERAL("Error in select on listen socket after ENOBUFS error"), errno);
else
{
ACCEPT_SOCKET(gtmrecv_listen_sock_fd, (struct sockaddr *)&primary_addr,
(GTM_SOCKLEN_TYPE *)&primary_addr_len, gtmrecv_sock_fd);
if ((0 > gtmrecv_sock_fd) && (errno == ENOBUFS))
retry_num++;
else
break;
}
}
}
if (0 > gtmrecv_sock_fd)
#endif
{
rts_error(VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2,
RTS_ERROR_LITERAL("Error accepting connection from Source Server"), errno);
}
}
repl_close(&gtmrecv_listen_sock_fd);
if (0 != (status = get_send_sock_buff_size(gtmrecv_sock_fd, &repl_max_send_buffsize))
|| 0 != (status = get_recv_sock_buff_size(gtmrecv_sock_fd, &repl_max_recv_buffsize)))
{
rts_error(VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2,
LEN_AND_LIT("Error getting socket send/recv buffsizes"), status);
return ERR_REPLCOMM;
}
repl_log(stdout, TRUE, TRUE, "Connection established, using TCP send buffer size %d receive buffer size %d\n",
repl_max_send_buffsize, repl_max_recv_buffsize);
repl_log_conn_info(gtmrecv_sock_fd, stdout);
/* Send REPL_FETCH_RESYNC message */
memset(&resync_msg, 0, SIZEOF(resync_msg));
/* If we assume remote primary is multisite capable, we need to send the journal seqno of this instance
* for comparison. If on the other hand, it is assumed to be only dualsite capable, we need to send the
* dualsite_resync_seqno of this instance which is maintained in "jgbl.max_dualsite_resync_seqno".
*/
if (REPL_PROTO_VER_DUALSITE != assumed_remote_proto_ver)
resync_msg.resync_seqno = max_reg_seqno;
else
resync_msg.resync_seqno = jgbl.max_dualsite_resync_seqno;
assert(resync_msg.resync_seqno);
resync_msg.proto_ver = REPL_PROTO_VER_THIS;
resync_msg.node_endianness = NODE_ENDIANNESS;
(TREF(replgbl)).src_node_same_endianness = TRUE;
(TREF(replgbl)).src_node_endianness_known = FALSE;
gtmrecv_repl_send((repl_msg_ptr_t)&resync_msg, REPL_FETCH_RESYNC, MIN_REPL_MSGLEN,
"REPL_FETCH_RESYNC", resync_msg.resync_seqno);
if (repl_connection_reset)
{ /* Connection got reset during the above send */
rts_error(VARLSTCNT(1) ERR_REPLCOMM);
return ERR_REPLCOMM;
}
/* Wait for REPL_RESYNC_SEQNO (if dual-site primary) or REPL_NEED_INSTANCE_INFO (if multi-site primary) message */
do
{
wait_count = MAX_ATTEMPTS_FOR_FETCH_RESYNC;
REPL_RECV_LOOP(gtmrecv_sock_fd, &msg, MIN_REPL_MSGLEN, FALSE, &gtmrecv_fetchresync_poll)
{
if (0 >= wait_count)
break;
repl_log(stdout, TRUE, TRUE, "Waiting for REPL_NEED_INSTANCE_INFO or REPL_NEED_TRIPLE_INFO"
" or REPL_RESYNC_SEQNO\n");
wait_count--;
}
if (status != SS_NORMAL)
{
if (EREPL_RECV == repl_errno)
rts_error(VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2,
RTS_ERROR_LITERAL("Error receiving RESYNC JNLSEQNO. Error in recv"), status);
if (EREPL_SELECT == repl_errno)
rts_error(VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2,
RTS_ERROR_LITERAL("Error receiving RESYNC JNLSEQNO. Error in select"), status);
}
if (wait_count <= 0)
rts_error(VARLSTCNT(6) ERR_REPLCOMM, 0, ERR_TEXT, 2,
LEN_AND_LIT("Waited too long to get message from primary. Check if primary is alive."));
if (!(TREF(replgbl)).src_node_endianness_known)
{
(TREF(replgbl)).src_node_endianness_known = TRUE;
if ((REPL_MSGTYPE_LAST < msg.type) && (REPL_MSGTYPE_LAST > GTM_BYTESWAP_32(msg.type)))
(TREF(replgbl)).src_node_same_endianness = FALSE;
else
(TREF(replgbl)).src_node_same_endianness = TRUE;
}
if (!(TREF(replgbl)).src_node_same_endianness)
{
msg.type = GTM_BYTESWAP_32(msg.type);
msg.len = GTM_BYTESWAP_32(msg.len);
}
switch(msg.type)
{
case REPL_NEED_INSTANCE_INFO:
need_instinfo_msg = (repl_needinst_msg_ptr_t)&msg;
repl_log(stdout, TRUE, TRUE, "Received REPL_NEED_INSTANCE_INFO message from primary "
"instance [%s]\n", need_instinfo_msg->instname);
murgbl.remote_proto_ver = need_instinfo_msg->proto_ver;
assert(REPL_PROTO_VER_DUALSITE != murgbl.remote_proto_ver);
assert(REPL_PROTO_VER_UNINITIALIZED != murgbl.remote_proto_ver);
assert(REPL_PROTO_VER_MULTISITE <= murgbl.remote_proto_ver);
memset(&instinfo_msg, 0, SIZEOF(instinfo_msg));
assert(NULL != jnlpool.repl_inst_filehdr);
memcpy(instinfo_msg.instname, jnlpool.repl_inst_filehdr->this_instname,
MAX_INSTNAME_LEN - 1);
instinfo_msg.was_rootprimary = (unsigned char)repl_inst_was_rootprimary();
murgbl.was_rootprimary = instinfo_msg.was_rootprimary;
gtmrecv_repl_send((repl_msg_ptr_t)&instinfo_msg, REPL_INSTANCE_INFO,
MIN_REPL_MSGLEN, "REPL_INSTANCE_INFO", MAX_SEQNO);
if (instinfo_msg.was_rootprimary && !need_instinfo_msg->is_rootprimary)
rts_error(VARLSTCNT(4) ERR_PRIMARYNOTROOT, 2,
LEN_AND_STR((char *)need_instinfo_msg->instname));
break;
case REPL_NEED_TRIPLE_INFO:
need_tripleinfo_msg = RECAST(repl_needtriple_msg_ptr_t)&msg;
if ((TREF(replgbl)).src_node_same_endianness)
triple_seqnum = need_tripleinfo_msg->seqno;
else
triple_seqnum = GTM_BYTESWAP_64(need_tripleinfo_msg->seqno);
repl_log(stdout, TRUE, TRUE, "Received REPL_NEED_TRIPLE_INFO message for seqno "
"%llu [0x%llx]\n", triple_seqnum, triple_seqnum);
assert(REPL_PROTO_VER_UNINITIALIZED != murgbl.remote_proto_ver);
assert(NULL != jnlpool.jnlpool_dummy_reg);
repl_inst_ftok_sem_lock();
status = repl_inst_wrapper_triple_find_seqno(triple_seqnum, &triple, &triple_num);
repl_inst_ftok_sem_release();
if (0 != status)
{ /* Close the connection. The function call above would have issued the error. */
assert(ERR_REPLINSTNOHIST == status);
repl_log(stdout, TRUE, TRUE, "Connection reset due to REPLINSTNOHIST error\n");
repl_connection_reset = TRUE;
repl_close(&gtmrecv_sock_fd);
return status;
}
gtmrecv_send_triple_info(&triple, triple_num);
break;
case REPL_INST_NOHIST:
repl_log(stdout, TRUE, TRUE, "Originating instance encountered a REPLINSTNOHIST error."
" JNL_SEQNO of this replicating instance precedes the current history in the "
"originating instance file. Rollback exiting.\n");
status = ERR_REPLINSTNOHIST;
repl_log(stdout, TRUE, TRUE, "Connection reset due to REPLINSTNOHIST error on primary\n");
repl_connection_reset = TRUE;
repl_close(&gtmrecv_sock_fd);
return status;
break;
case REPL_RESYNC_SEQNO:
repl_log(stdout, TRUE, TRUE, "Received REPL_RESYNC_SEQNO message\n");
if (REPL_PROTO_VER_UNINITIALIZED == murgbl.remote_proto_ver)
murgbl.remote_proto_ver = REPL_PROTO_VER_DUALSITE;
assert(REPL_PROTO_VER_DUALSITE <= murgbl.remote_proto_ver);
break;
default:
repl_log(stdout, TRUE, TRUE, "Message of unknown type (%d) received\n", msg.type);
assert(FALSE);
rts_error(VARLSTCNT(1) ERR_REPLCOMM);
break;
}
} while (!repl_connection_reset && (REPL_RESYNC_SEQNO != msg.type));
if (repl_connection_reset)
{ /* Connection got reset during the above send */
rts_error(VARLSTCNT(1) ERR_REPLCOMM);
return ERR_REPLCOMM;
}
if ((TREF(replgbl)).src_node_same_endianness)
QWASSIGN(*resync_seqno, *(seq_num *)&msg.msg[0]);
else
QWASSIGN(*resync_seqno, GTM_BYTESWAP_64(*(seq_num *)&msg.msg[0]));
/* Wait till connection is broken or REPL_CONN_CLOSE is received */
REPL_RECV_LOOP(gtmrecv_sock_fd, &msg, MIN_REPL_MSGLEN, FALSE, &gtmrecv_fetchresync_poll)
{
REPL_DPRINT1("FETCH_RESYNC : Waiting for source to send CLOSE_CONN or connection breakage\n");
}
repl_close(&gtmrecv_sock_fd);
/* Check if our assumed remote protocol version matches the actual. If so, no need for further communication.
* If not, we need to reset our assumed remote protocol version, and reconnect using the newly assumed protocol
* version. This is because if the remote side is dualsite, we will send the resync seqno across and if it is
* multisite, we will send the jnl seqno across. But if the resync seqno and jnl seqno are not different, there
* is no need to disconnect. Keep retrying until the assumed and actual protocol versions match.
*/
if (REPL_PROTO_VER_DUALSITE != assumed_remote_proto_ver)
{
if (REPL_PROTO_VER_DUALSITE != murgbl.remote_proto_ver)
break; /* Both assumed and actual is multisite */
/* Assumed multisite, but actual is dualsite. */
if (max_reg_seqno == jgbl.max_dualsite_resync_seqno)
break; /* avoid disconnect/reconnect if both jnl and resync seqnos are same */
assumed_remote_proto_ver = REPL_PROTO_VER_DUALSITE;
repl_log(stdout, TRUE, TRUE, "Primary does not support multisite functionality. Reconnecting "
"using dualsite communication protocol.\n");
} else
{
if (REPL_PROTO_VER_DUALSITE == murgbl.remote_proto_ver)
break; /* Both assumed and actual is dualsite */
/* Assumed dualsite, but actual is multisite. */
if (max_reg_seqno == jgbl.max_dualsite_resync_seqno)
break; /* avoid disconnect/reconnect if both jnl and resync seqnos are same */
assumed_remote_proto_ver = REPL_PROTO_VER_MULTISITE;
repl_log(stdout, TRUE, TRUE, "Primary supports multisite functionality. Reconnecting "
"using multisite communication protocol.\n");
}
} while (TRUE);
REVERT;
repl_log(stdout, TRUE, TRUE, "Received RESYNC SEQNO is %llu [0x%llx]\n", *resync_seqno, *resync_seqno);
assert((*resync_seqno <= max_reg_seqno) || (REPL_PROTO_VER_DUALSITE == murgbl.remote_proto_ver));
return SS_NORMAL;
}