336 lines
13 KiB
C
336 lines
13 KiB
C
/****************************************************************
|
|
* *
|
|
* Copyright 2006, 2013 Fidelity Information Services, Inc *
|
|
* *
|
|
* This source code contains the intellectual property *
|
|
* of its copyright holder(s), and is made available *
|
|
* under a license. If you do not know the terms of *
|
|
* the license, please stop and do not read further. *
|
|
* *
|
|
****************************************************************/
|
|
|
|
#include "mdef.h"
|
|
|
|
#include "gtm_unistd.h"
|
|
#include "gtm_string.h"
|
|
#include "gtm_inet.h"
|
|
#include "gtm_fcntl.h"
|
|
#include "gtm_socket.h"
|
|
#include <sys/mman.h>
|
|
#include <sys/param.h>
|
|
#include <sys/time.h>
|
|
#include <errno.h>
|
|
#include <sys/sem.h>
|
|
#include "repl_instance.h"
|
|
#include "gdsroot.h"
|
|
#include "gdsblk.h"
|
|
#include "gtm_facility.h"
|
|
#include "fileinfo.h"
|
|
#include "gdsbt.h"
|
|
#include "gdsfhead.h"
|
|
#include "filestruct.h"
|
|
#include "repl_msg.h"
|
|
#include "gtmsource.h"
|
|
#include "repl_dbg.h"
|
|
#include "gtm_stdio.h"
|
|
#include "repl_shutdcode.h"
|
|
#include "repl_sem.h"
|
|
#include "is_proc_alive.h"
|
|
#include "repl_comm.h"
|
|
#include "repl_log.h"
|
|
#include "ftok_sems.h"
|
|
#include "gtm_c_stack_trace.h"
|
|
#ifdef DEBUG
|
|
#include "wbox_test_init.h"
|
|
#include "gtmio.h"
|
|
#include "anticipatory_freeze.h"
|
|
#include "gtm_threadgbl.h"
|
|
#endif
|
|
|
|
GBLREF jnlpool_addrs jnlpool;
|
|
GBLREF jnlpool_ctl_ptr_t jnlpool_ctl;
|
|
GBLREF uint4 process_id;
|
|
GBLREF int gtmsource_srv_count;
|
|
GBLREF gtmsource_options_t gtmsource_options;
|
|
GBLREF int4 jnlpool_shmid;
|
|
GBLREF boolean_t is_src_server;
|
|
GBLREF void (*call_on_signal)();
|
|
GBLREF boolean_t holds_sem[NUM_SEM_SETS][NUM_SRC_SEMS];
|
|
GBLREF boolean_t pool_init;
|
|
GBLREF gd_addr *gd_header;
|
|
|
|
error_def(ERR_JNLPOOLSETUP);
|
|
error_def(ERR_REPLFTOKSEM);
|
|
error_def(ERR_TEXT);
|
|
|
|
int gtmsource_shutdown(boolean_t auto_shutdown, int exit_status)
|
|
{
|
|
boolean_t all_dead, first_time, sem_incremented, regrab_lock;
|
|
uint4 savepid[NUM_GTMSRC_LCL];
|
|
int status, shutdown_status, save_errno;
|
|
int4 index, maxindex, lcnt, num_src_servers_running;
|
|
unix_db_info *udi;
|
|
gtmsource_local_ptr_t gtmsourcelocal_ptr;
|
|
#ifdef DEBUG
|
|
DCL_THREADGBL_ACCESS;
|
|
|
|
SETUP_THREADGBL_ACCESS;
|
|
#endif
|
|
/* Significance of shutdown field in gtmsource_local:
|
|
* This field is initially set to NO_SHUTDOWN. When a command to shut down the source server is issued,
|
|
* the process initiating the shutdown sets this field to SHUTDOWN. The Source Server on sensing
|
|
* that it has to shut down (reads SHUTDOWN in the shutdown field), flushes the database regions, writes
|
|
* (NORMAL_SHUTDOWN + its exit value) into this field and exits. On seeing a non SHUTDOWN value
|
|
* in this field, the process which initiated the shutdown removes the ipcs and exits with the exit value
|
|
* which is a combination of gtmsource_local->shutdown and its own exit value.
|
|
*
|
|
* Note : Exit values should be positive for error indication, zero for normal exit.
|
|
*/
|
|
call_on_signal = NULL; /* Don't reenter on error */
|
|
assert(pool_init); /* should have attached to the journal pool before coming here */
|
|
udi = (unix_db_info *)FILE_INFO(jnlpool.jnlpool_dummy_reg);
|
|
if (!auto_shutdown)
|
|
{ /* ftok semaphore and jnlpool access semaphore should already be held from the previous call to "jnlpool_init" */
|
|
assert(udi->grabbed_ftok_sem);
|
|
assert(holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]);
|
|
if (NULL != jnlpool.gtmsource_local)
|
|
{ /* Shutdown source server for the secondary instance specified in the command line */
|
|
savepid[0] = jnlpool.gtmsource_local->gtmsource_pid;
|
|
/* Set flag to signal concurrently running source server to shutdown */
|
|
jnlpool.gtmsource_local->shutdown = SHUTDOWN;
|
|
repl_log(stdout, TRUE, TRUE, "Initiating SHUTDOWN operation on source server pid [%d] for secondary"
|
|
" instance [%s]\n", savepid[0], jnlpool.gtmsource_local->secondary_instname);
|
|
maxindex = 1; /* Only one process id to check */
|
|
} else
|
|
{ /* Shutdown ALL source servers that are up and running */
|
|
gtmsourcelocal_ptr = &jnlpool.gtmsource_local_array[0];
|
|
for (maxindex = 0, index = 0; index < NUM_GTMSRC_LCL; index++, gtmsourcelocal_ptr++)
|
|
{
|
|
savepid[index] = gtmsourcelocal_ptr->gtmsource_pid;
|
|
if (0 < savepid[index])
|
|
{
|
|
gtmsourcelocal_ptr->shutdown = SHUTDOWN;
|
|
repl_log(stdout, TRUE, TRUE, "Initiating SHUTDOWN operation on source server pid [%d] "
|
|
"for secondary instance [%s]\n", savepid[index],
|
|
gtmsourcelocal_ptr->secondary_instname);
|
|
maxindex = index + 1; /* Check at least until pid corresponding to "index" */
|
|
}
|
|
}
|
|
}
|
|
/* Wait for source server(s) to die. But before that release ftok semaphore and jnlpool access control semaphore.
|
|
* This way, other processes (either in this environment or a different one) don't encounter startup issues.
|
|
* However, to ensure that a concurrent argument-less rundown doesn't remove these semaphores (in case they
|
|
* are orphaned), increment the counter semaphore.
|
|
*/
|
|
if (0 != incr_sem(SOURCE, SRC_SERV_COUNT_SEM))
|
|
{
|
|
save_errno = errno;
|
|
repl_log(stderr, TRUE, TRUE, "Could not increment Journal Pool counter semaphore : %s. "
|
|
"Shutdown did not complete\n", STRERROR(save_errno));
|
|
/* Even though we hold the FTOK and JNL_POOL_ACCESS_SEM before entering this function (as ensured by
|
|
* asserts above), it is safe to release them in case of a premature error (like this one). The caller
|
|
* doesn't rely on the semaphores being held and this function is designed to release these semaphores
|
|
* eventually anyways (after gtmsource_ipc_cleanup())
|
|
*/
|
|
repl_inst_ftok_sem_release();
|
|
status = rel_sem(SOURCE, JNL_POOL_ACCESS_SEM);
|
|
assert(0 == status);
|
|
return ABNORMAL_SHUTDOWN;
|
|
}
|
|
if (0 != rel_sem(SOURCE, JNL_POOL_ACCESS_SEM))
|
|
{
|
|
save_errno = errno;
|
|
repl_log(stderr, TRUE, TRUE, "Could not release Journal Pool access control semaphore : %s. "
|
|
"Shutdown did not complete\n", STRERROR(save_errno));
|
|
repl_inst_ftok_sem_release(); /* see comment above for why this is okay */
|
|
status = decr_sem(SOURCE, SRC_SERV_COUNT_SEM);
|
|
assert(0 == status);
|
|
return ABNORMAL_SHUTDOWN;
|
|
}
|
|
repl_inst_ftok_sem_release();
|
|
regrab_lock = sem_incremented = TRUE;
|
|
gvinit(); /* Get the gd header*/
|
|
/* Wait for ONE particular or ALL source servers to die */
|
|
repl_log(stdout, TRUE, TRUE, "Waiting for upto [%d] seconds for the source server to shutdown\n",
|
|
GTMSOURCE_MAX_SHUTDOWN_WAITLOOP);
|
|
for (lcnt = 1; GTMSOURCE_MAX_SHUTDOWN_WAITLOOP >= lcnt; lcnt++)
|
|
{
|
|
all_dead = TRUE;
|
|
for (index = 0; index < maxindex; index++)
|
|
{
|
|
if ((0 < savepid[index]) && is_proc_alive(savepid[index], 0))
|
|
{
|
|
all_dead = FALSE;
|
|
# ifdef DEBUG
|
|
if (!(lcnt % 60))
|
|
GET_C_STACK_FROM_SCRIPT("ERR_SHUTDOWN_INFO", process_id, savepid[index], lcnt);
|
|
# endif
|
|
}
|
|
}
|
|
if (!all_dead)
|
|
SHORT_SLEEP(GTMSOURCE_WAIT_FOR_SHUTDOWN)
|
|
else
|
|
break;
|
|
}
|
|
if (GTMSOURCE_MAX_SHUTDOWN_WAITLOOP < lcnt)
|
|
{ /* Max timeout over, take stack trace of all the source server(s) which are still running.
|
|
* Display the list of pids that wont die along with the secondary instances they correspond to.
|
|
* Users need to kill these pids and reissue the shutdown command for the journal pool to be cleaned up.
|
|
*/
|
|
repl_log(stderr, TRUE, TRUE, "Error : Timed out waiting for following source server process(es) to die\n");
|
|
for (lcnt = 0, index = 0; index < maxindex; index++)
|
|
{
|
|
if ((0 < savepid[index]) && is_proc_alive(savepid[index], 0))
|
|
{
|
|
lcnt++;
|
|
GET_C_STACK_FROM_SCRIPT("ERR_SHUTDOWN", process_id, savepid[index], lcnt);
|
|
if (NULL != jnlpool.gtmsource_local)
|
|
{
|
|
assert(0 == index);
|
|
gtmsourcelocal_ptr = jnlpool.gtmsource_local;
|
|
} else
|
|
gtmsourcelocal_ptr = &jnlpool.gtmsource_local_array[index];
|
|
repl_log(stderr, FALSE, FALSE,
|
|
" ---> Source server pid [%d] for secondary instance [%s] is still alive\n",
|
|
savepid[index], gtmsourcelocal_ptr->secondary_instname);
|
|
}
|
|
}
|
|
repl_log(stderr, FALSE, TRUE, "Shutdown cannot proceed. Stop the above processes and reissue "
|
|
"the shutdown command.\n");
|
|
status = decr_sem(SOURCE, SRC_SERV_COUNT_SEM);
|
|
assert(0 == status);
|
|
return ABNORMAL_SHUTDOWN;
|
|
}
|
|
} else
|
|
{
|
|
sem_incremented = FALSE;
|
|
if (gtmsource_srv_count)
|
|
{
|
|
repl_log(stdout, TRUE, TRUE, "Initiating shut down\n");
|
|
/* A non-zero gtmsource_srv_count indicates we are the spawned off child source server. That means we
|
|
* are not holding any semaphores. More importantly, none of the source server's mainline code holds
|
|
* the ftok or the access control semaphore anymore. So, even if we reach here due to an external signal
|
|
* we are guaranteed that we don't hold any semaphores. Assert that.
|
|
*/
|
|
assert(!udi->grabbed_ftok_sem);
|
|
assert(!holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]);
|
|
regrab_lock = TRUE;
|
|
} else
|
|
{
|
|
assert(udi->grabbed_ftok_sem);
|
|
assert(holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]);
|
|
/* Do not release lock as this is a case of the source server startup command coming here after the
|
|
* forked off child source server errored out at startup itself. Just in case the jnlpool has not
|
|
* yet been initialized (possible if this process created the journal pool) we do not want to
|
|
* release the lock and let someone else sneak in and see uninitialized data. Better to remove the
|
|
* journal pool before anyone can come in. Hence hold on to the lock.
|
|
*/
|
|
regrab_lock = FALSE;
|
|
}
|
|
}
|
|
if (regrab_lock)
|
|
{ /* Now that the source servers are shutdown, regrab the FTOK and access control semaphore (IN THAT ORDER to avoid
|
|
* deadlocks)
|
|
*/
|
|
repl_inst_ftok_sem_lock();
|
|
# ifdef DEBUG
|
|
/* Sleep for a few seconds to test for concurrent argument-less RUNDOWN to ensure that the latter doesn't remove
|
|
* the JNL_POOL_ACCESS_SEM under the assumption that it is orphaned.
|
|
*/
|
|
if (gtm_white_box_test_case_enabled && (WBTEST_LONGSLEEP_IN_REPL_SHUTDOWN == gtm_white_box_test_case_number))
|
|
{
|
|
DBGFPF((stderr, "GTMSOURCE_SHUTDOWN is about to start long sleep\n"));
|
|
LONG_SLEEP(10);
|
|
}
|
|
# endif
|
|
if (0 > (status = grab_sem(SOURCE, JNL_POOL_ACCESS_SEM)))
|
|
{
|
|
save_errno = errno;
|
|
repl_log(stderr, TRUE, TRUE, "Could not acquire Journal Pool access control semaphore : %s. "
|
|
"Shutdown not complete\n", STRERROR(save_errno));
|
|
repl_inst_ftok_sem_release();
|
|
status = decr_sem(SOURCE, SRC_SERV_COUNT_SEM);
|
|
assert(0 == status);
|
|
return ABNORMAL_SHUTDOWN;
|
|
}
|
|
/* Now that the locks are re-acquired, decrease the counter sempahore */
|
|
if (sem_incremented && (0 > (status = decr_sem(SOURCE, SRC_SERV_COUNT_SEM))))
|
|
{
|
|
save_errno = errno;
|
|
repl_log(stderr, TRUE, TRUE, "Could not decrement Journal Pool counter semaphore : %s."
|
|
"Shutdown not complete\n", STRERROR(save_errno));
|
|
repl_inst_ftok_sem_release();
|
|
status = rel_sem(SOURCE, JNL_POOL_ACCESS_SEM);
|
|
assert(0 == status);
|
|
return ABNORMAL_SHUTDOWN;
|
|
}
|
|
}
|
|
if (!auto_shutdown)
|
|
{
|
|
first_time = TRUE;
|
|
for (index = 0; index < maxindex; index++)
|
|
{
|
|
if (NULL != jnlpool.gtmsource_local)
|
|
{
|
|
assert(0 == index);
|
|
gtmsourcelocal_ptr = jnlpool.gtmsource_local;
|
|
} else
|
|
gtmsourcelocal_ptr = &jnlpool.gtmsource_local_array[index];
|
|
exit_status = gtmsourcelocal_ptr->shutdown;
|
|
if (SHUTDOWN == exit_status)
|
|
{
|
|
if (0 == savepid[index]) /* No source server */
|
|
exit_status = NORMAL_SHUTDOWN;
|
|
else /* Source Server crashed */
|
|
{
|
|
repl_log(stderr, first_time, TRUE, "Source Server pid [%d] (secondary instance [%s])"
|
|
" exited abnormally. MUPIP RUNDOWN might be warranted\n",
|
|
savepid[index], gtmsourcelocal_ptr->secondary_instname);
|
|
first_time = FALSE;
|
|
}
|
|
}
|
|
}
|
|
if (!first_time) /* At least one source server did not exit normally. Reset "exit_status" */
|
|
exit_status = ABNORMAL_SHUTDOWN;
|
|
}
|
|
shutdown_status = exit_status;
|
|
/* gtmsource_ipc_cleanup will not be successful unless source server has completely exited.
|
|
* It relies on SRC_SERV_COUNT_SEM value. One thing to note here is that if shutdown of a specific source server
|
|
* is requested and that is successfully shutdown we should return NORMAL_SHUTDOWN if other source servers
|
|
* are running (currently returned as an ABNORMAL_SHUTDOWN "exit_status" in "gtmsource_ipc_cleanup". But if any
|
|
* other error occurs in that function causing it to return ABNORMAL_SHUTDOWN, then we should return ABNORMAL_SHUTDOWN
|
|
* from this function as well.
|
|
*/
|
|
if (FALSE == gtmsource_ipc_cleanup(auto_shutdown, &exit_status, &num_src_servers_running))
|
|
rel_sem_immediate(SOURCE, JNL_POOL_ACCESS_SEM);
|
|
else
|
|
{ /* Journal Pool and Access Control Semaphores removed. Invalidate corresponding fields in file header */
|
|
assert(NORMAL_SHUTDOWN == exit_status);
|
|
repl_inst_jnlpool_reset();
|
|
}
|
|
if (!ftok_sem_release(jnlpool.jnlpool_dummy_reg, TRUE, FALSE))
|
|
rts_error(VARLSTCNT(1) ERR_JNLPOOLSETUP);
|
|
assert(!num_src_servers_running || (ABNORMAL_SHUTDOWN == exit_status));
|
|
return (((1 == maxindex) && num_src_servers_running) ? shutdown_status : exit_status);
|
|
}
|
|
|
|
void gtmsource_stop(boolean_t exit)
|
|
{
|
|
int status;
|
|
|
|
assert(gtmsource_srv_count || is_src_server);
|
|
status = gtmsource_end1(TRUE);
|
|
status = gtmsource_shutdown(TRUE, status) - NORMAL_SHUTDOWN;
|
|
if (exit)
|
|
gtmsource_exit(status);
|
|
return;
|
|
}
|
|
|
|
void gtmsource_sigstop(void)
|
|
{
|
|
if (is_src_server)
|
|
gtmsource_stop(FALSE);
|
|
return;
|
|
}
|
|
|