fis-gtm/sr_port/wcs_phase2_commit_wait.c

/****************************************************************
 *								*
 *	Copyright 2008, 2010 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include "gtm_facility.h"
#include "gdsroot.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "sleep_cnt.h"
#include "gdsbgtr.h"
#include "memcoherency.h"

/* Include prototypes */
#include "wcs_phase2_commit_wait.h"
#include "gt_timer.h"
#include "wcs_sleep.h"
#include "rel_quant.h"
#include "send_msg.h"
#include "gtm_c_stack_trace.h"

#define	SEND_COMMITWAITPID_GET_STACK_IF_NEEDED(BLOCKING_PID, STUCK_CNT, CR, CSA)						\
{																\
	GBLREF	uint4	process_id;												\
																\
	error_def(ERR_COMMITWAITPID);												\
																\
	if (BLOCKING_PID)													\
	{															\
		STUCK_CNT++;													\
		GET_C_STACK_FROM_SCRIPT("COMMITWAITPID", process_id, BLOCKING_PID, STUCK_CNT);					\
		send_msg(VARLSTCNT(8) ERR_COMMITWAITPID, 6, process_id, 1, BLOCKING_PID, CR->blk, DB_LEN_STR(CSA->region));	\
	}															\
}

/* take C-stack trace of the process doing the phase2 commits at half the entire wait. We do this only while waiting
 * for a particular cache record
 */
#define GET_STACK_AT_HALF_WAIT_IF_NEEDED(BLOCKING_PID, STUCK_CNT)						\
{														\
	GBLREF	uint4	process_id;										\
														\
	if (BLOCKING_PID && (process_id != BLOCKING_PID))							\
	{													\
		STUCK_CNT++;											\
		GET_C_STACK_FROM_SCRIPT("COMMITWAITPID_HALF_WAIT", process_id, BLOCKING_PID, STUCK_CNT);	\
	}													\
}

GBLREF	uint4		process_id;
GBLREF	boolean_t	mu_rndwn_file_dbjnl_flush;
GBLREF	boolean_t	gtm_white_box_test_case_enabled;
GBLREF	int		process_exiting;

#ifdef UNIX
GBLREF	volatile uint4		heartbeat_counter;
GBLREF	volatile boolean_t	timer_in_handler;
#endif

/* if cr == NULL, wait a maximum of 1 minute for ALL processes actively in bg_update_phase2 to finish.
 * if cr != NULL, wait a maximum of 1 minute for the particular cache-record to be done with phase2 commit.
 *
 * This routine is invoked inside and outside of crit. If we hold crit, then we are guaranteed that cr->in_tend
 * cannot get reset to a non-zero value different from what we saw when we started waiting. This is not
 * guaranteed if we dont hold crit. In that case, we wait until cr->in_tend changes in value (zero or non-zero).
 *
 * Returns : TRUE if waiting event completed before timeout, FALSE otherwise
 */
boolean_t	wcs_phase2_commit_wait(sgmnt_addrs *csa, cache_rec_ptr_t cr)
{
	sgmnt_data_ptr_t	csd;
	node_local_ptr_t        cnl;
	uint4			lcnt, blocking_pid, start_in_tend, spincnt, maxspincnt, start_heartbeat, heartbeat_delta;
	int4			value;
	boolean_t		was_crit;
	boolean_t		use_heartbeat;
	block_id		blk;
#	ifdef VMS
	uint4			heartbeat_counter = 0;	/* dummy variable to make compiler happy */
#	endif
	int4			index, crarray_size, crarray_index;
	cache_rec_ptr_t		cr_lo, cr_top, curcr;
	phase2_wait_trace_t	crarray[MAX_PHASE2_WAIT_CR_TRACE_SIZE];
#	ifdef DEBUG
	uint4			incrit_pid, phase2_commit_half_wait;
	int4			waitarray[1024];
	int4			waitarray_size;
	boolean_t		half_time = FALSE;
#	endif
	static uint4		stuck_cnt = 0; /* stuck_cnt signifies the number of times the same process
						has called gtmstuckexec for the same condition*/
	error_def(ERR_COMMITWAITPID);
	error_def(ERR_COMMITWAITSTUCK);

	DEBUG_ONLY(cr_lo = cr_top = NULL;)
	crarray_size = SIZEOF(crarray) / SIZEOF(crarray[0]);
	DEBUG_ONLY(waitarray_size = SIZEOF(waitarray) / SIZEOF(waitarray[0]);)

	assert(!mu_rndwn_file_dbjnl_flush);	/* caller should have avoided calling us if it was mupip rundown */
	csd = csa->hdr;
	/* To avoid unnecessary time spent waiting, we would like to do rel_quants instead of wcs_sleep. But this means
	 * we need to have some other scheme for limiting the total time slept. We use the heartbeat scheme which currently
	 * is available only in Unix. Every 8 seconds or so, the heartbeat timer increments a counter. But there are two
	 * cases where heartbeat_timer will not pop:
	 * (a) if we are in the process of exiting (through a call to cancel_timer(0) which cancels all active timers)
	 * (b) if we are are already in timer_handler. This is possible if the flush timer pops and we end up invoking
	 *     wcs_clean_dbsync->wcs_flu->wcs_phase2_commit_wait. Since SIGALRM signals in Unix don't nest we cannot use
	 *     heartbeat scheme in this case as well.
	 * Therefore, if heartbeat timer is available and currently active, then use rel_quants. If not, use wcs_sleep.
	 * We have found that doing rel_quants (instead of sleeps) causes huge CPU usage in Tru64 even if the default spincnt is
	 * set to 0 and ALL processes are only waiting for one process to finish its phase2 commit. Therefore we choose
	 * the sleep approach for Tru64. Choosing a spincnt of 0 would choose the sleep approach (versus rel_quant).
	 */
#	if (defined(UNIX) && !defined(__osf__))
	use_heartbeat = (!process_exiting && csd->wcs_phase2_commit_wait_spincnt && !timer_in_handler);
#	else
	use_heartbeat = FALSE;
#	endif
	DEBUG_ONLY(phase2_commit_half_wait = use_heartbeat ? (PHASE2_COMMIT_WAIT_HTBT >> 1) : (PHASE2_COMMIT_WAIT >> 1);)
	if (use_heartbeat)
	{
		maxspincnt = csd->wcs_phase2_commit_wait_spincnt;
		assert(maxspincnt);
		if (!maxspincnt)
			maxspincnt = WCS_PHASE2_COMMIT_DEFAULT_SPINCNT;
		start_heartbeat = heartbeat_counter;
	}
	assert(dba_bg == csd->acc_meth);
	if (dba_bg != csd->acc_meth)	/* in pro, be safe and return */
		return TRUE;
	cnl = csa->nl;
	was_crit = csa->now_crit;
	assert((NULL != cr) || was_crit);
	if (NULL != cr)
	{
		start_in_tend = cr->in_tend;
		/* Normally we should never find ourselves holding the lock on the cache-record we are waiting for. There is
		 * one exception though. And that is if we had encountered an error in the middle of phase1 or phase2 of the
		 * commit and ended up invoking "secshr_db_clnup" to finish the transaction for us. It is possible that we
		 * then proceeded with the next transaction doing a "t_qread" without any process invoking "wcs_recover"
		 * (possible only if they did a "grab_crit") until then. In that case, we could have one or more cache-records
		 * with non-zero value of cr->in_tend identical to our process_id. Since we will fix these cache-records
		 * while grabbing crit (which we have to before doing validation in t_end/tp_tend), it is safe to assume
		 * this block is not being touched for now and return right away. But this exception is possible only if
		 * we dont already hold crit (i.e. called from "t_qread"). In addition, errors in the midst of commit are
		 * possible only if we have enabled white-box testing. Assert accordingly.
		 */
		/* we better not deadlock wait for ourself */
		if (!was_crit && (process_id == start_in_tend))
		{
			assert(gtm_white_box_test_case_enabled);
			return TRUE;
		}
		if (process_id == start_in_tend)
			GTMASSERT;	/* should not deadlock on our self */
		if (!start_in_tend)
			return TRUE;
	} else
	{	/* initialize the beginning and the end of cache-records to be used later (only in case of cr == NULL) */
		cr_lo = ((cache_rec_ptr_t)csa->acc_meth.bg.cache_state->cache_array) + csd->bt_buckets;
		cr_top = cr_lo + csd->n_bts;
	}
	/* Spin & sleep/yield alternately for the phase2 commit to complete */
	for (spincnt = 0, lcnt = 0; ; spincnt++)
	{
		SHM_READ_MEMORY_BARRIER; /* read memory barrier done to minimize time spent spinning waiting for value to change */
		if (NULL == cr)
		{
			value = cnl->wcs_phase2_commit_pidcnt;
			if (!value)
				return TRUE;
		} else
		{	/* If we dont hold crit and are sleep looping waiting for cr->in_tend to become 0, it is
			 * theoretically possible (though very remote) that every one of the 1000s of iterations we look
			 * at the cache-record, cr->in_tend is set to the same pid even though the block could have
			 * been updated as part of multiple transactions. But we could have stopped the wait the moment the
			 * same buffer gets updated for the next transaction (even if by the same pid). To recognize that
			 * we note down the current db tn at the start of the wait and check if the block header tn
			 * throughout the wait gets higher than this. If so, we return right away even though cr->in_tend
			 * is non-zero. But since this comparison is done outside of crit it is possible that the block
			 * header tn could be temporarily GREATER than the db tn because of concurrent updates AND because
			 * an update to the 8-byte transaction number is not necessarily atomic AND because the block's tn
			 * that we read could be a mish-mash of low-order and high-order bytes taken from BEFORE and AFTER
			 * an update. Doing less than checks with these bad values is considered risky as a false return
			 * means a GTMASSERT in "t_end" or "tp_tend" in the PIN_CACHE_RECORD macro. Since this situation is
			 * almost an impossibility in practice, we handle this by returning FALSE after timing out and
			 * requiring the caller (t_qread) to restart. Eventually we will get crit (in the final retry) where
			 * we are guaranteed not to end up in this situation.
			 */
			value = cr->in_tend;
			if (value != start_in_tend)
			{
				assert(!was_crit || !value);
				return TRUE;
			}
			if (!was_crit && csd->wc_blocked)
			{	/* Some other process could be doing cache-recovery at this point and if it takes more than
				 * a minute, we will time out for no reason. No point proceeding with this transaction
				 * anyway as we are bound to restart. Do that right away. Caller knows to restart.
				 */
				return FALSE;
			}
		}
		if (use_heartbeat)
		{
			if (spincnt < maxspincnt)
				continue;
			assert(spincnt == maxspincnt);
			heartbeat_delta = heartbeat_counter - start_heartbeat;
		}
		spincnt = 0;
		lcnt++;
		DEBUG_ONLY(waitarray[lcnt % waitarray_size] = value;)
		if (NULL != cr)
		{
			if (was_crit)
			{
				BG_TRACE_PRO_ANY(csa, phase2_commit_wait_sleep_in_crit);
			} else
			{
				BG_TRACE_PRO_ANY(csa, phase2_commit_wait_sleep_no_crit);
			}
		} else
		{
			BG_TRACE_PRO_ANY(csa, phase2_commit_wait_pidcnt);
		}
		if (use_heartbeat)
		{
			if (PHASE2_COMMIT_WAIT_HTBT < heartbeat_delta)
				break;
			DEBUG_ONLY(half_time = (phase2_commit_half_wait == heartbeat_delta));
			rel_quant();
		} else
		{
			if (lcnt >= PHASE2_COMMIT_WAIT)
				break;
			DEBUG_ONLY(half_time = (phase2_commit_half_wait == lcnt));
			wcs_sleep(PHASE2_COMMIT_SLEEP);
		}
#		ifdef DEBUG
		if (half_time)
		{
			if (NULL != cr)
			{
				blocking_pid = cr->in_tend; /* Get a more recent value */
				GET_STACK_AT_HALF_WAIT_IF_NEEDED(blocking_pid, stuck_cnt);
			} else
			{
				assert((NULL != cr_lo) && (cr_lo < cr_top));
				for (curcr = cr_lo; curcr < cr_top; curcr++)
				{
					blocking_pid = curcr->in_tend;
					GET_STACK_AT_HALF_WAIT_IF_NEEDED(blocking_pid, stuck_cnt);
				}
			}
		}
#		endif

	}
	if (NULL == cr)
	{	/* This is the case where we wait for all the phase2 commits to complete. Note down the cache records that
		 * are still not done with the commits. Since there can be multiple cache records held by the same PID, note
		 * down one cache record for each representative PID. We don't expect the list of distinct PIDs to be large.
		 * In any case, note down only as many as we can
		 */
		crarray_index = 0;
		for (curcr = cr_lo; curcr < cr_top;  curcr++)
		{
			blocking_pid = curcr->in_tend;
			/* In rare cases, wcs_phase2_commit_wait could be invoked from bg_update_phase1 (via bt_put->wcs_get_space)
			 * when bg_update_phase1 has already pinned a few cache records (with our PID). We don't want to note down
			 * such cache records and hence the (blocking_pid != process_id) check below
			 */
			if (blocking_pid && (blocking_pid != process_id))
			{
				/* go through the book-keeping array to see if we have already noted down this PID. We don't
				 * expect many processes to be in the phase2 commit section concurrently. So, in most cases,
				 * we won't scan the array more than once
				 */
				for (index = 0; index < crarray_index; ++index)
					if (crarray[index].blocking_pid == blocking_pid)
						break;
				if (index == crarray_index)
				{	/* cache-record with distinct PID */
					assert(crarray_size >= crarray_index);
					if (crarray_size <= crarray_index)
						break;
					crarray[crarray_index].blocking_pid = blocking_pid;
					crarray[crarray_index].cr = curcr;
					crarray_index++;
				}
			}
		}
		/* Issue COMMITWAITPID and get c-stack trace (if possible) for all the distinct PID noted down above */
		for (index = 0; index < crarray_index; index++)
		{	/* It is possible that cr->in_tend changed since the time we added it to the crarray array.
			 * Account for this by rechecking.
			 */
			curcr = crarray[index].cr;
			blocking_pid = curcr->in_tend;
			SEND_COMMITWAITPID_GET_STACK_IF_NEEDED(blocking_pid, stuck_cnt, curcr, csa);
		}
	} else
	{	/* This is the case where we wait for a particular cache-record. Take the c-stack of the PID that is still
		 * holding this cr
		 */
		blocking_pid = cr->in_tend;
		SEND_COMMITWAITPID_GET_STACK_IF_NEEDED(blocking_pid, stuck_cnt, cr, csa);
	}
	DEBUG_ONLY(incrit_pid = cnl->in_crit;)
	send_msg(VARLSTCNT(7) ERR_COMMITWAITSTUCK, 5, process_id, 1, cnl->wcs_phase2_commit_pidcnt, DB_LEN_STR(csa->region));
	BG_TRACE_PRO_ANY(csa, wcb_phase2_commit_wait);
	/* If called from wcs_recover(), we dont want to assert(FALSE) as it is possible (in case of STOP/IDs) that
	 * cnl->wcs_phase2_commit_pidcnt is non-zero even though there is no process in phase2 of commit. In this case
	 * wcs_recover will call wcs_verify which will clear the flag unconditionally and proceed with normal activity.
	 * So should not assert. If the caller is wcs_recover, then we expect csd->wc_blocked so be non-zero. Assert that.
	 */
	assert(csd->wc_blocked);
	return FALSE;
}