fis-gtm/sr_unix/wcs_get_space.c

/****************************************************************
 *								*
 *	Copyright 2007, 2011 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include "gtm_facility.h"
#include "gdsroot.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "interlock.h"
#include "jnl.h"
#include "sleep_cnt.h"
#include "gdsbgtr.h"
#include "wbox_test_init.h"

/* Include prototypes */
#include "send_msg.h"
#include "wcs_get_space.h"
#include "gtmmsg.h"
#include "gt_timer.h"
#include "wcs_sleep.h"
#include "relqop.h"
#include "error.h"		/* for gtm_fork_n_core() prototype */
#include "rel_quant.h"
#include "performcaslatchcheck.h"
#include "wcs_phase2_commit_wait.h"
#include "wcs_recover.h"
#include "gtm_c_stack_trace.h"

GBLDEF	cache_rec_ptr_t		get_space_fail_cr;	/* gbldefed to be accessible in a pro core */
GBLDEF	wcs_conflict_trace_t	*get_space_fail_array;	/* gbldefed to be accessilbe in a pro core */
GBLDEF	int4			get_space_fail_arridx;	/* gbldefed to be accessilbe in a pro core */

GBLREF	sgmnt_addrs		*cs_addrs;
GBLREF	sgmnt_data_ptr_t	cs_data;
GBLREF	gd_region		*gv_cur_region;	/* needed for the JNL_ENSURE_OPEN_WCS_WTSTART macro */
GBLREF	int			num_additional_processors;
GBLREF	uint4			process_id;
GBLREF	volatile int4		fast_lock_count;

error_def(ERR_DBFILERR);
error_def(ERR_WAITDSKSPACE);
error_def(ERR_GBLOFLOW);

#define	WCS_CONFLICT_TRACE_ARRAYSIZE	64
#define	LCNT_INTERVAL			DIVIDE_ROUND_UP(UNIX_GETSPACEWAIT, WCS_CONFLICT_TRACE_ARRAYSIZE)

#define WCS_GET_SPACE_RETURN_FAIL(TRACEARRAY, CR)					\
{											\
	assert(FALSE);			/* We have failed */				\
	get_space_fail_cr = CR;								\
	get_space_fail_array = TRACEARRAY;						\
	if (TREF(gtm_environment_init))							\
		gtm_fork_n_core();	/* take a snapshot in case running in-house */	\
	return FALSE;									\
}

#define GET_IO_LATCH_PID(CSA)		(CSA->jnl ? CSA->jnl->jnl_buff->io_in_prog_latch.u.parts.latch_pid : -1)
#define GET_FSYNC_LATCH_PID(CSA)	(CSA->jnl ? CSA->jnl->jnl_buff->fsync_in_prog_latch.u.parts.latch_pid : -1)

#define INVOKE_C_STACK_APPROPRIATE(CR, CSA, STUCK_CNT)										\
{																\
	int4	io_latch_pid, fsync_latch_pid;											\
																\
	if (CR->epid)														\
	{															\
		GET_C_STACK_FROM_SCRIPT("WCS_GET_SPACE_RETURN_FAIL_CR", process_id, CR->epid, STUCK_CNT);			\
	}															\
	if (0 < (io_latch_pid = GET_IO_LATCH_PID(CSA)))										\
	{															\
		GET_C_STACK_FROM_SCRIPT("WCS_GET_SPACE_RETURN_FAIL_IO_PROG", process_id, io_latch_pid, STUCK_CNT);		\
	}															\
	if (0 < (fsync_latch_pid = GET_FSYNC_LATCH_PID(CSA)))									\
	{															\
		GET_C_STACK_FROM_SCRIPT("WCS_GET_SPACE_RETURN_FAIL_FSYNC_PROG", process_id, fsync_latch_pid, STUCK_CNT);	\
	} 															\
}																\

/* go after a specific number of buffers or a particular buffer */
/* not called if UNTARGETED_MSYNC and MM mode */
bool	wcs_get_space(gd_region *reg, int needed, cache_rec_ptr_t cr)
{
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	node_local_ptr_t        cnl;
	cache_que_head_ptr_t	q0, base;
	int4			n, save_errno = 0, k, i, dummy_errno, max_count, count;
	int			maxspins, retries, spins;
	uint4			lcnt, size, to_wait, to_msg, this_idx;
	wcs_conflict_trace_t	wcs_conflict_trace[WCS_CONFLICT_TRACE_ARRAYSIZE];
	boolean_t		is_mm;
	cache_rec		cr_contents;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert((0 != needed) || (NULL != cr));
	get_space_fail_arridx = 0;
	csa = &FILE_INFO(reg)->s_addrs;
	csd = csa->hdr;
	cnl = csa->nl;
	is_mm = (dba_mm == csd->acc_meth);
	assert(is_mm || (dba_bg == csd->acc_meth));
	if (FALSE == csa->now_crit)
	{
		assert(0 != needed);	/* if needed == 0, then we should be in crit */
		for (lcnt = DIVIDE_ROUND_UP(needed, csd->n_wrt_per_flu);  0 < lcnt;  lcnt--)
			JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 0, dummy_errno);
					/* a macro that ensure jnl is open, invokes wcs_wtstart() and checks for errors etc. */
		return TRUE;
	}
	UNTARGETED_MSYNC_ONLY(assert(!is_mm);)
	csd->flush_trigger = MAX(csd->flush_trigger - MAX(csd->flush_trigger / STEP_FACTOR, 1), MIN_FLUSH_TRIGGER(csd->n_bts));
	/* Routine actually serves two purposes:
	 *	1 - Free up required number of buffers or
	 *	2 - Free up a specific buffer
	 * Do a different kind of loop depending on which is our current calling.
	 */
	if (0 != needed)
	{
		BG_TRACE_ANY(csa, bufct_buffer_flush);
		for (lcnt = 1; (cnl->wc_in_free < needed) && (BUF_OWNER_STUCK > lcnt); ++lcnt)
		{
			JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, needed, save_errno);
			if (is_mm && (ERR_GBLOFLOW == save_errno))
				wcs_recover(reg);
			if (cnl->wc_in_free < needed)
			{
				if ((ENOSPC == save_errno) && (csa->hdr->wait_disk_space > 0))
				{
					/* not enough disk space to flush the buffers to regain them
					 * so wait for it to become available,
					 * and if it takes too long, just
					 * quit. Unfortunately, quitting would
					 * invoke the recovery logic which
					 * should be of no help to this
					 * situation. Then what?
					 */
					lcnt = BUF_OWNER_STUCK;
					to_wait = cs_data->wait_disk_space;
					to_msg = (to_wait / 8) ? (to_wait / 8) : 1; /* output error message around 8 times */
					while ((0 < to_wait) && (ENOSPC == save_errno))
					{
						if ((to_wait == cs_data->wait_disk_space)
							|| (0 == to_wait % to_msg))
						{
							send_msg(VARLSTCNT(7) ERR_WAITDSKSPACE, 4,
								process_id, to_wait, DB_LEN_STR(reg), save_errno);
							gtm_putmsg(VARLSTCNT(7) ERR_WAITDSKSPACE, 4,
								process_id, to_wait, DB_LEN_STR(reg), save_errno);
						}
						hiber_start(1000);
						to_wait--;
						JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, needed, save_errno);
						if (is_mm && (ERR_GBLOFLOW == save_errno))
							wcs_recover(reg);
						if (cnl->wc_in_free >= needed)
							break;
					}
				}
				wcs_sleep(lcnt);
			} else
				return TRUE;
			BG_TRACE_ANY(csa, bufct_buffer_flush_loop);
		}
		if (cnl->wc_in_free >= needed)
			return TRUE;
	} else
	{	/* Wait for a specific buffer to be flushed. We attempt to speed this along by shuffling the entry
		 * we want to the front of the queue before we call routines to do some writing.
		 * Formerly we used to wait for this buffer to be flushed irrespective of its position in the active queue.
		 * We keep this code commented just in case this needs to be resurrected in the future.
		 */
#		ifdef old_code
		BG_TRACE_ANY(csa, spcfc_buffer_flush);
		for (lcnt = 1; (0 != cr->dirty) && (BUF_OWNER_STUCK > lcnt); ++lcnt)
		{
			for (; 0 != cr->dirty && 0 != csa->acc_meth.bg.cache_state->cacheq_active.fl;)
				JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 0, save_errno);
			if (0 != cr->dirty)
				wcs_sleep(lcnt);
			else
				return TRUE;
			BG_TRACE_ANY(csa, spcfc_buffer_flush_loop);
		}
		if (0 == cr->dirty)
			return TRUE;
#		endif
		assert(csa->now_crit);		/* must be crit to play with queues when not the writer */
		BG_TRACE_PRO_ANY(csa, spcfc_buffer_flush);
		++fast_lock_count;			/* Disable wcs_stale for duration */
		if (!is_mm)		/* Determine queue base to use */
		{
			base = &csa->acc_meth.bg.cache_state->cacheq_active;
			/* If another process is concurrently finishing up phase2 of commit, wait for that to complete first. */
			if (cr->in_tend && !wcs_phase2_commit_wait(csa, cr))
				return FALSE;	/* assumption is that caller will set wc_blocked and trigger cache recovery */
		} else
			base = &csa->acc_meth.mm.mmblk_state->mmblkq_active;
		maxspins = num_additional_processors ? MAX_LOCK_SPINS(LOCK_SPINS, num_additional_processors) : 1;
		for (retries = LOCK_TRIES - 1; retries > 0 ; retries--)
		{
			for (spins = maxspins; spins > 0 ; spins--)
			{
				if (GET_SWAPLOCK(&base->latch)) /* Lock queue to prevent interference */
				{
					if (0 != cr->state_que.fl)
					{	/* If it is still in the active queue, then insert it at the head of the queue */
						csa->wbuf_dqd++;
						q0 = (cache_que_head_ptr_t)((sm_uc_ptr_t)&cr->state_que + cr->state_que.fl);
						shuffqth((que_ent_ptr_t)q0, (que_ent_ptr_t)base);
						csa->wbuf_dqd--;
						VERIFY_QUEUE(base);
					}
					/* release the queue header lock so that the writers can proceed */
					RELEASE_SWAPLOCK(&base->latch);
					--fast_lock_count;
					assert(0 <= fast_lock_count);
					/* Fire off a writer to write it out. Another writer may grab our cache
					 * record so we have to be willing to wait for him to flush it.
					 * Flush this one buffer the first time through.
					 * If this didn't work, flush normal amount next time in the loop.
					 */
					JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 1, save_errno);
					if (is_mm && (ERR_GBLOFLOW == save_errno))
						wcs_recover(reg);
					for (lcnt = 1; (0 != cr->dirty) && (UNIX_GETSPACEWAIT > lcnt); ++lcnt)
					{
						if (0 == (lcnt % LCNT_INTERVAL))
						{
							this_idx = (lcnt / LCNT_INTERVAL);
							assert(this_idx < WCS_CONFLICT_TRACE_ARRAYSIZE);
							wcs_conflict_trace[this_idx].wcs_active_lvl = cnl->wcs_active_lvl;
							wcs_conflict_trace[this_idx].io_in_prog_pid = GET_IO_LATCH_PID(csa);
							wcs_conflict_trace[this_idx].fsync_in_prog_pid = GET_FSYNC_LATCH_PID(csa);
						}
						get_space_fail_arridx = lcnt;
						max_count = ROUND_UP(cnl->wcs_active_lvl, csd->n_wrt_per_flu);
						/* Check if cache recovery is needed (could be set by another process in
						 * secshr_db_clnup finishing off a phase2 commit). If so, no point invoking
						 * wcs_wtstart as it will return right away. Instead return FALSE so
						 * cache-recovery can be triggered by the caller.
						 */
						if (csd->wc_blocked)
						{
							assert(gtm_white_box_test_case_enabled);
							return FALSE;
						}
						/* loop till the active queue is exhausted */
						for (count = 0; 0 != cr->dirty && 0 != cnl->wcs_active_lvl &&
							     max_count > count; count++)
						{
							BG_TRACE_PRO_ANY(csa, spcfc_buffer_flush_retries);
							JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 0, save_errno);
							if (is_mm && (ERR_GBLOFLOW == save_errno))
								wcs_recover(reg);
						}
						/* Usually we want to sleep only if we need to wait on someone else
						 * i.e. (i) if we are waiting for another process' fsync to complete
						 *		We have seen jnl_fsync() to take more than a minute.
						 *		Hence we wait for a max. of 2 mins (UNIX_GETSPACEWAIT).
						 *     (ii) if some concurrent writer has taken this cache-record out.
						 *    (iii) if someone else is holding the io_in_prog lock.
						 * Right now we know of only one case where there is no point in waiting
						 *   which is if the cache-record is out of the active queue and is dirty.
						 * But since that is quite rare and we don't lose much in that case by
						 *   sleeping we do an unconditional sleep (only if cr is dirty).
						 */
						if (!cr->dirty)
							return TRUE;
						else
						{
							DEBUG_ONLY(cr_contents = *cr;)
							/* Assert that if the cache-record is dirty, it better be in the
							 * active queue or be in the process of getting flushed by a concurrent
							 * writer or phase2 of the commit is in progress. If none of this is
							 * true, it should have become non-dirty by now even though we found it
							 * dirty a few lines above. Note that the cache-record could be in the
							 * process of being released by a concurrent writer; This is done by
							 * resetting 3 fields cr->epid, cr->dirty, cr->interlock; Since the write
							 * interlock is the last field to be released, check that BEFORE dirty.
							 */
							assert(cr_contents.state_que.fl || cr_contents.epid || cnl->in_wtstart
								|| cr_contents.in_tend
								|| (LATCH_CLEAR != WRITE_LATCH_VAL(&cr_contents))
								|| !cr_contents.dirty);
							wcs_sleep(lcnt);
						}
						BG_TRACE_PRO_ANY(csa, spcfc_buffer_flush_loop);
					}
					if (0 == cr->dirty)
						return TRUE;
					INVOKE_C_STACK_APPROPRIATE(cr, csa, 1);
					WCS_GET_SPACE_RETURN_FAIL(wcs_conflict_trace, cr);
				} else
				{	/* buffer was locked */
					if (0 == cr->dirty)
					{
						BG_TRACE_ANY(csa, spcfc_buffer_flushed_during_lockwait);
						--fast_lock_count;
						assert(0 <= fast_lock_count);
						return TRUE;
					}
				}
			}
			if (retries & 0x3)	/* On all but every 4th pass, do a simple rel_quant */
				rel_quant();	/* Release processor to holder of lock (hopefully) */
			else
			{	/* On every 4th pass, we bide for awhile */
				wcs_sleep(LOCK_SLEEP);
				/* If near end of loop, see if target is dead and/or wake it up */
				if (RETRY_CASLATCH_CUTOFF == retries)
					performCASLatchCheck(&base->latch, TRUE);
			}
		}
		--fast_lock_count;
		assert(0 <= fast_lock_count);
		if (0 == cr->dirty)
			return TRUE;
	}
	if (ENOSPC == save_errno)
		rts_error(VARLSTCNT(7) ERR_WAITDSKSPACE, 4, process_id, to_wait, DB_LEN_STR(reg), save_errno);
	else
		assert(FALSE);
	INVOKE_C_STACK_APPROPRIATE(cr, csa, 2);
	WCS_GET_SPACE_RETURN_FAIL(wcs_conflict_trace, cr);
}