fis-gtm/sr_port/db_csh_getn.c

/****************************************************************
 *								*
 *	Copyright 2001, 2012 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include <signal.h>	/* needed for VSIG_ATOMIC_T */

#include "gdsroot.h"
#include "gdsblk.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "gdskill.h"
#include "gdscc.h"
#include "filestruct.h"
#include "interlock.h"
#include "jnl.h"
#include "buddy_list.h"		/* needed for tp.h */
#include "hashtab.h"		/* needed for cws_insert.h */
#include "hashtab_int4.h"	/* needed for tp.h and cws_insert.h */
#include "tp.h"
#include "gdsbgtr.h"
#include "min_max.h"
#include "sleep_cnt.h"
#include "send_msg.h"
#include "relqop.h"
#include "is_proc_alive.h"
#include "cache.h"
#include "longset.h"		/* needed for cws_insert.h */
#include "cws_insert.h"
#include "wcs_sleep.h"
#include "wcs_get_space.h"
#include "wcs_timer_start.h"
#include "add_inter.h"
#include "wbox_test_init.h"
#include "have_crit.h"
#include "memcoherency.h"
#include "gtm_c_stack_trace.h"

GBLREF sgmnt_addrs	*cs_addrs;
GBLREF gd_region	*gv_cur_region;
GBLREF uint4		process_id;
GBLREF uint4		image_count;
GBLREF unsigned int	t_tries;
GBLREF uint4		dollar_tlevel;
GBLREF sgm_info		*sgm_info_ptr;
GBLREF boolean_t        mu_reorg_process;

#define	TRACE_AND_SLEEP(ocnt)				\
{							\
	if (1 == ocnt)					\
	{						\
		BG_TRACE_PRO(db_csh_getn_rip_wait);	\
		first_r_epid = latest_r_epid;		\
	}						\
	wcs_sleep(ocnt);				\
}

error_def(ERR_BUFRDTIMEOUT);
error_def(ERR_INVALIDRIP);

cache_rec_ptr_t	db_csh_getn(block_id block)
{
	cache_rec_ptr_t		hdr, q0, start_cr, cr;
	bt_rec_ptr_t		bt;
	unsigned int		lcnt, ocnt;
	int			rip, max_ent, pass1, pass2, pass3;
	int4			flsh_trigger;
	uint4			first_r_epid, latest_r_epid;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	srch_blk_status		*tp_srch_status;
	ht_ent_int4		*tabent;

	csa = cs_addrs;
	csd = csa->hdr;
	assert(csa->now_crit);
	assert(csa == &FILE_INFO(gv_cur_region)->s_addrs);
	max_ent = csd->n_bts;
	cr = (cache_rec_ptr_t)GDS_REL2ABS(csa->nl->cur_lru_cache_rec_off);
	hdr = csa->acc_meth.bg.cache_state->cache_array + (block % csd->bt_buckets);
	start_cr = csa->acc_meth.bg.cache_state->cache_array + csd->bt_buckets;
	pass1 = max_ent;	/* skip referred or dirty or read-into cache records */
	pass2 = 2 * max_ent;	/* skip referred cache records */
	pass3 = 3 * max_ent;	/* skip nothing */
	INCR_DB_CSH_COUNTER(csa, n_db_csh_getns, 1);
	DEFER_INTERRUPTS(INTRPT_IN_DB_CSH_GETN);
	for (lcnt = 0;  ; lcnt++)
	{
		if (lcnt > pass3)
		{
			BG_TRACE_PRO(wc_blocked_db_csh_getn_loopexceed);
			assert(FALSE);
			break;
		}
		cr++;
		if (cr == start_cr + max_ent)
			cr = start_cr;
		VMS_ONLY(
			if ((lcnt == pass1) || (lcnt == pass2))
				wcs_wtfini(gv_cur_region);
		)
		if (cr->refer && (lcnt < pass2))
		{	/* in passes 1 & 2, set refer to FALSE and skip; in the third pass attempt reuse even if TRUE == refer */
			cr->refer = FALSE;
			continue;
		}
		if (cr->in_cw_set || cr->in_tend)
		{	/* some process already has this pinned for reading and/or updating. skip it. */
			cr->refer = TRUE;
			continue;
		}
		if (CDB_STAGNATE <= t_tries || mu_reorg_process)
		{
			/* Prevent stepping on self when crit for entire transaction.
			 * This is done by looking up in sgm_info_ptr->blk_in_use and cw_stagnate for presence of the block.
			 * The following two hashtable lookups are not similar, since in TP, sgm_info_ptr->blks_in_use
			 * 	is updated to the latest cw_stagnate list of blocks only in "tp_hist".
			 * Also note that the lookup in sgm_info_ptr->blks_in_use reuses blocks that don't have cse's.
			 * This is to allow big-read TP transactions which may use up more than the available global buffers.
			 * There is one issue here in that a block that has been only read till now may be stepped upon here
			 *	but may later be needed for update. It is handled by updating the block's corresponding
			 *	entry in the set of histories (sgm_info_ptr->first_tp_hist[index] structure) to hold the
			 *	"cr" and "cycle" of the t_qread done for the block when it was intended to be changed for the
			 *	first time within the transaction since otherwise the transaction would restart due to a
			 *	cdb_sc_lostcr status. Note that "tn" (read_tn of the block) in the first_tp_hist will still
			 *	remain the "tn" when the block was first read within this transaction to ensure the block
			 *	hasn't been modified since the start of the transaction. Once we intend on changing the
			 *	block i.e. srch_blk_status->cse is non-NULL, we ensure in the code below not to step on it.
			 *	["tp_hist" is the routine that updates the "cr", "cycle" and "tn" of the block].
			 * Note that usually in a transaction the first_tp_hist[] structure holds the "cr", "cycle", and "tn"
			 *	of the first t_qread of the block within that transaction. The above is the only exception.
			 * Also note that for blocks in cw_stagnate (i.e. current TP mini-action), we don't reuse any of
			 *	them even if they don't have a cse. This is to ensure that the current action doesn't
			 *	encounter a restart due to cdb_sc_lostcr in "tp_hist" even in the fourth-retry.
			 */
			tp_srch_status = NULL;
			if (dollar_tlevel && (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&cr->blk)))
					&& (tp_srch_status = (srch_blk_status *)tabent->value) && (tp_srch_status->cse))
			{	/* this process is already using the block - skip it */
				cr->refer = TRUE;
				continue;
			}
			if (NULL != lookup_hashtab_int4(&cw_stagnate, (uint4 *)&cr->blk))
			{	/* this process is already using the block for the current gvcst_search - skip it */
				cr->refer = TRUE;
				continue;
			}
			if (NULL != tp_srch_status)
			{	/* About to reuse a buffer that is part of the read-set of the current TP transaction.
				 * Reset clue as otherwise the next global reference of that global will use an outofdate clue.
				 * Even though tp_srch_status is available after the sgm_info_ptr->blks_in_use hashtable check,
				 * we dont want to reset the clue in case the cw_stagnate hashtable check causes the same cr
				 * to be skipped from reuse. Hence the placement of this reset logic AFTER the cw_stagnate check.
				 */
				tp_srch_status->blk_target->clue.end = 0;
			}
		}
		if (cr->dirty)
		{	/* Note that in Unix, it is possible that we see a stale value of cr->dirty (possible if a
			 * concurrent "wcs_wtstart" has reset dirty to 0 but that update did not reach us yet). In this
			 * case the call to "wcs_get_space" below will do the necessary memory barrier instructions
			 * (through calls to "aswp") which will allow us to see the non-stale value of cr->dirty.
			 *
			 * It is also possible that cr->dirty is non-zero but < cr->flushed_dirty_tn. In this case, wcs_get_space
			 * done below will return FALSE forcing a cache-rebuild which will fix this situation.
			 *
			 * In VMS, another process cannot be concurrently resetting cr->dirty to 0 as the resetting routine
			 * is "wcs_wtfini" which is executed in crit which another process cannot be in as we are in crit now.
			 */
			if (gv_cur_region->read_only)
				continue;
			if (lcnt < pass1)
			{
				if (!csa->timer && (csa->nl->wcs_timers < 1))
					wcs_timer_start(gv_cur_region, FALSE);
				continue;
			}
			BG_TRACE_PRO(db_csh_getn_flush_dirty);
			if (FALSE == wcs_get_space(gv_cur_region, 0, cr))
			{	/* failed to flush it out - force a rebuild */
				BG_TRACE_PRO(wc_blocked_db_csh_getn_wcsstarvewrt);
				assert(csa->nl->wc_blocked); /* only reason we currently know why wcs_get_space could fail */
				assert(gtm_white_box_test_case_enabled);
				break;
			}
			assert(0 == cr->dirty);
		}
		UNIX_ONLY(
			/* the cache-record is not free for reuse until the write-latch value becomes LATCH_CLEAR.
			 * In VMS, resetting the write-latch value occurs in "wcs_wtfini" which is in CRIT, we are fine.
			 * In Unix, this resetting is done by "wcs_wtstart" which is out-of-crit. Therefore, we need to
			 * 	wait for this value to be LATCH_CLEAR before reusing this cache-record.
			 * Note that we are examining the write-latch-value without holding the interlock. It is ok to do
			 * 	this because the only two routines that modify the latch value are "bg_update" and
			 * 	"wcs_wtstart". The former cannot be concurrently executing because we are in crit.
			 * 	The latter will not update the latch value unless this cache-record is dirty. But in this
			 * 	case we would have most likely gone through the if (cr->dirty) check above. Most likely
			 * 	because there is one rare possibility where a concurrent "wcs_wtstart" has set cr->dirty
			 * 	to 0 but not yet cleared the latch. In that case we wait for the latch to be cleared.
			 * 	In all other cases, nobody is modifying the latch since when we got crit and therefore
			 * 	it is safe to observe the value of the latch without holding the interlock.
			 */
			if (LATCH_CLEAR != WRITE_LATCH_VAL(cr))
			{	/* possible if a concurrent "wcs_wtstart" has set cr->dirty to 0 but not yet
				 * cleared the latch. this should be very rare though.
				 */
				if (lcnt < pass2)
					continue; /* try to find some other cache-record to reuse until the 3rd pass */
				for (ocnt = 1; (MAXWRTLATCHWAIT >= ocnt) && (LATCH_CLEAR != WRITE_LATCH_VAL(cr)); ocnt++)
					wcs_sleep(SLEEP_WRTLATCHWAIT);	/* since it is a short lock, sleep the minimum */
				if (MAXWRTLATCHWAIT <= ocnt)
				{
					BG_TRACE_PRO(db_csh_getn_wrt_latch_stuck);
					assert(FALSE);
					continue;
				}
			}
		)
		/* Note that before setting up a buffer for the requested block, we should make sure the cache-record's
		 * 	read_in_progress is set. This is so that noone else in t_qread gets access to this empty buffer.
		 * By setting up a buffer, it is meant assigning cr->blk in addition to inserting the cr in the blkques
		 * 	through "shuffqth" below.
		 * Note that "t_qread" has special code to handle read_in_progress */
		LOCK_BUFF_FOR_READ(cr, rip);
		if (0 != rip)
		{
			if (lcnt < pass2)
			{	/* someone is reading into this cache record. leave it for two passes.
				 * this is because if somebody is reading it, it is most likely to be referred to very soon.
				 * if we replace this, we will definitely be causing a restart for the reader.
				 * instead of that, see if some other cache record fits in for us.
				 */
				RELEASE_BUFF_READ_LOCK(cr);
				continue;
			}
			for (ocnt = 1; 0 != rip && BUF_OWNER_STUCK >= ocnt; ocnt++)
			{
				RELEASE_BUFF_READ_LOCK(cr);
				/* The owner has been unable to complete the read - check for some things before going to sleep.
				 * Since cr->r_epid can be changing concurrently, take a local copy before using it below,
				 * particularly before calling is_proc_alive as we dont want to call it with a 0 r_epid.
				 */
				latest_r_epid = cr->r_epid;
				if (cr->read_in_progress < -1)
				{
					BG_TRACE_PRO(db_csh_getn_out_of_design);  /* outside of design; clear to known state */
					send_msg(VARLSTCNT(4) ERR_INVALIDRIP, 2, DB_LEN_STR(gv_cur_region));
					assert(cr->r_epid == 0);
					cr->r_epid = 0;
					INTERLOCK_INIT(cr);
				} else  if (0 != latest_r_epid)
				{
					if (is_proc_alive(latest_r_epid, cr->image_count))
					{
#						ifdef DEBUG
						if ((BUF_OWNER_STUCK / 2) == ocnt)
							GET_C_STACK_FROM_SCRIPT("BUFRDTIMEOUT", process_id, latest_r_epid, ONCE);
#						endif
						TRACE_AND_SLEEP(ocnt);
					} else
					{
						cr->r_epid = 0;
						INTERLOCK_INIT(cr);	/* Process gone, release that process's lock */
					}
				} else
				{
					TRACE_AND_SLEEP(ocnt);
				}
				LOCK_BUFF_FOR_READ(cr, rip);
			}
			if ((BUF_OWNER_STUCK < ocnt) && (0 != rip))
			{
				BG_TRACE_PRO(db_csh_getn_buf_owner_stuck);
				if (0 != latest_r_epid)
				{
					if (first_r_epid != latest_r_epid)
						GTMASSERT;
					GET_C_STACK_FROM_SCRIPT("BUFRDTIMEOUT", process_id, latest_r_epid,
								DEBUG_ONLY(TWICE) PRO_ONLY(ONCE));
					RELEASE_BUFF_READ_LOCK(cr);
					send_msg(VARLSTCNT(8) ERR_BUFRDTIMEOUT, 6, process_id,
						 cr->blk, cr, first_r_epid, DB_LEN_STR(gv_cur_region));
					continue;
				}
				cr->r_epid = 0;
				INTERLOCK_INIT(cr);
				LOCK_BUFF_FOR_READ(cr, rip);
				assert(0 == rip); 	/* Since holding crit, we expect to get lock */
				if (0 != rip)
					continue;
				/* We successfully obtained the lock so can fall out of this block */
			}
		}
		assert(0 == rip);
		/* no other process "owns" the block */
		if (CDB_STAGNATE <= t_tries || mu_reorg_process)
		{	/* this should probably use cr->in_cw_set with a condition handler to cleanup */
			CWS_INSERT(block);
		}
		assert(LATCH_CLEAR == WRITE_LATCH_VAL(cr));
		/* got a block - set it up */
		assert(0 == cr->epid);
		assert(0 == cr->r_epid);
		cr->r_epid = process_id;	/* establish ownership */
		cr->image_count = image_count;
		cr->blk = block;
		/* We want cr->read_in_progress to be locked BEFORE cr->cycle is incremented. t_qread relies on this order.
		 * Enforce this order with a write memory barrier. Not doing so might cause the incremented cr->cycle to be
		 * seen by another process even though it sees the unlocked state of cr->read_in_progress. This could cause
		 * t_qread to incorrectly return with an uptodate cr->cycle even though the buffer is still being read in
		 * from disk and this could cause db integ errors as validation (in t_end/tp_tend which relies on cr->cycle)
		 * will detect no problems even though there is one. Note this memory barrier is still needed even though
		 * there is a memory barrier connotation in the LOCK_BUFF_FOR_READ() macro above. LOCK_BUFF_FOR_READ() does
		 * a read type memory barrier whereas here, we need a write barrier.
		 */
		SHM_WRITE_MEMORY_BARRIER;
		cr->cycle++;
		cr->jnl_addr = 0;
		cr->refer = TRUE;
		if (cr->bt_index != 0)
		{
			bt = (bt_rec_ptr_t)GDS_REL2ABS(cr->bt_index);
			bt->cache_index = CR_NOTVALID;
			cr->bt_index = 0;
		}
		q0 = (cache_rec_ptr_t)((sm_uc_ptr_t)cr + cr->blkque.fl);
		shuffqth((que_ent_ptr_t)q0, (que_ent_ptr_t)hdr);
		assert(0 == cr->dirty);
		csa->nl->cur_lru_cache_rec_off = GDS_ABS2REL(cr);
		if (lcnt > pass1)
			csa->nl->cache_hits = 0;
		csa->nl->cache_hits++;
		if (csa->nl->cache_hits > csd->n_bts)
		{
			flsh_trigger = csd->flush_trigger;
			csd->flush_trigger = MIN(flsh_trigger + MAX(flsh_trigger / STEP_FACTOR, 1), MAX_FLUSH_TRIGGER(csd->n_bts));
			csa->nl->cache_hits = 0;
		}
		INCR_DB_CSH_COUNTER(csa, n_db_csh_getn_lcnt, lcnt);
		ENABLE_INTERRUPTS(INTRPT_IN_DB_CSH_GETN);
		return cr;
	}
	/* force a recover */
	INCR_DB_CSH_COUNTER(csa, n_db_csh_getn_lcnt, lcnt);
	csa->nl->cur_lru_cache_rec_off = GDS_ABS2REL(cr);
	ENABLE_INTERRUPTS(INTRPT_IN_DB_CSH_GETN);
	return (cache_rec_ptr_t)CR_NOTVALID;
}