fis-gtm/sr_port/gvcst_search.c

/****************************************************************
 *								*
 *	Copyright 2001, 2012 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include "gtm_string.h"

#include "cdb_sc.h"
#include "gdsroot.h"
#include "gdskill.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "gdscc.h"
#include "copy.h"
#include "filestruct.h"
#include "jnl.h"
#include "buddy_list.h"		/* needed for tp.h */
#include "hashtab_int4.h"	/* needed for tp.h and cws_insert.h */
#include "tp.h"
#include "gvcst_blk_build.h"
#include "t_qread.h"
#include "longset.h"		/* needed for cws_insert.h */
#include "hashtab.h"		/* needed for cws_insert.h */
#include "cws_insert.h"
#include "gvcst_protos.h"	/* for gvcst_search_blk,gvcst_search_tail,gvcst_search prototype */
#include "min_max.h"

GBLREF	gd_region		*gv_cur_region;
GBLREF	sgmnt_addrs		*cs_addrs;
GBLREF	gv_namehead		*gv_target;
GBLREF	uint4			dollar_tlevel;
GBLREF	sgmnt_data_ptr_t	cs_data;
GBLREF	unsigned char		rdfail_detail;
GBLREF	sgm_info		*sgm_info_ptr;
GBLREF	unsigned int		t_tries;
GBLREF	srch_blk_status		*first_tp_srch_status;	/* overriding value of srch_blk_status given by t_qread in case of TP */
GBLREF	trans_num		local_tn;		/* transaction number for THIS PROCESS */
GBLREF	boolean_t		tp_restart_syslog;	/* for the TP_TRACE_HIST_MOD macro */
GBLREF	boolean_t		mu_reorg_process;
GBLREF	char			gvcst_search_clue;

#define	SET_GVCST_SEARCH_CLUE(X)	gvcst_search_clue = X;

enum cdb_sc 	gvcst_search(gv_key *pKey,		/* Key to search for */
			     srch_hist *pHist)		/* History to fill in*/
{
	unsigned char		nLevl;
	enum cdb_sc		status;
	register int		n1;
	register uchar_ptr_t	c1, c2;
	register sm_uc_ptr_t	pRec, pBlkBase;
	register gv_namehead	*pTarg;	/* Local copy of gv_target;  hope it gets put into register */
	register srch_blk_status *pCurr;
	register srch_blk_status *pNonStar;
	register srch_hist	*pTargHist;
	block_id		nBlkId;
	cache_rec_ptr_t		cr;
	int			cycle;
	unsigned short		n0, nKeyLen;
	trans_num		tn;
	cw_set_element		*cse;
	off_chain		chain1, chain2;
	srch_blk_status		*tp_srch_status, *srch_status, *leaf_blk_hist;
	boolean_t		already_built, is_mm;
	ht_ent_int4		*tabent;
	sm_uc_ptr_t		buffaddr;
	trans_num		blkhdrtn;
	int			hist_size;

	pTarg = gv_target;
	assert(NULL != pTarg);
	assert(pTarg->root);
	assert(pKey != &pTarg->clue);
	nKeyLen = pKey->end + 1;

	assert(!dollar_tlevel || ((NULL != sgm_info_ptr) && (cs_addrs->sgm_info_ptr == sgm_info_ptr)));
	SET_GVCST_SEARCH_CLUE(0);
	INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srches, 1);
	pTargHist = (NULL == pHist ? &pTarg->hist : pHist);
	/* If FINAL RETRY and TP then we can safely use clues of gv_targets that have been referenced in this
	 * TP transaction (read_local_tn == local_tn). While that is guaranteed to be true for all updates, it
	 * does not hold good for READs since we allow a lot more reads to be done inside a transaction compared
	 * to the # of updates allowed. We allow the same global to be read multiple times inside the same transaction
	 * using different global buffers for each read. This means that we need to validate any clues from the first
	 * read before using it for the second read even if it is in the final retry. This validation is done inside
	 * the below IF block. As for gv_targets which are referenced for the very first time in this TP transaction,
	 * we have no easy way of determining if their clues are still uptodate (i.e. using the clue will guarantee us
	 * no restart) and since we are in the final retry, we dont want to take a risk. So dont use the clue in that case.
	 *
	 * If FINAL RETRY and Non-TP, we will be dealing with only ONE gv_target so its clue would have been reset as
	 * part of the penultimate restart so we dont have any of the above issue in the non-tp case. The only exception
	 * is if we are in gvcst_kill in which case, gvcst_search will be called twice and the clue could be non-zero
	 * for the second invocation. In this case, the clue is guaranteed to be uptodate since it was set just now
	 * as part of the first invocation. So no need to do anything about clue in final retry for Non-TP.
	 */
	if ((0 != pTarg->clue.end) && ((CDB_STAGNATE > t_tries) || !dollar_tlevel || (pTarg->read_local_tn == local_tn)))
	{	/* Have non-zero clue. Check if it is usable for the current search key. If so validate clue then and use it. */
		/* In t_end, we skipped validating the clue in case of reorg due to the assumption that reorg never uses the clue
		 * i.e. it nullifies the clue before calling gvcst_search. However, it doesn't reset the clue for directory tree
		 * and so continue using the clue if called for root search. Assert accordingly.
		 */
		assert(!mu_reorg_process UNIX_ONLY(|| (pTarg->gd_csa->dir_tree == pTarg)));
		INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srch_clues, 1);
		status = cdb_sc_normal;	/* clue is usable unless proved otherwise */
		if (NULL != pHist)
		{	/* Copy the full srch_hist and set loop terminator flag in unused srch_blk_status entry.
			 * If in TP and if leaf block in history has cse, we are guaranteed that it is built by the
			 * immediately previous call to "gvcst_search" (called by gvcst_kill which does two calls to
			 * gvcst_search of which this invocation is the second) so no need to build the block like
			 * is done for the (NULL == pHist) case below. Assert that and some more.
			 */
			hist_size = HIST_SIZE(pTarg->hist);
			memcpy(pHist, &pTarg->hist, hist_size);
			((srch_blk_status *)((char *)pHist + hist_size))->blk_num = 0;
#			ifdef DEBUG
			if (dollar_tlevel)
			{
				leaf_blk_hist = &pHist->h[0];
				assert(0 == leaf_blk_hist->level);
				chain1 = *(off_chain *)&leaf_blk_hist->blk_num;
				if (chain1.flag == 1)
				{
					assert((int)chain1.cw_index < sgm_info_ptr->cw_set_depth);
					tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
				} else
				{
					tp_srch_status = leaf_blk_hist->first_tp_srch_status;
					ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(tp_srch_status, sgm_info_ptr);
					cse = (NULL != tp_srch_status) ? tp_srch_status->cse : NULL;
				}
				assert((NULL == cse) || cse->done);
			}
#			endif
		} else if (dollar_tlevel)
		{	/* First nullify first_tp_srch_status member in gv_target history if out-of-date. This is logically done
			 * at tp_clean_up time but delayed until the time this gv_target is used next in a transaction. This way
			 * it saves some CPU cycles. pTarg->read_local_tn tells us whether this is the first usage of this
			 * gv_target in this TP transaction and if so we need to reset the out-of-date field.
			 */
			if (pTarg->read_local_tn != local_tn)
			{
				for (srch_status = &pTarg->hist.h[0]; HIST_TERMINATOR != srch_status->blk_num; srch_status++)
					srch_status->first_tp_srch_status = NULL;
			}
			/* TP & going to use clue. check if clue path contains a leaf block with a corresponding unbuilt
			 * cse from the previous traversal. If so build it first before gvcst_search_blk/gvcst_search_tail.
			 */
			tp_srch_status = NULL;
			leaf_blk_hist = &pTarg->hist.h[0];
			assert(0 == leaf_blk_hist->level);
			chain1 = *(off_chain *)&leaf_blk_hist->blk_num;
			if (chain1.flag == 1)
			{
				if ((int)chain1.cw_index >= sgm_info_ptr->cw_set_depth)
				{
					assert(sgm_info_ptr->tp_csa == cs_addrs);
					assert(FALSE == cs_addrs->now_crit);
					return cdb_sc_blknumerr;
				}
				tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
			} else
			{
				nBlkId = (block_id)leaf_blk_hist->blk_num;
				tp_srch_status = leaf_blk_hist->first_tp_srch_status;
				if ((NULL == tp_srch_status)
						&& (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use,
											(uint4 *)&leaf_blk_hist->blk_num))))
					tp_srch_status = tabent->value;
				ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(tp_srch_status, sgm_info_ptr);
				cse = (NULL != tp_srch_status) ? tp_srch_status->cse : NULL;
			}
			assert(!cse || !cse->high_tlevel);
			if ((NULL == tp_srch_status) || (tp_srch_status->blk_target == leaf_blk_hist->blk_target))
			{	/* Either the leaf level block in clue is not already present in the current TP transaction's
				 * hashtable OR it is already present and the corresponding globals match. If they dont match
				 * we know for sure the clue is out-of-date (i.e. using it will lead to a transaction restart)
				 * and hence needs to be discarded.
				 */
				leaf_blk_hist->first_tp_srch_status = tp_srch_status;
				if (NULL != cse)
				{
					if (!cse->done)
					{	/* there's a private copy and it's not up to date */
						already_built = (NULL != cse->new_buff);
						gvcst_blk_build(cse, cse->new_buff, 0);
						/* Validate the block's search history right after building a private copy.
						 * This is not needed in case gvcst_search is going to reuse the clue's search
						 * history and return (because tp_hist will do the validation of this block).
						 * But if gvcst_search decides to do a fresh traversal (because the clue does not
						 * cover the path of the current input key etc.) the block build that happened now
						 * will not get validated in tp_hist since it will instead be given the current
						 * key's search history path (a totally new path) for validation. Since a private
						 * copy of the block has been built, tp_tend would also skip validating this block
						 * so it is necessary that we validate the block right here. Since it is tricky to
						 * accurately differentiate between the two cases, we do the validation
						 * unconditionally here (besides it is only a few if checks done per block build
						 * so it is considered okay performance-wise).
						 */
						if (!already_built && !chain1.flag)
						{	/* is_mm is calculated twice, but this is done so as to speed up the
							 * most-frequent path, i.e. when there is a clue and either no cse or
							 * cse->done is TRUE
							 */
							is_mm = (dba_mm == cs_data->acc_meth);
							buffaddr = tp_srch_status->buffaddr;
							cr = tp_srch_status->cr;
							assert(tp_srch_status && (is_mm || cr) && buffaddr);
							blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn;
							if (TP_IS_CDB_SC_BLKMOD3(cr, tp_srch_status, blkhdrtn))
							{
								assert(CDB_STAGNATE > t_tries);
								TP_TRACE_HIST_MOD(leaf_blk_hist->blk_num, gv_target,
									tp_blkmod_gvcst_srch, cs_data, tp_srch_status->tn,
									blkhdrtn, ((blk_hdr_ptr_t)buffaddr)->levl);
								return cdb_sc_blkmod;
							}
							if (!is_mm && ((tp_srch_status->cycle != cr->cycle)
										|| (tp_srch_status->blk_num != cr->blk)))
							{
								assert(CDB_STAGNATE > t_tries);
								return cdb_sc_lostcr;
							}
						}
						cse->done = TRUE;
						leaf_blk_hist->cr = 0;
						leaf_blk_hist->cycle = CYCLE_PVT_COPY;
						leaf_blk_hist->buffaddr = cse->new_buff;
					} else
					{	/* Keep leaf_blk_hist->buffaddr and cse->new_buff in sync. Dont know how they
						 * cannot be the same but it seems possible if the gvcst_blk_build happened as
						 * part of a t_qread call (which does not have enough information to update the
						 * search history buffer address) without going through gvcst_search. Since the
						 * consequences of these two not being in sync are database damage, we fix them
						 * in pro just in case they are different.
						 */
						assert(leaf_blk_hist->buffaddr == cse->new_buff);
						leaf_blk_hist->buffaddr = cse->new_buff;
					}
				}
			} else
				status = cdb_sc_lostcr;	/* two different gv_targets point to same block; discard out-of-date clue */
		}
		/* Validate EVERY level in the clue before using it for ALL retries. This way we avoid unnecessary restarts.
		 * This is NECESSARY for the final retry (e.g. in a TP transaction that does LOTS of reads of different globals,
		 * it is possible that one global's clue is invalidated by a later read of another global) and is DESIRABLE (for
		 * performance reasons) in the other tries. The cost of a restart (particularly in TP) is very high that it is
		 * considered okay to take the hit of validating the entire clue before using it even if it is not the final retry.
		 */
		DEBUG_ONLY(is_mm = (dba_mm == cs_data->acc_meth);)
		if (cdb_sc_normal == status)
		{
			for (srch_status = &pTargHist->h[0]; HIST_TERMINATOR != srch_status->blk_num; srch_status++)
			{
				assert(srch_status->level == srch_status - &pTargHist->h[0]);
				assert(is_mm || (NULL == srch_status->cr) || (NULL != srch_status->buffaddr));
				cr = srch_status->cr;
				assert(!is_mm || (NULL == cr));
				if (TP_IS_CDB_SC_BLKMOD(cr, srch_status))
				{
					status = cdb_sc_blkmod;
					break;
				}
				if (NULL != cr)
				{
					assert(NULL != srch_status->buffaddr);
					if (srch_status->cycle != cr->cycle)
					{
						status = cdb_sc_lostcr;
						break;
					}
					if (CDB_STAGNATE <= t_tries || mu_reorg_process)
						CWS_INSERT(cr->blk);
					cr->refer = TRUE;
				}
			}
		}
		if (cdb_sc_normal == status)
		{	/* Now that we are ready to use the clue, put more-likely case earlier in the if then else sequence.
			 * For sequential reads of globals, we expect the tail of the clue to be much more used than the head.
			 * For random reads, both are equally probable and hence it doesn't matter.
			 * The case (0 == n1) is not expected a lot (relatively) since the application may be able to optimize
			 *	a number of reads of the same key into one read by using a local-variable to store the value.
			 */
			if (0 < (n1 = memcmp(pKey->base, pTarg->clue.base, nKeyLen)))
			{
				if (memcmp(pKey->base, pTarg->last_rec->base, nKeyLen) <= 0)
				{
					SET_GVCST_SEARCH_CLUE(1);
					status = gvcst_search_tail(pKey, pTargHist->h, &pTarg->clue);
					if (NULL == pHist)
					{	/* Implies the search history is being filled in pTarg->hist so we can
						 * safely update pTarg->clue to reflect the new search key. It is important
						 * that this clue update be done AFTER the gvcst_search_tail invocation
						 * (as that needs to pass the previous clue key).
						 */
						COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);
					}
					INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_tail, 1);
					return status;
				}
			} else if (0 > n1)
			{
				if (memcmp(pKey->base, pTarg->first_rec->base, nKeyLen) >= 0)
				{
					SET_GVCST_SEARCH_CLUE(3);
					status = gvcst_search_blk(pKey, pTargHist->h);
					if (NULL == pHist)
					{	/* Implies the search history is being filled in pTarg->hist so we can
						 * safely update pTarg->clue to reflect the new search key. It does not
						 * matter if we update the clue BEFORE or AFTER the gvcst_search_blk
						 * invocation but for consistency with the gvcst_search_tail invocation
						 * we keep it AFTER.
						 */
						COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);
					}
					INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_head, 1);
					return status;
				}
			} else
			{
				SET_GVCST_SEARCH_CLUE(2);
				INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_same, 1);
				return cdb_sc_normal;
			}
		}
	}
	nBlkId = pTarg->root;
	tn = cs_addrs->ti->curr_tn;
	if (NULL == (pBlkBase = t_qread(nBlkId, (sm_int_ptr_t)&cycle, &cr)))
		return (enum cdb_sc)rdfail_detail;
	nLevl = ((blk_hdr_ptr_t)pBlkBase)->levl;
	if (MAX_BT_DEPTH < (int)nLevl)
	{
		assert(CDB_STAGNATE > t_tries);
		return cdb_sc_maxlvl;
	}
	if (0 == (int)nLevl)
	{
		assert(CDB_STAGNATE > t_tries);
		return cdb_sc_badlvl;
	}
	is_mm = (dba_mm == cs_data->acc_meth);
	pTargHist->depth = (int)nLevl;
	pCurr = &pTargHist->h[nLevl];
	(pCurr + 1)->blk_num = 0;
	pCurr->tn = tn;
	pCurr->first_tp_srch_status = first_tp_srch_status;
	pCurr->cycle = cycle;
	pCurr->cr = cr;
	pNonStar = NULL;
	for (;;)
	{
		assert(pCurr->level == nLevl);
		pCurr->cse = NULL;
		pCurr->blk_num = nBlkId;
		pCurr->buffaddr = pBlkBase;
		if (cdb_sc_normal != (status = gvcst_search_blk(pKey, pCurr)))
			return status;
		if (0 == nLevl)
			break;
		if ((n0 = pCurr->curr_rec.offset) >= ((blk_hdr_ptr_t)pBlkBase)->bsiz)
			n0 = pCurr->prev_rec.offset;
		pRec = pBlkBase + n0;
		GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);
		if (FALSE == CHKRECLEN(pRec, pBlkBase, n0))
		{
			assert(CDB_STAGNATE > t_tries);
			return cdb_sc_rmisalign;
		}
		GET_LONG(nBlkId, (pRec + n0 - SIZEOF(block_id)));
		if (is_mm)
		{
			PUT_LONG(&chain2, nBlkId);
			if ((0 == chain2.flag) && (nBlkId > cs_addrs->total_blks))
			{	/* private copy should be taken care of by .flag */
				if (cs_addrs->total_blks < cs_addrs->ti->total_blks)
					return cdb_sc_helpedout;
				else
					return cdb_sc_blknumerr;
			}
		}
		if (BSTAR_REC_SIZE != n0)
			pNonStar = pCurr;
		pCurr--;
		pCurr->tn = cs_addrs->ti->curr_tn;
		if (NULL == (pBlkBase = t_qread(nBlkId, (sm_int_ptr_t)&pCurr->cycle, &pCurr->cr)))
			return (enum cdb_sc)rdfail_detail;
		pCurr->first_tp_srch_status = first_tp_srch_status;
		if (((blk_hdr_ptr_t)pBlkBase)->levl != --nLevl)
		{
			assert(CDB_STAGNATE > t_tries);
			return cdb_sc_badlvl;
		}
	}
	if (NULL == pHist)
	{
		if ((pCurr->curr_rec.offset < SIZEOF(blk_hdr)) ||
			((pCurr->curr_rec.offset == SIZEOF(blk_hdr)) && (pCurr->curr_rec.match < nKeyLen)))
		{	/* Clue less than first rec, invalidate */
			pTarg->clue.end = 0;
			return cdb_sc_normal;
		}
		pRec = pBlkBase + SIZEOF(blk_hdr);
		GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);
		if (FALSE == CHKRECLEN(pRec, pBlkBase, n0))
		{
			assert(CDB_STAGNATE > t_tries);
			return cdb_sc_rmisalign;
		}
		c1 = pRec + SIZEOF(rec_hdr);
		c2 = pTarg->first_rec->base;
		if (n0 > (pTarg->first_rec->top))
		{
			n0 = pTarg->first_rec->top;
			status = cdb_sc_keyoflow;
		} else
			status = cdb_sc_rmisalign;
		if (0 != n0)
		{
			do
			{
				--n0;
				if ((0 == (*c2++ = *c1++)) && (0 == *c1))
					break;
			} while (n0);
		}
		if (0 == n0)
		{
			assert(CDB_STAGNATE > t_tries);
			return status;
		}
		assert(c2 < &pTarg->first_rec->base[pTarg->first_rec->top]);	/* make sure we don't exceed allocated bounds */
		*c2 = *c1;
		DEBUG_ONLY(pTarg->first_rec->end = c2 - pTarg->first_rec->base;)
		if (NULL == pNonStar)
		{
			*((short *)pTarg->last_rec->base) = GVT_CLUE_LAST_REC_MAXKEY;
			DEBUG_ONLY(pTarg->last_rec->end = SIZEOF(short);)
		} else
		{
			pRec = pNonStar->buffaddr + pNonStar->curr_rec.offset;
			GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);
			c1 = pNonStar->buffaddr;
			if (FALSE == CHKRECLEN(pRec, c1, n0))
			{
				assert(CDB_STAGNATE > t_tries);
				return cdb_sc_rmisalign;
			}
			if (pNonStar->curr_rec.match < ((rec_hdr_ptr_t)pRec)->cmpc)
			{
				assert(CDB_STAGNATE > t_tries);
			 	return cdb_sc_rmisalign;
			}
			if ((n1 = ((rec_hdr_ptr_t)pRec)->cmpc) > (int)(pTarg->last_rec->top))
			{
				assert(CDB_STAGNATE > t_tries);
				return cdb_sc_keyoflow;
			}
			c2 = pTarg->last_rec->base;
			if (0 != n1)
				memcpy(c2, pKey->base, n1);
			c2 = (sm_uc_ptr_t)c2 + n1;

			c1 = pRec + SIZEOF(rec_hdr);
			if ((int)n0 > (int)(pTarg->last_rec->top) - n1)
			{
				n0 = pTarg->last_rec->top - n1;
				status = cdb_sc_keyoflow;
			} else
				status = cdb_sc_rmisalign;
			if (0 != n0)
			{
				do
				{
					--n0;
					if ((0 == (*c2++ = *c1++)) && (0 == *c1))
						break;
				} while (n0);
			}
			if (0 == n0)
			{
				assert(CDB_STAGNATE > t_tries);
				return status;
			}
			assert(c2 < &pTarg->last_rec->base[pTarg->last_rec->top]); /* make sure we don't exceed allocated bounds */
			*c2 = *c1;
			DEBUG_ONLY(pTarg->last_rec->end = c2 - pTarg->last_rec->base;)
		}
		COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);
	}
	return cdb_sc_normal;
}