fis-gtm/sr_port/gvcst_search.c

/****************************************************************
 *								*
 *	Copyright 2001, 2011 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include "gtm_string.h"

#include "cdb_sc.h"
#include "gdsroot.h"
#include "gdskill.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "gdscc.h"
#include "copy.h"
#include "filestruct.h"
#include "jnl.h"
#include "buddy_list.h"		/* needed for tp.h */
#include "hashtab_int4.h"	/* needed for tp.h and cws_insert.h */
#include "tp.h"
#include "gvcst_blk_build.h"
#include "t_qread.h"
#include "longset.h"		/* needed for cws_insert.h */
#include "hashtab.h"		/* needed for cws_insert.h */
#include "cws_insert.h"
#include "gvcst_protos.h"	/* for gvcst_search_blk,gvcst_search_tail,gvcst_search prototype */
#include "min_max.h"

GBLREF	gd_region		*gv_cur_region;
GBLREF	sgmnt_addrs		*cs_addrs;
GBLREF	gv_namehead		*gv_target;
GBLREF	uint4			dollar_tlevel;
GBLREF	sgmnt_data_ptr_t	cs_data;
GBLREF	unsigned char		rdfail_detail;
GBLREF	sgm_info		*sgm_info_ptr;
GBLREF	unsigned int		t_tries;
GBLREF	srch_blk_status		*first_tp_srch_status;	/* overriding value of srch_blk_status given by t_qread in case of TP */
GBLREF	trans_num		local_tn;		/* transaction number for THIS PROCESS */
GBLREF	boolean_t		tp_restart_syslog;	/* for the TP_TRACE_HIST_MOD macro */
GBLREF	boolean_t		mu_reorg_process;
GBLREF	char			gvcst_search_clue;

#define	SET_GVCST_SEARCH_CLUE(X)	gvcst_search_clue = X;

enum cdb_sc 	gvcst_search(gv_key *pKey,		/* Key to search for */
			     srch_hist *pHist)		/* History to fill in*/
{
	unsigned char		nLevl;
	enum cdb_sc		status;
	register int		n1;
	register uchar_ptr_t	c1, c2;
	register sm_uc_ptr_t	pRec, pBlkBase;
	register gv_namehead	*pTarg;	/* Local copy of gv_target;  hope it gets put into register */
	register srch_blk_status *pCurr;
	register srch_blk_status *pNonStar;
	register srch_hist	*pTargHist;
	block_id		nBlkId;
	cache_rec_ptr_t		cr;
	int			cycle;
	unsigned short		n0, nKeyLen;
	trans_num		tn;
	cw_set_element		*cse;
	off_chain		chain1, chain2;
	srch_blk_status		*tp_srch_status, *srch_status, *leaf_blk_hist;
	boolean_t		already_built, is_mm;
	ht_ent_int4		*tabent;
	sm_uc_ptr_t		buffaddr;
	trans_num		blkhdrtn;
	int			hist_size;

	pTarg = gv_target;
	assert(NULL != pTarg);
	assert(pTarg->root);
	assert(pKey != &pTarg->clue);
	nKeyLen = pKey->end + 1;

	assert(!dollar_tlevel || ((NULL != sgm_info_ptr) && (cs_addrs->sgm_info_ptr == sgm_info_ptr)));
	SET_GVCST_SEARCH_CLUE(0);
	INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srches, 1);
	pTargHist = (NULL == pHist ? &pTarg->hist : pHist);
	/* If FINAL RETRY and TP then we can safely use clues of gv_targets that have been referenced in this
	 * TP transaction (read_local_tn == local_tn). While that is guaranteed to be true for all updates, it
	 * does not hold good for READs since we allow a lot more reads to be done inside a transaction compared
	 * to the # of updates allowed. We allow the same global to be read multiple times inside the same transaction
	 * using different global buffers for each read. This means that we need to validate any clues from the first
	 * read before using it for the second read even if it is in the final retry. This validation is done inside
	 * the below IF block. As for gv_targets which are referenced for the very first time in this TP transaction,
	 * we have no easy way of determining if their clues are still uptodate (i.e. using the clue will guarantee us
	 * no restart) and since we are in the final retry, we dont want to take a risk. So dont use the clue in that case.
	 *
	 * If FINAL RETRY and Non-TP, we will be dealing with only ONE gv_target so its clue would have been reset as
	 * part of the penultimate restart so we dont have any of the above issue in the non-tp case. The only exception
	 * is if we are in gvcst_kill in which case, gvcst_search will be called twice and the clue could be non-zero
	 * for the second invocation. In this case, the clue is guaranteed to be uptodate since it was set just now
	 * as part of the first invocation. So no need to do anything about clue in final retry for Non-TP.
	 */
	if ((0 != pTarg->clue.end) && ((CDB_STAGNATE > t_tries) || !dollar_tlevel || (pTarg->read_local_tn == local_tn)))
	{	/* Have non-zero clue. Check if it is usable for the current search key. If so validate clue then and use it. */
		/* In t_end, we skipped validating the clue in case of reorg due to the assumption that reorg never
		 * uses the clue i.e. it nullifies the clue before calling gvcst_search. Assert that here.
		 */
		assert(!mu_reorg_process);
		INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srch_clues, 1);
		status = cdb_sc_normal;	/* clue is usable unless proved otherwise */
		if (NULL != pHist)
		{	/* Copy the full srch_hist and set loop terminator flag in unused srch_blk_status entry.
			 * If in TP and if leaf block in history has cse, we are guaranteed that it is built by the
			 * immediately previous call to "gvcst_search" (called by gvcst_kill which does two calls to
			 * gvcst_search of which this invocation is the second) so no need to build the block like
			 * is done for the (NULL == pHist) case below. Assert that and some more.
			 */
			hist_size = HIST_SIZE(pTarg->hist);
			memcpy(pHist, &pTarg->hist, hist_size);
			((srch_blk_status *)((char *)pHist + hist_size))->blk_num = 0;
#			ifdef DEBUG
			if (dollar_tlevel)
			{
				leaf_blk_hist = &pHist->h[0];
				assert(0 == leaf_blk_hist->level);
				chain1 = *(off_chain *)&leaf_blk_hist->blk_num;
				if (chain1.flag == 1)
				{
					assert((int)chain1.cw_index < sgm_info_ptr->cw_set_depth);
					tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
				} else
				{
					tp_srch_status = leaf_blk_hist->first_tp_srch_status;
					ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(tp_srch_status, sgm_info_ptr);
					cse = (NULL != tp_srch_status) ? tp_srch_status->cse : NULL;
				}
				assert((NULL == cse) || cse->done);
			}
#			endif
		} else if (dollar_tlevel)
		{	/* First nullify first_tp_srch_status member in gv_target history if out-of-date. This is logically done
			 * at tp_clean_up time but delayed until the time this gv_target is used next in a transaction. This way
			 * it saves some CPU cycles. pTarg->read_local_tn tells us whether this is the first usage of this
			 * gv_target in this TP transaction and if so we need to reset the out-of-date field.
			 */
			if (pTarg->read_local_tn != local_tn)
			{
				for (srch_status = &pTarg->hist.h[0]; HIST_TERMINATOR != srch_status->blk_num; srch_status++)
					srch_status->first_tp_srch_status = NULL;
			}
			/* TP & going to use clue. check if clue path contains a leaf block with a corresponding unbuilt
			 * cse from the previous traversal. If so build it first before gvcst_search_blk/gvcst_search_tail.
			 */
			tp_srch_status = NULL;
			leaf_blk_hist = &pTarg->hist.h[0];
			assert(0 == leaf_blk_hist->level);
			chain1 = *(off_chain *)&leaf_blk_hist->blk_num;
			if (chain1.flag == 1)
			{
				if ((int)chain1.cw_index >= sgm_info_ptr->cw_set_depth)
				{
					assert(sgm_info_ptr->tp_csa == cs_addrs);
					assert(FALSE == cs_addrs->now_crit);
					return cdb_sc_blknumerr;
				}
				tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
			} else
			{
				nBlkId = (block_id)leaf_blk_hist->blk_num;
				tp_srch_status = leaf_blk_hist->first_tp_srch_status;
				if ((NULL == tp_srch_status)
						&& (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use,
											(uint4 *)&leaf_blk_hist->blk_num))))
					tp_srch_status = tabent->value;
				ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(tp_srch_status, sgm_info_ptr);
				cse = (NULL != tp_srch_status) ? tp_srch_status->cse : NULL;
			}
			assert(!cse || !cse->high_tlevel);
			if ((NULL == tp_srch_status) || (tp_srch_status->blk_target == leaf_blk_hist->blk_target))
			{	/* Either the leaf level block in clue is not already present in the current TP transaction's
				 * hashtable OR it is already present and the corresponding globals match. If they dont match
				 * we know for sure the clue is out-of-date (i.e. using it will lead to a transaction restart)
				 * and hence needs to be discarded.
				 */
				leaf_blk_hist->first_tp_srch_status = tp_srch_status;
				if (NULL != cse)
				{
					if (!cse->done)
					{	/* there's a private copy and it's not up to date */
						already_built = (NULL != cse->new_buff);
						gvcst_blk_build(cse, cse->new_buff, 0);
						/* Validate the block's search history right after building a private copy.
						 * This is not needed in case gvcst_search is going to reuse the clue's search
						 * history and return (because tp_hist will do the validation of this block).
						 * But if gvcst_search decides to do a fresh traversal (because the clue does not
						 * cover the path of the current input key etc.) the block build that happened now
						 * will not get validated in tp_hist since it will instead be given the current
						 * key's search history path (a totally new path) for validation. Since a private
						 * copy of the block has been built, tp_tend would also skip validating this block
						 * so it is necessary that we validate the block right here. Since it is tricky to
						 * accurately differentiate between the two cases, we do the validation
						 * unconditionally here (besides it is only a few if checks done per block build
						 * so it is considered okay performance-wise).
						 */
						if (!already_built && !chain1.flag)
						{	/* is_mm is calculated twice, but this is done so as to speed up the
							 * most-frequent path, i.e. when there is a clue and either no cse or
							 * cse->done is TRUE
							 */
							is_mm = (dba_mm == cs_data->acc_meth);
							buffaddr = tp_srch_status->buffaddr;
							cr = tp_srch_status->cr;
							assert(tp_srch_status && (is_mm || cr) && buffaddr);
							blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn;
							if (TP_IS_CDB_SC_BLKMOD3(cr, tp_srch_status, blkhdrtn))
							{
								assert(CDB_STAGNATE > t_tries);
								TP_TRACE_HIST_MOD(leaf_blk_hist->blk_num, gv_target,
									tp_blkmod_gvcst_srch, cs_data, tp_srch_status->tn,
									blkhdrtn, ((blk_hdr_ptr_t)buffaddr)->levl);
								return cdb_sc_blkmod;
							}
							if (!is_mm && ((tp_srch_status->cycle != cr->cycle)
										|| (tp_srch_status->blk_num != cr->blk)))
							{
								assert(CDB_STAGNATE > t_tries);
								return cdb_sc_lostcr;
							}
						}
						cse->done = TRUE;
						leaf_blk_hist->cr = 0;
						leaf_blk_hist->cycle = CYCLE_PVT_COPY;
						leaf_blk_hist->buffaddr = cse->new_buff;
					} else
					{	/* Keep leaf_blk_hist->buffaddr and cse->new_buff in sync. Dont know how they
						 * cannot be the same but it seems possible if the gvcst_blk_build happened as
						 * part of a t_qread call (which does not have enough information to update the
						 * search history buffer address) without going through gvcst_search. Since the
						 * consequences of these two not being in sync are database damage, we fix them
						 * in pro just in case they are different.
						 */
						assert(leaf_blk_hist->buffaddr == cse->new_buff);
						leaf_blk_hist->buffaddr = cse->new_buff;
					}
				}
			} else
				status = cdb_sc_lostcr;	/* two different gv_targets point to same block; discard out-of-date clue */
		}
		/* Validate EVERY level in the clue before using it for ALL retries. This way we avoid unnecessary restarts.
		 * This is NECESSARY for the final retry (e.g. in a TP transaction that does LOTS of reads of different globals,
		 * it is possible that one global's clue is invalidated by a later read of another global) and is DESIRABLE (for
		 * performance reasons) in the other tries. The cost of a restart (particularly in TP) is very high that it is
		 * considered okay to take the hit of validating the entire clue before using it even if it is not the final retry.
		 */
		DEBUG_ONLY(is_mm = (dba_mm == cs_data->acc_meth);)
		if (cdb_sc_normal == status)
		{
			for (srch_status = &pTargHist->h[0]; HIST_TERMINATOR != srch_status->blk_num; srch_status++)
			{
				assert(srch_status->level == srch_status - &pTargHist->h[0]);
				assert(is_mm || (NULL == srch_status->cr) || (NULL != srch_status->buffaddr));
				cr = srch_status->cr;
				assert(!is_mm || (NULL == cr));
				if (TP_IS_CDB_SC_BLKMOD(cr, srch_status))
				{
					status = cdb_sc_blkmod;
					break;
				}
				if (NULL != cr)
				{
					assert(NULL != srch_status->buffaddr);
					if (srch_status->cycle != cr->cycle)
					{
						status = cdb_sc_lostcr;
						break;
					}
					if (CDB_STAGNATE <= t_tries || mu_reorg_process)
						CWS_INSERT(cr->blk);
					cr->refer = TRUE;
				}
			}
		}
		if (cdb_sc_normal == status)
		{	/* Now that we are ready to use the clue, put more-likely case earlier in the if then else sequence.
			 * For sequential reads of globals, we expect the tail of the clue to be much more used than the head.
			 * For random reads, both are equally probable and hence it doesn't matter.
			 * The case (0 == n1) is not expected a lot (relatively) since the application may be able to optimize
			 *	a number of reads of the same key into one read by using a local-variable to store the value.
			 */
			if (0 < (n1 = memcmp(pKey->base, pTarg->clue.base, nKeyLen)))
			{
				if (memcmp(pKey->base, pTarg->last_rec->base, nKeyLen) <= 0)
				{
					SET_GVCST_SEARCH_CLUE(1);
					status = gvcst_search_tail(pKey, pTargHist->h, &pTarg->clue);
					if (NULL == pHist)
					{	/* Implies the search history is being filled in pTarg->hist so we can
						 * safely update pTarg->clue to reflect the new search key. It is important
						 * that this clue update be done AFTER the gvcst_search_tail invocation
						 * (as that needs to pass the previous clue key).
						 */
						COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);
					}
					INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_tail, 1);
					return status;
				}
			} else if (0 > n1)
			{
				if (memcmp(pKey->base, pTarg->first_rec->base, nKeyLen) >= 0)
				{
					SET_GVCST_SEARCH_CLUE(3);
					status = gvcst_search_blk(pKey, pTargHist->h);
					if (NULL == pHist)
					{	/* Implies the search history is being filled in pTarg->hist so we can
						 * safely update pTarg->clue to reflect the new search key. It does not
						 * matter if we update the clue BEFORE or AFTER the gvcst_search_blk
						 * invocation but for consistency with the gvcst_search_tail invocation
						 * we keep it AFTER.
						 */
						COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);
					}
					INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_head, 1);
					return status;
				}
			} else
			{
				SET_GVCST_SEARCH_CLUE(2);
				INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_same, 1);
				return cdb_sc_normal;
			}
		}
	}
	nBlkId = pTarg->root;
	tn = cs_addrs->ti->curr_tn;
	if (NULL == (pBlkBase = t_qread(nBlkId, (sm_int_ptr_t)&cycle, &cr)))
		return (enum cdb_sc)rdfail_detail;
	nLevl = ((blk_hdr_ptr_t)pBlkBase)->levl;
	if (MAX_BT_DEPTH < (int)nLevl)
	{
		assert(CDB_STAGNATE > t_tries);
		return cdb_sc_maxlvl;
	}
	if (0 == (int)nLevl)
	{
		assert(CDB_STAGNATE > t_tries);
		return cdb_sc_badlvl;
	}
	is_mm = (dba_mm == cs_data->acc_meth);
	pTargHist->depth = (int)nLevl;
	pCurr = &pTargHist->h[nLevl];
	(pCurr + 1)->blk_num = 0;
	pCurr->tn = tn;
	pCurr->first_tp_srch_status = first_tp_srch_status;
	pCurr->cycle = cycle;
	pCurr->cr = cr;
	pNonStar = NULL;
	for (;;)
	{
		assert(pCurr->level == nLevl);
		pCurr->cse = NULL;
		pCurr->blk_num = nBlkId;
		pCurr->buffaddr = pBlkBase;
		if (cdb_sc_normal != (status = gvcst_search_blk(pKey, pCurr)))
			return status;
		if (0 == nLevl)
			break;
		if ((n0 = pCurr->curr_rec.offset) >= ((blk_hdr_ptr_t)pBlkBase)->bsiz)
			n0 = pCurr->prev_rec.offset;
		pRec = pBlkBase + n0;
		GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);
		if (FALSE == CHKRECLEN(pRec, pBlkBase, n0))
		{
			assert(CDB_STAGNATE > t_tries);
			return cdb_sc_rmisalign;
		}
		GET_LONG(nBlkId, (pRec + n0 - SIZEOF(block_id)));
		if (is_mm)
		{
			PUT_LONG(&chain2, nBlkId);
			if ((0 == chain2.flag) && (nBlkId > cs_addrs->total_blks))
			{	/* private copy should be taken care of by .flag */
				if (cs_addrs->total_blks < cs_addrs->ti->total_blks)
					return cdb_sc_helpedout;
				else
					return cdb_sc_blknumerr;
			}
		}
		if (BSTAR_REC_SIZE != n0)
			pNonStar = pCurr;
		pCurr--;
		pCurr->tn = cs_addrs->ti->curr_tn;
		if (NULL == (pBlkBase = t_qread(nBlkId, (sm_int_ptr_t)&pCurr->cycle, &pCurr->cr)))
			return (enum cdb_sc)rdfail_detail;
		pCurr->first_tp_srch_status = first_tp_srch_status;
		if (((blk_hdr_ptr_t)pBlkBase)->levl != --nLevl)
		{
			assert(CDB_STAGNATE > t_tries);
			return cdb_sc_badlvl;
		}
	}
	if (NULL == pHist)
	{
		if ((pCurr->curr_rec.offset < SIZEOF(blk_hdr)) ||
			((pCurr->curr_rec.offset == SIZEOF(blk_hdr)) && (pCurr->curr_rec.match < nKeyLen)))
		{	/* Clue less than first rec, invalidate */
			pTarg->clue.end = 0;
			return cdb_sc_normal;
		}
		pRec = pBlkBase + SIZEOF(blk_hdr);
		GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);
		if (FALSE == CHKRECLEN(pRec, pBlkBase, n0))
		{
			assert(CDB_STAGNATE > t_tries);
			return cdb_sc_rmisalign;
		}
		c1 = pRec + SIZEOF(rec_hdr);
		c2 = pTarg->first_rec->base;
		if (n0 > (pTarg->first_rec->top))
		{
			n0 = pTarg->first_rec->top;
			status = cdb_sc_keyoflow;
		} else
			status = cdb_sc_rmisalign;
		if (0 != n0)
		{
			do
			{
				--n0;
				if ((0 == (*c2++ = *c1++)) && (0 == *c1))
					break;
			} while (n0);
		}
		if (0 == n0)
		{
			assert(CDB_STAGNATE > t_tries);
			return status;
		}
		assert(c2 < &pTarg->first_rec->base[pTarg->first_rec->top]);	/* make sure we don't exceed allocated bounds */
		*c2 = *c1;
		DEBUG_ONLY(pTarg->first_rec->end = c2 - pTarg->first_rec->base;)
		if (NULL == pNonStar)
		{
			*((short *)pTarg->last_rec->base) = GVT_CLUE_LAST_REC_MAXKEY;
			DEBUG_ONLY(pTarg->last_rec->end = SIZEOF(short);)
		} else
		{
			pRec = pNonStar->buffaddr + pNonStar->curr_rec.offset;
			GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);
			c1 = pNonStar->buffaddr;
			if (FALSE == CHKRECLEN(pRec, c1, n0))
			{
				assert(CDB_STAGNATE > t_tries);
				return cdb_sc_rmisalign;
			}
			if (pNonStar->curr_rec.match < ((rec_hdr_ptr_t)pRec)->cmpc)
			{
				assert(CDB_STAGNATE > t_tries);
			 	return cdb_sc_rmisalign;
			}
			if ((n1 = ((rec_hdr_ptr_t)pRec)->cmpc) > (int)(pTarg->last_rec->top))
			{
				assert(CDB_STAGNATE > t_tries);
				return cdb_sc_keyoflow;
			}
			c2 = pTarg->last_rec->base;
			if (0 != n1)
				memcpy(c2, pKey->base, n1);
			c2 = (sm_uc_ptr_t)c2 + n1;

			c1 = pRec + SIZEOF(rec_hdr);
			if ((int)n0 > (int)(pTarg->last_rec->top) - n1)
			{
				n0 = pTarg->last_rec->top - n1;
				status = cdb_sc_keyoflow;
			} else
				status = cdb_sc_rmisalign;
			if (0 != n0)
			{
				do
				{
					--n0;
					if ((0 == (*c2++ = *c1++)) && (0 == *c1))
						break;
				} while (n0);
			}
			if (0 == n0)
			{
				assert(CDB_STAGNATE > t_tries);
				return status;
			}
			assert(c2 < &pTarg->last_rec->base[pTarg->last_rec->top]); /* make sure we don't exceed allocated bounds */
			*c2 = *c1;
			DEBUG_ONLY(pTarg->last_rec->end = c2 - pTarg->last_rec->base;)
		}
		COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);
	}
	return cdb_sc_normal;
}
ENH: Initial import from sourceforge. These source tree was directly imported from http://sourceforge.net/projects/fis-gtm/files/GT.M-x86-Linux-src/V5.4-002B/ by extracting the file: gtm_V54002B_linux_i686_src.tar.gz 2012-02-05 11:35:58 -05:00			`/****************************************************************`
			`* *`
			`* Copyright 2001, 2011 Fidelity Information Services, Inc *`
			`* *`
			`* This source code contains the intellectual property *`
			`* of its copyright holder(s), and is made available *`
			`* under a license. If you do not know the terms of *`
			`* the license, please stop and do not read further. *`
			`* *`
			`****************************************************************/`

			`#include "mdef.h"`

			`#include "gtm_string.h"`

			`#include "cdb_sc.h"`
			`#include "gdsroot.h"`
			`#include "gdskill.h"`
			`#include "gtm_facility.h"`
			`#include "fileinfo.h"`
			`#include "gdsbt.h"`
			`#include "gdsblk.h"`
			`#include "gdsfhead.h"`
			`#include "gdscc.h"`
			`#include "copy.h"`
			`#include "filestruct.h"`
			`#include "jnl.h"`
			`#include "buddy_list.h" /* needed for tp.h */`
			`#include "hashtab_int4.h" /* needed for tp.h and cws_insert.h */`
			`#include "tp.h"`
			`#include "gvcst_blk_build.h"`
			`#include "t_qread.h"`
			`#include "longset.h" /* needed for cws_insert.h */`
			`#include "hashtab.h" /* needed for cws_insert.h */`
			`#include "cws_insert.h"`
			`#include "gvcst_protos.h" /* for gvcst_search_blk,gvcst_search_tail,gvcst_search prototype */`
			`#include "min_max.h"`

			`GBLREF gd_region *gv_cur_region;`
			`GBLREF sgmnt_addrs *cs_addrs;`
			`GBLREF gv_namehead *gv_target;`
			`GBLREF uint4 dollar_tlevel;`
			`GBLREF sgmnt_data_ptr_t cs_data;`
			`GBLREF unsigned char rdfail_detail;`
			`GBLREF sgm_info *sgm_info_ptr;`
			`GBLREF unsigned int t_tries;`
			`GBLREF srch_blk_status first_tp_srch_status; / overriding value of srch_blk_status given by t_qread in case of TP */`
			`GBLREF trans_num local_tn; /* transaction number for THIS PROCESS */`
			`GBLREF boolean_t tp_restart_syslog; /* for the TP_TRACE_HIST_MOD macro */`
			`GBLREF boolean_t mu_reorg_process;`
			`GBLREF char gvcst_search_clue;`

			`#define SET_GVCST_SEARCH_CLUE(X) gvcst_search_clue = X;`

			`enum cdb_sc gvcst_search(gv_key pKey, / Key to search for */`
			`srch_hist pHist) / History to fill in*/`
			`{`
			`unsigned char nLevl;`
			`enum cdb_sc status;`
			`register int n1;`
			`register uchar_ptr_t c1, c2;`
			`register sm_uc_ptr_t pRec, pBlkBase;`
			`register gv_namehead pTarg; / Local copy of gv_target; hope it gets put into register */`
			`register srch_blk_status *pCurr;`
			`register srch_blk_status *pNonStar;`
			`register srch_hist *pTargHist;`
			`block_id nBlkId;`
			`cache_rec_ptr_t cr;`
			`int cycle;`
			`unsigned short n0, nKeyLen;`
			`trans_num tn;`
			`cw_set_element *cse;`
			`off_chain chain1, chain2;`
			`srch_blk_status tp_srch_status, srch_status, *leaf_blk_hist;`
			`boolean_t already_built, is_mm;`
			`ht_ent_int4 *tabent;`
			`sm_uc_ptr_t buffaddr;`
			`trans_num blkhdrtn;`
			`int hist_size;`

			`pTarg = gv_target;`
			`assert(NULL != pTarg);`
			`assert(pTarg->root);`
			`assert(pKey != &pTarg->clue);`
			`nKeyLen = pKey->end + 1;`

			`assert(!dollar_tlevel \|\| ((NULL != sgm_info_ptr) && (cs_addrs->sgm_info_ptr == sgm_info_ptr)));`
			`SET_GVCST_SEARCH_CLUE(0);`
			`INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srches, 1);`
			`pTargHist = (NULL == pHist ? &pTarg->hist : pHist);`
			`/* If FINAL RETRY and TP then we can safely use clues of gv_targets that have been referenced in this`
			`* TP transaction (read_local_tn == local_tn). While that is guaranteed to be true for all updates, it`
			`* does not hold good for READs since we allow a lot more reads to be done inside a transaction compared`
			`* to the # of updates allowed. We allow the same global to be read multiple times inside the same transaction`
			`* using different global buffers for each read. This means that we need to validate any clues from the first`
			`* read before using it for the second read even if it is in the final retry. This validation is done inside`
			`* the below IF block. As for gv_targets which are referenced for the very first time in this TP transaction,`
			`* we have no easy way of determining if their clues are still uptodate (i.e. using the clue will guarantee us`
			`* no restart) and since we are in the final retry, we dont want to take a risk. So dont use the clue in that case.`
			`*`
			`* If FINAL RETRY and Non-TP, we will be dealing with only ONE gv_target so its clue would have been reset as`
			`* part of the penultimate restart so we dont have any of the above issue in the non-tp case. The only exception`
			`* is if we are in gvcst_kill in which case, gvcst_search will be called twice and the clue could be non-zero`
			`* for the second invocation. In this case, the clue is guaranteed to be uptodate since it was set just now`
			`* as part of the first invocation. So no need to do anything about clue in final retry for Non-TP.`
			`*/`
			`if ((0 != pTarg->clue.end) && ((CDB_STAGNATE > t_tries) \|\| !dollar_tlevel \|\| (pTarg->read_local_tn == local_tn)))`
			`{ /* Have non-zero clue. Check if it is usable for the current search key. If so validate clue then and use it. */`
			`/* In t_end, we skipped validating the clue in case of reorg due to the assumption that reorg never`
			`* uses the clue i.e. it nullifies the clue before calling gvcst_search. Assert that here.`
			`*/`
			`assert(!mu_reorg_process);`
			`INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srch_clues, 1);`
			`status = cdb_sc_normal; /* clue is usable unless proved otherwise */`
			`if (NULL != pHist)`
			`{ /* Copy the full srch_hist and set loop terminator flag in unused srch_blk_status entry.`
			`* If in TP and if leaf block in history has cse, we are guaranteed that it is built by the`
			`* immediately previous call to "gvcst_search" (called by gvcst_kill which does two calls to`
			`* gvcst_search of which this invocation is the second) so no need to build the block like`
			`* is done for the (NULL == pHist) case below. Assert that and some more.`
			`*/`
			`hist_size = HIST_SIZE(pTarg->hist);`
			`memcpy(pHist, &pTarg->hist, hist_size);`
			`((srch_blk_status )((char )pHist + hist_size))->blk_num = 0;`
			`# ifdef DEBUG`
			`if (dollar_tlevel)`
			`{`
			`leaf_blk_hist = &pHist->h[0];`
			`assert(0 == leaf_blk_hist->level);`
			`chain1 = (off_chain )&leaf_blk_hist->blk_num;`
			`if (chain1.flag == 1)`
			`{`
			`assert((int)chain1.cw_index < sgm_info_ptr->cw_set_depth);`
			`tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);`
			`} else`
			`{`
			`tp_srch_status = leaf_blk_hist->first_tp_srch_status;`
			`ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(tp_srch_status, sgm_info_ptr);`
			`cse = (NULL != tp_srch_status) ? tp_srch_status->cse : NULL;`
			`}`
			`assert((NULL == cse) \|\| cse->done);`
			`}`
			`# endif`
			`} else if (dollar_tlevel)`
			`{ /* First nullify first_tp_srch_status member in gv_target history if out-of-date. This is logically done`
			`* at tp_clean_up time but delayed until the time this gv_target is used next in a transaction. This way`
			`* it saves some CPU cycles. pTarg->read_local_tn tells us whether this is the first usage of this`
			`* gv_target in this TP transaction and if so we need to reset the out-of-date field.`
			`*/`
			`if (pTarg->read_local_tn != local_tn)`
			`{`
			`for (srch_status = &pTarg->hist.h[0]; HIST_TERMINATOR != srch_status->blk_num; srch_status++)`
			`srch_status->first_tp_srch_status = NULL;`
			`}`
			`/* TP & going to use clue. check if clue path contains a leaf block with a corresponding unbuilt`
			`* cse from the previous traversal. If so build it first before gvcst_search_blk/gvcst_search_tail.`
			`*/`
			`tp_srch_status = NULL;`
			`leaf_blk_hist = &pTarg->hist.h[0];`
			`assert(0 == leaf_blk_hist->level);`
			`chain1 = (off_chain )&leaf_blk_hist->blk_num;`
			`if (chain1.flag == 1)`
			`{`
			`if ((int)chain1.cw_index >= sgm_info_ptr->cw_set_depth)`
			`{`
			`assert(sgm_info_ptr->tp_csa == cs_addrs);`
			`assert(FALSE == cs_addrs->now_crit);`
			`return cdb_sc_blknumerr;`
			`}`
			`tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);`
			`} else`
			`{`
			`nBlkId = (block_id)leaf_blk_hist->blk_num;`
			`tp_srch_status = leaf_blk_hist->first_tp_srch_status;`
			`if ((NULL == tp_srch_status)`
			`&& (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use,`
			`(uint4 *)&leaf_blk_hist->blk_num))))`
			`tp_srch_status = tabent->value;`
			`ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(tp_srch_status, sgm_info_ptr);`
			`cse = (NULL != tp_srch_status) ? tp_srch_status->cse : NULL;`
			`}`
			`assert(!cse \|\| !cse->high_tlevel);`
			`if ((NULL == tp_srch_status) \|\| (tp_srch_status->blk_target == leaf_blk_hist->blk_target))`
			`{ /* Either the leaf level block in clue is not already present in the current TP transaction's`
			`* hashtable OR it is already present and the corresponding globals match. If they dont match`
			`* we know for sure the clue is out-of-date (i.e. using it will lead to a transaction restart)`
			`* and hence needs to be discarded.`
			`*/`
			`leaf_blk_hist->first_tp_srch_status = tp_srch_status;`
			`if (NULL != cse)`
			`{`
			`if (!cse->done)`
			`{ /* there's a private copy and it's not up to date */`
			`already_built = (NULL != cse->new_buff);`
			`gvcst_blk_build(cse, cse->new_buff, 0);`
			`/* Validate the block's search history right after building a private copy.`
			`* This is not needed in case gvcst_search is going to reuse the clue's search`
			`* history and return (because tp_hist will do the validation of this block).`
			`* But if gvcst_search decides to do a fresh traversal (because the clue does not`
			`* cover the path of the current input key etc.) the block build that happened now`
			`* will not get validated in tp_hist since it will instead be given the current`
			`* key's search history path (a totally new path) for validation. Since a private`
			`* copy of the block has been built, tp_tend would also skip validating this block`
			`* so it is necessary that we validate the block right here. Since it is tricky to`
			`* accurately differentiate between the two cases, we do the validation`
			`* unconditionally here (besides it is only a few if checks done per block build`
			`* so it is considered okay performance-wise).`
			`*/`
			`if (!already_built && !chain1.flag)`
			`{ /* is_mm is calculated twice, but this is done so as to speed up the`
			`* most-frequent path, i.e. when there is a clue and either no cse or`
			`* cse->done is TRUE`
			`*/`
			`is_mm = (dba_mm == cs_data->acc_meth);`
			`buffaddr = tp_srch_status->buffaddr;`
			`cr = tp_srch_status->cr;`
			`assert(tp_srch_status && (is_mm \|\| cr) && buffaddr);`
			`blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn;`
			`if (TP_IS_CDB_SC_BLKMOD3(cr, tp_srch_status, blkhdrtn))`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`TP_TRACE_HIST_MOD(leaf_blk_hist->blk_num, gv_target,`
			`tp_blkmod_gvcst_srch, cs_data, tp_srch_status->tn,`
			`blkhdrtn, ((blk_hdr_ptr_t)buffaddr)->levl);`
			`return cdb_sc_blkmod;`
			`}`
			`if (!is_mm && ((tp_srch_status->cycle != cr->cycle)`
			`\|\| (tp_srch_status->blk_num != cr->blk)))`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`return cdb_sc_lostcr;`
			`}`
			`}`
			`cse->done = TRUE;`
			`leaf_blk_hist->cr = 0;`
			`leaf_blk_hist->cycle = CYCLE_PVT_COPY;`
			`leaf_blk_hist->buffaddr = cse->new_buff;`
			`} else`
			`{ /* Keep leaf_blk_hist->buffaddr and cse->new_buff in sync. Dont know how they`
			`* cannot be the same but it seems possible if the gvcst_blk_build happened as`
			`* part of a t_qread call (which does not have enough information to update the`
			`* search history buffer address) without going through gvcst_search. Since the`
			`* consequences of these two not being in sync are database damage, we fix them`
			`* in pro just in case they are different.`
			`*/`
			`assert(leaf_blk_hist->buffaddr == cse->new_buff);`
			`leaf_blk_hist->buffaddr = cse->new_buff;`
			`}`
			`}`
			`} else`
			`status = cdb_sc_lostcr; /* two different gv_targets point to same block; discard out-of-date clue */`
			`}`
			`/* Validate EVERY level in the clue before using it for ALL retries. This way we avoid unnecessary restarts.`
			`* This is NECESSARY for the final retry (e.g. in a TP transaction that does LOTS of reads of different globals,`
			`* it is possible that one global's clue is invalidated by a later read of another global) and is DESIRABLE (for`
			`* performance reasons) in the other tries. The cost of a restart (particularly in TP) is very high that it is`
			`* considered okay to take the hit of validating the entire clue before using it even if it is not the final retry.`
			`*/`
			`DEBUG_ONLY(is_mm = (dba_mm == cs_data->acc_meth);)`
			`if (cdb_sc_normal == status)`
			`{`
			`for (srch_status = &pTargHist->h[0]; HIST_TERMINATOR != srch_status->blk_num; srch_status++)`
			`{`
			`assert(srch_status->level == srch_status - &pTargHist->h[0]);`
			`assert(is_mm \|\| (NULL == srch_status->cr) \|\| (NULL != srch_status->buffaddr));`
			`cr = srch_status->cr;`
			`assert(!is_mm \|\| (NULL == cr));`
			`if (TP_IS_CDB_SC_BLKMOD(cr, srch_status))`
			`{`
			`status = cdb_sc_blkmod;`
			`break;`
			`}`
			`if (NULL != cr)`
			`{`
			`assert(NULL != srch_status->buffaddr);`
			`if (srch_status->cycle != cr->cycle)`
			`{`
			`status = cdb_sc_lostcr;`
			`break;`
			`}`
			`if (CDB_STAGNATE <= t_tries \|\| mu_reorg_process)`
			`CWS_INSERT(cr->blk);`
			`cr->refer = TRUE;`
			`}`
			`}`
			`}`
			`if (cdb_sc_normal == status)`
			`{ /* Now that we are ready to use the clue, put more-likely case earlier in the if then else sequence.`
			`* For sequential reads of globals, we expect the tail of the clue to be much more used than the head.`
			`* For random reads, both are equally probable and hence it doesn't matter.`
			`* The case (0 == n1) is not expected a lot (relatively) since the application may be able to optimize`
			`* a number of reads of the same key into one read by using a local-variable to store the value.`
			`*/`
			`if (0 < (n1 = memcmp(pKey->base, pTarg->clue.base, nKeyLen)))`
			`{`
			`if (memcmp(pKey->base, pTarg->last_rec->base, nKeyLen) <= 0)`
			`{`
			`SET_GVCST_SEARCH_CLUE(1);`
			`status = gvcst_search_tail(pKey, pTargHist->h, &pTarg->clue);`
			`if (NULL == pHist)`
			`{ /* Implies the search history is being filled in pTarg->hist so we can`
			`* safely update pTarg->clue to reflect the new search key. It is important`
			`* that this clue update be done AFTER the gvcst_search_tail invocation`
			`* (as that needs to pass the previous clue key).`
			`*/`
			`COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);`
			`}`
			`INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_tail, 1);`
			`return status;`
			`}`
			`} else if (0 > n1)`
			`{`
			`if (memcmp(pKey->base, pTarg->first_rec->base, nKeyLen) >= 0)`
			`{`
			`SET_GVCST_SEARCH_CLUE(3);`
			`status = gvcst_search_blk(pKey, pTargHist->h);`
			`if (NULL == pHist)`
			`{ /* Implies the search history is being filled in pTarg->hist so we can`
			`* safely update pTarg->clue to reflect the new search key. It does not`
			`* matter if we update the clue BEFORE or AFTER the gvcst_search_blk`
			`* invocation but for consistency with the gvcst_search_tail invocation`
			`* we keep it AFTER.`
			`*/`
			`COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);`
			`}`
			`INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_head, 1);`
			`return status;`
			`}`
			`} else`
			`{`
			`SET_GVCST_SEARCH_CLUE(2);`
			`INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_same, 1);`
			`return cdb_sc_normal;`
			`}`
			`}`
			`}`
			`nBlkId = pTarg->root;`
			`tn = cs_addrs->ti->curr_tn;`
			`if (NULL == (pBlkBase = t_qread(nBlkId, (sm_int_ptr_t)&cycle, &cr)))`
			`return (enum cdb_sc)rdfail_detail;`
			`nLevl = ((blk_hdr_ptr_t)pBlkBase)->levl;`
			`if (MAX_BT_DEPTH < (int)nLevl)`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`return cdb_sc_maxlvl;`
			`}`
			`if (0 == (int)nLevl)`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`return cdb_sc_badlvl;`
			`}`
			`is_mm = (dba_mm == cs_data->acc_meth);`
			`pTargHist->depth = (int)nLevl;`
			`pCurr = &pTargHist->h[nLevl];`
			`(pCurr + 1)->blk_num = 0;`
			`pCurr->tn = tn;`
			`pCurr->first_tp_srch_status = first_tp_srch_status;`
			`pCurr->cycle = cycle;`
			`pCurr->cr = cr;`
			`pNonStar = NULL;`
			`for (;;)`
			`{`
			`assert(pCurr->level == nLevl);`
			`pCurr->cse = NULL;`
			`pCurr->blk_num = nBlkId;`
			`pCurr->buffaddr = pBlkBase;`
			`if (cdb_sc_normal != (status = gvcst_search_blk(pKey, pCurr)))`
			`return status;`
			`if (0 == nLevl)`
			`break;`
			`if ((n0 = pCurr->curr_rec.offset) >= ((blk_hdr_ptr_t)pBlkBase)->bsiz)`
			`n0 = pCurr->prev_rec.offset;`
			`pRec = pBlkBase + n0;`
			`GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);`
			`if (FALSE == CHKRECLEN(pRec, pBlkBase, n0))`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`return cdb_sc_rmisalign;`
			`}`
			`GET_LONG(nBlkId, (pRec + n0 - SIZEOF(block_id)));`
			`if (is_mm)`
			`{`
			`PUT_LONG(&chain2, nBlkId);`
			`if ((0 == chain2.flag) && (nBlkId > cs_addrs->total_blks))`
			`{ /* private copy should be taken care of by .flag */`
			`if (cs_addrs->total_blks < cs_addrs->ti->total_blks)`
			`return cdb_sc_helpedout;`
			`else`
			`return cdb_sc_blknumerr;`
			`}`
			`}`
			`if (BSTAR_REC_SIZE != n0)`
			`pNonStar = pCurr;`
			`pCurr--;`
			`pCurr->tn = cs_addrs->ti->curr_tn;`
			`if (NULL == (pBlkBase = t_qread(nBlkId, (sm_int_ptr_t)&pCurr->cycle, &pCurr->cr)))`
			`return (enum cdb_sc)rdfail_detail;`
			`pCurr->first_tp_srch_status = first_tp_srch_status;`
			`if (((blk_hdr_ptr_t)pBlkBase)->levl != --nLevl)`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`return cdb_sc_badlvl;`
			`}`
			`}`
			`if (NULL == pHist)`
			`{`
			`if ((pCurr->curr_rec.offset < SIZEOF(blk_hdr)) \|\|`
			`((pCurr->curr_rec.offset == SIZEOF(blk_hdr)) && (pCurr->curr_rec.match < nKeyLen)))`
			`{ /* Clue less than first rec, invalidate */`
			`pTarg->clue.end = 0;`
			`return cdb_sc_normal;`
			`}`
			`pRec = pBlkBase + SIZEOF(blk_hdr);`
			`GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);`
			`if (FALSE == CHKRECLEN(pRec, pBlkBase, n0))`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`return cdb_sc_rmisalign;`
			`}`
			`c1 = pRec + SIZEOF(rec_hdr);`
			`c2 = pTarg->first_rec->base;`
			`if (n0 > (pTarg->first_rec->top))`
			`{`
			`n0 = pTarg->first_rec->top;`
			`status = cdb_sc_keyoflow;`
			`} else`
			`status = cdb_sc_rmisalign;`
			`if (0 != n0)`
			`{`
			`do`
			`{`
			`--n0;`
			`if ((0 == (c2++ = c1++)) && (0 == *c1))`
			`break;`
			`} while (n0);`
			`}`
			`if (0 == n0)`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`return status;`
			`}`
			`assert(c2 < &pTarg->first_rec->base[pTarg->first_rec->top]); /* make sure we don't exceed allocated bounds */`
			`c2 = c1;`
			`DEBUG_ONLY(pTarg->first_rec->end = c2 - pTarg->first_rec->base;)`
			`if (NULL == pNonStar)`
			`{`
			`((short )pTarg->last_rec->base) = GVT_CLUE_LAST_REC_MAXKEY;`
			`DEBUG_ONLY(pTarg->last_rec->end = SIZEOF(short);)`
			`} else`
			`{`
			`pRec = pNonStar->buffaddr + pNonStar->curr_rec.offset;`
			`GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);`
			`c1 = pNonStar->buffaddr;`
			`if (FALSE == CHKRECLEN(pRec, c1, n0))`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`return cdb_sc_rmisalign;`
			`}`
			`if (pNonStar->curr_rec.match < ((rec_hdr_ptr_t)pRec)->cmpc)`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`return cdb_sc_rmisalign;`
			`}`
			`if ((n1 = ((rec_hdr_ptr_t)pRec)->cmpc) > (int)(pTarg->last_rec->top))`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`return cdb_sc_keyoflow;`
			`}`
			`c2 = pTarg->last_rec->base;`
			`if (0 != n1)`
			`memcpy(c2, pKey->base, n1);`
			`c2 = (sm_uc_ptr_t)c2 + n1;`

			`c1 = pRec + SIZEOF(rec_hdr);`
			`if ((int)n0 > (int)(pTarg->last_rec->top) - n1)`
			`{`
			`n0 = pTarg->last_rec->top - n1;`
			`status = cdb_sc_keyoflow;`
			`} else`
			`status = cdb_sc_rmisalign;`
			`if (0 != n0)`
			`{`
			`do`
			`{`
			`--n0;`
			`if ((0 == (c2++ = c1++)) && (0 == *c1))`
			`break;`
			`} while (n0);`
			`}`
			`if (0 == n0)`
			`{`
			`assert(CDB_STAGNATE > t_tries);`
			`return status;`
			`}`
			`assert(c2 < &pTarg->last_rec->base[pTarg->last_rec->top]); /* make sure we don't exceed allocated bounds */`
			`c2 = c1;`
			`DEBUG_ONLY(pTarg->last_rec->end = c2 - pTarg->last_rec->base;)`
			`}`
			`COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);`
			`}`
			`return cdb_sc_normal;`
			`}`