fis-gtm/sr_port/gvcst_search.c

504 lines
19 KiB
C

/****************************************************************
* *
* Copyright 2001, 2012 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_string.h"
#include "cdb_sc.h"
#include "gdsroot.h"
#include "gdskill.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "gdscc.h"
#include "copy.h"
#include "filestruct.h"
#include "jnl.h"
#include "buddy_list.h" /* needed for tp.h */
#include "hashtab_int4.h" /* needed for tp.h and cws_insert.h */
#include "tp.h"
#include "gvcst_blk_build.h"
#include "t_qread.h"
#include "longset.h" /* needed for cws_insert.h */
#include "hashtab.h" /* needed for cws_insert.h */
#include "cws_insert.h"
#include "gvcst_protos.h" /* for gvcst_search_blk,gvcst_search_tail,gvcst_search prototype */
#include "min_max.h"
GBLREF gd_region *gv_cur_region;
GBLREF sgmnt_addrs *cs_addrs;
GBLREF gv_namehead *gv_target;
GBLREF uint4 dollar_tlevel;
GBLREF sgmnt_data_ptr_t cs_data;
GBLREF unsigned char rdfail_detail;
GBLREF sgm_info *sgm_info_ptr;
GBLREF unsigned int t_tries;
GBLREF srch_blk_status *first_tp_srch_status; /* overriding value of srch_blk_status given by t_qread in case of TP */
GBLREF trans_num local_tn; /* transaction number for THIS PROCESS */
GBLREF boolean_t tp_restart_syslog; /* for the TP_TRACE_HIST_MOD macro */
GBLREF boolean_t mu_reorg_process;
GBLREF char gvcst_search_clue;
#define SET_GVCST_SEARCH_CLUE(X) gvcst_search_clue = X;
enum cdb_sc gvcst_search(gv_key *pKey, /* Key to search for */
srch_hist *pHist) /* History to fill in*/
{
unsigned char nLevl;
enum cdb_sc status;
register int n1;
register uchar_ptr_t c1, c2;
register sm_uc_ptr_t pRec, pBlkBase;
register gv_namehead *pTarg; /* Local copy of gv_target; hope it gets put into register */
register srch_blk_status *pCurr;
register srch_blk_status *pNonStar;
register srch_hist *pTargHist;
block_id nBlkId;
cache_rec_ptr_t cr;
int cycle;
unsigned short n0, nKeyLen;
trans_num tn;
cw_set_element *cse;
off_chain chain1, chain2;
srch_blk_status *tp_srch_status, *srch_status, *leaf_blk_hist;
boolean_t already_built, is_mm;
ht_ent_int4 *tabent;
sm_uc_ptr_t buffaddr;
trans_num blkhdrtn;
int hist_size;
pTarg = gv_target;
assert(NULL != pTarg);
assert(pTarg->root);
assert(pKey != &pTarg->clue);
nKeyLen = pKey->end + 1;
assert(!dollar_tlevel || ((NULL != sgm_info_ptr) && (cs_addrs->sgm_info_ptr == sgm_info_ptr)));
SET_GVCST_SEARCH_CLUE(0);
INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srches, 1);
pTargHist = (NULL == pHist ? &pTarg->hist : pHist);
/* If FINAL RETRY and TP then we can safely use clues of gv_targets that have been referenced in this
* TP transaction (read_local_tn == local_tn). While that is guaranteed to be true for all updates, it
* does not hold good for READs since we allow a lot more reads to be done inside a transaction compared
* to the # of updates allowed. We allow the same global to be read multiple times inside the same transaction
* using different global buffers for each read. This means that we need to validate any clues from the first
* read before using it for the second read even if it is in the final retry. This validation is done inside
* the below IF block. As for gv_targets which are referenced for the very first time in this TP transaction,
* we have no easy way of determining if their clues are still uptodate (i.e. using the clue will guarantee us
* no restart) and since we are in the final retry, we dont want to take a risk. So dont use the clue in that case.
*
* If FINAL RETRY and Non-TP, we will be dealing with only ONE gv_target so its clue would have been reset as
* part of the penultimate restart so we dont have any of the above issue in the non-tp case. The only exception
* is if we are in gvcst_kill in which case, gvcst_search will be called twice and the clue could be non-zero
* for the second invocation. In this case, the clue is guaranteed to be uptodate since it was set just now
* as part of the first invocation. So no need to do anything about clue in final retry for Non-TP.
*/
if ((0 != pTarg->clue.end) && ((CDB_STAGNATE > t_tries) || !dollar_tlevel || (pTarg->read_local_tn == local_tn)))
{ /* Have non-zero clue. Check if it is usable for the current search key. If so validate clue then and use it. */
/* In t_end, we skipped validating the clue in case of reorg due to the assumption that reorg never uses the clue
* i.e. it nullifies the clue before calling gvcst_search. However, it doesn't reset the clue for directory tree
* and so continue using the clue if called for root search. Assert accordingly.
*/
assert(!mu_reorg_process UNIX_ONLY(|| (pTarg->gd_csa->dir_tree == pTarg)));
INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srch_clues, 1);
status = cdb_sc_normal; /* clue is usable unless proved otherwise */
if (NULL != pHist)
{ /* Copy the full srch_hist and set loop terminator flag in unused srch_blk_status entry.
* If in TP and if leaf block in history has cse, we are guaranteed that it is built by the
* immediately previous call to "gvcst_search" (called by gvcst_kill which does two calls to
* gvcst_search of which this invocation is the second) so no need to build the block like
* is done for the (NULL == pHist) case below. Assert that and some more.
*/
hist_size = HIST_SIZE(pTarg->hist);
memcpy(pHist, &pTarg->hist, hist_size);
((srch_blk_status *)((char *)pHist + hist_size))->blk_num = 0;
# ifdef DEBUG
if (dollar_tlevel)
{
leaf_blk_hist = &pHist->h[0];
assert(0 == leaf_blk_hist->level);
chain1 = *(off_chain *)&leaf_blk_hist->blk_num;
if (chain1.flag == 1)
{
assert((int)chain1.cw_index < sgm_info_ptr->cw_set_depth);
tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
} else
{
tp_srch_status = leaf_blk_hist->first_tp_srch_status;
ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(tp_srch_status, sgm_info_ptr);
cse = (NULL != tp_srch_status) ? tp_srch_status->cse : NULL;
}
assert((NULL == cse) || cse->done);
}
# endif
} else if (dollar_tlevel)
{ /* First nullify first_tp_srch_status member in gv_target history if out-of-date. This is logically done
* at tp_clean_up time but delayed until the time this gv_target is used next in a transaction. This way
* it saves some CPU cycles. pTarg->read_local_tn tells us whether this is the first usage of this
* gv_target in this TP transaction and if so we need to reset the out-of-date field.
*/
if (pTarg->read_local_tn != local_tn)
{
for (srch_status = &pTarg->hist.h[0]; HIST_TERMINATOR != srch_status->blk_num; srch_status++)
srch_status->first_tp_srch_status = NULL;
}
/* TP & going to use clue. check if clue path contains a leaf block with a corresponding unbuilt
* cse from the previous traversal. If so build it first before gvcst_search_blk/gvcst_search_tail.
*/
tp_srch_status = NULL;
leaf_blk_hist = &pTarg->hist.h[0];
assert(0 == leaf_blk_hist->level);
chain1 = *(off_chain *)&leaf_blk_hist->blk_num;
if (chain1.flag == 1)
{
if ((int)chain1.cw_index >= sgm_info_ptr->cw_set_depth)
{
assert(sgm_info_ptr->tp_csa == cs_addrs);
assert(FALSE == cs_addrs->now_crit);
return cdb_sc_blknumerr;
}
tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
} else
{
nBlkId = (block_id)leaf_blk_hist->blk_num;
tp_srch_status = leaf_blk_hist->first_tp_srch_status;
if ((NULL == tp_srch_status)
&& (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use,
(uint4 *)&leaf_blk_hist->blk_num))))
tp_srch_status = tabent->value;
ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(tp_srch_status, sgm_info_ptr);
cse = (NULL != tp_srch_status) ? tp_srch_status->cse : NULL;
}
assert(!cse || !cse->high_tlevel);
if ((NULL == tp_srch_status) || (tp_srch_status->blk_target == leaf_blk_hist->blk_target))
{ /* Either the leaf level block in clue is not already present in the current TP transaction's
* hashtable OR it is already present and the corresponding globals match. If they dont match
* we know for sure the clue is out-of-date (i.e. using it will lead to a transaction restart)
* and hence needs to be discarded.
*/
leaf_blk_hist->first_tp_srch_status = tp_srch_status;
if (NULL != cse)
{
if (!cse->done)
{ /* there's a private copy and it's not up to date */
already_built = (NULL != cse->new_buff);
gvcst_blk_build(cse, cse->new_buff, 0);
/* Validate the block's search history right after building a private copy.
* This is not needed in case gvcst_search is going to reuse the clue's search
* history and return (because tp_hist will do the validation of this block).
* But if gvcst_search decides to do a fresh traversal (because the clue does not
* cover the path of the current input key etc.) the block build that happened now
* will not get validated in tp_hist since it will instead be given the current
* key's search history path (a totally new path) for validation. Since a private
* copy of the block has been built, tp_tend would also skip validating this block
* so it is necessary that we validate the block right here. Since it is tricky to
* accurately differentiate between the two cases, we do the validation
* unconditionally here (besides it is only a few if checks done per block build
* so it is considered okay performance-wise).
*/
if (!already_built && !chain1.flag)
{ /* is_mm is calculated twice, but this is done so as to speed up the
* most-frequent path, i.e. when there is a clue and either no cse or
* cse->done is TRUE
*/
is_mm = (dba_mm == cs_data->acc_meth);
buffaddr = tp_srch_status->buffaddr;
cr = tp_srch_status->cr;
assert(tp_srch_status && (is_mm || cr) && buffaddr);
blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn;
if (TP_IS_CDB_SC_BLKMOD3(cr, tp_srch_status, blkhdrtn))
{
assert(CDB_STAGNATE > t_tries);
TP_TRACE_HIST_MOD(leaf_blk_hist->blk_num, gv_target,
tp_blkmod_gvcst_srch, cs_data, tp_srch_status->tn,
blkhdrtn, ((blk_hdr_ptr_t)buffaddr)->levl);
return cdb_sc_blkmod;
}
if (!is_mm && ((tp_srch_status->cycle != cr->cycle)
|| (tp_srch_status->blk_num != cr->blk)))
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_lostcr;
}
}
cse->done = TRUE;
leaf_blk_hist->cr = 0;
leaf_blk_hist->cycle = CYCLE_PVT_COPY;
leaf_blk_hist->buffaddr = cse->new_buff;
} else
{ /* Keep leaf_blk_hist->buffaddr and cse->new_buff in sync. Dont know how they
* cannot be the same but it seems possible if the gvcst_blk_build happened as
* part of a t_qread call (which does not have enough information to update the
* search history buffer address) without going through gvcst_search. Since the
* consequences of these two not being in sync are database damage, we fix them
* in pro just in case they are different.
*/
assert(leaf_blk_hist->buffaddr == cse->new_buff);
leaf_blk_hist->buffaddr = cse->new_buff;
}
}
} else
status = cdb_sc_lostcr; /* two different gv_targets point to same block; discard out-of-date clue */
}
/* Validate EVERY level in the clue before using it for ALL retries. This way we avoid unnecessary restarts.
* This is NECESSARY for the final retry (e.g. in a TP transaction that does LOTS of reads of different globals,
* it is possible that one global's clue is invalidated by a later read of another global) and is DESIRABLE (for
* performance reasons) in the other tries. The cost of a restart (particularly in TP) is very high that it is
* considered okay to take the hit of validating the entire clue before using it even if it is not the final retry.
*/
DEBUG_ONLY(is_mm = (dba_mm == cs_data->acc_meth);)
if (cdb_sc_normal == status)
{
for (srch_status = &pTargHist->h[0]; HIST_TERMINATOR != srch_status->blk_num; srch_status++)
{
assert(srch_status->level == srch_status - &pTargHist->h[0]);
assert(is_mm || (NULL == srch_status->cr) || (NULL != srch_status->buffaddr));
cr = srch_status->cr;
assert(!is_mm || (NULL == cr));
if (TP_IS_CDB_SC_BLKMOD(cr, srch_status))
{
status = cdb_sc_blkmod;
break;
}
if (NULL != cr)
{
assert(NULL != srch_status->buffaddr);
if (srch_status->cycle != cr->cycle)
{
status = cdb_sc_lostcr;
break;
}
if (CDB_STAGNATE <= t_tries || mu_reorg_process)
CWS_INSERT(cr->blk);
cr->refer = TRUE;
}
}
}
if (cdb_sc_normal == status)
{ /* Now that we are ready to use the clue, put more-likely case earlier in the if then else sequence.
* For sequential reads of globals, we expect the tail of the clue to be much more used than the head.
* For random reads, both are equally probable and hence it doesn't matter.
* The case (0 == n1) is not expected a lot (relatively) since the application may be able to optimize
* a number of reads of the same key into one read by using a local-variable to store the value.
*/
if (0 < (n1 = memcmp(pKey->base, pTarg->clue.base, nKeyLen)))
{
if (memcmp(pKey->base, pTarg->last_rec->base, nKeyLen) <= 0)
{
SET_GVCST_SEARCH_CLUE(1);
status = gvcst_search_tail(pKey, pTargHist->h, &pTarg->clue);
if (NULL == pHist)
{ /* Implies the search history is being filled in pTarg->hist so we can
* safely update pTarg->clue to reflect the new search key. It is important
* that this clue update be done AFTER the gvcst_search_tail invocation
* (as that needs to pass the previous clue key).
*/
COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);
}
INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_tail, 1);
return status;
}
} else if (0 > n1)
{
if (memcmp(pKey->base, pTarg->first_rec->base, nKeyLen) >= 0)
{
SET_GVCST_SEARCH_CLUE(3);
status = gvcst_search_blk(pKey, pTargHist->h);
if (NULL == pHist)
{ /* Implies the search history is being filled in pTarg->hist so we can
* safely update pTarg->clue to reflect the new search key. It does not
* matter if we update the clue BEFORE or AFTER the gvcst_search_blk
* invocation but for consistency with the gvcst_search_tail invocation
* we keep it AFTER.
*/
COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);
}
INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_head, 1);
return status;
}
} else
{
SET_GVCST_SEARCH_CLUE(2);
INCR_DB_CSH_COUNTER(cs_addrs, n_clue_used_same, 1);
return cdb_sc_normal;
}
}
}
nBlkId = pTarg->root;
tn = cs_addrs->ti->curr_tn;
if (NULL == (pBlkBase = t_qread(nBlkId, (sm_int_ptr_t)&cycle, &cr)))
return (enum cdb_sc)rdfail_detail;
nLevl = ((blk_hdr_ptr_t)pBlkBase)->levl;
if (MAX_BT_DEPTH < (int)nLevl)
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_maxlvl;
}
if (0 == (int)nLevl)
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_badlvl;
}
is_mm = (dba_mm == cs_data->acc_meth);
pTargHist->depth = (int)nLevl;
pCurr = &pTargHist->h[nLevl];
(pCurr + 1)->blk_num = 0;
pCurr->tn = tn;
pCurr->first_tp_srch_status = first_tp_srch_status;
pCurr->cycle = cycle;
pCurr->cr = cr;
pNonStar = NULL;
for (;;)
{
assert(pCurr->level == nLevl);
pCurr->cse = NULL;
pCurr->blk_num = nBlkId;
pCurr->buffaddr = pBlkBase;
if (cdb_sc_normal != (status = gvcst_search_blk(pKey, pCurr)))
return status;
if (0 == nLevl)
break;
if ((n0 = pCurr->curr_rec.offset) >= ((blk_hdr_ptr_t)pBlkBase)->bsiz)
n0 = pCurr->prev_rec.offset;
pRec = pBlkBase + n0;
GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);
if (FALSE == CHKRECLEN(pRec, pBlkBase, n0))
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_rmisalign;
}
GET_LONG(nBlkId, (pRec + n0 - SIZEOF(block_id)));
if (is_mm)
{
PUT_LONG(&chain2, nBlkId);
if ((0 == chain2.flag) && (nBlkId > cs_addrs->total_blks))
{ /* private copy should be taken care of by .flag */
if (cs_addrs->total_blks < cs_addrs->ti->total_blks)
return cdb_sc_helpedout;
else
return cdb_sc_blknumerr;
}
}
if (BSTAR_REC_SIZE != n0)
pNonStar = pCurr;
pCurr--;
pCurr->tn = cs_addrs->ti->curr_tn;
if (NULL == (pBlkBase = t_qread(nBlkId, (sm_int_ptr_t)&pCurr->cycle, &pCurr->cr)))
return (enum cdb_sc)rdfail_detail;
pCurr->first_tp_srch_status = first_tp_srch_status;
if (((blk_hdr_ptr_t)pBlkBase)->levl != --nLevl)
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_badlvl;
}
}
if (NULL == pHist)
{
if ((pCurr->curr_rec.offset < SIZEOF(blk_hdr)) ||
((pCurr->curr_rec.offset == SIZEOF(blk_hdr)) && (pCurr->curr_rec.match < nKeyLen)))
{ /* Clue less than first rec, invalidate */
pTarg->clue.end = 0;
return cdb_sc_normal;
}
pRec = pBlkBase + SIZEOF(blk_hdr);
GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);
if (FALSE == CHKRECLEN(pRec, pBlkBase, n0))
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_rmisalign;
}
c1 = pRec + SIZEOF(rec_hdr);
c2 = pTarg->first_rec->base;
if (n0 > (pTarg->first_rec->top))
{
n0 = pTarg->first_rec->top;
status = cdb_sc_keyoflow;
} else
status = cdb_sc_rmisalign;
if (0 != n0)
{
do
{
--n0;
if ((0 == (*c2++ = *c1++)) && (0 == *c1))
break;
} while (n0);
}
if (0 == n0)
{
assert(CDB_STAGNATE > t_tries);
return status;
}
assert(c2 < &pTarg->first_rec->base[pTarg->first_rec->top]); /* make sure we don't exceed allocated bounds */
*c2 = *c1;
DEBUG_ONLY(pTarg->first_rec->end = c2 - pTarg->first_rec->base;)
if (NULL == pNonStar)
{
*((short *)pTarg->last_rec->base) = GVT_CLUE_LAST_REC_MAXKEY;
DEBUG_ONLY(pTarg->last_rec->end = SIZEOF(short);)
} else
{
pRec = pNonStar->buffaddr + pNonStar->curr_rec.offset;
GET_USHORT(n0, &((rec_hdr_ptr_t)pRec)->rsiz);
c1 = pNonStar->buffaddr;
if (FALSE == CHKRECLEN(pRec, c1, n0))
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_rmisalign;
}
if (pNonStar->curr_rec.match < ((rec_hdr_ptr_t)pRec)->cmpc)
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_rmisalign;
}
if ((n1 = ((rec_hdr_ptr_t)pRec)->cmpc) > (int)(pTarg->last_rec->top))
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_keyoflow;
}
c2 = pTarg->last_rec->base;
if (0 != n1)
memcpy(c2, pKey->base, n1);
c2 = (sm_uc_ptr_t)c2 + n1;
c1 = pRec + SIZEOF(rec_hdr);
if ((int)n0 > (int)(pTarg->last_rec->top) - n1)
{
n0 = pTarg->last_rec->top - n1;
status = cdb_sc_keyoflow;
} else
status = cdb_sc_rmisalign;
if (0 != n0)
{
do
{
--n0;
if ((0 == (*c2++ = *c1++)) && (0 == *c1))
break;
} while (n0);
}
if (0 == n0)
{
assert(CDB_STAGNATE > t_tries);
return status;
}
assert(c2 < &pTarg->last_rec->base[pTarg->last_rec->top]); /* make sure we don't exceed allocated bounds */
*c2 = *c1;
DEBUG_ONLY(pTarg->last_rec->end = c2 - pTarg->last_rec->base;)
}
COPY_CURRKEY_TO_GVTARGET_CLUE(pTarg, pKey);
}
return cdb_sc_normal;
}