fis-gtm/sr_port/t_qread.c

707 lines
31 KiB
C

/****************************************************************
* *
* Copyright 2001, 2013 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#ifdef VMS
#include <ssdef.h>
#endif
#include "ast.h" /* needed for JNL_ENSURE_OPEN_WCS_WTSTART macro in gdsfhead.h */
#include "copy.h"
#include "gdsroot.h"
#include "gdskill.h"
#include "gdsblk.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "gdscc.h"
#include "filestruct.h"
#include "iosp.h"
#include "interlock.h"
#include "jnl.h"
#include "buddy_list.h" /* needed for tp.h */
#include "hashtab_int4.h" /* needed for tp.h and cws_insert.h */
#include "tp.h"
#include "gdsbgtr.h"
#include "sleep_cnt.h"
#include "send_msg.h"
#include "t_qread.h"
#include "gvcst_blk_build.h"
#include "mm_read.h"
#include "is_proc_alive.h"
#include "cache.h"
#include "longset.h" /* needed for cws_insert.h */
#include "hashtab.h" /* needed for cws_insert.h */
#include "cws_insert.h"
#include "wcs_sleep.h"
#include "add_inter.h"
#include "wbox_test_init.h"
#include "memcoherency.h"
#include "wcs_flu.h" /* for SET_CACHE_FAIL_STATUS macro */
#ifdef UNIX
# ifdef GTM_CRYPT
# include "gtmcrypt.h"
# endif
#include "io.h" /* needed by gtmsecshr.h */
#include "gtmsecshr.h" /* for continue_proc */
#endif
#include "wcs_phase2_commit_wait.h"
#include "gtm_c_stack_trace.h"
GBLDEF srch_blk_status *first_tp_srch_status; /* the first srch_blk_status for this block in this transaction */
GBLDEF unsigned char rdfail_detail; /* t_qread uses a 0 return to indicate a failure (no buffer filled) and the real
status of the read is returned using a global reference, as the status detail
should typically not be needed and optimizing the call is important */
GBLREF gd_region *gv_cur_region;
GBLREF sgmnt_addrs *cs_addrs;
GBLREF sgmnt_data_ptr_t cs_data;
GBLREF sgm_info *sgm_info_ptr;
GBLREF short crash_count;
GBLREF uint4 dollar_tlevel;
GBLREF unsigned int t_tries;
GBLREF uint4 process_id;
GBLREF boolean_t tp_restart_syslog; /* for the TP_TRACE_HIST_MOD macro */
GBLREF gv_namehead *gv_target;
GBLREF boolean_t dse_running;
GBLREF boolean_t disk_blk_read;
GBLREF uint4 t_err;
GBLREF boolean_t block_is_free;
GBLREF boolean_t mupip_jnl_recover;
/* There are 3 passes (of the do-while loop below) we allow now.
* The first pass which is potentially out-of-crit and hence can end up not locating the cache-record for the input block.
* The second pass which holds crit and is waiting for a concurrent reader to finish reading the input block in.
* The third pass is needed because the concurrent reader (in dsk_read) might encounter a DYNUPGRDFAIL error in which case
* it is going to increment the cycle in the cache-record and reset the blk to CR_BLKEMPTY.
* We dont need any pass more than this because if we hold crit then no one else can start a dsk_read for this block.
* This # of passes is hardcoded in the macro BAD_LUCK_ABOUNDS
*/
#define BAD_LUCK_ABOUNDS 2
#define RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, newcr, newcycle) \
assert((first_tp_srch_status)->cr != (newcr) || (first_tp_srch_status)->cycle != (newcycle)); \
(first_tp_srch_status)->cr = (newcr); \
(first_tp_srch_status)->cycle = (newcycle); \
(first_tp_srch_status)->buffaddr = (sm_uc_ptr_t)GDS_REL2ABS((newcr)->buffaddr);
#define REL_CRIT_IF_NEEDED(CSA, REG, WAS_CRIT, HOLD_ONTO_CRIT) \
{ /* If currently have crit, but didn't have it upon entering, release crit now. */ \
assert(!WAS_CRIT || CSA->now_crit); \
if ((WAS_CRIT != CSA->now_crit) && !HOLD_ONTO_CRIT) \
rel_crit(REG); \
}
error_def(ERR_BUFOWNERSTUCK);
error_def(ERR_CRYPTBADCONFIG);
error_def(ERR_DBFILERR);
error_def(ERR_DYNUPGRDFAIL);
error_def(ERR_GVPUTFAIL);
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out)
/* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */
{
int4 status;
uint4 blocking_pid;
cache_rec_ptr_t cr;
bt_rec_ptr_t bt;
boolean_t clustered, hold_onto_crit, was_crit;
int dummy, lcnt, ocnt;
cw_set_element *cse;
off_chain chain1;
register sgmnt_addrs *csa;
register sgmnt_data_ptr_t csd;
enum db_ver ondsk_blkver;
int4 dummy_errno, gtmcrypt_errno;
boolean_t already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked;
ht_ent_int4 *tabent;
srch_blk_status *blkhist;
trans_num dirty, blkhdrtn;
sm_uc_ptr_t buffaddr;
uint4 stuck_cnt = 0;
boolean_t lcl_blk_free;
node_local_ptr_t cnl;
# ifdef GTM_CRYPT
gd_segment *seg;
# endif
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
lcl_blk_free = block_is_free;
block_is_free = FALSE; /* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */
first_tp_srch_status = NULL;
reset_first_tp_srch_status = FALSE;
csa = cs_addrs;
csd = csa->hdr;
INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1);
is_mm = (dba_mm == csd->acc_meth);
/* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */
assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover);
if (dollar_tlevel)
{
assert(sgm_info_ptr);
if (0 != sgm_info_ptr->cw_set_depth)
{
chain1 = *(off_chain *)&blk;
if (1 == chain1.flag)
{
assert(sgm_info_ptr->cw_set_depth);
if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth)
tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
else
{
assert(FALSE == csa->now_crit);
rdfail_detail = cdb_sc_blknumerr;
return (sm_uc_ptr_t)NULL;
}
} else
{
if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
first_tp_srch_status = tabent->value;
else
first_tp_srch_status = NULL;
ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL;
}
assert(!cse || !cse->high_tlevel);
assert(!chain1.flag || cse);
if (cse)
{ /* transaction has modified the sought after block */
if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode))
{ /* Changes have not been committed to shared memory, i.e. still in private memory.
* Build block in private buffer if not already done and return the same.
*/
assert(gds_t_writemap != cse->mode);
if (FALSE == cse->done)
{ /* out of date, so make it current */
assert(gds_t_committed != cse->mode);
already_built = (NULL != cse->new_buff);
/* Validate the block's search history right after building a private copy.
* This is not needed in case gvcst_search is going to reuse the clue's search
* history and return (because tp_hist will do the validation of this block).
* But if gvcst_search decides to do a fresh traversal (because the clue does not
* cover the path of the current input key etc.) the block build that happened now
* will not get validated in tp_hist since it will instead be given the current
* key's search history path (a totally new path) for validation. Since a private
* copy of the block has been built, tp_tend would also skip validating this block
* so it is necessary that we validate the block right here. Since it is tricky to
* accurately differentiate between the two cases, we do the validation
* unconditionally here (besides it is only a few if checks done per block build
* so it is considered okay performance-wise).
*/
gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0);
assert(NULL != cse->blk_target);
if (!already_built && !chain1.flag)
{
buffaddr = first_tp_srch_status->buffaddr;
cr = first_tp_srch_status->cr;
assert((is_mm || cr) && buffaddr);
blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn;
if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn))
{
assert(CDB_STAGNATE > t_tries);
rdfail_detail = cdb_sc_blkmod; /* should this be something else */
TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data,
first_tp_srch_status->tn, blkhdrtn,
((blk_hdr_ptr_t)buffaddr)->levl);
return (sm_uc_ptr_t)NULL;
}
if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle)
|| (first_tp_srch_status->blk_num != cr->blk)))
{
assert(CDB_STAGNATE > t_tries);
rdfail_detail = cdb_sc_lostcr; /* should this be something else */
return (sm_uc_ptr_t)NULL;
}
}
cse->done = TRUE;
}
*cycle = CYCLE_PVT_COPY;
*cr_out = 0;
return (sm_uc_ptr_t)cse->new_buff;
} else
{ /* Block changes are already committed to shared memory (possible if we are in TP
* in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read
* block from shared memory; do not look at private memory (i.e. cse) as that might
* not be as uptodate as shared memory.
*/
assert(csa->now_crit); /* gvcst_expand_free_subtree does t_qread in crit */
/* If this block was newly created as part of the TP transaction, it should not be killed
* as part of the 2nd phase of M-kill. This is because otherwise the block's cse would
* have had an old_mode of kill_t_create in which case we would not have come into this
* else block. Assert accordingly.
*/
assert(!chain1.flag);
first_tp_srch_status = NULL; /* do not use any previous srch_hist information */
}
}
} else
{
if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
first_tp_srch_status = tabent->value;
else
first_tp_srch_status = NULL;
}
ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
if (!is_mm && first_tp_srch_status)
{
cr = first_tp_srch_status->cr;
assert(cr && !first_tp_srch_status->cse);
if (first_tp_srch_status->cycle == cr->cycle)
{
*cycle = first_tp_srch_status->cycle;
*cr_out = cr;
cr->refer = TRUE;
if (CDB_STAGNATE <= t_tries) /* mu_reorg doesn't use TP else should have an || for that */
CWS_INSERT(blk);
return (sm_uc_ptr_t)first_tp_srch_status->buffaddr;
} else
{ /* Block was already part of the read-set of this transaction, but got recycled in the cache.
* Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new
* buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect
* if block recycling happened within the same mini-action and restart in that case.
* Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know
* the values to update to. Set a variable that will enable the updation before returning.
* Also assert that if we are in the final retry, we are never in a situation where we have a
* block that got recycled since the start of the current mini-action. This is easily detected since
* as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that
* have been read as part of the current mini-action until now.
*/
assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk)));
reset_first_tp_srch_status = TRUE;
}
}
}
if ((uint4)blk >= (uint4)csa->ti->total_blks)
{ /* Requested block out of range; could occur because of a concurrency conflict. mm_read and dsk_read assume blk is
* never negative or greater than the maximum possible file size. If a concurrent REORG truncates the file, t_qread
* can proceed despite blk being greater than total_blks. But dsk_read handles this fine; see comments below.
*/
assert((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data));
assert(!csa->now_crit);
rdfail_detail = cdb_sc_blknumerr;
return (sm_uc_ptr_t)NULL;
}
if (is_mm)
{
*cycle = CYCLE_SHRD_COPY;
*cr_out = 0;
return (sm_uc_ptr_t)(mm_read(blk));
}
# ifdef GTM_CRYPT
if ((GTMCRYPT_INVALID_KEY_HANDLE == csa->encr_key_handle) && !IS_BITMAP_BLK(blk))
{ /* A non-GT.M process is attempting to read a non-bitmap block but doesn't have a valid encryption key handle. This
* is an indication that the process encountered an error during db_init and reported it with a -W- severity. But,
* since the block it is attempting to read can be in the unencrypted shared memory, we cannot let it access it
* without a valid handle. So, issue an rts_error
*/
assert(!IS_GTM_IMAGE); /* GT.M would have error'ed out in db_init */
gtmcrypt_errno = SET_REPEAT_MSG_MASK(SET_CRYPTERR_MASK(ERR_CRYPTBADCONFIG));
seg = gv_cur_region->dyn.addr;
GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, seg->fname_len, seg->fname);
}
# endif
assert(dba_bg == csd->acc_meth);
assert(!first_tp_srch_status || !first_tp_srch_status->cr
|| first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle);
if (FALSE == (clustered = csd->clustered))
bt = NULL;
was_crit = csa->now_crit;
ocnt = 0;
cnl = csa->nl;
set_wc_blocked = FALSE; /* to indicate whether cnl->wc_blocked was set to TRUE by us */
hold_onto_crit = csa->hold_onto_crit; /* note down in local to avoid csa-> dereference in multiple usages below */
do
{
if (NULL == (cr = db_csh_get(blk)))
{ /* not in memory */
if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing))
bt = NULL;
if (!csa->now_crit)
{
assert(!hold_onto_crit);
if (NULL != bt)
{ /* at this point, bt is not NULL only if clustered and flushing - wait no crit */
assert(clustered);
wait_for_block_flush(bt, blk); /* try for no other node currently writing the block */
}
if ((csd->flush_trigger <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only))
JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, 0, dummy_errno);
/* a macro that dclast's "wcs_wtstart" and checks for errors etc. */
grab_crit(gv_cur_region);
cr = db_csh_get(blk); /* in case blk arrived before crit */
}
if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing))
{ /* Once crit, need to assure that if clustered, that flushing is [still] complete
* If it isn't, we missed an entire WM cycle and have to wait for another node to finish */
wait_for_block_flush(bt, blk); /* ensure no other node currently writing the block */
}
if (NULL == cr)
{ /* really not in memory - must get a new buffer */
assert(csa->now_crit);
cr = db_csh_getn(blk);
if (CR_NOTVALID == (sm_long_t)cr)
{
assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */
assert(gtm_white_box_test_case_enabled);
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk);
set_wc_blocked = TRUE;
break;
}
assert(0 <= cr->read_in_progress);
*cycle = cr->cycle;
cr->tn = csd->trans_hist.curr_tn;
/* Record history of most recent disk reads only in dbg builds for now. Although the macro
* is just a couple dozen instructions, it is done while holding crit so we want to avoid
* delaying crit unless really necessary. Whoever wants this information can enable it
* by a build change to remove the DEBUG_ONLY part below.
*/
DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);)
if (!was_crit && !hold_onto_crit)
rel_crit(gv_cur_region);
/* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */
assert(0 == cr->dirty);
assert(cr->read_in_progress >= 0);
CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr);
buffaddr = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
if (SS_NORMAL != (status = dsk_read(blk, buffaddr, &ondsk_blkver, lcl_blk_free)))
{ /* buffer does not contain valid data, so reset blk to be empty */
cr->cycle++; /* increment cycle for blk number changes (for tp_hist and others) */
cr->blk = CR_BLKEMPTY;
cr->r_epid = 0;
RELEASE_BUFF_READ_LOCK(cr);
assert(-1 <= cr->read_in_progress);
assert(was_crit == csa->now_crit);
if (FUTURE_READ == status)
{ /* in cluster, block can be in the "future" with respect to the local history */
assert(TRUE == clustered);
assert(FALSE == csa->now_crit);
rdfail_detail = cdb_sc_future_read; /* t_retry forces the history up to date */
return (sm_uc_ptr_t)NULL;
}
if (ERR_DYNUPGRDFAIL == status)
{ /* if we dont hold crit on the region, it is possible due to concurrency conflicts
* that this block is unused (i.e. marked free/recycled in bitmap, see comments in
* gds_blk_upgrade.h). in this case we should not error out but instead restart.
*/
if (was_crit)
{
assert(FALSE);
rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) status, 3, blk,
DB_LEN_STR(gv_cur_region));
} else
{
rdfail_detail = cdb_sc_lostcr;
return (sm_uc_ptr_t)NULL;
}
}
if ((-1 == status) && !was_crit)
{ /* LSEEKREAD and, consequently, dsk_read return -1 in case pread is unable to fetch
* a full database block's length of data. This can happen if the requested read is
* past the end of the file, which can happen if a concurrent truncate occurred
* after the blk >= csa->ti->total_blks comparison above. Allow for this scenario
* by restarting. However, if we've had crit the whole time, no truncate could have
* happened. -1 indicates a problem with the file, so fall through to DBFILERR.
*/
rdfail_detail = cdb_sc_truncate;
return (sm_uc_ptr_t)NULL;
}
# ifdef GTM_CRYPT
else if (IS_CRYPTERR_MASK(status))
{
seg = gv_cur_region->dyn.addr;
GTMCRYPT_REPORT_ERROR(status, rts_error, seg->fname_len, seg->fname);
}
# endif
else
{ /* A DBFILERR can be thrown for two possible reasons:
* (1) LSEEKREAD returned an unexpected error due to a filesystem problem; or
* (2) csa/cs_addrs/csd/cs_data are out of sync, and we're trying to read a block
* number for one region from another region with fewer total_blks.
* We suspect the former is what happened in GTM-7623. Apparently the latter
* has been an issue before, too. If either occurs again in pro, this assertpro
* distinguishes the two possibilities.
*/
assertpro((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data));
rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region),
status);
}
}
disk_blk_read = TRUE;
assert(0 <= cr->read_in_progress);
assert(0 == cr->dirty);
/* Only set in cache if read was success */
cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver);
cr->r_epid = 0;
RELEASE_BUFF_READ_LOCK(cr);
assert(-1 <= cr->read_in_progress);
*cr_out = cr;
assert(was_crit == csa->now_crit);
if (reset_first_tp_srch_status)
{ /* keep the parantheses for the if (although single line) since the following is a macro */
RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
}
return buffaddr;
} else if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt))
{
assert(!hold_onto_crit);
assert(TRUE == csa->now_crit);
assert(cnl->in_crit == process_id);
rel_crit(gv_cur_region);
}
}
if (CR_NOTVALID == (sm_long_t)cr)
{
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_get_invalid_blk);
set_wc_blocked = TRUE;
break;
}
/* It is very important for cycle to be noted down BEFORE checking for read_in_progress/in_tend.
* Because of this instruction order requirement, we need to have a read barrier just after noting down cr->cycle.
* Doing it the other way round introduces the scope for a bug in the concurrency control validation logic in
* t_end/tp_hist/tp_tend. This is because the validation logic relies on t_qread returning an atomically
* consistent value of <"cycle","cr"> for a given input blk such that cr->buffaddr held the input blk's
* contents at the time when cr->cycle was "cycle". It is important that cr->read_in_progress is -1
* (indicating the read from disk into the buffer is complete) AND cr->in_tend is FALSE (indicating
* that the buffer is not being updated) when t_qread returns. The only exception is if cr->cycle is higher
* than the "cycle" returned by t_qread (signifying the buffer got reused for another block concurrently)
* in which case the cycle check in the validation logic will detect this.
*/
*cycle = cr->cycle;
SHM_READ_MEMORY_BARRIER;
sleep_invoked = FALSE;
for (lcnt = 1; ; lcnt++)
{
if (0 > cr->read_in_progress)
{ /* it's not being read */
if (clustered && (0 == cr->bt_index) && (cr->tn < OLDEST_HIST_TN(csa)))
{ /* can't rely on the buffer */
cr->cycle++; /* increment cycle whenever blk number changes (tp_hist depends on this) */
cr->blk = CR_BLKEMPTY;
break;
}
*cr_out = cr;
VMS_ONLY(
/* If we were doing the "db_csh_get" above (in t_qread itself) and located the cache-record
* which, before coming here and taking a copy of cr->cycle a few lines above, was made an
* older twin by another process in bg_update (note this can happen in VMS only) which has
* already incremented the cycle, we will end up having a copy of the old cache-record with
* its incremented cycle number and hence will succeed in tp_hist validation if we return
* this <cr,cycle> combination although we don't want to since this "cr" is not current for
* the given block as of now. Note that the "indexmod" optimization in "tp_tend" relies on
* an accurate intermediate validation by "tp_hist" which in turn relies on the <cr,cycle>
* value returned by t_qread to be accurate for a given blk at the current point in time.
* We detect the older-twin case by the following check. Note that here we depend on the
* the fact that "bg_update" sets cr->bt_index to 0 before incrementing cr->cycle.
* Given that order, cr->bt_index can be guaranteed to be 0 if we read the incremented cycle
*/
if (cr->twin && (0 == cr->bt_index))
break;
)
if (cr->blk != blk)
break;
REL_CRIT_IF_NEEDED(csa, gv_cur_region, was_crit, hold_onto_crit);
assert(was_crit == csa->now_crit);
/* Check if "cr" is locked for phase2 update by a concurrent process. Before doing so, need to
* do a read memory barrier to ensure we read a consistent state. Otherwise, we could see
* cr->in_tend as 0 even though it is actually non-zero in another processor (due to cache
* coherency delays in multi-processor environments) and this could lead to mysterious
* failures including GTMASSERTs and database damage as the validation logic in t_end/tp_tend
* relies on the fact that the cr->in_tend check here is accurate as of this point.
*
* Note that on architectures where a change done by another process needs two steps to be made
* visible by another process (write memory barrier on the writer side AND a read memory barrier
* on the reader side) this read memory barrier also serves the purpose of ensuring this process
* sees an uptodate state of the global buffer whose contents got modified by the disk read (done
* by another process) that finished just now. Example is the Alpha architecture where this is
* needed. Example where this is not needed is the Power architecture (as of this writing) where
* only the write memory barrier on the write side is necessary. As long as the reader sees any
* update done AFTER the write memory barrier, it is guaranteed to see all updates done BEFORE
* the write memory barrier.
*/
SHM_READ_MEMORY_BARRIER;
blocking_pid = cr->in_tend;
if (blocking_pid)
{ /* Wait for cr->in_tend to be non-zero. But in the case we are doing a TP transaction and
* the global has NOISOLATION turned ON and this is a leaf level block and this is a SET
* operation (t_err == ERR_GVPUTFAIL), avoid the sleep but ensure a cdb_sc_blkmod type
* restart will be triggered (in tp_tend) and the function "recompute_upd_array" will be
* invoked. Avoiding the sleep in this case (at the cost of recomputing the update array
* in crit) is expected to improve throughput. The only exception is if we are in the
* final retry in which case it is better to wait here as we dont want to end up in a
* situation where "recompute_upd_array" indicates that a restart is necessary.
*/
if (dollar_tlevel && (gv_target && gv_target->noisolation) && (ERR_GVPUTFAIL == t_err)
&& (CDB_STAGNATE > t_tries)) /* do not skip wait in case of final retry */
{ /* We know that the only caller in this case would be the function "gvcst_search".
* If the input cr and cycle match corresponding fields of gv_target->hist.h[0],
* we update the corresponding "tn" field to reset it BACK thereby ensuring the
* cdb_sc_blkmod check in tp_tend will fail and that the function
* "recompute_upd_array" will be invoked to try and recompute the update array.
* We do this only in case of gv_target->hist.h[0] as recomputations
* are currently done for NOISOLATION globals only for leaf level blocks.
*/
blkhist = &gv_target->hist.h[0];
dirty = cr->dirty;
if (((sm_int_ptr_t)&blkhist->cycle == (sm_int_ptr_t)cycle)
&& ((cache_rec_ptr_ptr_t)&blkhist->cr == (cache_rec_ptr_ptr_t)cr_out))
{
if (blkhist->tn > dirty)
{
blkhist->tn = dirty;
if (reset_first_tp_srch_status)
first_tp_srch_status->tn = dirty;
}
blocking_pid = 0; /* do not sleep in the for loop below */
}
}
if (blocking_pid)
{
if (TREF(tqread_nowait) && ((sm_int_ptr_t)&gv_target->hist.h[0].cycle == cycle))
{ /* We're an update helper. Don't waste time waiting on a leaf blk */
rdfail_detail = cdb_sc_tqreadnowait;
return (sm_uc_ptr_t)NULL;
}
if (!wcs_phase2_commit_wait(csa, cr))
{ /* Timed out waiting for cr->in_tend to become non-zero. Restart. */
rdfail_detail = cdb_sc_phase2waitfail;
return (sm_uc_ptr_t)NULL;
}
}
}
if (reset_first_tp_srch_status)
{ /* keep the parantheses for the if (although single line) since the following is a macro */
RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
}
assert(!csa->now_crit || !cr->twin || cr->bt_index);
assert(!csa->now_crit || (NULL == (bt = bt_get(blk)))
|| (CR_NOTVALID == bt->cache_index)
|| (cr == (cache_rec_ptr_t)GDS_REL2ABS(bt->cache_index)) && (0 == cr->in_tend));
/* Note that at this point we expect t_qread to return a <cr,cycle> combination that
* corresponds to "blk" passed in. It is crucial to get an accurate value for both the fields
* since "tp_hist" relies on this for its intermediate validation.
*/
return (sm_uc_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr);
}
if (blk != cr->blk)
break;
if (lcnt >= BUF_OWNER_STUCK && (0 == (lcnt % BUF_OWNER_STUCK)))
{
if (!csa->now_crit && !hold_onto_crit)
grab_crit(gv_cur_region);
if (cr->read_in_progress < -1)
{ /* outside of design; clear to known state */
BG_TRACE_PRO(t_qread_out_of_design);
assert(0 == cr->r_epid);
cr->r_epid = 0;
INTERLOCK_INIT(cr);
} else if (cr->read_in_progress >= 0)
{
BG_TRACE_PRO(t_qread_buf_owner_stuck);
blocking_pid = cr->r_epid;
if ((0 != blocking_pid) && (process_id != blocking_pid))
{
if (FALSE == is_proc_alive(blocking_pid, cr->image_count))
{ /* process gone: release that process's lock */
assert(0 == cr->bt_index);
if (cr->bt_index)
{
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_bad_bt_index1);
set_wc_blocked = TRUE;
break;
}
cr->cycle++; /* increment cycle for blk number changes (for tp_hist) */
cr->blk = CR_BLKEMPTY;
cr->r_epid = 0;
RELEASE_BUFF_READ_LOCK(cr);
} else
{
if (!hold_onto_crit)
rel_crit(gv_cur_region);
send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2,
DB_LEN_STR(gv_cur_region));
send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_BUFOWNERSTUCK, 7, process_id,
blocking_pid, cr->blk, cr->blk, (lcnt / BUF_OWNER_STUCK),
cr->read_in_progress, cr->rip_latch.u.parts.latch_pid);
stuck_cnt++;
GET_C_STACK_FROM_SCRIPT("BUFOWNERSTUCK", process_id, blocking_pid,
stuck_cnt);
/* Kickstart the process taking a long time in case it was suspended */
UNIX_ONLY(continue_proc(blocking_pid));
}
} else
{ /* process stopped before could set r_epid OR
* Process is waiting on the lock held by itself.
* Process waiting on the lock held by itself is an out-of-design
* situation that we dont how it can occur hence the following assert
* but know how to handle so we dont have to gtmassert in pro.
*/
assert(process_id != blocking_pid);
assert(0 == cr->bt_index);
if (cr->bt_index)
{
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_bad_bt_index2);
set_wc_blocked = TRUE;
break;
}
cr->cycle++; /* increment cycle for blk number changes (for tp_hist) */
cr->blk = CR_BLKEMPTY;
cr->r_epid = 0; /* If the process itself is lock holder, r_epid is non-zero */
RELEASE_BUFF_READ_LOCK(cr);
if (cr->read_in_progress < -1) /* race: process released since if r_epid */
LOCK_BUFF_FOR_READ(cr, dummy);
}
}
REL_CRIT_IF_NEEDED(csa, gv_cur_region, was_crit, hold_onto_crit);
} else
{
if (TREF(tqread_nowait) && ((sm_int_ptr_t)&gv_target->hist.h[0].cycle == cycle))
{ /* We're an update helper. Don't waste time waiting on a leaf blk; move on to useful work */
REL_CRIT_IF_NEEDED(csa, gv_cur_region, was_crit, hold_onto_crit);
rdfail_detail = cdb_sc_tqreadnowait;
return (sm_uc_ptr_t)NULL;
}
BG_TRACE_PRO_ANY(csa, t_qread_ripsleep_cnt);
if (!sleep_invoked) /* Count # of blks for which we ended up sleeping on the read */
BG_TRACE_PRO_ANY(csa, t_qread_ripsleep_nblks);
wcs_sleep(lcnt);
sleep_invoked = TRUE;
}
}
if (set_wc_blocked) /* cannot use cnl->wc_blocked here as we might not necessarily have crit */
break;
ocnt++;
assert((0 == was_crit) || (1 == was_crit));
/* if we held crit while entering t_qread we might need BAD_LUCK_ABOUNDS - 1 passes.
* otherwise we might need BAD_LUCK_ABOUNDS passes. if we are beyond this GTMASSERT.
*/
if ((BAD_LUCK_ABOUNDS - was_crit) < ocnt)
{
assert(!hold_onto_crit);
assert(!csa->now_crit);
GTMASSERT;
}
if (!csa->now_crit && !hold_onto_crit)
grab_crit(gv_cur_region);
} while (TRUE);
assert(set_wc_blocked && (cnl->wc_blocked || !csa->now_crit));
SET_CACHE_FAIL_STATUS(rdfail_detail, csd);
REL_CRIT_IF_NEEDED(csa, gv_cur_region, was_crit, hold_onto_crit);
assert(was_crit == csa->now_crit);
return (sm_uc_ptr_t)NULL;
}