fis-gtm/sr_port/db_csh_getn.c

359 lines
15 KiB
C
Raw Normal View History

/****************************************************************
* *
* Copyright 2001, 2011 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include <signal.h> /* needed for VSIG_ATOMIC_T */
#include "gdsroot.h"
#include "gdsblk.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "gdskill.h"
#include "gdscc.h"
#include "filestruct.h"
#include "interlock.h"
#include "jnl.h"
#include "buddy_list.h" /* needed for tp.h */
#include "hashtab.h" /* needed for cws_insert.h */
#include "hashtab_int4.h" /* needed for tp.h and cws_insert.h */
#include "tp.h"
#include "gdsbgtr.h"
#include "min_max.h"
#include "sleep_cnt.h"
#include "send_msg.h"
#include "relqop.h"
#include "is_proc_alive.h"
#include "cache.h"
#include "longset.h" /* needed for cws_insert.h */
#include "cws_insert.h"
#include "wcs_sleep.h"
#include "wcs_get_space.h"
#include "wcs_timer_start.h"
#include "add_inter.h"
#include "wbox_test_init.h"
#include "have_crit.h"
#include "memcoherency.h"
#include "gtm_c_stack_trace.h"
GBLREF sgmnt_addrs *cs_addrs;
GBLREF gd_region *gv_cur_region;
GBLREF uint4 process_id;
GBLREF uint4 image_count;
GBLREF unsigned int t_tries;
GBLREF uint4 dollar_tlevel;
GBLREF sgm_info *sgm_info_ptr;
GBLREF boolean_t mu_reorg_process;
#define TRACE_AND_SLEEP(ocnt) \
{ \
if (1 == ocnt) \
{ \
BG_TRACE_PRO(db_csh_getn_rip_wait); \
first_r_epid = latest_r_epid; \
} \
wcs_sleep(ocnt); \
}
error_def(ERR_BUFRDTIMEOUT);
error_def(ERR_INVALIDRIP);
cache_rec_ptr_t db_csh_getn(block_id block)
{
cache_rec_ptr_t hdr, q0, start_cr, cr;
bt_rec_ptr_t bt;
unsigned int lcnt, ocnt;
int rip, max_ent, pass1, pass2, pass3;
int4 flsh_trigger;
uint4 first_r_epid, latest_r_epid;
sgmnt_addrs *csa;
sgmnt_data_ptr_t csd;
srch_blk_status *tp_srch_status;
ht_ent_int4 *tabent;
csa = cs_addrs;
csd = csa->hdr;
assert(csa->now_crit);
assert(csa == &FILE_INFO(gv_cur_region)->s_addrs);
max_ent = csd->n_bts;
cr = (cache_rec_ptr_t)GDS_REL2ABS(csa->nl->cur_lru_cache_rec_off);
hdr = csa->acc_meth.bg.cache_state->cache_array + (block % csd->bt_buckets);
start_cr = csa->acc_meth.bg.cache_state->cache_array + csd->bt_buckets;
pass1 = max_ent; /* skip referred or dirty or read-into cache records */
pass2 = 2 * max_ent; /* skip referred cache records */
pass3 = 3 * max_ent; /* skip nothing */
INCR_DB_CSH_COUNTER(csa, n_db_csh_getns, 1);
DEFER_INTERRUPTS(INTRPT_IN_DB_CSH_GETN);
for (lcnt = 0; ; lcnt++)
{
if (lcnt > pass3)
{
BG_TRACE_PRO(wc_blocked_db_csh_getn_loopexceed);
assert(FALSE);
break;
}
cr++;
if (cr == start_cr + max_ent)
cr = start_cr;
VMS_ONLY(
if ((lcnt == pass1) || (lcnt == pass2))
wcs_wtfini(gv_cur_region);
)
if (cr->refer && (lcnt < pass2))
{ /* in passes 1 & 2, set refer to FALSE and skip; in the third pass attempt reuse even if TRUE == refer */
cr->refer = FALSE;
continue;
}
if (cr->in_cw_set || cr->in_tend)
{ /* some process already has this pinned for reading and/or updating. skip it. */
cr->refer = TRUE;
continue;
}
if (CDB_STAGNATE <= t_tries || mu_reorg_process)
{
/* Prevent stepping on self when crit for entire transaction.
* This is done by looking up in sgm_info_ptr->blk_in_use and cw_stagnate for presence of the block.
* The following two hashtable lookups are not similar, since in TP, sgm_info_ptr->blks_in_use
* is updated to the latest cw_stagnate list of blocks only in "tp_hist".
* Also note that the lookup in sgm_info_ptr->blks_in_use reuses blocks that don't have cse's.
* This is to allow big-read TP transactions which may use up more than the available global buffers.
* There is one issue here in that a block that has been only read till now may be stepped upon here
* but may later be needed for update. It is handled by updating the block's corresponding
* entry in the set of histories (sgm_info_ptr->first_tp_hist[index] structure) to hold the
* "cr" and "cycle" of the t_qread done for the block when it was intended to be changed for the
* first time within the transaction since otherwise the transaction would restart due to a
* cdb_sc_lostcr status. Note that "tn" (read_tn of the block) in the first_tp_hist will still
* remain the "tn" when the block was first read within this transaction to ensure the block
* hasn't been modified since the start of the transaction. Once we intend on changing the
* block i.e. srch_blk_status->cse is non-NULL, we ensure in the code below not to step on it.
* ["tp_hist" is the routine that updates the "cr", "cycle" and "tn" of the block].
* Note that usually in a transaction the first_tp_hist[] structure holds the "cr", "cycle", and "tn"
* of the first t_qread of the block within that transaction. The above is the only exception.
* Also note that for blocks in cw_stagnate (i.e. current TP mini-action), we don't reuse any of
* them even if they don't have a cse. This is to ensure that the current action doesn't
* encounter a restart due to cdb_sc_lostcr in "tp_hist" even in the fourth-retry.
*/
tp_srch_status = NULL;
if (dollar_tlevel && (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&cr->blk)))
&& (tp_srch_status = (srch_blk_status *)tabent->value) && (tp_srch_status->cse))
{ /* this process is already using the block - skip it */
cr->refer = TRUE;
continue;
}
if (NULL != lookup_hashtab_int4(&cw_stagnate, (uint4 *)&cr->blk))
{ /* this process is already using the block for the current gvcst_search - skip it */
cr->refer = TRUE;
continue;
}
if (NULL != tp_srch_status)
{ /* About to reuse a buffer that is part of the read-set of the current TP transaction.
* Reset clue as otherwise the next global reference of that global will use an outofdate clue.
* Even though tp_srch_status is available after the sgm_info_ptr->blks_in_use hashtable check,
* we dont want to reset the clue in case the cw_stagnate hashtable check causes the same cr
* to be skipped from reuse. Hence the placement of this reset logic AFTER the cw_stagnate check.
*/
tp_srch_status->blk_target->clue.end = 0;
}
}
if (cr->dirty)
{ /* Note that in Unix, it is possible that we see a stale value of cr->dirty (possible if a
* concurrent "wcs_wtstart" has reset dirty to 0 but that update did not reach us yet). In this
* case the call to "wcs_get_space" below will do the necessary memory barrier instructions
* (through calls to "aswp") which will allow us to see the non-stale value of cr->dirty.
*
* It is also possible that cr->dirty is non-zero but < cr->flushed_dirty_tn. In this case, wcs_get_space
* done below will return FALSE forcing a cache-rebuild which will fix this situation.
*
* In VMS, another process cannot be concurrently resetting cr->dirty to 0 as the resetting routine
* is "wcs_wtfini" which is executed in crit which another process cannot be in as we are in crit now.
*/
if (gv_cur_region->read_only)
continue;
if (lcnt < pass1)
{
if (!csa->timer && (csa->nl->wcs_timers < 1))
wcs_timer_start(gv_cur_region, FALSE);
continue;
}
BG_TRACE_PRO(db_csh_getn_flush_dirty);
if (FALSE == wcs_get_space(gv_cur_region, 0, cr))
{ /* failed to flush it out - force a rebuild */
BG_TRACE_PRO(wc_blocked_db_csh_getn_wcsstarvewrt);
assert(csd->wc_blocked); /* only reason we currently know why wcs_get_space could fail */
assert(gtm_white_box_test_case_enabled);
break;
}
assert(0 == cr->dirty);
}
UNIX_ONLY(
/* the cache-record is not free for reuse until the write-latch value becomes LATCH_CLEAR.
* In VMS, resetting the write-latch value occurs in "wcs_wtfini" which is in CRIT, we are fine.
* In Unix, this resetting is done by "wcs_wtstart" which is out-of-crit. Therefore, we need to
* wait for this value to be LATCH_CLEAR before reusing this cache-record.
* Note that we are examining the write-latch-value without holding the interlock. It is ok to do
* this because the only two routines that modify the latch value are "bg_update" and
* "wcs_wtstart". The former cannot be concurrently executing because we are in crit.
* The latter will not update the latch value unless this cache-record is dirty. But in this
* case we would have most likely gone through the if (cr->dirty) check above. Most likely
* because there is one rare possibility where a concurrent "wcs_wtstart" has set cr->dirty
* to 0 but not yet cleared the latch. In that case we wait for the latch to be cleared.
* In all other cases, nobody is modifying the latch since when we got crit and therefore
* it is safe to observe the value of the latch without holding the interlock.
*/
if (LATCH_CLEAR != WRITE_LATCH_VAL(cr))
{ /* possible if a concurrent "wcs_wtstart" has set cr->dirty to 0 but not yet
* cleared the latch. this should be very rare though.
*/
if (lcnt < pass2)
continue; /* try to find some other cache-record to reuse until the 3rd pass */
for (ocnt = 1; (MAXWRTLATCHWAIT >= ocnt) && (LATCH_CLEAR != WRITE_LATCH_VAL(cr)); ocnt++)
wcs_sleep(SLEEP_WRTLATCHWAIT); /* since it is a short lock, sleep the minimum */
if (MAXWRTLATCHWAIT <= ocnt)
{
BG_TRACE_PRO(db_csh_getn_wrt_latch_stuck);
assert(FALSE);
continue;
}
}
)
/* Note that before setting up a buffer for the requested block, we should make sure the cache-record's
* read_in_progress is set. This is so that noone else in t_qread gets access to this empty buffer.
* By setting up a buffer, it is meant assigning cr->blk in addition to inserting the cr in the blkques
* through "shuffqth" below.
* Note that "t_qread" has special code to handle read_in_progress */
LOCK_BUFF_FOR_READ(cr, rip);
if (0 != rip)
{
if (lcnt < pass2)
{ /* someone is reading into this cache record. leave it for two passes.
* this is because if somebody is reading it, it is most likely to be referred to very soon.
* if we replace this, we will definitely be causing a restart for the reader.
* instead of that, see if some other cache record fits in for us.
*/
RELEASE_BUFF_READ_LOCK(cr);
continue;
}
for (ocnt = 1; 0 != rip && BUF_OWNER_STUCK >= ocnt; ocnt++)
{
RELEASE_BUFF_READ_LOCK(cr);
/* The owner has been unable to complete the read - check for some things before going to sleep.
* Since cr->r_epid can be changing concurrently, take a local copy before using it below,
* particularly before calling is_proc_alive as we dont want to call it with a 0 r_epid.
*/
latest_r_epid = cr->r_epid;
if (cr->read_in_progress < -1)
{
BG_TRACE_PRO(db_csh_getn_out_of_design); /* outside of design; clear to known state */
send_msg(VARLSTCNT(4) ERR_INVALIDRIP, 2, DB_LEN_STR(gv_cur_region));
assert(cr->r_epid == 0);
cr->r_epid = 0;
INTERLOCK_INIT(cr);
} else if (0 != latest_r_epid)
{
if (is_proc_alive(latest_r_epid, cr->image_count))
{
# ifdef DEBUG
if ((BUF_OWNER_STUCK / 2) == ocnt)
GET_C_STACK_FROM_SCRIPT("BUFRDTIMEOUT", process_id, latest_r_epid, ONCE);
# endif
TRACE_AND_SLEEP(ocnt);
} else
{
cr->r_epid = 0;
INTERLOCK_INIT(cr); /* Process gone, release that process's lock */
}
} else
{
TRACE_AND_SLEEP(ocnt);
}
LOCK_BUFF_FOR_READ(cr, rip);
}
if ((BUF_OWNER_STUCK < ocnt) && (0 != rip))
{
BG_TRACE_PRO(db_csh_getn_buf_owner_stuck);
if (0 != latest_r_epid)
{
if (first_r_epid != latest_r_epid)
GTMASSERT;
GET_C_STACK_FROM_SCRIPT("BUFRDTIMEOUT", process_id, latest_r_epid,
DEBUG_ONLY(TWICE) PRO_ONLY(ONCE));
RELEASE_BUFF_READ_LOCK(cr);
send_msg(VARLSTCNT(8) ERR_BUFRDTIMEOUT, 6, process_id,
cr->blk, cr, first_r_epid, DB_LEN_STR(gv_cur_region));
continue;
}
cr->r_epid = 0;
INTERLOCK_INIT(cr);
LOCK_BUFF_FOR_READ(cr, rip);
assert(0 == rip); /* Since holding crit, we expect to get lock */
if (0 != rip)
continue;
/* We successfully obtained the lock so can fall out of this block */
}
}
assert(0 == rip);
/* no other process "owns" the block */
if (CDB_STAGNATE <= t_tries || mu_reorg_process)
{ /* this should probably use cr->in_cw_set with a condition handler to cleanup */
CWS_INSERT(block);
}
assert(LATCH_CLEAR == WRITE_LATCH_VAL(cr));
/* got a block - set it up */
assert(0 == cr->epid);
assert(0 == cr->r_epid);
cr->r_epid = process_id; /* establish ownership */
cr->image_count = image_count;
cr->blk = block;
/* We want cr->read_in_progress to be locked BEFORE cr->cycle is incremented. t_qread relies on this order.
* Enforce this order with a write memory barrier. Not doing so might cause the incremented cr->cycle to be
* seen by another process even though it sees the unlocked state of cr->read_in_progress. This could cause
* t_qread to incorrectly return with an uptodate cr->cycle even though the buffer is still being read in
* from disk and this could cause db integ errors as validation (in t_end/tp_tend which relies on cr->cycle)
* will detect no problems even though there is one. Note this memory barrier is still needed even though
* there is a memory barrier connotation in the LOCK_BUFF_FOR_READ() macro above. LOCK_BUFF_FOR_READ() does
* a read type memory barrier whereas here, we need a write barrier.
*/
SHM_WRITE_MEMORY_BARRIER;
cr->cycle++;
cr->jnl_addr = 0;
cr->refer = TRUE;
if (cr->bt_index != 0)
{
bt = (bt_rec_ptr_t)GDS_REL2ABS(cr->bt_index);
bt->cache_index = CR_NOTVALID;
cr->bt_index = 0;
}
q0 = (cache_rec_ptr_t)((sm_uc_ptr_t)cr + cr->blkque.fl);
shuffqth((que_ent_ptr_t)q0, (que_ent_ptr_t)hdr);
assert(0 == cr->dirty);
csa->nl->cur_lru_cache_rec_off = GDS_ABS2REL(cr);
if (lcnt > pass1)
csa->nl->cache_hits = 0;
csa->nl->cache_hits++;
if (csa->nl->cache_hits > csd->n_bts)
{
flsh_trigger = csd->flush_trigger;
csd->flush_trigger = MIN(flsh_trigger + MAX(flsh_trigger / STEP_FACTOR, 1), MAX_FLUSH_TRIGGER(csd->n_bts));
csa->nl->cache_hits = 0;
}
INCR_DB_CSH_COUNTER(csa, n_db_csh_getn_lcnt, lcnt);
ENABLE_INTERRUPTS(INTRPT_IN_DB_CSH_GETN);
return cr;
}
/* force a recover */
INCR_DB_CSH_COUNTER(csa, n_db_csh_getn_lcnt, lcnt);
csa->nl->cur_lru_cache_rec_off = GDS_ABS2REL(cr);
ENABLE_INTERRUPTS(INTRPT_IN_DB_CSH_GETN);
return (cache_rec_ptr_t)CR_NOTVALID;
}