/**************************************************************** * * * Copyright 2001, 2011 Fidelity Information Services, Inc * * * * This source code contains the intellectual property * * of its copyright holder(s), and is made available * * under a license. If you do not know the terms of * * the license, please stop and do not read further. * * * ****************************************************************/ #include "mdef.h" #include /* needed for VSIG_ATOMIC_T */ #include "gdsroot.h" #include "gdsblk.h" #include "gtm_facility.h" #include "fileinfo.h" #include "gdsbt.h" #include "gdsfhead.h" #include "gdskill.h" #include "gdscc.h" #include "filestruct.h" #include "interlock.h" #include "jnl.h" #include "buddy_list.h" /* needed for tp.h */ #include "hashtab.h" /* needed for cws_insert.h */ #include "hashtab_int4.h" /* needed for tp.h and cws_insert.h */ #include "tp.h" #include "gdsbgtr.h" #include "min_max.h" #include "sleep_cnt.h" #include "send_msg.h" #include "relqop.h" #include "is_proc_alive.h" #include "cache.h" #include "longset.h" /* needed for cws_insert.h */ #include "cws_insert.h" #include "wcs_sleep.h" #include "wcs_get_space.h" #include "wcs_timer_start.h" #include "add_inter.h" #include "wbox_test_init.h" #include "have_crit.h" #include "memcoherency.h" #include "gtm_c_stack_trace.h" GBLREF sgmnt_addrs *cs_addrs; GBLREF gd_region *gv_cur_region; GBLREF uint4 process_id; GBLREF uint4 image_count; GBLREF unsigned int t_tries; GBLREF uint4 dollar_tlevel; GBLREF sgm_info *sgm_info_ptr; GBLREF boolean_t mu_reorg_process; #define TRACE_AND_SLEEP(ocnt) \ { \ if (1 == ocnt) \ { \ BG_TRACE_PRO(db_csh_getn_rip_wait); \ first_r_epid = latest_r_epid; \ } \ wcs_sleep(ocnt); \ } error_def(ERR_BUFRDTIMEOUT); error_def(ERR_INVALIDRIP); cache_rec_ptr_t db_csh_getn(block_id block) { cache_rec_ptr_t hdr, q0, start_cr, cr; bt_rec_ptr_t bt; unsigned int lcnt, ocnt; int rip, max_ent, pass1, pass2, pass3; int4 flsh_trigger; uint4 first_r_epid, latest_r_epid; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; srch_blk_status *tp_srch_status; ht_ent_int4 *tabent; csa = cs_addrs; csd = csa->hdr; assert(csa->now_crit); assert(csa == &FILE_INFO(gv_cur_region)->s_addrs); max_ent = csd->n_bts; cr = (cache_rec_ptr_t)GDS_REL2ABS(csa->nl->cur_lru_cache_rec_off); hdr = csa->acc_meth.bg.cache_state->cache_array + (block % csd->bt_buckets); start_cr = csa->acc_meth.bg.cache_state->cache_array + csd->bt_buckets; pass1 = max_ent; /* skip referred or dirty or read-into cache records */ pass2 = 2 * max_ent; /* skip referred cache records */ pass3 = 3 * max_ent; /* skip nothing */ INCR_DB_CSH_COUNTER(csa, n_db_csh_getns, 1); DEFER_INTERRUPTS(INTRPT_IN_DB_CSH_GETN); for (lcnt = 0; ; lcnt++) { if (lcnt > pass3) { BG_TRACE_PRO(wc_blocked_db_csh_getn_loopexceed); assert(FALSE); break; } cr++; if (cr == start_cr + max_ent) cr = start_cr; VMS_ONLY( if ((lcnt == pass1) || (lcnt == pass2)) wcs_wtfini(gv_cur_region); ) if (cr->refer && (lcnt < pass2)) { /* in passes 1 & 2, set refer to FALSE and skip; in the third pass attempt reuse even if TRUE == refer */ cr->refer = FALSE; continue; } if (cr->in_cw_set || cr->in_tend) { /* some process already has this pinned for reading and/or updating. skip it. */ cr->refer = TRUE; continue; } if (CDB_STAGNATE <= t_tries || mu_reorg_process) { /* Prevent stepping on self when crit for entire transaction. * This is done by looking up in sgm_info_ptr->blk_in_use and cw_stagnate for presence of the block. * The following two hashtable lookups are not similar, since in TP, sgm_info_ptr->blks_in_use * is updated to the latest cw_stagnate list of blocks only in "tp_hist". * Also note that the lookup in sgm_info_ptr->blks_in_use reuses blocks that don't have cse's. * This is to allow big-read TP transactions which may use up more than the available global buffers. * There is one issue here in that a block that has been only read till now may be stepped upon here * but may later be needed for update. It is handled by updating the block's corresponding * entry in the set of histories (sgm_info_ptr->first_tp_hist[index] structure) to hold the * "cr" and "cycle" of the t_qread done for the block when it was intended to be changed for the * first time within the transaction since otherwise the transaction would restart due to a * cdb_sc_lostcr status. Note that "tn" (read_tn of the block) in the first_tp_hist will still * remain the "tn" when the block was first read within this transaction to ensure the block * hasn't been modified since the start of the transaction. Once we intend on changing the * block i.e. srch_blk_status->cse is non-NULL, we ensure in the code below not to step on it. * ["tp_hist" is the routine that updates the "cr", "cycle" and "tn" of the block]. * Note that usually in a transaction the first_tp_hist[] structure holds the "cr", "cycle", and "tn" * of the first t_qread of the block within that transaction. The above is the only exception. * Also note that for blocks in cw_stagnate (i.e. current TP mini-action), we don't reuse any of * them even if they don't have a cse. This is to ensure that the current action doesn't * encounter a restart due to cdb_sc_lostcr in "tp_hist" even in the fourth-retry. */ tp_srch_status = NULL; if (dollar_tlevel && (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&cr->blk))) && (tp_srch_status = (srch_blk_status *)tabent->value) && (tp_srch_status->cse)) { /* this process is already using the block - skip it */ cr->refer = TRUE; continue; } if (NULL != lookup_hashtab_int4(&cw_stagnate, (uint4 *)&cr->blk)) { /* this process is already using the block for the current gvcst_search - skip it */ cr->refer = TRUE; continue; } if (NULL != tp_srch_status) { /* About to reuse a buffer that is part of the read-set of the current TP transaction. * Reset clue as otherwise the next global reference of that global will use an outofdate clue. * Even though tp_srch_status is available after the sgm_info_ptr->blks_in_use hashtable check, * we dont want to reset the clue in case the cw_stagnate hashtable check causes the same cr * to be skipped from reuse. Hence the placement of this reset logic AFTER the cw_stagnate check. */ tp_srch_status->blk_target->clue.end = 0; } } if (cr->dirty) { /* Note that in Unix, it is possible that we see a stale value of cr->dirty (possible if a * concurrent "wcs_wtstart" has reset dirty to 0 but that update did not reach us yet). In this * case the call to "wcs_get_space" below will do the necessary memory barrier instructions * (through calls to "aswp") which will allow us to see the non-stale value of cr->dirty. * * It is also possible that cr->dirty is non-zero but < cr->flushed_dirty_tn. In this case, wcs_get_space * done below will return FALSE forcing a cache-rebuild which will fix this situation. * * In VMS, another process cannot be concurrently resetting cr->dirty to 0 as the resetting routine * is "wcs_wtfini" which is executed in crit which another process cannot be in as we are in crit now. */ if (gv_cur_region->read_only) continue; if (lcnt < pass1) { if (!csa->timer && (csa->nl->wcs_timers < 1)) wcs_timer_start(gv_cur_region, FALSE); continue; } BG_TRACE_PRO(db_csh_getn_flush_dirty); if (FALSE == wcs_get_space(gv_cur_region, 0, cr)) { /* failed to flush it out - force a rebuild */ BG_TRACE_PRO(wc_blocked_db_csh_getn_wcsstarvewrt); assert(csd->wc_blocked); /* only reason we currently know why wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); break; } assert(0 == cr->dirty); } UNIX_ONLY( /* the cache-record is not free for reuse until the write-latch value becomes LATCH_CLEAR. * In VMS, resetting the write-latch value occurs in "wcs_wtfini" which is in CRIT, we are fine. * In Unix, this resetting is done by "wcs_wtstart" which is out-of-crit. Therefore, we need to * wait for this value to be LATCH_CLEAR before reusing this cache-record. * Note that we are examining the write-latch-value without holding the interlock. It is ok to do * this because the only two routines that modify the latch value are "bg_update" and * "wcs_wtstart". The former cannot be concurrently executing because we are in crit. * The latter will not update the latch value unless this cache-record is dirty. But in this * case we would have most likely gone through the if (cr->dirty) check above. Most likely * because there is one rare possibility where a concurrent "wcs_wtstart" has set cr->dirty * to 0 but not yet cleared the latch. In that case we wait for the latch to be cleared. * In all other cases, nobody is modifying the latch since when we got crit and therefore * it is safe to observe the value of the latch without holding the interlock. */ if (LATCH_CLEAR != WRITE_LATCH_VAL(cr)) { /* possible if a concurrent "wcs_wtstart" has set cr->dirty to 0 but not yet * cleared the latch. this should be very rare though. */ if (lcnt < pass2) continue; /* try to find some other cache-record to reuse until the 3rd pass */ for (ocnt = 1; (MAXWRTLATCHWAIT >= ocnt) && (LATCH_CLEAR != WRITE_LATCH_VAL(cr)); ocnt++) wcs_sleep(SLEEP_WRTLATCHWAIT); /* since it is a short lock, sleep the minimum */ if (MAXWRTLATCHWAIT <= ocnt) { BG_TRACE_PRO(db_csh_getn_wrt_latch_stuck); assert(FALSE); continue; } } ) /* Note that before setting up a buffer for the requested block, we should make sure the cache-record's * read_in_progress is set. This is so that noone else in t_qread gets access to this empty buffer. * By setting up a buffer, it is meant assigning cr->blk in addition to inserting the cr in the blkques * through "shuffqth" below. * Note that "t_qread" has special code to handle read_in_progress */ LOCK_BUFF_FOR_READ(cr, rip); if (0 != rip) { if (lcnt < pass2) { /* someone is reading into this cache record. leave it for two passes. * this is because if somebody is reading it, it is most likely to be referred to very soon. * if we replace this, we will definitely be causing a restart for the reader. * instead of that, see if some other cache record fits in for us. */ RELEASE_BUFF_READ_LOCK(cr); continue; } for (ocnt = 1; 0 != rip && BUF_OWNER_STUCK >= ocnt; ocnt++) { RELEASE_BUFF_READ_LOCK(cr); /* The owner has been unable to complete the read - check for some things before going to sleep. * Since cr->r_epid can be changing concurrently, take a local copy before using it below, * particularly before calling is_proc_alive as we dont want to call it with a 0 r_epid. */ latest_r_epid = cr->r_epid; if (cr->read_in_progress < -1) { BG_TRACE_PRO(db_csh_getn_out_of_design); /* outside of design; clear to known state */ send_msg(VARLSTCNT(4) ERR_INVALIDRIP, 2, DB_LEN_STR(gv_cur_region)); assert(cr->r_epid == 0); cr->r_epid = 0; INTERLOCK_INIT(cr); } else if (0 != latest_r_epid) { if (is_proc_alive(latest_r_epid, cr->image_count)) { # ifdef DEBUG if ((BUF_OWNER_STUCK / 2) == ocnt) GET_C_STACK_FROM_SCRIPT("BUFRDTIMEOUT", process_id, latest_r_epid, ONCE); # endif TRACE_AND_SLEEP(ocnt); } else { cr->r_epid = 0; INTERLOCK_INIT(cr); /* Process gone, release that process's lock */ } } else { TRACE_AND_SLEEP(ocnt); } LOCK_BUFF_FOR_READ(cr, rip); } if ((BUF_OWNER_STUCK < ocnt) && (0 != rip)) { BG_TRACE_PRO(db_csh_getn_buf_owner_stuck); if (0 != latest_r_epid) { if (first_r_epid != latest_r_epid) GTMASSERT; GET_C_STACK_FROM_SCRIPT("BUFRDTIMEOUT", process_id, latest_r_epid, DEBUG_ONLY(TWICE) PRO_ONLY(ONCE)); RELEASE_BUFF_READ_LOCK(cr); send_msg(VARLSTCNT(8) ERR_BUFRDTIMEOUT, 6, process_id, cr->blk, cr, first_r_epid, DB_LEN_STR(gv_cur_region)); continue; } cr->r_epid = 0; INTERLOCK_INIT(cr); LOCK_BUFF_FOR_READ(cr, rip); assert(0 == rip); /* Since holding crit, we expect to get lock */ if (0 != rip) continue; /* We successfully obtained the lock so can fall out of this block */ } } assert(0 == rip); /* no other process "owns" the block */ if (CDB_STAGNATE <= t_tries || mu_reorg_process) { /* this should probably use cr->in_cw_set with a condition handler to cleanup */ CWS_INSERT(block); } assert(LATCH_CLEAR == WRITE_LATCH_VAL(cr)); /* got a block - set it up */ assert(0 == cr->epid); assert(0 == cr->r_epid); cr->r_epid = process_id; /* establish ownership */ cr->image_count = image_count; cr->blk = block; /* We want cr->read_in_progress to be locked BEFORE cr->cycle is incremented. t_qread relies on this order. * Enforce this order with a write memory barrier. Not doing so might cause the incremented cr->cycle to be * seen by another process even though it sees the unlocked state of cr->read_in_progress. This could cause * t_qread to incorrectly return with an uptodate cr->cycle even though the buffer is still being read in * from disk and this could cause db integ errors as validation (in t_end/tp_tend which relies on cr->cycle) * will detect no problems even though there is one. Note this memory barrier is still needed even though * there is a memory barrier connotation in the LOCK_BUFF_FOR_READ() macro above. LOCK_BUFF_FOR_READ() does * a read type memory barrier whereas here, we need a write barrier. */ SHM_WRITE_MEMORY_BARRIER; cr->cycle++; cr->jnl_addr = 0; cr->refer = TRUE; if (cr->bt_index != 0) { bt = (bt_rec_ptr_t)GDS_REL2ABS(cr->bt_index); bt->cache_index = CR_NOTVALID; cr->bt_index = 0; } q0 = (cache_rec_ptr_t)((sm_uc_ptr_t)cr + cr->blkque.fl); shuffqth((que_ent_ptr_t)q0, (que_ent_ptr_t)hdr); assert(0 == cr->dirty); csa->nl->cur_lru_cache_rec_off = GDS_ABS2REL(cr); if (lcnt > pass1) csa->nl->cache_hits = 0; csa->nl->cache_hits++; if (csa->nl->cache_hits > csd->n_bts) { flsh_trigger = csd->flush_trigger; csd->flush_trigger = MIN(flsh_trigger + MAX(flsh_trigger / STEP_FACTOR, 1), MAX_FLUSH_TRIGGER(csd->n_bts)); csa->nl->cache_hits = 0; } INCR_DB_CSH_COUNTER(csa, n_db_csh_getn_lcnt, lcnt); ENABLE_INTERRUPTS(INTRPT_IN_DB_CSH_GETN); return cr; } /* force a recover */ INCR_DB_CSH_COUNTER(csa, n_db_csh_getn_lcnt, lcnt); csa->nl->cur_lru_cache_rec_off = GDS_ABS2REL(cr); ENABLE_INTERRUPTS(INTRPT_IN_DB_CSH_GETN); return (cache_rec_ptr_t)CR_NOTVALID; }