fis-gtm/sr_port/wcs_phase2_commit_wait.c

337 lines
14 KiB
C

/****************************************************************
* *
* Copyright 2008, 2012 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_facility.h"
#include "gdsroot.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "sleep_cnt.h"
#include "gdsbgtr.h"
#include "memcoherency.h"
/* Include prototypes */
#include "wcs_phase2_commit_wait.h"
#include "gt_timer.h"
#include "wcs_sleep.h"
#include "rel_quant.h"
#include "send_msg.h"
#include "gtm_c_stack_trace.h"
#include "wbox_test_init.h"
error_def(ERR_COMMITWAITPID);
error_def(ERR_COMMITWAITSTUCK);
#define SEND_COMMITWAITPID_GET_STACK_IF_NEEDED(BLOCKING_PID, STUCK_CNT, CR, CSA) \
{ \
GBLREF uint4 process_id; \
\
if (BLOCKING_PID) \
{ \
STUCK_CNT++; \
GET_C_STACK_FROM_SCRIPT("COMMITWAITPID", process_id, BLOCKING_PID, STUCK_CNT); \
send_msg(VARLSTCNT(8) ERR_COMMITWAITPID, 6, process_id, 1, BLOCKING_PID, CR->blk, DB_LEN_STR(CSA->region)); \
} \
}
/* take C-stack trace of the process doing the phase2 commits at half the entire wait. We do this only while waiting
* for a particular cache record
*/
#define GET_STACK_AT_HALF_WAIT_IF_NEEDED(BLOCKING_PID, STUCK_CNT) \
{ \
GBLREF uint4 process_id; \
\
if (BLOCKING_PID && (process_id != BLOCKING_PID)) \
{ \
STUCK_CNT++; \
GET_C_STACK_FROM_SCRIPT("COMMITWAITPID_HALF_WAIT", process_id, BLOCKING_PID, STUCK_CNT); \
} \
}
GBLREF uint4 process_id;
GBLREF int process_exiting;
#ifdef DEBUG
GBLREF boolean_t in_mu_rndwn_file;
#endif
#ifdef UNIX
GBLREF volatile uint4 heartbeat_counter;
GBLREF volatile int4 timer_stack_count;
#endif
/* if cr == NULL, wait a maximum of 1 minute for ALL processes actively in bg_update_phase2 to finish.
* if cr != NULL, wait a maximum of 1 minute for the particular cache-record to be done with phase2 commit.
*
* This routine is invoked inside and outside of crit. If we hold crit, then we are guaranteed that cr->in_tend
* cannot get reset to a non-zero value different from what we saw when we started waiting. This is not
* guaranteed if we dont hold crit. In that case, we wait until cr->in_tend changes in value (zero or non-zero).
*
* Returns : TRUE if waiting event completed before timeout, FALSE otherwise
*/
boolean_t wcs_phase2_commit_wait(sgmnt_addrs *csa, cache_rec_ptr_t cr)
{
sgmnt_data_ptr_t csd;
node_local_ptr_t cnl;
uint4 lcnt, blocking_pid, start_in_tend, spincnt, maxspincnt, start_heartbeat, heartbeat_delta;
int4 value;
boolean_t was_crit;
boolean_t use_heartbeat;
block_id blk;
# ifdef VMS
uint4 heartbeat_counter = 0; /* dummy variable to make compiler happy */
# endif
int4 index, crarray_size, crarray_index;
cache_rec_ptr_t cr_lo, cr_top, curcr;
phase2_wait_trace_t crarray[MAX_PHASE2_WAIT_CR_TRACE_SIZE];
# ifdef DEBUG
uint4 incrit_pid, phase2_commit_half_wait;
int4 waitarray[1024];
int4 waitarray_size;
boolean_t half_time = FALSE;
# endif
static uint4 stuck_cnt = 0; /* stuck_cnt signifies the number of times the same process
has called gtmstuckexec for the same condition*/
DEBUG_ONLY(cr_lo = cr_top = NULL;)
crarray_size = SIZEOF(crarray) / SIZEOF(crarray[0]);
DEBUG_ONLY(waitarray_size = SIZEOF(waitarray) / SIZEOF(waitarray[0]);)
assert(!in_mu_rndwn_file);
csd = csa->hdr;
/* To avoid unnecessary time spent waiting, we would like to do rel_quants instead of wcs_sleep. But this means
* we need to have some other scheme for limiting the total time slept. We use the heartbeat scheme which currently
* is available only in Unix. Every 8 seconds or so, the heartbeat timer increments a counter. But there are two
* cases where heartbeat_timer will not pop:
* (a) if we are in the process of exiting (through a call to cancel_timer(0) which cancels all active timers)
* (b) if we are are already in timer_handler. This is possible if the flush timer pops and we end up invoking
* wcs_clean_dbsync->wcs_flu->wcs_phase2_commit_wait. But since the heartbeat timer cannot pop as long as
* timer_in_handler is TRUE (which it will be until at least we exit this function), we cannot use the heartbeat
* scheme in this case as well.
* Therefore, if heartbeat timer is available and currently active, then use rel_quants. If not, use wcs_sleep.
* We have found that doing rel_quants (instead of sleeps) causes huge CPU usage in Tru64 even if the default spincnt is
* set to 0 and ALL processes are only waiting for one process to finish its phase2 commit. Therefore we choose
* the sleep approach for Tru64. Choosing a spincnt of 0 would choose the sleep approach (versus rel_quant).
*/
# if (defined(UNIX) && !defined(__osf__))
use_heartbeat = (!process_exiting && csd->wcs_phase2_commit_wait_spincnt && (1 > timer_stack_count));
# else
use_heartbeat = FALSE;
# endif
DEBUG_ONLY(phase2_commit_half_wait = use_heartbeat ? (PHASE2_COMMIT_WAIT_HTBT >> 1) : (PHASE2_COMMIT_WAIT >> 1);)
if (use_heartbeat)
{
maxspincnt = csd->wcs_phase2_commit_wait_spincnt;
assert(maxspincnt);
if (!maxspincnt)
maxspincnt = WCS_PHASE2_COMMIT_DEFAULT_SPINCNT;
start_heartbeat = heartbeat_counter;
}
assert(dba_bg == csd->acc_meth);
if (dba_bg != csd->acc_meth) /* in pro, be safe and return */
return TRUE;
cnl = csa->nl;
was_crit = csa->now_crit;
assert((NULL != cr) || was_crit);
if (NULL != cr)
{
start_in_tend = cr->in_tend;
/* Normally we should never find ourselves holding the lock on the cache-record we are waiting for. There is
* one exception though. And that is if we had encountered an error in the middle of phase1 or phase2 of the
* commit and ended up invoking "secshr_db_clnup" to finish the transaction for us. It is possible that we
* then proceeded with the next transaction doing a "t_qread" without any process invoking "wcs_recover"
* (possible only if they did a "grab_crit") until then. In that case, we could have one or more cache-records
* with non-zero value of cr->in_tend identical to our process_id. Since we will fix these cache-records
* while grabbing crit (which we have to before doing validation in t_end/tp_tend), it is safe to assume
* this block is not being touched for now and return right away. But this exception is possible only if
* we dont already hold crit (i.e. called from "t_qread"). In addition, errors in the midst of commit are
* possible only if we have enabled white-box testing. Assert accordingly.
*/
/* we better not deadlock wait for ourself */
if (!was_crit && (process_id == start_in_tend))
{
assert(gtm_white_box_test_case_enabled);
return TRUE;
}
if (process_id == start_in_tend)
GTMASSERT; /* should not deadlock on our self */
if (!start_in_tend)
return TRUE;
} else
{ /* initialize the beginning and the end of cache-records to be used later (only in case of cr == NULL) */
cr_lo = ((cache_rec_ptr_t)csa->acc_meth.bg.cache_state->cache_array) + csd->bt_buckets;
cr_top = cr_lo + csd->n_bts;
}
/* Spin & sleep/yield alternately for the phase2 commit to complete */
for (spincnt = 0, lcnt = 0; ; spincnt++)
{
SHM_READ_MEMORY_BARRIER; /* read memory barrier done to minimize time spent spinning waiting for value to change */
if (NULL == cr)
{
value = cnl->wcs_phase2_commit_pidcnt;
if (!value)
return TRUE;
} else
{ /* If we dont hold crit and are sleep looping waiting for cr->in_tend to become 0, it is
* theoretically possible (though very remote) that every one of the 1000s of iterations we look
* at the cache-record, cr->in_tend is set to the same pid even though the block could have
* been updated as part of multiple transactions. But we could have stopped the wait the moment the
* same buffer gets updated for the next transaction (even if by the same pid). To recognize that
* we note down the current db tn at the start of the wait and check if the block header tn
* throughout the wait gets higher than this. If so, we return right away even though cr->in_tend
* is non-zero. But since this comparison is done outside of crit it is possible that the block
* header tn could be temporarily GREATER than the db tn because of concurrent updates AND because
* an update to the 8-byte transaction number is not necessarily atomic AND because the block's tn
* that we read could be a mish-mash of low-order and high-order bytes taken from BEFORE and AFTER
* an update. Doing less than checks with these bad values is considered risky as a false return
* means a GTMASSERT in "t_end" or "tp_tend" in the PIN_CACHE_RECORD macro. Since this situation is
* almost an impossibility in practice, we handle this by returning FALSE after timing out and
* requiring the caller (t_qread) to restart. Eventually we will get crit (in the final retry) where
* we are guaranteed not to end up in this situation.
*/
value = cr->in_tend;
if (value != start_in_tend)
{
assert(!was_crit || !value);
return TRUE;
}
if (!was_crit && cnl->wc_blocked)
{ /* Some other process could be doing cache-recovery at this point and if it takes more than
* a minute, we will time out for no reason. No point proceeding with this transaction
* anyway as we are bound to restart. Do that right away. Caller knows to restart.
*/
return FALSE;
}
}
if (use_heartbeat)
{
if (spincnt < maxspincnt)
continue;
assert(spincnt == maxspincnt);
heartbeat_delta = heartbeat_counter - start_heartbeat;
}
spincnt = 0;
lcnt++;
DEBUG_ONLY(waitarray[lcnt % waitarray_size] = value;)
if (NULL != cr)
{
if (was_crit)
{
BG_TRACE_PRO_ANY(csa, phase2_commit_wait_sleep_in_crit);
} else
{
BG_TRACE_PRO_ANY(csa, phase2_commit_wait_sleep_no_crit);
}
} else
{
BG_TRACE_PRO_ANY(csa, phase2_commit_wait_pidcnt);
}
if (use_heartbeat)
{
if (PHASE2_COMMIT_WAIT_HTBT < heartbeat_delta)
break;
DEBUG_ONLY(half_time = (phase2_commit_half_wait == heartbeat_delta));
rel_quant();
} else
{
if (lcnt >= PHASE2_COMMIT_WAIT)
break;
DEBUG_ONLY(half_time = (phase2_commit_half_wait == lcnt));
wcs_sleep(PHASE2_COMMIT_SLEEP);
}
# ifdef DEBUG
if (half_time)
{
if (NULL != cr)
{
blocking_pid = cr->in_tend; /* Get a more recent value */
GET_STACK_AT_HALF_WAIT_IF_NEEDED(blocking_pid, stuck_cnt);
} else
{
assert((NULL != cr_lo) && (cr_lo < cr_top));
for (curcr = cr_lo; curcr < cr_top; curcr++)
{
blocking_pid = curcr->in_tend;
GET_STACK_AT_HALF_WAIT_IF_NEEDED(blocking_pid, stuck_cnt);
}
}
}
# endif
}
if (NULL == cr)
{ /* This is the case where we wait for all the phase2 commits to complete. Note down the cache records that
* are still not done with the commits. Since there can be multiple cache records held by the same PID, note
* down one cache record for each representative PID. We don't expect the list of distinct PIDs to be large.
* In any case, note down only as many as we can
*/
crarray_index = 0;
for (curcr = cr_lo; curcr < cr_top; curcr++)
{
blocking_pid = curcr->in_tend;
/* In rare cases, wcs_phase2_commit_wait could be invoked from bg_update_phase1 (via bt_put->wcs_get_space)
* when bg_update_phase1 has already pinned a few cache records (with our PID). We don't want to note down
* such cache records and hence the (blocking_pid != process_id) check below
*/
if (blocking_pid && (blocking_pid != process_id))
{
/* go through the book-keeping array to see if we have already noted down this PID. We don't
* expect many processes to be in the phase2 commit section concurrently. So, in most cases,
* we won't scan the array more than once
*/
for (index = 0; index < crarray_index; ++index)
if (crarray[index].blocking_pid == blocking_pid)
break;
if (index == crarray_index)
{ /* cache-record with distinct PID */
assert(crarray_size >= crarray_index);
if (crarray_size <= crarray_index)
break;
crarray[crarray_index].blocking_pid = blocking_pid;
crarray[crarray_index].cr = curcr;
crarray_index++;
}
}
}
/* Issue COMMITWAITPID and get c-stack trace (if possible) for all the distinct PID noted down above */
for (index = 0; index < crarray_index; index++)
{ /* It is possible that cr->in_tend changed since the time we added it to the crarray array.
* Account for this by rechecking.
*/
curcr = crarray[index].cr;
blocking_pid = curcr->in_tend;
SEND_COMMITWAITPID_GET_STACK_IF_NEEDED(blocking_pid, stuck_cnt, curcr, csa);
}
} else
{ /* This is the case where we wait for a particular cache-record. Take the c-stack of the PID that is still
* holding this cr
*/
blocking_pid = cr->in_tend;
SEND_COMMITWAITPID_GET_STACK_IF_NEEDED(blocking_pid, stuck_cnt, cr, csa);
}
DEBUG_ONLY(incrit_pid = cnl->in_crit;)
send_msg(VARLSTCNT(7) ERR_COMMITWAITSTUCK, 5, process_id, 1, cnl->wcs_phase2_commit_pidcnt, DB_LEN_STR(csa->region));
BG_TRACE_PRO_ANY(csa, wcb_phase2_commit_wait);
/* If called from wcs_recover(), we dont want to assert(FALSE) as it is possible (in case of STOP/IDs) that
* cnl->wcs_phase2_commit_pidcnt is non-zero even though there is no process in phase2 of commit. In this case
* wcs_recover will call wcs_verify which will clear the flag unconditionally and proceed with normal activity.
* So should not assert. If the caller is wcs_recover, then we expect cnl->wc_blocked so be non-zero. Assert
* that. If we are called from wcs_flu via ONLINE ROLLBACK, then wc_blocked will NOT be set. Instead, wcs_flu
* will return with a failure status back to ROLLBACK which will invoke wcs_recover and that will take care of
* resetting cnl->wcs_phase2_commit_pidcnt. But, ONLINE ROLLBACK called in a crash situation is done only with
* whitebox test cases. So, assert accordingly.
*/
assert(cnl->wc_blocked || (WBTEST_CRASH_SHUTDOWN_EXPECTED == gtm_white_box_test_case_number));
return FALSE;
}