fis-gtm/sr_unix/wcs_clean_dbsync.c

249 lines
13 KiB
C

/****************************************************************
* *
* Copyright 2001, 2013 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_fcntl.h" /* needed for silly aix's expansion of open to open64 */
#include "gtm_unistd.h"
#include "gdsroot.h"
#include "gtm_facility.h"
#include "gdskill.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "gdscc.h"
#include "jnl.h"
#include "buddy_list.h" /* for tp.h */
#include "hashtab_int4.h" /* needed for tp.h */
#include "tp.h" /* for tp_region definition */
#include "gt_timer.h" /* for TID definition */
#include "timers.h" /* for TIM_DEFER_DBSYNC #define */
#include "gdsbgtr.h" /* for the BG_TRACE_PRO macros */
#include "gtmio.h" /* for the GET_LSEEK_FLAG macro */
#include "wcs_clean_dbsync.h"
#include "wcs_flu.h"
#include "lockconst.h"
#ifdef GTM_MALLOC_RENT
# define GTM_MALLOC_NO_RENT_ONLY(X)
#else
# define GTM_MALLOC_NO_RENT_ONLY(X) X
#endif
NOPIO_ONLY(GBLREF boolean_t *lseekIoInProgress_flags;) /* needed for the LSEEK* macros in gtmio.h */
GBLREF gd_region *gv_cur_region;
GBLREF sgmnt_addrs *cs_addrs;
GBLREF sgmnt_data_ptr_t cs_data;
GBLREF volatile int4 crit_count;
GBLREF volatile boolean_t in_mutex_deadlock_check;
GBLREF volatile int4 db_fsync_in_prog, jnl_qio_in_prog;
GBLREF volatile int4 fast_lock_count;
GBLREF volatile int4 gtmMallocDepth; /* Recursion indicator */
GBLREF boolean_t mupip_jnl_recover;
#ifdef DEBUG
GBLREF unsigned int t_tries;
GBLREF volatile boolean_t timer_in_handler;
#endif
/* Sync the filehdr (and epoch in the journal file if before imaging). The goal is to sync the database,
* but if we find us in a situation where we need to block on someone else, then we defer this to the next round.
*/
void wcs_clean_dbsync(TID tid, int4 hd_len, sgmnt_addrs **csaptr)
{
boolean_t dbsync_defer_timer;
gd_region *reg, *save_region;
jnl_private_control *jpc;
node_local_ptr_t cnl;
sgmnt_addrs *csa, *check_csaddrs, *save_csaddrs;
sgmnt_data_ptr_t csd, save_csdata;
NOPIO_ONLY(boolean_t lseekIoInProgress_flag;)
DEBUG_ONLY(boolean_t save_ok_to_call_wcs_recover;)
boolean_t is_mm;
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
csa = *csaptr;
assert(timer_in_handler);
assert(csa->dbsync_timer); /* to ensure no duplicate dbsync timers */
CANCEL_DBSYNC_TIMER(csa); /* reset csa->dbsync_timer now that the dbsync timer has popped */
assert(!csa->dbsync_timer);
reg = csa->region;
/* Don't know how this can happen, but if region is closed, just return in PRO. */
if (!reg->open)
{
assert(FALSE);
return;
}
is_mm = (dba_mm == reg->dyn.addr->acc_meth);
save_region = gv_cur_region; /* Save for later restore. See notes about restore */
save_csaddrs = cs_addrs;
save_csdata = cs_data;
/* Save to see if we are in crit anywhere */
check_csaddrs = ((NULL == save_region || FALSE == save_region->open) ? NULL : (&FILE_INFO(save_region)->s_addrs));
/* Note the non-usage of TP_CHANGE_REG_IF_NEEDED macros since this routine can be timer driven. */
TP_CHANGE_REG(reg);
csd = csa->hdr;
cnl = csa->nl;
jpc = csa->jnl;
BG_TRACE_PRO_ANY(csa, n_dbsync_timers);
assert(csa == cs_addrs);
assert(!JNL_ALLOWED(csd) || NULL != jpc);
/* Note that even if the active queue was emptied when this routine was called, due to
* concurrent update activity, cnl->wcs_active_lvl can be non-zero when we reach here. We
* defer syncing in this case to the next time the active queue becomes empty ( or when we
* reach the next scheduled epoch_time -- in case of before-imaging) whichever is earlier.
*
* Note that if we are already in wcs_wtstart for this region, then invoking wcs_flu() won't
* recurse on wcs_wtstart. In any case the interrupted wcs_wtstart invocation will take care
* of the dbsync_timer once it is done. Therefore in this case too no need to do the dbsync.
*/
dbsync_defer_timer = FALSE;
if (!cnl->wcs_active_lvl && !csa->in_wtstart)
{ /* Similar to wcs_stale, defer expensive IO flushing if any of the following is true.
* 1) We are in the midst of lseek/read/write IO. This could reset an lseek.
* 2) We are aquiring/releasing crit in any region (Strictly speaking it is enough
* to check this in the current region, but doesn't harm us much).
* Note that the function "mutex_deadlock_check" resets crit_count to 0 temporarily even though we
* might actually be in the midst of acquiring crit. Therefore we should not interrupt mainline code
* if we are in the "mutex_deadlock_check" as otherwise it presents reentrancy issues.
* 3) We have crit in the current region OR are in the middle of commit for this region (even though
* we dont hold crit) OR are in wcs_wtstart (potentially holding write interlock and keeping another
* process in crit waiting) OR we need to wait to obtain crit. At least one reason why we should not wait
* to obtain crit is because the timeout mechanism for the critical section is currently (as of 2004 May)
* driven by heartbeat on Tru64, AIX, Solaris and HPUX. The periodic heartbeat handler cannot pop as
* it is a SIGALRM handler and cannot nest while we are already in a SIGALRM handler for the wcs_clean_dbsync.
* Were this to happen, we could end up waiting for crit, not being able to interrupt the wait
* with a timeout resulting in a hang until crit became available.
* 4) We are in a "fast lock".
* 5) We are in gtm_malloc. Don't want to recurse on malloc.
* Other deadlock causing conditions that need to be taken care of
* 1) We already have either the fsync_in_prog or the io_in_prog lock.
* 2) We are currently doing a db_fsync on some region.
*/
dbsync_defer_timer = TRUE;
GET_LSEEK_FLAG(FILE_INFO(reg)->fd, lseekIoInProgress_flag);
DEBUG_ONLY(
/* We invoke grab_crit_immediate below which can potentially do cache-recoveries if cnl->wc_blocked is set.
* But wcs_recover has an assert that we never invoke it in the final retry. This is to avoid
* restarts in the final retry. But wcs_clean_dbsync invokes grab_crit_immediate only if we dont already
* hold crit and that means we have already finished commit on this particular region (e.g. if
* commit is complete on all regions and crit is released on all of them but before we reset t_tries
* to 0 in t_end/tp_tend) so it is okay to invoke wcs_recover in that case. Signal that to wcs_recover
* by setting ok_to_call_wcs_recover to TRUE. Need to save and restore the global as it could be
* TRUE or FALSE depending on where wcs_clean_dbsync interrupted mainline code.
*/
assert(CDB_STAGNATE >= t_tries || WBTEST_ENABLED(WBTEST_ANTIFREEZE_GVDATAFAIL));
if (CDB_STAGNATE <= t_tries)
{
save_ok_to_call_wcs_recover = TREF(ok_to_call_wcs_recover);
TREF(ok_to_call_wcs_recover) = TRUE;
}
)
if (!mupip_jnl_recover NOPIO_ONLY(&& (FALSE == lseekIoInProgress_flag))
GTM_MALLOC_NO_RENT_ONLY(&& 0 == gtmMallocDepth)
&& (0 == crit_count) && !in_mutex_deadlock_check
&& (0 == fast_lock_count)
&& (!jnl_qio_in_prog) && (!db_fsync_in_prog)
&& (!jpc || !jpc->jnl_buff || (LOCK_AVAILABLE == jpc->jnl_buff->fsync_in_prog_latch.u.parts.latch_pid))
&& ((NULL == check_csaddrs) || !T_IN_CRIT_OR_COMMIT_OR_WRITE(check_csaddrs))
&& !T_IN_CRIT_OR_COMMIT_OR_WRITE(csa)
&& (FALSE != grab_crit_immediate(reg)))
{ /* Note that grab_crit_immediate invokes wcs_recover in case cnl->wc_blocked is non-zero. This means we
* could be doing cache recovery even though we are in interrupt code. If this is found undesirable, the
* logic in grab_crit_immediate that invokes wcs_recover has to be re-examined.
*/
/* Note that if we are here, we have obtained crit using grab_crit_immediate. */
assert(csa->ti->early_tn == csa->ti->curr_tn);
/* Do not invoke wcs_flu if the database has a newer journal file than what this process had open
* when the dbsync timer was started in wcs_wtstart. This is because mainline (non-interrupt) code
* in jnl_write_attempt/jnl_output_sp assumes that interrupt code will not update jpc structures to
* point to latest journal file (i.e. will not do a jnl_ensure_open) but wcs_flu might invoke just
* that. It is ok not to do a wcs_flu since whichever process did the journal switch would have
* written the EPOCH record in the older generation journal file. Therefore there is no need to
* start a new dbsync timer in this case.
*
* If journaling and writing EPOCHs, do a wcs_flu only if there has been at least one transaction
* since the last time someone wrote an EPOCH.
*
* If NOT journaling or if NOT writing EPOCHs, do a wcs_flu only if there has been at least one
* transaction since the last time someone did a wcs_flu.
*
* This way wcs_flu is not redundantly invoked and it ensures that the least number of epochs
* (only the necessary ones) are written OR the least number of db file header flushes are done.
*
* If MM and not writing EPOCHs, we need to flush the fileheader out as that is not mmap'ed.
*/
/* Write idle/free epoch only if db curr_tn did not change since when the last dirty cache record was
* written in wcs_wtstart to when the dbsync timer (5 seconds) popped. If the curr_tn changed it means
* some other update happened in between and things are no longer idle so the previous idle dbsync
* timer can be stopped. A new timer will be written when the later updates finish and leave the db
* idle again. Note that there are some race conditions where we might not be accurate in writing idle
* EPOCH only when necessary (since we dont hold crit at the time we record csa->dbsync_timer_tn). But
* any error will always be on the side of caution so we might end up writing more idle EPOCHs than
* necessary. Also, even if we dont write an idle EPOCH (for example because we found an update
* happened later but that update turned out to be a duplicate SET which will not start an idle
* EPOCH timer), journal recovery already knows to handle the case where an idle EPOCH did not get
* written. So things will still work but it might just take a little longer than usual.
*/
if (csa->dbsync_timer_tn == csa->ti->curr_tn)
{ /* Note that it is possible in rare cases that an online rollback took csa->ti->curr_tn back
* and the exact # of updates happened concurrently to take csa->ti->curr_tn back to where it
* was to match csa->dbsync_timer_tn. In this case, we will be writing an epoch unnecessarily
* but this is a very rare situation that is considered okay to write the epoch in that case
* as it keeps the if check simple for the most frequent path.
*/
if ((NULL != jpc) && JNL_HAS_EPOCH(jpc->jnl_buff)
? (((NOJNL == jpc->channel) || !JNL_FILE_SWITCHED(jpc))
&& (jpc->jnl_buff->epoch_tn < csa->ti->curr_tn))
: (cnl->last_wcsflu_tn < csa->ti->curr_tn))
{
wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH | WCSFLU_CLEAN_DBSYNC
| WCSFLU_SPEEDUP_NOBEFORE);
BG_TRACE_PRO_ANY(csa, n_dbsync_writes);
/* If MM, file could have been remapped by wcs_flu above.
* If so, cs_data needs to be reset.
*/
if (is_mm && (save_csaddrs == cs_addrs) && (save_csdata != cs_data))
save_csdata = cs_addrs->hdr;
}
}
dbsync_defer_timer = FALSE;
assert(!csa->hold_onto_crit); /* this ensures we can safely do unconditional rel_crit */
rel_crit(reg);
}
DEBUG_ONLY(
if (CDB_STAGNATE <= t_tries)
TREF(ok_to_call_wcs_recover) = save_ok_to_call_wcs_recover;
)
}
if (dbsync_defer_timer)
{
assert(SIZEOF(INTPTR_T) == SIZEOF(csa));
/* Adding a new dbsync timer should typically be done in a deferred zone to avoid duplicate timer additions for the
* same TID. But, in this case, we are guaranteed that timers won't pop as we are already in a timer handler. As
* for the external interrupts, they should be okay to interrupt at this point since, unlike timer interrupts,
* control won't return to mainline code. So, in either case, we can safely add the new timer.
*/
if (!csa->dbsync_timer)
START_DBSYNC_TIMER(csa, TIM_DEFER_DBSYNC);
}
/* To restore to former glory, don't use TP_CHANGE_REG, 'coz we might mistakenly set cs_addrs and cs_data to NULL
* if the region we are restoring to has been closed. Don't use tp_change_reg 'coz we might be ripping out the structures
* needed in tp_change_reg in gv_rundown. */
gv_cur_region = save_region;
cs_addrs = save_csaddrs;
cs_data = save_csdata;
return;
}