249 lines
13 KiB
C
249 lines
13 KiB
C
/****************************************************************
|
|
* *
|
|
* Copyright 2001, 2013 Fidelity Information Services, Inc *
|
|
* *
|
|
* This source code contains the intellectual property *
|
|
* of its copyright holder(s), and is made available *
|
|
* under a license. If you do not know the terms of *
|
|
* the license, please stop and do not read further. *
|
|
* *
|
|
****************************************************************/
|
|
|
|
#include "mdef.h"
|
|
|
|
#include "gtm_fcntl.h" /* needed for silly aix's expansion of open to open64 */
|
|
#include "gtm_unistd.h"
|
|
|
|
#include "gdsroot.h"
|
|
#include "gtm_facility.h"
|
|
#include "gdskill.h"
|
|
#include "fileinfo.h"
|
|
#include "gdsbt.h"
|
|
#include "gdsblk.h"
|
|
#include "gdsfhead.h"
|
|
#include "filestruct.h"
|
|
#include "gdscc.h"
|
|
#include "jnl.h"
|
|
#include "buddy_list.h" /* for tp.h */
|
|
#include "hashtab_int4.h" /* needed for tp.h */
|
|
#include "tp.h" /* for tp_region definition */
|
|
#include "gt_timer.h" /* for TID definition */
|
|
#include "timers.h" /* for TIM_DEFER_DBSYNC #define */
|
|
#include "gdsbgtr.h" /* for the BG_TRACE_PRO macros */
|
|
#include "gtmio.h" /* for the GET_LSEEK_FLAG macro */
|
|
#include "wcs_clean_dbsync.h"
|
|
#include "wcs_flu.h"
|
|
#include "lockconst.h"
|
|
|
|
#ifdef GTM_MALLOC_RENT
|
|
# define GTM_MALLOC_NO_RENT_ONLY(X)
|
|
#else
|
|
# define GTM_MALLOC_NO_RENT_ONLY(X) X
|
|
#endif
|
|
|
|
NOPIO_ONLY(GBLREF boolean_t *lseekIoInProgress_flags;) /* needed for the LSEEK* macros in gtmio.h */
|
|
GBLREF gd_region *gv_cur_region;
|
|
GBLREF sgmnt_addrs *cs_addrs;
|
|
GBLREF sgmnt_data_ptr_t cs_data;
|
|
GBLREF volatile int4 crit_count;
|
|
GBLREF volatile boolean_t in_mutex_deadlock_check;
|
|
GBLREF volatile int4 db_fsync_in_prog, jnl_qio_in_prog;
|
|
GBLREF volatile int4 fast_lock_count;
|
|
GBLREF volatile int4 gtmMallocDepth; /* Recursion indicator */
|
|
GBLREF boolean_t mupip_jnl_recover;
|
|
#ifdef DEBUG
|
|
GBLREF unsigned int t_tries;
|
|
GBLREF volatile boolean_t timer_in_handler;
|
|
#endif
|
|
|
|
/* Sync the filehdr (and epoch in the journal file if before imaging). The goal is to sync the database,
|
|
* but if we find us in a situation where we need to block on someone else, then we defer this to the next round.
|
|
*/
|
|
void wcs_clean_dbsync(TID tid, int4 hd_len, sgmnt_addrs **csaptr)
|
|
{
|
|
boolean_t dbsync_defer_timer;
|
|
gd_region *reg, *save_region;
|
|
jnl_private_control *jpc;
|
|
node_local_ptr_t cnl;
|
|
sgmnt_addrs *csa, *check_csaddrs, *save_csaddrs;
|
|
sgmnt_data_ptr_t csd, save_csdata;
|
|
NOPIO_ONLY(boolean_t lseekIoInProgress_flag;)
|
|
DEBUG_ONLY(boolean_t save_ok_to_call_wcs_recover;)
|
|
boolean_t is_mm;
|
|
DCL_THREADGBL_ACCESS;
|
|
|
|
SETUP_THREADGBL_ACCESS;
|
|
csa = *csaptr;
|
|
assert(timer_in_handler);
|
|
assert(csa->dbsync_timer); /* to ensure no duplicate dbsync timers */
|
|
CANCEL_DBSYNC_TIMER(csa); /* reset csa->dbsync_timer now that the dbsync timer has popped */
|
|
assert(!csa->dbsync_timer);
|
|
reg = csa->region;
|
|
/* Don't know how this can happen, but if region is closed, just return in PRO. */
|
|
if (!reg->open)
|
|
{
|
|
assert(FALSE);
|
|
return;
|
|
}
|
|
is_mm = (dba_mm == reg->dyn.addr->acc_meth);
|
|
save_region = gv_cur_region; /* Save for later restore. See notes about restore */
|
|
save_csaddrs = cs_addrs;
|
|
save_csdata = cs_data;
|
|
/* Save to see if we are in crit anywhere */
|
|
check_csaddrs = ((NULL == save_region || FALSE == save_region->open) ? NULL : (&FILE_INFO(save_region)->s_addrs));
|
|
/* Note the non-usage of TP_CHANGE_REG_IF_NEEDED macros since this routine can be timer driven. */
|
|
TP_CHANGE_REG(reg);
|
|
csd = csa->hdr;
|
|
cnl = csa->nl;
|
|
jpc = csa->jnl;
|
|
BG_TRACE_PRO_ANY(csa, n_dbsync_timers);
|
|
assert(csa == cs_addrs);
|
|
assert(!JNL_ALLOWED(csd) || NULL != jpc);
|
|
/* Note that even if the active queue was emptied when this routine was called, due to
|
|
* concurrent update activity, cnl->wcs_active_lvl can be non-zero when we reach here. We
|
|
* defer syncing in this case to the next time the active queue becomes empty ( or when we
|
|
* reach the next scheduled epoch_time -- in case of before-imaging) whichever is earlier.
|
|
*
|
|
* Note that if we are already in wcs_wtstart for this region, then invoking wcs_flu() won't
|
|
* recurse on wcs_wtstart. In any case the interrupted wcs_wtstart invocation will take care
|
|
* of the dbsync_timer once it is done. Therefore in this case too no need to do the dbsync.
|
|
*/
|
|
dbsync_defer_timer = FALSE;
|
|
if (!cnl->wcs_active_lvl && !csa->in_wtstart)
|
|
{ /* Similar to wcs_stale, defer expensive IO flushing if any of the following is true.
|
|
* 1) We are in the midst of lseek/read/write IO. This could reset an lseek.
|
|
* 2) We are aquiring/releasing crit in any region (Strictly speaking it is enough
|
|
* to check this in the current region, but doesn't harm us much).
|
|
* Note that the function "mutex_deadlock_check" resets crit_count to 0 temporarily even though we
|
|
* might actually be in the midst of acquiring crit. Therefore we should not interrupt mainline code
|
|
* if we are in the "mutex_deadlock_check" as otherwise it presents reentrancy issues.
|
|
* 3) We have crit in the current region OR are in the middle of commit for this region (even though
|
|
* we dont hold crit) OR are in wcs_wtstart (potentially holding write interlock and keeping another
|
|
* process in crit waiting) OR we need to wait to obtain crit. At least one reason why we should not wait
|
|
* to obtain crit is because the timeout mechanism for the critical section is currently (as of 2004 May)
|
|
* driven by heartbeat on Tru64, AIX, Solaris and HPUX. The periodic heartbeat handler cannot pop as
|
|
* it is a SIGALRM handler and cannot nest while we are already in a SIGALRM handler for the wcs_clean_dbsync.
|
|
* Were this to happen, we could end up waiting for crit, not being able to interrupt the wait
|
|
* with a timeout resulting in a hang until crit became available.
|
|
* 4) We are in a "fast lock".
|
|
* 5) We are in gtm_malloc. Don't want to recurse on malloc.
|
|
* Other deadlock causing conditions that need to be taken care of
|
|
* 1) We already have either the fsync_in_prog or the io_in_prog lock.
|
|
* 2) We are currently doing a db_fsync on some region.
|
|
*/
|
|
dbsync_defer_timer = TRUE;
|
|
GET_LSEEK_FLAG(FILE_INFO(reg)->fd, lseekIoInProgress_flag);
|
|
DEBUG_ONLY(
|
|
/* We invoke grab_crit_immediate below which can potentially do cache-recoveries if cnl->wc_blocked is set.
|
|
* But wcs_recover has an assert that we never invoke it in the final retry. This is to avoid
|
|
* restarts in the final retry. But wcs_clean_dbsync invokes grab_crit_immediate only if we dont already
|
|
* hold crit and that means we have already finished commit on this particular region (e.g. if
|
|
* commit is complete on all regions and crit is released on all of them but before we reset t_tries
|
|
* to 0 in t_end/tp_tend) so it is okay to invoke wcs_recover in that case. Signal that to wcs_recover
|
|
* by setting ok_to_call_wcs_recover to TRUE. Need to save and restore the global as it could be
|
|
* TRUE or FALSE depending on where wcs_clean_dbsync interrupted mainline code.
|
|
*/
|
|
assert(CDB_STAGNATE >= t_tries || WBTEST_ENABLED(WBTEST_ANTIFREEZE_GVDATAFAIL));
|
|
if (CDB_STAGNATE <= t_tries)
|
|
{
|
|
save_ok_to_call_wcs_recover = TREF(ok_to_call_wcs_recover);
|
|
TREF(ok_to_call_wcs_recover) = TRUE;
|
|
}
|
|
)
|
|
if (!mupip_jnl_recover NOPIO_ONLY(&& (FALSE == lseekIoInProgress_flag))
|
|
GTM_MALLOC_NO_RENT_ONLY(&& 0 == gtmMallocDepth)
|
|
&& (0 == crit_count) && !in_mutex_deadlock_check
|
|
&& (0 == fast_lock_count)
|
|
&& (!jnl_qio_in_prog) && (!db_fsync_in_prog)
|
|
&& (!jpc || !jpc->jnl_buff || (LOCK_AVAILABLE == jpc->jnl_buff->fsync_in_prog_latch.u.parts.latch_pid))
|
|
&& ((NULL == check_csaddrs) || !T_IN_CRIT_OR_COMMIT_OR_WRITE(check_csaddrs))
|
|
&& !T_IN_CRIT_OR_COMMIT_OR_WRITE(csa)
|
|
&& (FALSE != grab_crit_immediate(reg)))
|
|
{ /* Note that grab_crit_immediate invokes wcs_recover in case cnl->wc_blocked is non-zero. This means we
|
|
* could be doing cache recovery even though we are in interrupt code. If this is found undesirable, the
|
|
* logic in grab_crit_immediate that invokes wcs_recover has to be re-examined.
|
|
*/
|
|
/* Note that if we are here, we have obtained crit using grab_crit_immediate. */
|
|
assert(csa->ti->early_tn == csa->ti->curr_tn);
|
|
/* Do not invoke wcs_flu if the database has a newer journal file than what this process had open
|
|
* when the dbsync timer was started in wcs_wtstart. This is because mainline (non-interrupt) code
|
|
* in jnl_write_attempt/jnl_output_sp assumes that interrupt code will not update jpc structures to
|
|
* point to latest journal file (i.e. will not do a jnl_ensure_open) but wcs_flu might invoke just
|
|
* that. It is ok not to do a wcs_flu since whichever process did the journal switch would have
|
|
* written the EPOCH record in the older generation journal file. Therefore there is no need to
|
|
* start a new dbsync timer in this case.
|
|
*
|
|
* If journaling and writing EPOCHs, do a wcs_flu only if there has been at least one transaction
|
|
* since the last time someone wrote an EPOCH.
|
|
*
|
|
* If NOT journaling or if NOT writing EPOCHs, do a wcs_flu only if there has been at least one
|
|
* transaction since the last time someone did a wcs_flu.
|
|
*
|
|
* This way wcs_flu is not redundantly invoked and it ensures that the least number of epochs
|
|
* (only the necessary ones) are written OR the least number of db file header flushes are done.
|
|
*
|
|
* If MM and not writing EPOCHs, we need to flush the fileheader out as that is not mmap'ed.
|
|
*/
|
|
/* Write idle/free epoch only if db curr_tn did not change since when the last dirty cache record was
|
|
* written in wcs_wtstart to when the dbsync timer (5 seconds) popped. If the curr_tn changed it means
|
|
* some other update happened in between and things are no longer idle so the previous idle dbsync
|
|
* timer can be stopped. A new timer will be written when the later updates finish and leave the db
|
|
* idle again. Note that there are some race conditions where we might not be accurate in writing idle
|
|
* EPOCH only when necessary (since we dont hold crit at the time we record csa->dbsync_timer_tn). But
|
|
* any error will always be on the side of caution so we might end up writing more idle EPOCHs than
|
|
* necessary. Also, even if we dont write an idle EPOCH (for example because we found an update
|
|
* happened later but that update turned out to be a duplicate SET which will not start an idle
|
|
* EPOCH timer), journal recovery already knows to handle the case where an idle EPOCH did not get
|
|
* written. So things will still work but it might just take a little longer than usual.
|
|
*/
|
|
if (csa->dbsync_timer_tn == csa->ti->curr_tn)
|
|
{ /* Note that it is possible in rare cases that an online rollback took csa->ti->curr_tn back
|
|
* and the exact # of updates happened concurrently to take csa->ti->curr_tn back to where it
|
|
* was to match csa->dbsync_timer_tn. In this case, we will be writing an epoch unnecessarily
|
|
* but this is a very rare situation that is considered okay to write the epoch in that case
|
|
* as it keeps the if check simple for the most frequent path.
|
|
*/
|
|
if ((NULL != jpc) && JNL_HAS_EPOCH(jpc->jnl_buff)
|
|
? (((NOJNL == jpc->channel) || !JNL_FILE_SWITCHED(jpc))
|
|
&& (jpc->jnl_buff->epoch_tn < csa->ti->curr_tn))
|
|
: (cnl->last_wcsflu_tn < csa->ti->curr_tn))
|
|
{
|
|
wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH | WCSFLU_CLEAN_DBSYNC
|
|
| WCSFLU_SPEEDUP_NOBEFORE);
|
|
BG_TRACE_PRO_ANY(csa, n_dbsync_writes);
|
|
/* If MM, file could have been remapped by wcs_flu above.
|
|
* If so, cs_data needs to be reset.
|
|
*/
|
|
if (is_mm && (save_csaddrs == cs_addrs) && (save_csdata != cs_data))
|
|
save_csdata = cs_addrs->hdr;
|
|
}
|
|
}
|
|
dbsync_defer_timer = FALSE;
|
|
assert(!csa->hold_onto_crit); /* this ensures we can safely do unconditional rel_crit */
|
|
rel_crit(reg);
|
|
}
|
|
DEBUG_ONLY(
|
|
if (CDB_STAGNATE <= t_tries)
|
|
TREF(ok_to_call_wcs_recover) = save_ok_to_call_wcs_recover;
|
|
)
|
|
}
|
|
if (dbsync_defer_timer)
|
|
{
|
|
assert(SIZEOF(INTPTR_T) == SIZEOF(csa));
|
|
/* Adding a new dbsync timer should typically be done in a deferred zone to avoid duplicate timer additions for the
|
|
* same TID. But, in this case, we are guaranteed that timers won't pop as we are already in a timer handler. As
|
|
* for the external interrupts, they should be okay to interrupt at this point since, unlike timer interrupts,
|
|
* control won't return to mainline code. So, in either case, we can safely add the new timer.
|
|
*/
|
|
if (!csa->dbsync_timer)
|
|
START_DBSYNC_TIMER(csa, TIM_DEFER_DBSYNC);
|
|
}
|
|
/* To restore to former glory, don't use TP_CHANGE_REG, 'coz we might mistakenly set cs_addrs and cs_data to NULL
|
|
* if the region we are restoring to has been closed. Don't use tp_change_reg 'coz we might be ripping out the structures
|
|
* needed in tp_change_reg in gv_rundown. */
|
|
gv_cur_region = save_region;
|
|
cs_addrs = save_csaddrs;
|
|
cs_data = save_csdata;
|
|
return;
|
|
}
|