fis-gtm/sr_port/wcs_recover.c

882 lines
37 KiB
C

/****************************************************************
* *
* Copyright 2001, 2011 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_string.h"
#include "gtm_time.h"
#include "gtmimagename.h"
#ifdef UNIX
# include <sys/mman.h>
# include "gtm_stat.h"
# include <errno.h>
# include <signal.h>
#elif defined(VMS)
# include <fab.h>
# include <iodef.h>
# include <ssdef.h>
#else
# error UNSUPPORTED PLATFORM
#endif
#include "ast.h" /* needed for DCLAST_WCS_WTSTART macro in gdsfhead.h */
#include "gdsroot.h"
#include "gdskill.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "gdsbgtr.h"
#include "gdsbml.h"
#include "filestruct.h"
#include "gdscc.h"
#include "interlock.h"
#include "jnl.h"
#include "testpt.h"
#include "sleep_cnt.h"
#include "mupipbckup.h"
#include "wbox_test_init.h"
#ifdef UNIX
# include "eintr_wrappers.h"
GBLREF sigset_t blockalrm;
#endif
#include "send_msg.h"
#include "bit_set.h"
#include "bit_clear.h"
#include "relqop.h"
#include "is_proc_alive.h"
#include "mmseg.h"
#include "format_targ_key.h"
#include "gds_map_moved.h"
#include "wcs_recover.h"
#include "wcs_sleep.h"
#include "wcs_mm_recover.h"
#include "add_inter.h"
#include "gtm_malloc.h" /* for verifyAllocatedStorage() prototype */
#include "cert_blk.h"
#include "shmpool.h"
#include "wcs_phase2_commit_wait.h"
#include "buddy_list.h" /* needed for tp.h */
#include "hashtab_int4.h" /* needed for tp.h */
#include "tp.h"
#include "memcoherency.h"
#include "gtm_c_stack_trace.h"
GBLREF boolean_t certify_all_blocks;
GBLREF sgmnt_addrs *cs_addrs;
GBLREF sgmnt_data_ptr_t cs_data;
GBLREF gd_region *gv_cur_region;
GBLREF gv_key *gv_currkey; /* needed in VMS for error logging in MM */
GBLREF uint4 process_id;
GBLREF testpt_struct testpoint;
GBLREF inctn_opcode_t inctn_opcode;
GBLREF boolean_t mupip_jnl_recover;
GBLREF jnl_gbls_t jgbl;
GBLREF enum gtmImageTypes image_type;
GBLREF boolean_t mu_rndwn_file_dbjnl_flush;
GBLREF uint4 gtmDebugLevel;
GBLREF unsigned int cr_array_index;
GBLREF uint4 dollar_tlevel;
GBLREF volatile boolean_t in_wcs_recover; /* TRUE if in "wcs_recover" */
#ifdef DEBUG
GBLREF unsigned int t_tries;
GBLREF int process_exiting;
#endif
error_def(ERR_BUFRDTIMEOUT);
error_def(ERR_DBADDRALIGN);
error_def(ERR_DBADDRANGE);
error_def(ERR_DBCCERR);
error_def(ERR_DBCNTRLERR);
error_def(ERR_DBCRERR);
error_def(ERR_DBDANGER);
error_def(ERR_DBFILERR);
error_def(ERR_ERRCALL);
error_def(ERR_INVALIDRIP);
error_def(ERR_GBLOFLOW);
error_def(ERR_GVIS);
error_def(ERR_STOPTIMEOUT);
error_def(ERR_TEXT);
void wcs_recover(gd_region *reg)
{
bt_rec_ptr_t bt;
cache_rec_ptr_t cr, cr_alt, cr_alt_new, cr_lo, cr_top, hash_hdr;
cache_que_head_ptr_t active_head, hq, wip_head, wq;
gd_region *save_reg;
que_ent_ptr_t back_link; /* should be crit & not need interlocked ops. */
sgmnt_addrs *csa;
sgmnt_data_ptr_t csd;
node_local_ptr_t cnl;
int4 bml_full, dummy_errno, blk_size;
uint4 jnl_status, epid, r_epid;
int4 bt_buckets, bufindx; /* should be the same type as "csd->bt_buckets" */
inctn_opcode_t save_inctn_opcode;
unsigned int bplmap, lcnt, total_blks, wait_in_rip;
sm_uc_ptr_t buffptr;
blk_hdr_ptr_t blk_ptr;
INTPTR_T bp_lo, bp_top, old_block;
boolean_t backup_block_saved, change_bmm;
jnl_private_control *jpc;
jnl_buffer_ptr_t jbp;
sgm_info *si;
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
save_reg = gv_cur_region; /* protect against [at least] M LOCK code which doesn't maintain cs_addrs and cs_data */
TP_CHANGE_REG(reg); /* which are needed by called routines such as wcs_wtstart and wcs_mm_recover */
if (dba_mm == reg->dyn.addr->acc_meth) /* MM uses wcs_recover to remap the database in case of a file extension */
{
wcs_mm_recover(reg);
TP_CHANGE_REG(save_reg);
return;
}
csa = &FILE_INFO(reg)->s_addrs;
csd = csa->hdr;
cnl = csa->nl;
si = csa->sgm_info_ptr;
/* We are going to UNPIN (reset in_cw_set to 0) ALL cache-records so assert that we are not in the middle of a
* non-TP or TP transaction that has already PINNED a few buffers as otherwise we will create an out-of-design state.
* The only exception is if we are in the 2nd phase of KILL in a TP transaction. In this case si->cr_aray_index
* could be non-zero as it is reset only in tp_clean_up which is invoked AFTER freeing up ALL the blocks in
* the function gvcst_expand_free_subtree. Work around this by checking for si->kip_csa to be NON NULL in this case.
*/
assert((!dollar_tlevel && !cr_array_index) || (dollar_tlevel && (!si->cr_array_index || (NULL != si->kip_csa))));
/* We should never invoke wcs_recover in the final retry as that could cause the transaction in progress to restart
* (which is an out-of-design situation). There are a few exceptions e.g. tp_restart/t_retry where we have not started
* the transaction so allow those. Such places set the variable ok_to_call_wcs_recover to TRUE. Also if we are in
* the process of exiting, we are guaranteed no transaction is in progress so it is ok to invoke wcs_recover
* even if the variable ok_to_call_wcs_recover is not set to TRUE.
*/
assert((CDB_STAGNATE > t_tries) || TREF(ok_to_call_wcs_recover) || process_exiting);
assert(csa->now_crit || csd->clustered);
CHECK_TN(csa, csd, csd->trans_hist.curr_tn); /* can issue rts_error TNTOOLARGE */
SIGNAL_WRITERS_TO_STOP(csd); /* to stop all active writers */
WAIT_FOR_WRITERS_TO_STOP(cnl, lcnt, MAXWTSTARTWAIT);
/* if the wait loop above hits the limit, or cnl->intent_wtstart goes negative, it is ok to proceed since
* wcs_verify (invoked below) reports and clears cnl->intent_wtstart and cnl->in_wtstart.
*/
assert(!csa->wcs_pidcnt_incremented); /* we should never have come here with a phase2 commit pending for ourself */
/* Wait for any pending phase2 commits to finish.
* In case of mupip rundown, we know no one else is accessing shared memory so no point waiting.
*/
if (!mu_rndwn_file_dbjnl_flush && cnl->wcs_phase2_commit_pidcnt)
{
wcs_phase2_commit_wait(csa, NULL); /* not checking return value since even if it fails we want to do recovery */
/* At this point we expect cnl->wcs_phase2_commit_pidcnt to be 0. But it is possible in case of crash tests that
* it is non-zero (e.g. in VMS, if the only GT.M process accessing the db was STOP/IDed while in the
* DECR_WCS_PHASE2_COMMIT_PIDCNT macro just after resetting csa->wcs_pidcnt_incremented to FALSE but just BEFORE
* decrementing cnl->wcs_phase2_commit_pidcnt). Anyways wcs_verify reports and clears this field so no problems.
*/
}
BG_TRACE_PRO_ANY(csa, wc_blocked_wcs_recover_invoked);
if (wcs_verify(reg, TRUE, TRUE)) /* expect_damage is TRUE, in_wcs_recover is TRUE */
{ /* if it passes verify, then recover can't help ??? what to do */
BG_TRACE_PRO_ANY(csa, wc_blocked_wcs_verify_passed);
send_msg(VARLSTCNT(4) ERR_DBCNTRLERR, 2, DB_LEN_STR(reg));
}
if (gtmDebugLevel)
verifyAllocatedStorage();
change_bmm = FALSE;
/* Before recovering the cache, set early_tn to curr_tn + 1 to indicate to have_crit that we are in a situation that
* is equivalent to being in the midst of a database commit and therefore defer exit handling in case of a MUPIP STOP.
* wc_blocked is anyways set to TRUE at this point so the next process to grab crit will anyways attempt another recovery.
*/
if (!mu_rndwn_file_dbjnl_flush)
csd->trans_hist.early_tn = csd->trans_hist.curr_tn + 1;
assert(!in_wcs_recover); /* should not be called if we are already in "wcs_recover" for another region */
in_wcs_recover = TRUE; /* used by bt_put() called below */
bt_refresh(csa); /* this resets all bt->cache_index links to CR_NOTVALID */
/* the following queue head initializations depend on the wc_blocked mechanism for protection from wcs_wtstart */
wip_head = &csa->acc_meth.bg.cache_state->cacheq_wip;
memset(wip_head, 0, SIZEOF(cache_que_head));
active_head = &csa->acc_meth.bg.cache_state->cacheq_active;
memset(active_head, 0, SIZEOF(cache_que_head));
UNIX_ONLY(wip_head = active_head); /* all inserts into wip_que in VMS should be done in active_que in UNIX */
UNIX_ONLY(SET_LATCH_GLOBAL(&active_head->latch, LOCK_AVAILABLE));
cnl->wcs_active_lvl = 0;
cnl->wc_in_free = 0;
bplmap = csd->bplmap;
hash_hdr = (cache_rec_ptr_t)csa->acc_meth.bg.cache_state->cache_array;
bt_buckets = csd->bt_buckets;
for (cr = hash_hdr, cr_top = cr + bt_buckets; cr < cr_top; cr++)
cr->blkque.fl = cr->blkque.bl = 0; /* take no chances that the blkques are messed up */
cr_lo = cr_top;
cr_top = cr_top + csd->n_bts;
blk_size = csd->blk_size;
buffptr = (sm_uc_ptr_t)ROUND_UP((sm_ulong_t)cr_top, OS_PAGE_SIZE);
backup_block_saved = FALSE;
if (BACKUP_NOT_IN_PROGRESS != cnl->nbb)
{ /* Online backup is in progress. Check if secshr_db_clnup has created any cache-records with pointers
* to before-images that need to be backed up. If so take care of that first before doing any cache recovery.
*/
bp_lo = (INTPTR_T)buffptr;
bp_top = bp_lo + ((gtm_uint64_t)csd->n_bts * csd->blk_size);
for (cr = cr_lo; cr < cr_top; cr++)
{
if (cr->stopped && (0 != cr->twin))
{ /* Check if cr->twin points to a valid buffer. Only in that case, do the backup */
old_block = (INTPTR_T)GDS_ANY_REL2ABS(csa, cr->twin);
if (!IS_PTR_IN_RANGE(old_block, bp_lo, bp_top))
{
send_msg(VARLSTCNT(11) ERR_DBADDRANGE, 9, DB_LEN_STR(reg),
cr, cr->blk, old_block, RTS_ERROR_TEXT("bkup_before_image_range"), bp_lo, bp_top);
assert(FALSE);
continue;
} else if (!IS_PTR_ALIGNED(old_block, bp_lo, csd->blk_size))
{
send_msg(VARLSTCNT(11) ERR_DBADDRALIGN, 9, DB_LEN_STR(reg), cr, cr->blk,
RTS_ERROR_TEXT("bkup_before_image_align"), old_block, bp_lo, csd->blk_size);
assert(FALSE);
continue;
}
bufindx = (int4)((old_block - bp_lo) / csd->blk_size);
assert(0 <= bufindx);
assert(bufindx < csd->n_bts);
cr_alt = &cr_lo[bufindx];
assert((sm_uc_ptr_t)old_block == (sm_uc_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->buffaddr));
/* Do other checks to validate before-image buffer */
if (cr_alt == cr)
{
send_msg(VARLSTCNT(13) ERR_DBCRERR, 11, DB_LEN_STR(reg), cr, cr->blk,
RTS_ERROR_TEXT("bkup_before_image_cr_same"), cr_alt, FALSE, CALLFROM);
assert(FALSE);
continue;
} else if (cr->blk != cr_alt->blk)
{
send_msg(VARLSTCNT(13) ERR_DBCRERR, 11, DB_LEN_STR(reg), cr, cr->blk,
RTS_ERROR_TEXT("bkup_before_image_blk"), cr_alt->blk, cr->blk, CALLFROM);
assert(FALSE);
continue;
} else if (!cr_alt->in_cw_set)
{
send_msg(VARLSTCNT(13) ERR_DBCRERR, 11, DB_LEN_STR(reg), cr_alt, cr_alt->blk,
RTS_ERROR_TEXT("bkup_before_image_in_cw_set"), cr_alt->in_cw_set, TRUE, CALLFROM);
assert(FALSE);
continue;
} else if (cr_alt->stopped)
{
send_msg(VARLSTCNT(13) ERR_DBCRERR, 11, DB_LEN_STR(reg), cr_alt, cr_alt->blk,
RTS_ERROR_TEXT("bkup_before_image_stopped"), cr_alt->stopped, FALSE, CALLFROM);
assert(FALSE);
continue;
}
VMS_ONLY(
/* At this point, it is possible cr_alt points to the older twin. In this case though, the
* commit should have errored out BEFORE the newer twin got built. This way we are
* guaranteed that the cache-record holding the proper before-image is indeed the older
* twin. This is asserted below.
*/
DEBUG_ONLY(
cr_alt_new = (cr_alt->twin)
? ((cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->twin)) : NULL;
)
assert(!cr_alt->twin || cr_alt->bt_index
|| cr_alt_new->bt_index && cr_alt_new->in_tend && cr_alt_new->in_cw_set);
)
/* The following check is similar to the one in BG_BACKUP_BLOCK and the one in
* secshr_db_clnup (where backup_block is invoked)
*/
blk_ptr = (blk_hdr_ptr_t)old_block;
if ((cr_alt->blk >= cnl->nbb)
&& (0 == csa->shmpool_buffer->failed)
&& (blk_ptr->tn < csa->shmpool_buffer->backup_tn)
&& (blk_ptr->tn >= csa->shmpool_buffer->inc_backup_tn))
{
cr_alt->buffaddr = cr->twin; /* reset it to what it should be just to be safe */
backup_block(csa, cr_alt->blk, cr_alt, NULL);
backup_block_saved = TRUE;
}
}
}
}
/* After recovering the cache, we normally increment the db curr_tn. But this should not be done if called from
* forward journal recovery, since we want the final database transaction number to match the journal file's
* eov_tn (to avoid JNLDBTNNOMATCH errors). Therefore in this case, make sure all "tn" fields in the bt and cache are set
* to one less than the final db tn. This is done by decrementing the database current transaction number at the
* start of the recovery and incrementing it at the end. To keep early_tn and curr_tn in sync, decrement early_tn as well.
*/
if (!mu_rndwn_file_dbjnl_flush && mupip_jnl_recover && !JNL_ENABLED(csd))
{
csd->trans_hist.curr_tn--;
csd->trans_hist.early_tn--;
assert(csd->trans_hist.early_tn == (csd->trans_hist.curr_tn + 1));
}
for (cr = cr_lo, wait_in_rip = 0; cr < cr_top; cr++, buffptr += blk_size)
{
cr->buffaddr = GDS_ANY_ABS2REL(csa, buffptr); /* reset it to what it should be just to be safe */
if (((int)(cr->blk) != CR_BLKEMPTY) && (((int)(cr->blk) < 0) || ((int)(cr->blk) >= csd->trans_hist.total_blks)))
{ /* bad block number. discard buffer for now. actually can do better by looking at cr->bt_index... */
cr->cycle++; /* increment cycle whenever blk number changes (tp_hist depends on this) */
cr->blk = CR_BLKEMPTY;
assert(FALSE);
SHMPOOL_FREE_CR_RFMT_BLOCK(reg, csa, cr);
}
/* fix bad values of cr->dirty and cr->flushed_dirty_tn */
assert(csa->ti == &csd->trans_hist);
if (cr->dirty > csd->trans_hist.curr_tn)
cr->dirty = csd->trans_hist.curr_tn; /* we assume csd->trans_hist.curr_tn is valid */
if (cr->flushed_dirty_tn >= cr->dirty)
cr->flushed_dirty_tn = 0;
UNIX_ONLY(
/* reset all fields that might be corrupt that wcs_verify() cares about */
cr->epid = 0;
cr->image_count = 0; /* image_count needs to be reset before its usage below in case it is corrupt */
)
/* wait for read-in-progress to complete */
for (lcnt = 1; (-1 != cr->read_in_progress); lcnt++)
{ /* very similar code appears elsewhere and perhaps should be common */
/* Since cr->r_epid can be changing concurrently, take a local copy before using it below,
* particularly before calling is_proc_alive as we dont want to call it with a 0 r_epid.
*/
r_epid = cr->r_epid;
if (cr->read_in_progress < -1)
{
send_msg(VARLSTCNT(4) ERR_INVALIDRIP, 2, DB_LEN_STR(reg));
INTERLOCK_INIT(cr);
cr->cycle++; /* increment cycle whenever blk number changes (tp_hist depends on this) */
cr->blk = CR_BLKEMPTY;
SHMPOOL_FREE_CR_RFMT_BLOCK(reg, csa, cr);
assert(cr->r_epid == 0);
assert(0 == cr->dirty);
} else if ((0 != r_epid)
&& ((r_epid == process_id) || (FALSE == is_proc_alive(r_epid, cr->image_count))))
{
INTERLOCK_INIT(cr); /* Process gone, release that process's lock */
cr->cycle++; /* increment cycle whenever blk number changes (tp_hist depends on this) */
cr->blk = CR_BLKEMPTY;
SHMPOOL_FREE_CR_RFMT_BLOCK(reg, csa, cr);
} else
{
if (1 == lcnt)
epid = r_epid;
else if ((BUF_OWNER_STUCK < lcnt) || (MAX_WAIT_FOR_RIP <= wait_in_rip))
{ /* If we have already waited for atleast 4 minutes, no longer wait but fixup
* all following cr's. If r_epid is 0 and also read in progress, we identify
* this as corruption and fixup up this cr and proceed to the next cr.
*/
assert(FALSE || (WBTEST_CRASH_SHUTDOWN_EXPECTED == gtm_white_box_test_case_number));
if ((0 != r_epid) && (epid != r_epid))
GTMASSERT;
/* process still active but not playing fair or cache is corrupted */
GET_C_STACK_FROM_SCRIPT("BUFRDTIMEOUT", process_id, r_epid, TWICE);
send_msg(VARLSTCNT(8) ERR_BUFRDTIMEOUT, 6, process_id, cr->blk, cr, r_epid,
DB_LEN_STR(reg));
send_msg(VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("Buffer forcibly seized"));
INTERLOCK_INIT(cr);
cr->cycle++; /* increment cycle whenever blk number changes (tp_hist depends on this) */
cr->blk = CR_BLKEMPTY;
SHMPOOL_FREE_CR_RFMT_BLOCK(reg, csa, cr);
if (BUF_OWNER_STUCK < lcnt)
wait_in_rip++;
break;
}
DEBUG_ONLY(
else if (((BUF_OWNER_STUCK / 2) == lcnt) || ((MAX_WAIT_FOR_RIP / 2) == wait_in_rip))
GET_C_STACK_FROM_SCRIPT("BUFRDTIMEOUT", process_id, r_epid, ONCE);)
wcs_sleep(lcnt);
}
}
/* reset cr->rip_latch. it is unused in VMS, but wcs_verify() checks it hence the reset in both Unix and VMS */
UNIX_ONLY(SET_LATCH_GLOBAL(&(cr->rip_latch), LOCK_AVAILABLE));
VMS_ONLY(memset((sm_uc_ptr_t)&cr->rip_latch, 0, SIZEOF(global_latch_t)));
cr->r_epid = 0; /* the processing above should make this appropriate */
cr->tn = csd->trans_hist.curr_tn;
cr->blkque.fl = cr->blkque.bl = 0; /* take no chances that the blkques are messed up */
cr->state_que.fl = cr->state_que.bl = 0; /* take no chances that the state_ques are messed up */
cr->in_cw_set = 0; /* this has crit and is here, so in_cw_set must no longer be non-zero */
UNIX_ONLY(cr->wip_stopped = FALSE;)
VMS_ONLY(
if (cr->wip_stopped)
{
for (lcnt = 1; (0 == cr->iosb.cond) && is_proc_alive(cr->epid, cr->image_count); lcnt++)
{
if (1 == lcnt)
epid = cr->epid;
else if (BUF_OWNER_STUCK < lcnt)
{
if ((0 != cr->epid) && (epid != cr->epid))
GTMASSERT;
if (0 != epid)
{ /* process still active, but not playing fair */
send_msg(VARLSTCNT(5) ERR_STOPTIMEOUT, 3, epid, DB_LEN_STR(reg));
send_msg(VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("Buffer forcibly seized"));
cr->epid = 0;
}
continue;
}
wcs_sleep(lcnt);
}
if (0 == cr->iosb.cond)
{ /* if it's abandonned wip_stopped, treat it as a WRT_STRT_PNDNG */
cr->iosb.cond = WRT_STRT_PNDNG;
cr->epid = 0;
cr->image_count = 0;
} /* otherwise the iosb.cond should suffice */
cr->wip_stopped = FALSE;
}
)
if (0 != cr->twin)
{ /* clean up any old twins. in unix twin is unused so reset it without examining its value */
VMS_ONLY(
cr_alt = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, cr->twin);
if (!CR_NOT_ALIGNED(cr_alt, cr_lo) && !CR_NOT_IN_RANGE(cr_alt, cr_lo, cr_top))
{
assert(((cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->twin)) == cr);
assert((0 == cr->bt_index) || (0 == cr_alt->bt_index)); /* at least one zero */
assert((0 != cr->bt_index) || (0 != cr_alt->bt_index)); /* at least one non-zero */
cr_alt->twin = 0;
}
)
cr->twin = 0;
}
if (JNL_ENABLED(csd) && cr->dirty)
{
if ((NULL != csa->jnl) && (NULL != csa->jnl->jnl_buff) && (cr->jnl_addr > csa->jnl->jnl_buff->freeaddr))
cr->jnl_addr = csa->jnl->jnl_buff->freeaddr;
} else
cr->jnl_addr = 0; /* just be safe */
assert(!cr->stopped || (CR_BLKEMPTY != cr->blk));
if (cr->stopped && (CR_BLKEMPTY != cr->blk))
{ /* cache record attached to a buffer built by secshr_db_clnup: finish work; clearest case: do it 1st */
assert(LATCH_CLEAR == WRITE_LATCH_VAL(cr));
if ((cr->blk / bplmap) * bplmap == cr->blk)
{ /* it's a bitmap */
if ((csd->trans_hist.total_blks / bplmap) * bplmap == cr->blk)
total_blks = csd->trans_hist.total_blks - cr->blk;
else
total_blks = bplmap;
bml_full = bml_find_free(0, (sm_uc_ptr_t)(GDS_ANY_REL2ABS(csa, cr->buffaddr)) + SIZEOF(blk_hdr),
total_blks);
if (NO_FREE_SPACE == bml_full)
{
bit_clear(cr->blk / bplmap, MM_ADDR(csd));
if (cr->blk > cnl->highest_lbm_blk_changed)
cnl->highest_lbm_blk_changed = cr->blk;
change_bmm = TRUE;
} else if (!(bit_set(cr->blk / bplmap, MM_ADDR(csd))))
{
if (cr->blk > cnl->highest_lbm_blk_changed)
cnl->highest_lbm_blk_changed = cr->blk;
change_bmm = TRUE;
}
} /* end of bitmap processing */
if (certify_all_blocks)
cert_blk(reg, cr->blk, (blk_hdr_ptr_t)GDS_REL2ABS(cr->buffaddr), 0, TRUE); /* GTMASSERT on error */
bt = bt_put(reg, cr->blk);
if (NULL == bt) /* NULL value is only possible if wcs_get_space in bt_put fails */
GTMASSERT; /* That is impossible here since we have called bt_refresh above */
bt->killtn = csd->trans_hist.curr_tn; /* be safe; don't know when was last kill after recover */
if (CR_NOTVALID != bt->cache_index)
{ /* the bt already identifies another cache entry with this block */
cr_alt = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index);
assert(((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr))->tn
>= ((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->buffaddr))->tn);
assert((bt_rec_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->bt_index) == bt);
cr_alt->bt_index = 0; /* cr is more recent */
assert(LATCH_CLEAR <= WRITE_LATCH_VAL(cr_alt) && LATCH_CONFLICT >= WRITE_LATCH_VAL(cr_alt));
if (UNIX_ONLY(FALSE &&) LATCH_CLEAR < WRITE_LATCH_VAL(cr_alt))
{ /* the previous entry is of interest to some process and therefore must be WIP:
* twin and make this (cr->stopped) cache record the active one */
assert(0 == cr_alt->twin);
cr->twin = GDS_ANY_ABS2REL(csa, cr_alt);
cr_alt->twin = GDS_ANY_ABS2REL(csa, cr);
WRITE_LATCH_VAL(cr_alt) = LATCH_CONFLICT; /* semaphore state of a wip twin */
} else
{ /* the other copy is less recent and not WIP, so discard it */
if ((cr_alt < cr) && cr_alt->state_que.fl)
{ /* cr_alt has already been processed and is in the state_que. hence remove it */
wq = (cache_que_head_ptr_t)((sm_uc_ptr_t)&cr_alt->state_que + cr_alt->state_que.fl);
assert(0 == (((UINTPTR_T)wq) % SIZEOF(cr_alt->state_que.fl)));
assert((UINTPTR_T)wq + wq->bl == (UINTPTR_T)&cr_alt->state_que);
back_link = (que_ent_ptr_t)remqt((que_ent_ptr_t)wq);
assert(EMPTY_QUEUE != back_link);
SUB_ENT_FROM_ACTIVE_QUE_CNT(&cnl->wcs_active_lvl, &cnl->wc_var_lock);
assert(0 <= cnl->wcs_active_lvl);
assert(back_link == (que_ent *)&cr_alt->state_que);
/* Now that back_link is out of the active queue, reset its links to 0.
* The queue operation functions (see gtm_relqueopi.c) and Unix wcs_get_space
* rely on this to determine if an element is IN the queue or not.
*/
back_link->fl = 0;
back_link->bl = 0;
}
UNIX_ONLY(assert(!cr_alt->twin));
UNIX_ONLY(cr_alt->twin = 0;)
cr->twin = cr_alt->twin; /* existing cache record may have a twin */
cr_alt->cycle++; /* increment cycle whenever blk number changes (tp_hist depends on this) */
cr_alt->blk = CR_BLKEMPTY;
cr_alt->dirty = 0;
cr_alt->flushed_dirty_tn = 0;
cr_alt->in_tend = 0;
SHMPOOL_FREE_CR_RFMT_BLOCK(reg, csa, cr_alt);
WRITE_LATCH_VAL(cr_alt) = LATCH_CLEAR;
VMS_ONLY(cr_alt->iosb.cond = 0;)
cr_alt->jnl_addr = 0;
cr_alt->refer = FALSE;
cr_alt->twin = 0;
cnl->wc_in_free++;
UNIX_ONLY(assert(!cr->twin));
if (0 != cr->twin)
{ /* inherited a WIP twin from cr_alt, transfer the twin's affections */
cr_alt = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, cr->twin);
assert(((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr))->tn
> ((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->buffaddr))->tn);
assert(LATCH_CONFLICT == WRITE_LATCH_VAL(cr_alt)); /* semaphore for wip twin */
assert(0 == cr_alt->bt_index);
cr_alt->twin = GDS_ANY_ABS2REL(csa, cr);
}
} /* if (LATCH_CLEAR < WRITE_LATCH_VAL(cr_alt)) */
} /* if (CR_NOTVALID == cr_alt) */
bt->cache_index = (int4)GDS_ANY_ABS2REL(csa, cr);
cr->bt_index = GDS_ANY_ABS2REL(csa, bt);
cr->dirty = csd->trans_hist.curr_tn;
cr->flushed_dirty_tn = 0; /* need to be less than cr->dirty. we choose 0. */
cr->epid = 0;
cr->image_count = 0;
cr->in_tend = 0;
cr->data_invalid = 0;
WRITE_LATCH_VAL(cr) = LATCH_CLEAR;
VMS_ONLY(assert(0 == cr->iosb.cond));
VMS_ONLY(cr->iosb.cond = 0;)
cr->refer = TRUE;
cr->stopped = FALSE;
hq = (cache_que_head_ptr_t)(hash_hdr + (cr->blk % bt_buckets));
insqh((que_ent_ptr_t)&cr->blkque, (que_ent_ptr_t)hq);
insqt((que_ent_ptr_t)&cr->state_que, (que_ent_ptr_t)active_head);
ADD_ENT_TO_ACTIVE_QUE_CNT(&cnl->wcs_active_lvl, &cnl->wc_var_lock);
continue;
}
if ((CR_BLKEMPTY == cr->blk) || (0 == cr->dirty) VMS_ONLY(|| ((0 != cr->iosb.cond) && (0 == cr->bt_index))))
{ /* cache record has no valid buffer attached, or its contents are in the database,
* or it has a more recent twin so we don't even have to care how its write terminated */
cr->cycle++; /* increment cycle whenever blk number changes (tp_hist depends on this) */
cr->blk = CR_BLKEMPTY;
cr->bt_index = 0;
cr->data_invalid = 0;
cr->dirty = 0;
cr->flushed_dirty_tn = 0;
cr->epid = 0;
cr->image_count = 0;
cr->in_tend = 0;
SHMPOOL_FREE_CR_RFMT_BLOCK(reg, csa, cr);
WRITE_LATCH_VAL(cr) = LATCH_CLEAR;
VMS_ONLY(cr->iosb.cond = 0;)
cr->jnl_addr = 0;
cr->refer = FALSE;
cr->stopped = FALSE; /* reset cr->stopped just in case it has a corrupt value */
cnl->wc_in_free++;
continue;
}
if (cr->data_invalid)
{ /* Some process was shot (kill -9 in Unix and STOP/ID in VMS) in the middle of an update.
* In VMS, the kernel extension routine secshr_db_clnup would have rebuilt the block nevertheless.
* In Unix, no rebuild would have been attempted since no kernel extension routine currently available.
* In either case, we do not want to discard this buffer so send a warning to the user and proceed.
*/
send_msg(VARLSTCNT(7) ERR_DBDANGER, 5, cr->data_invalid, cr->data_invalid, DB_LEN_STR(reg), cr->blk);
cr->data_invalid = 0;
}
if (cr->in_tend)
{ /* caught by a failure while in bg_update, and less recent than a cache record created by secshr_db_clnup */
if (UNIX_ONLY(FALSE &&) (LATCH_CONFLICT == WRITE_LATCH_VAL(cr)) VMS_ONLY( && (0 == cr->iosb.cond)))
{ /* must be WIP, with a currently active write */
assert(LATCH_CONFLICT >= WRITE_LATCH_VAL(cr));
hq = (cache_que_head_ptr_t)(hash_hdr + (cr->blk % bt_buckets));
WRITE_LATCH_VAL(cr) = LATCH_SET;
bt = bt_put(reg, cr->blk);
if (NULL == bt) /* NULL value is only possible if wcs_get_space in bt_put fails */
GTMASSERT; /* That is impossible here since we have called bt_refresh above */
bt->killtn = csd->trans_hist.curr_tn; /* be safe; don't know when was last kill after recover */
if (CR_NOTVALID == bt->cache_index)
{ /* no previous entry for this block; more recent cache record will twin when processed */
cr->bt_index = GDS_ANY_ABS2REL(csa, bt);
bt->cache_index = (int4)GDS_ANY_ABS2REL(csa, cr);
insqh((que_ent_ptr_t)&cr->blkque, (que_ent_ptr_t)hq);
} else
{ /* form the twin with the previous (and more recent) cache record */
cr_alt = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index);
assert(((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr))->tn
< ((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->buffaddr))->tn);
assert((bt_rec_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->bt_index) == bt);
assert(0 == cr_alt->twin);
cr_alt->twin = GDS_ANY_ABS2REL(csa, cr);
cr->twin = GDS_ANY_ABS2REL(csa, cr_alt);
cr->bt_index = 0;
insqt((que_ent_ptr_t)&cr->blkque, (que_ent_ptr_t)hq);
}
assert(cr->epid); /* before inserting into WIP queue, ensure there is a writer process for this */
insqt((que_ent_ptr_t)&cr->state_que, (que_ent_ptr_t)wip_head); /* this should be VMS only code */
} else
{ /* the [current] in_tend cache record is no longer of value and can be discarded */
cr->cycle++; /* increment cycle whenever blk number changes (tp_hist depends on this) */
cr->blk = CR_BLKEMPTY;
cr->bt_index = 0;
cr->dirty = 0;
cr->flushed_dirty_tn = 0;
cr->epid = 0;
cr->image_count = 0;
SHMPOOL_FREE_CR_RFMT_BLOCK(reg, csa, cr);
WRITE_LATCH_VAL(cr) = LATCH_CLEAR;
VMS_ONLY(cr->iosb.cond = 0;)
cr->jnl_addr = 0;
cnl->wc_in_free++;
}
cr->in_tend = 0;
cr->refer = FALSE;
continue;
}
if ((LATCH_SET > WRITE_LATCH_VAL(cr)) VMS_ONLY(|| (WRT_STRT_PNDNG == cr->iosb.cond)))
{ /* no process has an interest */
bt = bt_put(reg, cr->blk);
if (NULL == bt) /* NULL value is only possible if wcs_get_space in bt_put fails */
GTMASSERT; /* That is impossible here since we have called bt_refresh above */
bt->killtn = csd->trans_hist.curr_tn; /* be safe; don't know when was last kill after recover */
if (CR_NOTVALID == bt->cache_index)
{ /* no previous entry for this block */
bt->cache_index = (int4)GDS_ANY_ABS2REL(csa, cr);
cr->bt_index = GDS_ANY_ABS2REL(csa, bt);
cr->refer = TRUE;
hq = (cache_que_head_ptr_t)(hash_hdr + (cr->blk % bt_buckets));
insqh((que_ent_ptr_t)&cr->blkque, (que_ent_ptr_t)hq);
insqt((que_ent_ptr_t)&cr->state_que, (que_ent_ptr_t)active_head);
ADD_ENT_TO_ACTIVE_QUE_CNT(&cnl->wcs_active_lvl, &cnl->wc_var_lock);
} else
{ /* the bt already has an entry for the block */
cr_alt = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index);
assert((bt_rec_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->bt_index) == bt);
if (UNIX_ONLY(FALSE &&) LATCH_CLEAR < WRITE_LATCH_VAL(cr_alt))
{ /* the previous cache record is WIP, and the current cache record is the more recent twin */
assert(((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr))->tn
> ((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->buffaddr))->tn);
VMS_ONLY(assert(WRT_STRT_PNDNG != cr->iosb.cond));
cr_alt->bt_index = 0;
WRITE_LATCH_VAL(cr_alt) = LATCH_CONFLICT;
cr_alt->twin = GDS_ANY_ABS2REL(csa, cr);
cr->twin = GDS_ANY_ABS2REL(csa, cr_alt);
bt->cache_index = (int4)GDS_ANY_ABS2REL(csa, cr);
cr->bt_index = GDS_ANY_ABS2REL(csa, bt);
cr->refer = TRUE;
hq = (cache_que_head_ptr_t)(hash_hdr + (cr->blk % bt_buckets));
insqh((que_ent_ptr_t)&cr->blkque, (que_ent_ptr_t)hq);
insqt((que_ent_ptr_t)&cr->state_que, (que_ent_ptr_t)active_head);
ADD_ENT_TO_ACTIVE_QUE_CNT(&cnl->wcs_active_lvl, &cnl->wc_var_lock);
} else
{ /* previous cache record is more recent from a cr->stopped record made by sechsr_db_clnup:
* discard this copy as it is old */
assert(((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr))->tn
<= ((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->buffaddr))->tn);
assert(LATCH_CLEAR == WRITE_LATCH_VAL(cr_alt));
cr->cycle++; /* increment cycle whenever blk number changes (tp_hist depends on this) */
cr->blk = CR_BLKEMPTY;
cr->bt_index = 0;
cr->dirty = 0;
cr->flushed_dirty_tn = 0;
cr->jnl_addr = 0;
cr->refer = FALSE;
SHMPOOL_FREE_CR_RFMT_BLOCK(reg, csa, cr);
cnl->wc_in_free++;
}
}
cr->epid = 0;
cr->image_count = 0;
WRITE_LATCH_VAL(cr) = LATCH_CLEAR;
VMS_ONLY(assert(0 == cr->iosb.cond || WRT_STRT_PNDNG == cr->iosb.cond));
VMS_ONLY(cr->iosb.cond = 0;)
continue;
}
/* not in_tend and interlock.semaphore is not LATCH_CLEAR so cache record must be WIP */
assert(LATCH_CONFLICT >= WRITE_LATCH_VAL(cr));
VMS_ONLY(WRITE_LATCH_VAL(cr) = LATCH_SET;)
UNIX_ONLY(WRITE_LATCH_VAL(cr) = LATCH_CLEAR;)
hq = (cache_que_head_ptr_t)(hash_hdr + (cr->blk % bt_buckets));
bt = bt_put(reg, cr->blk);
if (NULL == bt) /* NULL value is only possible if wcs_get_space in bt_put fails */
GTMASSERT; /* That is impossible here since we have called bt_refresh above */
bt->killtn = csd->trans_hist.curr_tn; /* be safe; don't know when was last kill after recover */
if (CR_NOTVALID == bt->cache_index)
{ /* no previous entry for this block */
bt->cache_index = (int4)GDS_ANY_ABS2REL(csa, cr);
cr->bt_index = GDS_ANY_ABS2REL(csa, bt);
cr->refer = TRUE;
insqh((que_ent_ptr_t)&cr->blkque, (que_ent_ptr_t)hq);
} else
{ /* previous cache record must be more recent as this one is WIP */
cr_alt = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index);
assert(((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr))->tn
< ((blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->buffaddr))->tn);
assert((bt_rec_ptr_t)GDS_ANY_REL2ABS(csa, cr_alt->bt_index) == bt);
VMS_ONLY(
assert(WRT_STRT_PNDNG != cr->iosb.cond);
assert(FALSE == cr_alt->wip_stopped);
WRITE_LATCH_VAL(cr) = LATCH_CONFLICT;
cr_alt->twin = GDS_ANY_ABS2REL(csa, cr);
cr->twin = GDS_ANY_ABS2REL(csa, cr_alt);
)
cr->bt_index = 0;
cr->refer = FALSE;
insqt((que_ent_ptr_t)&cr->blkque, (que_ent_ptr_t)hq);
}
VMS_ONLY(assert(cr->epid)); /* before inserting into WIP queue, ensure there is a writer process for this */
insqt((que_ent_ptr_t)&cr->state_que, (que_ent_ptr_t)wip_head);
UNIX_ONLY(ADD_ENT_TO_ACTIVE_QUE_CNT(&cnl->wcs_active_lvl, &cnl->wc_var_lock));
/* end of processing for a single cache record */
} /* end of processing all cache records */
if (change_bmm)
{
csd->trans_hist.mm_tn++;
if (!reg->read_only)
fileheader_sync(reg);
}
if (FALSE == wcs_verify(reg, FALSE, TRUE)) /* expect_damage is FALSE, in_wcs_recover is TRUE */
GTMASSERT;
/* skip INCTN processing in case called from mu_rndwn_file().
* if called from mu_rndwn_file(), we have standalone access to shared memory so no need to increment db curr_tn
* or write inctn (since no concurrent GT.M process is present in order to restart because of this curr_tn change)
*/
if (!mu_rndwn_file_dbjnl_flush)
{
jpc = csa->jnl;
if (JNL_ENABLED(csd) && (NULL != jpc) && (NULL != jpc->jnl_buff))
{
assert(&FILE_INFO(jpc->region)->s_addrs == csa);
if (!jgbl.dont_reset_gbl_jrec_time)
{
SET_GBL_JREC_TIME; /* needed for jnl_ensure_open, jnl_put_jrt_pini and jnl_write_inctn_rec */
}
assert(jgbl.gbl_jrec_time);
jbp = jpc->jnl_buff;
/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
* of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
* journal records (if it decides to switch to a new journal file).
*/
ADJUST_GBL_JREC_TIME(jgbl, jbp);
jnl_status = jnl_ensure_open();
if (0 == jnl_status)
{
if (0 == jpc->pini_addr)
jnl_put_jrt_pini(csa);
save_inctn_opcode = inctn_opcode; /* in case caller does not expect inctn_opcode
to be changed here */
inctn_opcode = inctn_wcs_recover;
jnl_write_inctn_rec(csa);
inctn_opcode = save_inctn_opcode;
} else
jnl_file_lost(jpc, jnl_status);
}
INCREMENT_CURR_TN(csd);
}
csa->wbuf_dqd = 0; /* reset this so the wcs_wtstart below will work */
SIGNAL_WRITERS_TO_RESUME(csd);
in_wcs_recover = FALSE;
if (!reg->read_only)
{
DCLAST_WCS_WTSTART(reg, 0, dummy_errno);
VMS_ONLY(
wcs_wtfini(gv_cur_region); /* try to free as many buffers from the wip queue if write is done */
)
}
if (backup_block_saved)
backup_buffer_flush(reg);
TP_CHANGE_REG(save_reg);
return;
}
#ifdef UNIX
#ifdef MM_FILE_EXT_OK
void wcs_mm_recover(gd_region *reg)
{
int mm_prot;
INTPTR_T status;
struct stat stat_buf;
sm_uc_ptr_t old_base[2];
sigset_t savemask;
boolean_t need_to_restore_mask = FALSE, was_crit;
unix_db_info *udi;
assert(&FILE_INFO(reg)->s_addrs == cs_addrs);
assert(cs_addrs->hdr == cs_data);
if (!(was_crit = cs_addrs->now_crit) && !(cs_addrs->hdr->clustered))
grab_crit(gv_cur_region);
SET_TRACEABLE_VAR(cs_addrs->hdr->wc_blocked, FALSE);
if (cs_addrs->total_blks == cs_addrs->ti->total_blks)
{
/* I am the one who actually did the extension, don't need to remap again */
if (!was_crit)
rel_crit(gv_cur_region);
return;
}
mm_prot = cs_addrs->read_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
/* Block SIGALRM to ensure cs_data and cs_addrs are always in-sync / No IO in this period */
sigprocmask(SIG_BLOCK, &blockalrm, &savemask);
old_base[0] = cs_addrs->db_addrs[0];
old_base[1] = cs_addrs->db_addrs[1];
status = (INTPTR_T)munmap((caddr_t)old_base[0], (size_t)(old_base[1] - old_base[0]));
if (-1 != status)
{
udi = FILE_INFO(gv_cur_region);
FSTAT_FILE(udi->fd, &stat_buf, status);
status = (sm_long_t)(cs_addrs->db_addrs[0] = (sm_uc_ptr_t)mmap((caddr_t)NULL, (size_t)stat_buf.st_size,
mm_prot, GTM_MM_FLAGS, udi->fd, (off_t)0));
}
if (-1 == status)
{
sigprocmask(SIG_SETMASK, &savemask, NULL);
if (!was_crit)
rel_crit(gv_cur_region);
rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), errno);
}
/* In addition to updating the internal map values, gds_map_moved also updates cs_data to point to the remapped file */
gds_map_moved(cs_addrs->db_addrs[0], old_base[0], old_base[1], (off_t)stat_buf.st_size);
cs_addrs->total_blks = cs_addrs->ti->total_blks;
if (!was_crit)
rel_crit(gv_cur_region);
sigprocmask(SIG_SETMASK, &savemask, NULL);
return;
}
#else /* !MM_FILE_EXT_OK */
void wcs_mm_recover(gd_region *reg)
{
unsigned char *end, buff[MAX_ZWR_KEY_SZ];
if (NULL == (end = format_targ_key(buff, MAX_ZWR_KEY_SZ, gv_currkey, TRUE)))
end = &buff[MAX_ZWR_KEY_SZ - 1];
rts_error(VARLSTCNT(6) ERR_GBLOFLOW, 0, ERR_GVIS, 2, end - buff, buff);
return;
}
#endif
#elif defined(VMS)
void wcs_mm_recover(gd_region *reg)
{
unsigned char *end, buff[MAX_ZWR_KEY_SZ];
assert(&FILE_INFO(reg)->s_addrs == cs_addrs);
assert(cs_addrs->now_crit);
assert(cs_addrs->hdr == cs_data);
assert(!cs_addrs->hold_onto_crit);
/* but it isn't yet implemented on VMS */
rel_crit(gv_cur_region);
if (NULL == (end = format_targ_key(buff, MAX_ZWR_KEY_SZ, gv_currkey, TRUE)))
end = &buff[MAX_ZWR_KEY_SZ - 1];
rts_error(VARLSTCNT(6) ERR_GBLOFLOW, 0, ERR_GVIS, 2, end - buff, buff);
return;
}
#else
# error UNSUPPORTED PLATFORM
#endif