fis-gtm/sr_port/secshr_db_clnup.c

1967 lines
79 KiB
C

/****************************************************************
* *
* Copyright 2001, 2013 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#ifdef VMS
#include <ssdef.h>
#include <psldef.h>
#include <descrip.h>
#endif
#include "gtm_inet.h"
#include "gtm_string.h"
#include "gdsroot.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "gdscc.h"
#include "min_max.h"
#include "gdsblkops.h"
#include "gdsbml.h"
#include "gdskill.h"
#include "copy.h"
#ifdef VMS
#include "lockconst.h"
#endif
#include "interlock.h"
#include "jnl.h"
#include "probe.h"
#include "buddy_list.h" /* needed for tp.h */
#include "hashtab_int4.h" /* needed for tp.h */
#include "tp.h"
#include "io.h"
#include "gtmsecshr.h"
#include "repl_msg.h"
#include "gtmsource.h"
#include "is_proc_alive.h"
#include "aswp.h"
#include "util.h"
#include "compswap.h"
#ifdef UNIX
#include "mutex.h"
#include "repl_instance.h" /* needed for JNLDATA_BASE_OFF macro */
#include "mupipbckup.h" /* needed for backup_block prototype */
#include "cert_blk.h" /* for CERT_BLK_IF_NEEDED macro */
#include "relqueopi.h" /* for INSQTI and INSQHI macros */
#include "caller_id.h"
#endif
#include "sec_shr_blk_build.h"
#include "sec_shr_map_build.h"
#include "add_inter.h"
#include "send_msg.h" /* for send_msg prototype */
#include "secshr_db_clnup.h"
#include "gdsbgtr.h"
#include "memcoherency.h"
#include "shmpool.h"
#include "wbox_test_init.h"
#ifdef GTM_SNAPSHOT
#include "db_snapshot.h"
#endif
#include "muextr.h"
#include "mupip_reorg.h"
/* This section documents DOs and DONTs about code used by GTMSECSHR on Alpha VMS. Any module linked into GTMSECSHR (see
* secshrlink.axp for the current list) must follow certain rules as GTMSECSHR provides user-defined system services
* (privileged image that runs in kernel mode). See "Creating User Written System Sevice" chapter of the "Programming Concepts"
* OpenVMS manual and the "Shareable Images Cookbook" available from the OpenVMS Wizard's page. SYS$EXAMPLES:uwss*.* is also a
* good reference.
*
** DO NOT use modulo (%) operation. If % is used, GTMSECSHR links with LIBOTS.EXE - an external shared image. This will result
* in "-SYSTEM-F-NOSHRIMG, privileged shareable image cannot have outbound calls" errors when GTMSECSHR is invoked. We might as
* well avoid division too.
*
** The only library/system calls allowed are SYS$ calls.
*
** No I/O allowed - any device, including operator console.
*
** Always PROBE memory before accessing it. If not, should SECSHR access invalid memory (out of bounds for instance) the machine
* will crash (BUGCHECK in VMS parlance). Remember, SECSHR is running in kernel mode!
*
** Both secshr_db_clnup.c and sec_shr_blk_build.c are compiled with /prefix=except=memmove. If any of the other modules used
* memmove, they would need special treatment as well.
*/
#define FLUSH 1
#define WCBLOCKED_NOW_CRIT_LIT "wcb_secshr_db_clnup_now_crit"
#define WCBLOCKED_WBUF_DQD_LIT "wcb_secshr_db_clnup_wbuf_dqd"
#define WCBLOCKED_PHASE2_CLNUP_LIT "wcb_secshr_db_clnup_phase2_clnup"
/* IMPORTANT : SECSHR_PROBE_REGION sets csa */
#define SECSHR_PROBE_REGION(reg) \
if (!GTM_PROBE(SIZEOF(gd_region), (reg), READ)) \
continue; /* would be nice to notify the world of a problem but where and how?? */ \
if (!reg->open || reg->was_open) \
continue; \
if (!GTM_PROBE(SIZEOF(gd_segment), (reg)->dyn.addr, READ)) \
continue; /* would be nice to notify the world of a problem but where and how? */ \
if ((dba_bg != (reg)->dyn.addr->acc_meth) && (dba_mm != (reg)->dyn.addr->acc_meth)) \
continue; \
if (!GTM_PROBE(SIZEOF(file_control), (reg)->dyn.addr->file_cntl, READ)) \
continue; /* would be nice to notify the world of a problem but where and how? */ \
if (!GTM_PROBE(SIZEOF(GDS_INFO), (reg)->dyn.addr->file_cntl->file_info, READ)) \
continue; /* would be nice to notify the world of a problem but where and how? */ \
csa = &(FILE_INFO((reg)))->s_addrs; \
if (!GTM_PROBE(SIZEOF(sgmnt_addrs), csa, WRITE)) \
continue; /* would be nice to notify the world of a problem but where and how? */ \
assert(reg->read_only && !csa->read_write || !reg->read_only && csa->read_write);
#ifdef DEBUG_CHECK_LATCH
# define DEBUG_LATCH(x) x
#else
# define DEBUG_LATCH(x)
#endif
#ifdef VMS
/* Use compswap_secshr instead of compswap in our expansions */
# define compswap compswap_secshr
# define SALVAGE_UNIX_LATCH(X, is_exiting)
#else
# define SALVAGE_UNIX_LATCH_DBCRIT(X, is_exiting, wcblocked) \
{ /* "wcblocked" is relevant only if X is the database crit semaphore. In this case, BEFORE salvaging crit, \
* (but AFTER ensuring the previous holder pid is dead) we need to set cnl->wc_blocked to TRUE to \
* ensure whoever grabs crit next does a cache-recovery. This is necessary in case previous holder of crit \
* had set some cr->in_cw_set to a non-zero value. Not doing cache recovery could cause incorrect GTMASSERTs \
* in PIN_CACHE_RECORD macro in t_end/tp_tend. \
*/ \
uint4 pid; \
\
if ((pid = (X)->u.parts.latch_pid) == rundown_process_id) \
{ \
if (is_exiting) \
{ \
SET_LATCH_GLOBAL(X, LOCK_AVAILABLE); \
DEBUG_LATCH(util_out_print("Latch cleaned up", FLUSH)); \
} \
} else if (0 != pid && FALSE == is_proc_alive(pid, UNIX_ONLY(0) VMS_ONLY((X)->u.parts.latch_image_count))) \
{ \
(wcblocked) = TRUE; \
DEBUG_LATCH(util_out_print("Orphaned latch cleaned up", TRUE)); \
COMPSWAP_UNLOCK((X), pid, (X)->u.parts.latch_image_count, LOCK_AVAILABLE, 0); \
} \
}
/* The SALVAGE_UNIX_LATCH macro needs to do exactly the same thing as done by the SALVAGE_UNIX_LATCH_DBCRIT \
* macro except that we dont need any special set of wc_blocked to TRUE. So we pass in a dummy variable \
* (instead of cnl->wc_blocked) to be set to TRUE in case the latch is salvaged. \
*/ \
#define SALVAGE_UNIX_LATCH(X, is_exiting) \
{ \
boolean_t dummy; \
\
SALVAGE_UNIX_LATCH_DBCRIT(X, is_exiting, dummy); \
}
GBLREF uint4 process_id; /* Used in xxx_SWAPLOCK macros .. has same value as rundown_process_id on UNIX */
GBLREF volatile int4 crit_count;
#endif
GBLDEF gd_addr_fn_ptr get_next_gdr_addrs;
GBLDEF cw_set_element *cw_set_addrs;
GBLDEF sgm_info **first_sgm_info_addrs;
GBLDEF sgm_info **first_tp_si_by_ftok_addrs;
GBLDEF unsigned char *cw_depth_addrs;
GBLDEF uint4 rundown_process_id;
GBLDEF uint4 rundown_image_count;
GBLDEF int4 rundown_os_page_size;
GBLDEF gd_region **jnlpool_reg_addrs;
GBLDEF inctn_opcode_t *inctn_opcode_addrs;
GBLDEF inctn_detail_t *inctn_detail_addrs;
GBLDEF uint4 *dollar_tlevel_addrs;
GBLDEF uint4 *update_trans_addrs;
GBLDEF sgmnt_addrs **cs_addrs_addrs;
GBLDEF sgmnt_addrs **kip_csa_addrs;
GBLDEF boolean_t *need_kip_incr_addrs;
GBLDEF trans_num *start_tn_addrs;
#ifdef UNIX
GBLREF short crash_count;
GBLREF node_local_ptr_t locknl;
GBLREF inctn_opcode_t inctn_opcode;
GBLREF inctn_detail_t inctn_detail; /* holds detail to fill in to inctn jnl record */
GBLREF boolean_t dse_running;
GBLREF boolean_t certify_all_blocks;
GBLREF gd_region *gv_cur_region; /* for the LOCK_HIST macro in the RELEASE_BUFF_UPDATE_LOCK macro */
GBLREF node_local_ptr_t locknl; /* set explicitly before invoking RELEASE_BUFF_UPDATE_LOCK macro */
GBLREF int4 strm_index;
GBLREF jnl_gbls_t jgbl;
#endif
#ifdef DEBUG
GBLREF sgmnt_addrs *cs_addrs;
#endif
error_def(ERR_WCBLOCKED);
typedef enum
{
REG_COMMIT_UNSTARTED = 0,/* indicates that GT.M has not committed even one cse in this region */
REG_COMMIT_PARTIAL, /* indicates that GT.M has committed at least one but not all cses for this region */
REG_COMMIT_COMPLETE /* indicates that GT.M has already committed all cw-set-elements for this region */
} commit_type;
boolean_t secshr_tp_get_cw(cw_set_element *cs, int depth, cw_set_element **cs1);
void secshr_db_clnup(enum secshr_db_state secshr_state)
{
unsigned char *chain_ptr;
char *wcblocked_ptr;
uint4 dlr_tlevel;
boolean_t is_bg, jnlpool_reg, do_accounting, first_time = TRUE, is_exiting;
boolean_t kip_csa_usable, needkipincr;
uint4 upd_trans; /* a copy of the global variable "update_trans" which is needed for VMS STOP/ID case */
boolean_t tp_update_underway = FALSE; /* set to TRUE if TP commit was in progress or complete */
boolean_t non_tp_update_underway = FALSE; /* set to TRUE if non-TP commit was in progress or complete */
boolean_t update_underway = FALSE; /* set to TRUE if either TP or non-TP commit was underway */
boolean_t set_wc_blocked = FALSE; /* set to TRUE if cnl->wc_blocked needs to be set */
boolean_t dont_reset_data_invalid; /* set to TRUE in case cr->data_invalid was TRUE in phase2 */
int max_bts;
unsigned int lcnt;
cache_rec_ptr_t clru, cr, cr_alt, cr_top, start_cr, actual_cr;
cache_que_heads_ptr_t cache_state;
cw_set_element *cs, *cs_ptr, *cs_top, *first_cw_set, *nxt, *orig_cs;
gd_addr *gd_header;
gd_region *reg, *reg_top;
jnl_buffer_ptr_t jbp;
off_chain chain;
sgm_info *si, *firstsgminfo;
sgmnt_addrs *csa, *csaddrs;
sgmnt_data_ptr_t csd;
node_local_ptr_t cnl;
sm_uc_ptr_t blk_ptr;
blk_hdr_ptr_t blk_hdr_ptr;
jnlpool_ctl_ptr_t jpl;
jnldata_hdr_ptr_t jh;
uint4 cumul_jnl_rec_len, jsize, new_write, imgcnt;
pid_t pid;
sm_uc_ptr_t bufstart;
int4 bufindx; /* should be the same type as "csd->bt_buckets" */
commit_type this_reg_commit_type; /* indicate the type of commit of a given region in a TP transaction */
gv_namehead *gvt = NULL, *gvtarget;
srch_blk_status *t1;
trans_num currtn;
int4 n;
# ifdef VMS
uint4 process_id; /* needed for the UNPIN_CACHE_RECORD macro */
# endif
GTM_SNAPSHOT_ONLY(
snapshot_context_ptr_t lcl_ss_ctx;
cache_rec_ptr_t snapshot_cr;
)
# ifdef UNIX
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
# endif
if (NULL == get_next_gdr_addrs)
return;
/*
* secshr_db_clnup can be called with one of the following three values for "secshr_state"
*
* a) NORMAL_TERMINATION --> We are called from the exit-handler for precautionary cleanup.
* We should NEVER be in the midst of a database update in this case.
* b) COMMIT_INCOMPLETE --> We are called from t_commit_cleanup.
* We should ALWAYS be in the midst of a database update in this case.
* c) ABNORMAL_TERMINATION --> This is currently VMS ONLY. This process received a STOP/ID.
* We can POSSIBLY be in the midst of a database update in this case.
* When UNIX boxes allow kernel extensions, this can be made to handle "kill -9" too.
*
* If we are in the midst of a database update, then depending on the stage of the commit we are in,
* we need to ROLL-BACK (undo the partial commit) or ROLL-FORWARD (complete the partial commit) the database update.
*
* t_commit_cleanup handles the ROLL-BACK and secshr_db_clnup handles the ROLL-FORWARD
*
* For all error conditions in the database commit logic, t_commit_cleanup gets control first.
* If then determines whether to do a ROLL-BACK or a ROLL-FORWARD.
* If a ROLL-BACK needs to be done, then t_commit_cleanup handles it all by itself and we will not come here.
* If a ROLL-FORWARD needs to be done, then t_commit_cleanup invokes secshr_db_clnup.
* In this case, secshr_db_clnup will be called with a "secshr_state" value of "COMMIT_INCOMPLETE".
*
* In case of a STOP/ID in VMS, secshr_db_clnup is directly invoked with a "secshr_state" value of "ABNORMAL_TERMINATION".
* Irrespective of whether we are in the midst of a database commit or not, t_commit_cleanup does not get control.
* Since the process can POSSIBLY be in the midst of a database update while it was STOP/IDed,
* the logic for determining whether it is a ROLL-BACK or a ROLL-FORWARD needs to also be in secshr_db_clnup.
* If it is determined that a ROLL-FORWARD needs to be done, secshr_db_clnup takes care of it by itself.
* But if a ROLL-BACK needs to be done, then secshr_db_clnup DOES NOT invoke t_commit_cleanup.
* Instead it sets cnl->wc_blocked to TRUE thereby ensuring the next process that gets CRIT does a cache recovery
* which will take care of doing more than the ROLL-BACK that t_commit_cleanup would have otherwise done.
*
* The logic for determining if it is a ROLL-BACK or ROLL-FORWARD is explained below.
* The commit logic flow in tp_tend and t_end can be captured as follows. Note that in t_end there is only one region.
*
* 1) Get crit on all regions
* 2) Get crit on jnlpool
* 3) jnlpool_ctl->early_write_addr += delta;
* For each participating region being UPDATED
* {
* 4) csd->trans_hist.early_tn++;
* Write journal records
* 5) csa->hdr->reg_seqno = jnlpool_ctl->jnl_seqno + 1;
* }
* For each participating region being UPDATED
* {
* 6) csa->t_commit_crit = T_COMMIT_CRIT_PHASE1;
* For every cw-set-element of this region
* {
* 6a) Commit this particular block PHASE1 (inside crit).
* }
* 7) csa->t_commit_crit = T_COMMIT_CRIT_PHASE2;
* 8) csd->trans_hist.curr_tn++;
* }
* 9) jnlpool_ctl->write_addr = jnlpool_ctl->early_write_addr;
* 10) jnlpool_ctl->jnl_seqno++;
* 11) Release crit on all db regions
* 12) Release crit on jnlpool
* For each participating region being UPDATED
* {
* For every cw-set-element of this region
* {
* 13) Commit this particular block PHASE2 (outside crit).
* 14) cs->mode = gds_t_committed;
* }
* 15) csa->t_commit_crit = FALSE;
* }
*
* If a TP transaction has proceeded to step (6) for at least one region, then "tp_update_underway" is set to TRUE
* and the transaction cannot be rolled back but has to be committed. Otherwise the transaction is rolled back.
*
* If a non-TP transaction has proceeded to step (6), then "non_tp_update_underway" is set to TRUE
* and the transaction cannot be rolled back but has to be committed. Otherwise the transaction is rolled back.
*/
UNIX_ONLY(assert(rundown_process_id == process_id);)
VMS_ONLY(assert(rundown_process_id);)
VMS_ONLY(process_id = rundown_process_id;) /* used by the UNPIN_CACHE_RECORD macro */
is_exiting = (ABNORMAL_TERMINATION == secshr_state) || (NORMAL_TERMINATION == secshr_state);
if (GTM_PROBE(SIZEOF(*dollar_tlevel_addrs), dollar_tlevel_addrs, READ))
dlr_tlevel = *dollar_tlevel_addrs;
else
{
assert(FALSE);
dlr_tlevel = FALSE;
}
if (dlr_tlevel && GTM_PROBE(SIZEOF(*first_tp_si_by_ftok_addrs), first_tp_si_by_ftok_addrs, READ))
{ /* Determine update_underway for TP transaction. A similar check is done in t_commit_cleanup as well.
* Regions are committed in the ftok order using "first_tp_si_by_ftok". Also crit is released on each region
* as the commit completes. Take that into account while determining if update is underway.
*/
for (si = *first_tp_si_by_ftok_addrs; NULL != si; si = si->next_tp_si_by_ftok)
{
if (GTM_PROBE(SIZEOF(sgm_info), si, READ))
{
assert(GTM_PROBE(SIZEOF(cw_set_element), si->first_cw_set, READ) || (NULL == si->first_cw_set));
if (UPDTRNS_TCOMMIT_STARTED_MASK & si->update_trans)
{ /* Two possibilities.
* (a) case of duplicate set not creating any cw-sets but updating db curr_tn++.
* (b) Have completed commit for this region and have released crit on this region.
* (in a potentially multi-region TP transaction).
* In either case, update is underway and the transaction cannot be rolled back.
*/
tp_update_underway = TRUE;
update_underway = TRUE;
break;
}
if (GTM_PROBE(SIZEOF(cw_set_element), si->first_cw_set, READ))
{ /* Note that SECSHR_PROBE_REGION does a "continue" if any probes fail. */
csa = si->tp_csa;
if (!GTM_PROBE(SIZEOF(sgmnt_addrs), csa, READ))
continue;
if (T_UPDATE_UNDERWAY(csa))
{
tp_update_underway = TRUE;
update_underway = TRUE;
break;
}
}
} else
{
assert(FALSE);
break;
}
}
}
if (!dlr_tlevel)
{ /* determine update_underway for non-TP transaction */
upd_trans = FALSE;
if (GTM_PROBE(SIZEOF(*update_trans_addrs), update_trans_addrs, READ))
upd_trans = *update_trans_addrs;
csaddrs = NULL;
if (GTM_PROBE(SIZEOF(*cs_addrs_addrs), cs_addrs_addrs, READ))
csaddrs = *cs_addrs_addrs;
if (GTM_PROBE(SIZEOF(sgmnt_addrs), csaddrs, READ))
{
if (csaddrs->now_crit && (UPDTRNS_TCOMMIT_STARTED_MASK & upd_trans) || T_UPDATE_UNDERWAY(csaddrs))
{
non_tp_update_underway = TRUE; /* non-tp update was underway */
update_underway = TRUE;
}
}
}
/* Assert that if we had been called from t_commit_cleanup, we independently concluded that update is underway
* (as otherwise t_commit_cleanup would not have called us)
*/
assert((COMMIT_INCOMPLETE != secshr_state) || update_underway);
for (gd_header = (*get_next_gdr_addrs)(NULL); NULL != gd_header; gd_header = (*get_next_gdr_addrs)(gd_header))
{
if (!GTM_PROBE(SIZEOF(gd_addr), gd_header, READ))
break; /* if gd_header is accessible */
for (reg = gd_header->regions, reg_top = reg + gd_header->n_regions; reg < reg_top; reg++)
{
SECSHR_PROBE_REGION(reg); /* SECSHR_PROBE_REGION sets csa */
csd = csa->hdr;
if (!GTM_PROBE(SIZEOF(sgmnt_data), csd, WRITE))
{
assert(FALSE);
continue; /* would be nice to notify the world of a problem but where and how? */
}
cnl = csa->nl;
if (!GTM_PROBE(NODE_LOCAL_SIZE_DBS, cnl, WRITE))
{
assert(FALSE);
continue; /* would be nice to notify the world of a problem but where and how? */
}
is_bg = (csd->acc_meth == dba_bg);
do_accounting = FALSE; /* used by SECSHR_ACCOUNTING macro */
/* do SECSHR_ACCOUNTING only if holding crit (to avoid another process' normal termination call
* to secshr_db_clnup from overwriting whatever important information we wrote. if we are in
* crit, for the next process to overwrite us it needs to get crit which in turn will invoke
* wcs_recover which in turn will send whatever we wrote to the operator log).
* also cannot update csd if MM and read-only. take care of that too. */
if (csa->now_crit && (csa->read_write || is_bg))
{ /* start accounting */
cnl->secshr_ops_index = 0;
do_accounting = TRUE; /* used by SECSHR_ACCOUNTING macro */
}
SECSHR_ACCOUNTING(4); /* 4 is the number of arguments following including self */
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING(rundown_process_id);
SECSHR_ACCOUNTING(secshr_state);
if (csa->ti != &csd->trans_hist)
{
SECSHR_ACCOUNTING(4);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)csa->ti);
SECSHR_ACCOUNTING((INTPTR_T)&csd->trans_hist);
csa->ti = &csd->trans_hist; /* better to correct and proceed than to stop */
}
SECSHR_ACCOUNTING(3); /* 3 is the number of arguments following including self */
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING(csd->trans_hist.curr_tn);
if (is_exiting)
{ /* If we hold any latches in the node_local area, release them. Note we do not check
db_latch here because it is never used by the compare and swap logic but rather
the aswp logic. Since it is only used for the 3 state cache record lock and
separate recovery exists for it, we do not do anything with it here.
*/
SALVAGE_UNIX_LATCH(&cnl->wc_var_lock, is_exiting);
if (ABNORMAL_TERMINATION == secshr_state)
{
if (csa->timer)
{
if (-1 < cnl->wcs_timers) /* private flag is optimistic: dont overdo */
CAREFUL_DECR_CNT(&cnl->wcs_timers, &cnl->wc_var_lock);
csa->timer = FALSE;
}
if (csa->read_write && csa->ref_cnt)
{
assert(0 < cnl->ref_cnt);
csa->ref_cnt--;
assert(!csa->ref_cnt);
CAREFUL_DECR_CNT(&cnl->ref_cnt, &cnl->wc_var_lock);
}
}
if ((csa->in_wtstart) && (0 < cnl->in_wtstart))
{
CAREFUL_DECR_CNT(&cnl->in_wtstart, &cnl->wc_var_lock);
assert(0 < cnl->intent_wtstart);
if (0 < cnl->intent_wtstart)
CAREFUL_DECR_CNT(&cnl->intent_wtstart, &cnl->wc_var_lock);
}
csa->in_wtstart = FALSE; /* Let wcs_wtstart run for exit processing */
if (cnl->wcsflu_pid == rundown_process_id)
cnl->wcsflu_pid = 0;
}
set_wc_blocked = FALSE;
if (is_bg)
{
if ((0 == cnl->sec_size) || !GTM_PROBE(cnl->sec_size VMS_ONLY(* OS_PAGELET_SIZE), cnl, WRITE))
{
SECSHR_ACCOUNTING(3);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING(cnl->sec_size VMS_ONLY(* OS_PAGELET_SIZE));
assert(FALSE);
continue;
}
cache_state = csa->acc_meth.bg.cache_state;
if (!GTM_PROBE(SIZEOF(cache_que_heads), cache_state, WRITE))
{
SECSHR_ACCOUNTING(3);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cache_state);
assert(FALSE);
continue;
}
SALVAGE_UNIX_LATCH(&cache_state->cacheq_active.latch, is_exiting);
start_cr = cache_state->cache_array + csd->bt_buckets;
max_bts = csd->n_bts;
if (!GTM_PROBE((uint4)(max_bts * SIZEOF(cache_rec)), start_cr, WRITE))
{
SECSHR_ACCOUNTING(3);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)start_cr);
assert(FALSE);
continue;
}
cr_top = start_cr + max_bts;
if (is_exiting)
{
for (cr = start_cr; cr < cr_top; cr++)
{ /* walk the cache looking for incomplete writes and reads issued by self */
VMS_ONLY(
if ((0 == cr->iosb.cond) && (cr->epid == rundown_process_id))
{
cr->shmpool_blk_off = 0; /* Cut link to reformat blk */
cr->wip_stopped = TRUE;
}
)
SALVAGE_UNIX_LATCH(&cr->rip_latch, is_exiting);
if ((cr->r_epid == rundown_process_id) && (0 == cr->dirty) && (0 == cr->in_cw_set))
{ /* increment cycle for blk number changes (for tp_hist) */
cr->cycle++;
cr->blk = CR_BLKEMPTY;
/* ensure no bt points to this cr for empty blk */
assert(0 == cr->bt_index);
/* don't mess with ownership the I/O may not yet be cancelled;
* ownership will be cleared by whoever gets stuck waiting
* for the buffer */
}
}
}
}
first_cw_set = cs = NULL;
/* If tp_update_underway has been determined to be TRUE, then we are guaranteed we have a well formed
* ftok ordered linked list ("first_tp_si_by_ftok") so we can safely use this.
*/
if (tp_update_underway)
{ /* this is constructed to deal with the issue of reg != si->gv_cur_region
* due to the possibility of multiple global directories pointing to regions
* that resolve to the same physical file; was_open prevents processing the segment
* more than once, so this code matches on the file rather than the region to make sure
* that it gets processed at least once */
for (si = *first_tp_si_by_ftok_addrs; NULL != si; si = si->next_tp_si_by_ftok)
{
if (!GTM_PROBE(SIZEOF(sgm_info), si, READ))
{
SECSHR_ACCOUNTING(3);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)si);
assert(FALSE);
break;
} else if (!GTM_PROBE(SIZEOF(gd_region), si->gv_cur_region, READ))
{
SECSHR_ACCOUNTING(3);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)si->gv_cur_region);
assert(FALSE);
continue;
} else if (!GTM_PROBE(SIZEOF(gd_segment), si->gv_cur_region->dyn.addr, READ))
{
SECSHR_ACCOUNTING(3);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)si->gv_cur_region->dyn.addr);
assert(FALSE);
continue;
} else if (si->gv_cur_region->dyn.addr->file_cntl == reg->dyn.addr->file_cntl)
{
cs = si->first_cw_set;
if (cs && GTM_PROBE(SIZEOF(cw_set_element), cs, READ))
{
while (cs->high_tlevel)
{
if (GTM_PROBE(SIZEOF(cw_set_element),
cs->high_tlevel, READ))
cs = cs->high_tlevel;
else
{
SECSHR_ACCOUNTING(3);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs->high_tlevel);
assert(FALSE);
first_cw_set = cs = NULL;
break;
}
}
}
first_cw_set = cs;
break;
}
}
} else if (!dlr_tlevel && csa->t_commit_crit)
{
if (!GTM_PROBE(SIZEOF(unsigned char), cw_depth_addrs, READ))
{
SECSHR_ACCOUNTING(3);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cw_depth_addrs);
assert(FALSE);
} else
{ /* csa->t_commit_crit being TRUE is a clear cut indication that we have
* reached stage (6). ROLL-FORWARD the commit unconditionally.
*/
if (0 != *cw_depth_addrs)
{
first_cw_set = cs = cw_set_addrs;
cs_top = cs + *cw_depth_addrs;
}
/* else is the case where we had a duplicate set that did not update any cw-set */
assert(!tp_update_underway);
assert(non_tp_update_underway); /* should have already determined update is underway */
if (!non_tp_update_underway)
{ /* This is a situation where we are in non-TP and have a region that we hold
* crit in and are in the midst of commit but this region was not the current
* region when we entered secshr_db_clnup. This is an out-of-design situation
* that we want to catch in Unix (not VMS because it runs in kernel mode).
*/
UNIX_ONLY(GTMASSERT;) /* in Unix we want to catch this situation even in pro */
}
non_tp_update_underway = TRUE; /* just in case */
update_underway = TRUE; /* just in case */
}
}
assert(!tp_update_underway || (NULL == first_cw_set) || (NULL != si));
/* It is possible that we were in the midst of a non-TP commit for this region at or past stage (7)
* but first_cw_set is NULL. This is a case of duplicate SET with zero cw_set_depth. In this case,
* dont have any cw-set-elements to commit. The only thing remaining to do is steps (9) through (12)
* which are done later in this function.
*/
assert((FALSE == csa->t_commit_crit) || (T_COMMIT_CRIT_PHASE1 == csa->t_commit_crit)
|| (T_COMMIT_CRIT_PHASE2 == csa->t_commit_crit));
assert(!csa->t_commit_crit || (NULL != first_cw_set)); /* dont miss out committing a region */
/* Skip processing region in case of a multi-region TP transaction where this region is already committed */
assert((NULL == first_cw_set) || csa->now_crit || csa->t_commit_crit || tp_update_underway);
if ((csa->now_crit || csa->t_commit_crit) && (NULL != first_cw_set))
{
SECSHR_ACCOUNTING(6);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING(csa->now_crit);
SECSHR_ACCOUNTING(csa->t_commit_crit);
SECSHR_ACCOUNTING(csd->trans_hist.early_tn);
SECSHR_ACCOUNTING(csd->trans_hist.curr_tn);
assert(non_tp_update_underway || tp_update_underway);
assert(!non_tp_update_underway || !tp_update_underway);
if (is_bg)
{
clru = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, cnl->cur_lru_cache_rec_off);
lcnt = 0;
}
assert((T_COMMIT_CRIT_PHASE2 == csa->t_commit_crit) || csa->now_crit);
if (T_COMMIT_CRIT_PHASE1 == csa->t_commit_crit)
{ /* in PHASE1 so hold crit AND have noted down valid value in csa->prev_free_blks */
assert(NORMAL_TERMINATION != secshr_state); /* for normal termination we should not
* have been in the midst of commit */
assert(csa->now_crit);
csd->trans_hist.free_blocks = csa->prev_free_blks;
}
SECSHR_ACCOUNTING(tp_update_underway ? 6 : 7);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)first_cw_set);
SECSHR_ACCOUNTING(tp_update_underway);
SECSHR_ACCOUNTING(non_tp_update_underway);
if (!tp_update_underway)
{
SECSHR_ACCOUNTING((INTPTR_T)cs_top);
SECSHR_ACCOUNTING(*cw_depth_addrs);
} else
{
SECSHR_ACCOUNTING(si->cw_set_depth);
this_reg_commit_type = REG_COMMIT_UNSTARTED; /* assume GT.M did no commits in this region */
/* Note that "this_reg_commit_type" is uninitialized if "tp_update_underway" is not TRUE
* so should always be used within an "if (tp_update_underway)" */
}
/* Determine transaction number to use for the gvcst_*_build functions.
* If not phase2, then we have crit, so it is the same as the current database transaction number.
* If phase2, then we dont have crit, so use value stored in "start_tn" or "si->start_tn".
*/
if (T_COMMIT_CRIT_PHASE2 != csa->t_commit_crit)
currtn = csd->trans_hist.curr_tn;
else
{
if (!tp_update_underway)
{
if (GTM_PROBE(SIZEOF(*start_tn_addrs), start_tn_addrs, READ))
currtn = *start_tn_addrs;
else
{
assert(FALSE);
/* dont know how this is possible, but in this case use curr db tn - 1 */
currtn = csd->trans_hist.curr_tn - 1;
}
} else
currtn = si->start_tn;
assert(currtn < csd->trans_hist.curr_tn);
}
for (; (tp_update_underway && NULL != cs) || (!tp_update_underway && cs < cs_top);
cs = tp_update_underway ? orig_cs->next_cw_set : (cs + 1))
{
dont_reset_data_invalid = FALSE;
if (tp_update_underway)
{
orig_cs = cs;
if (cs && GTM_PROBE(SIZEOF(cw_set_element), cs, READ))
{
while (cs->high_tlevel)
{
if (GTM_PROBE(SIZEOF(cw_set_element),
cs->high_tlevel, READ))
cs = cs->high_tlevel;
else
{
SECSHR_ACCOUNTING(3);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs->high_tlevel);
assert(FALSE);
cs = NULL;
break;
}
}
}
}
if (!GTM_PROBE(SIZEOF(cw_set_element), cs, WRITE))
{
SECSHR_ACCOUNTING(3);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
assert(FALSE);
break;
}
if (gds_t_committed < cs->mode)
{
assert(n_gds_t_op != cs->mode);
if (n_gds_t_op > cs->mode)
{ /* Currently there are only three possibilities and each is in NON-TP.
* In each case, no need to do any block update so simulate commit.
*/
assert(!tp_update_underway);
assert((gds_t_write_root == cs->mode) || (gds_t_busy2free == cs->mode)
|| (gds_t_recycled2free == cs->mode));
/* Check if BG AND gds_t_busy2free and if so UNPIN corresponding
* cache-record. This needs to be done only if we hold crit as otherwise
* it means we have already done it in t_end. But to do this we need to
* pass the global variable array "cr_array" from GTM to GTMSECSHR which
* is better avoided. Since anyways we have crit at this point, we are
* going to set wc_blocked later which is going to trigger cache recovery
* that is going to unpin all the cache-records so we dont take the
* trouble to do it here.
*/
} else
{ /* Currently there are only two possibilities and both are in TP.
* In either case, need to simulate what tp_tend would have done which
* is to build a private copy right now if this is the first phase of
* commit (i.e. we hold crit) as this could be needed in the 2nd phase
* of KILL.
*/
assert(tp_update_underway);
assert((kill_t_write == cs->mode) || (kill_t_create == cs->mode));
if (csa->now_crit && (!cs->done))
{
# ifdef UNIX
/* Initialize cs->new_buff to non-NULL since sec_shr_blk_build
* expects this. For VMS, tp_tend would have done this already.
*/
if (NULL == cs->new_buff)
cs->new_buff = (unsigned char *)
get_new_free_element(si->new_buff_list);
# endif
assert(NULL != cs->new_buff);
blk_ptr = (sm_uc_ptr_t)cs->new_buff;
/* No need to probe blk_ptr as sec_shr_blk_build does that */
if (FALSE == sec_shr_blk_build(csa, csd, is_bg, cs, blk_ptr,
currtn))
{
SECSHR_ACCOUNTING(10);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING(cs->level);
SECSHR_ACCOUNTING(cs->done);
SECSHR_ACCOUNTING(cs->forward_process);
SECSHR_ACCOUNTING(cs->first_copy);
SECSHR_ACCOUNTING((INTPTR_T)cs->upd_addr);
SECSHR_ACCOUNTING((INTPTR_T)cs->new_buff);
assert(FALSE);
continue;
} else if (cs->ins_off != 0)
{
if ((cs->ins_off
> ((blk_hdr *)blk_ptr)->bsiz - SIZEOF(block_id))
|| (cs->ins_off
< (SIZEOF(blk_hdr) + SIZEOF(rec_hdr))))
{
SECSHR_ACCOUNTING(7);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING(cs->index);
SECSHR_ACCOUNTING(cs->ins_off);
SECSHR_ACCOUNTING(
((blk_hdr *)blk_ptr)->bsiz);
assert(FALSE);
continue;
}
if (cs->first_off == 0)
cs->first_off = cs->ins_off;
chain_ptr = blk_ptr + cs->ins_off;
chain.flag = 1;
/* note: currently only assert check of cs->index */
assert(tp_update_underway || (0 <= (short)cs->index));
assert(tp_update_underway
|| (&first_cw_set[cs->index] < cs));
chain.cw_index = cs->index;
chain.next_off = cs->next_off;
if (!(GTM_PROBE(SIZEOF(int4), chain_ptr, WRITE)))
{
SECSHR_ACCOUNTING(5);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->ins_off);
SECSHR_ACCOUNTING((INTPTR_T)chain_ptr);
assert(FALSE);
continue;
}
GET_LONGP(chain_ptr, &chain);
cs->ins_off = cs->next_off = 0;
}
cs->done = TRUE;
assert(NULL != cs->blk_target);
/* cert_blk cannot be done in VMS as it is a heavyweight routine
* and cannot be pulled into GTMSECSHR. Hence do it only in Unix.
*/
UNIX_ONLY(assert(NULL == gvt);)
UNIX_ONLY(CERT_BLK_IF_NEEDED(certify_all_blocks, gv_cur_region,
cs, cs->new_buff, gvt);)
}
}
cs->old_mode = (int4)cs->mode;
assert(0 < cs->old_mode);
cs->mode = gds_t_committed;
continue;
}
if (gds_t_committed == cs->mode)
{ /* already processed */
assert(0 < cs->old_mode);
if (T_COMMIT_CRIT_PHASE1 == csa->t_commit_crit)
{
assert(csa->now_crit);
csd->trans_hist.free_blocks -= cs->reference_cnt;
}
if (tp_update_underway)
{ /* We have seen at least one already-committed cse. Assume GT.M has
* committed ALL cses if this is the first one we are seeing. This
* will be later overridden if we see an uncommitted cse in this region.
* If we have already decided that the region is only partially committed,
* do not change that. It is possible to see uncommitted cses followed by
* committed cses in case of an error during phase2 because bitmaps
* (later cses) are committed in phase1 while the rest (early cses)
* are completely committed only in phase2.
*/
if (REG_COMMIT_UNSTARTED == this_reg_commit_type)
this_reg_commit_type = REG_COMMIT_COMPLETE;
}
cr = cs->cr;
assert(!dlr_tlevel || (gds_t_write_root != cs->old_mode));
assert(gds_t_committed != cs->old_mode);
if (gds_t_committed > cs->old_mode)
{
if (!GTM_PROBE(SIZEOF(cache_rec), cr, WRITE))
{
SECSHR_ACCOUNTING(4);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING((INTPTR_T)cr);
assert(FALSE);
} else if (rundown_process_id == cr->in_tend)
{ /* Not sure how this is possible */
assert(FALSE);
}
} else
{ /* For the kill_t_* case, cs->cr will be NULL as bg_update was not invoked
* and the cw-set-elements were memset to 0 in TP. But for gds_t_write_root
* and gds_t_busy2free, they are non-TP ONLY modes and cses are not
* initialized so cant check for NULL cr. Thankfully "n_gds_t_op" demarcates
* the boundaries between non-TP only and TP only modes. So use that.
*/
assert((n_gds_t_op > cs->old_mode) || (NULL == cr));
}
continue;
}
/* Since we are going to build blocks at this point, unconditionally set wc_blocked
* (after finishing commits) to trigger wcs_recover even though we might not be
* holding crit at this point.
*/
set_wc_blocked = TRUE;
assert(NORMAL_TERMINATION != secshr_state); /* for normal termination we should not
* have been in the midst of commit */
if (tp_update_underway)
{ /* Since the current cse has not been committed, this is a partial
* GT.M commit in this region even if we have already seen committed cses.
*/
this_reg_commit_type = REG_COMMIT_PARTIAL;
}
if (is_bg)
{
if (T_COMMIT_CRIT_PHASE2 != csa->t_commit_crit)
{ /* We are not yet in phase2 which means we hold crit on this region,
* so have to find out a free cache-record we can dump our updates onto.
*/
for ( ; lcnt++ < max_bts; )
{ /* find any available cr */
if (++clru >= cr_top)
clru = start_cr;
assert(!clru->stopped);
if (!clru->stopped && (0 == clru->dirty)
&& (0 == clru->in_cw_set)
&& (!clru->in_tend)
&& (-1 == clru->read_in_progress)
&& GTM_PROBE(csd->blk_size,
GDS_ANY_REL2ABS(csa, clru->buffaddr), WRITE))
break;
}
if (lcnt >= max_bts)
{
SECSHR_ACCOUNTING(9);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING(cs->tn);
SECSHR_ACCOUNTING(cs->level);
SECSHR_ACCOUNTING(cs->done);
SECSHR_ACCOUNTING(cs->forward_process);
SECSHR_ACCOUNTING(cs->first_copy);
assert(FALSE);
continue;
}
cr = clru;
cr->cycle++; /* increment cycle for blk number changes (for tp_hist) */
assert(cs->blk < csd->trans_hist.total_blks);
cr->blk = cs->blk;
assert(CR_BLKEMPTY != cr->blk);
cr->jnl_addr = cs->jnl_freeaddr;
cr->stopped = TRUE;
/* Keep cs->cr and t1->cr uptodate to ensure clue will be accurate */
cs->cr = cr;
cs->cycle = cr->cycle;
if (!IS_BITMAP_BLK(cs->blk))
{ /* Not a bitmap block, update clue history to reflect new cr */
assert((0 <= cs->level) && (MAX_BT_DEPTH > cs->level));
gvtarget = cs->blk_target;
assert((MAX_BT_DEPTH + 1)
== (SIZEOF(gvtarget->hist.h)
/ SIZEOF(gvtarget->hist.h[0])));
if ((0 <= cs->level) && (MAX_BT_DEPTH > cs->level)
&& GTM_PROBE(SIZEOF(gv_namehead), gvtarget, WRITE)
&& (0 != gvtarget->clue.end))
{
t1 = &gvtarget->hist.h[cs->level];
if (t1->blk_num == cs->blk)
{
t1->cr = cr;
t1->cycle = cs->cycle;
t1->buffaddr = (sm_uc_ptr_t)
GDS_ANY_REL2ABS(csa, cr->buffaddr);
}
}
}
} else
{ /* We are in PHASE2 of the commit (i.e. have completed PHASE1 for ALL cses)
* We have already picked out a cr for the commit. Use that.
*/
cr = cs->cr;
if (!GTM_PROBE(SIZEOF(cache_rec), cr, WRITE))
{
SECSHR_ACCOUNTING(4);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING((INTPTR_T)cr);
assert(FALSE);
continue;
}
if (rundown_process_id != cr->in_tend)
{ /* phase2 commit is already complete for this cse but we got
* interrupted before setting cs->mode to gds_t_committed.
* Possible that this cache-record is not placed in the active
* queue properly. Any case set_wc_blocked is already set so that
* should take care of invoking wcs_recover to fix the queues.
*/
assert(rundown_process_id != cr->in_cw_set);
assert(rundown_process_id != cr->data_invalid);
continue;
}
assert(rundown_process_id == cr->in_cw_set);
assert(cr->blk == cs->cr->blk);
if (cr->data_invalid)
{ /* Buffer is already in middle of update. Since blk builds are
* not redoable, db is in danger whether or not we redo the build.
* Since, skipping the build is guaranteed to give us integrity
* errors, we redo the build hoping it will have at least a 50%
* chance of resulting in a clean block. Make sure data_invalid
* flag is set until the next cache-recovery (wcs_recover will
* send a DBDANGER syslog message for this block to alert of
* potential database damage) by setting dont_reset_data_invalid.
*/
SECSHR_ACCOUNTING(6);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING((INTPTR_T)cr);
SECSHR_ACCOUNTING(cr->blk);
SECSHR_ACCOUNTING(cr->data_invalid);
assert(FALSE);
dont_reset_data_invalid = TRUE;
}
}
/* Check if online backup is in progress and if there is a before-image to write.
* If so need to store link to it so wcs_recover can back it up later. Cannot
* rely on precomputed value csa->backup_in_prog since it is not initialized
* if (cw_depth == 0) (see t_end.c). Hence using cnl->nbb explicitly in check.
* However, for snapshots we can rely on csa as it is computed under
* if (update_trans). Use cs->blk_prior_state's free status to ensure that FREE
* blocks are not back'ed up either by secshr_db_clnup or wcs_recover.
*/
if ((SNAPSHOTS_IN_PROG(csa) || (BACKUP_NOT_IN_PROGRESS != cnl->nbb))
&& (NULL != cs->old_block))
{
DEBUG_ONLY(GTM_SNAPSHOT_ONLY(snapshot_cr = NULL;)) /* Will be set below */
if (T_COMMIT_CRIT_PHASE2 != csa->t_commit_crit)
{ /* Set "cr->twin" to point to "cs->old_block". This is not normal
* usage since "twin" usually points to a cache-record. But this
* is a special case where we want to record the before-image
* somewhere for wcs_recover to see and we are not allowed division
* operations in secshr_db_clnup (which is required to find out the
* corresponding cache-record). Hence we store the relative offset
* of "cs->old_block". This is a special case where "cr->twin" can
* be non-zero even in Unix. wcs_recover will recognize this special
* usage of "twin" (since cr->stopped is non-zero as well) and fix
* it. Note that in VMS, it is possible to have two other crs for
* the same block cr1, cr2 which are each twinned so we could end
* up with the following twin configuration.
* cr1 <---> cr2 <--- cr
* Note cr->twin = cr2 is a one way link and stores "cs->old_block",
* while "cr1->twin" and "cr2->twin" store each other's cacherecord
* pointers.
*/
# ifdef UNIX
bufstart = (sm_uc_ptr_t)GDS_ANY_REL2ABS(csa, start_cr->buffaddr);
bufindx = (int4)(cs->old_block - bufstart) / csd->blk_size;
assert(0 <= bufindx);
assert(bufindx < csd->n_bts);
cr_alt = &start_cr[bufindx];
assert(cr_alt != cr);
assert(cs->blk == cr_alt->blk);
assert(rundown_process_id == cr_alt->in_cw_set);
snapshot_cr = cr_alt;
# endif
/* wcs_recover need not copy before images of FREE blocks
* to the backup buffer */
if (!WAS_FREE(cs->blk_prior_state))
cr->twin = GDS_ANY_ABS2REL(csa, cs->old_block);
} else
{ /* We have to finish phase2 update.
* If Unix, we backup the block right here instead of waiting for
* wcs_recover to do it. If VMS, we dont need to do anything as
* the block has already been backed up in phase1. See end of
* bg_update_phase1 for comment on why.
*/
# ifdef UNIX
/* The following check is similar to the one in BG_BACKUP_BLOCK
* and the one in wcs_recover (where backup_block is invoked)
*/
blk_hdr_ptr = (blk_hdr_ptr_t)cs->old_block;
assert(GDS_ANY_REL2ABS(csa, cr->buffaddr)
== (sm_uc_ptr_t)blk_hdr_ptr);
if (!WAS_FREE(cs->blk_prior_state) && (cr->blk >= cnl->nbb)
&& (0 == csa->shmpool_buffer->failed)
&& (blk_hdr_ptr->tn < csa->shmpool_buffer->backup_tn)
&& (blk_hdr_ptr->tn >= csa->shmpool_buffer->inc_backup_tn))
{
backup_block(csa, cr->blk, cr, NULL);
/* No need for us to flush the backup buffer.
* MUPIP BACKUP will anyways flush it at the end.
*/
}
snapshot_cr = cr;
# endif
}
# ifdef GTM_SNAPSHOT
if (SNAPSHOTS_IN_PROG(csa))
{
lcl_ss_ctx = SS_CTX_CAST(csa->ss_ctx);
assert(NULL != snapshot_cr);
assert((snapshot_cr == cr) || (snapshot_cr == cr_alt));
WRITE_SNAPSHOT_BLOCK(csa, snapshot_cr, NULL, snapshot_cr->blk,
lcl_ss_ctx);
}
# endif
}
if (T_COMMIT_CRIT_PHASE2 != csa->t_commit_crit)
{ /* Adjust blks_to_upgrd counter if not already done in phase1. The value of
* cs->old_mode if negative implies phase1 is complete on this cse so we
* dont need to do this adjustment again. If not we do the adjustment.
*/
assert((0 <= cs->old_mode) || (cs->old_mode == -cs->mode));
if (0 <= cs->old_mode)
{ /* the following code is very similar to that in bg_update */
if (gds_t_acquired == cs->mode)
{
if (GDSV4 == csd->desired_db_format)
{
INCR_BLKS_TO_UPGRD(csa, csd, 1);
}
} else
{
# ifdef DEBUG
/* secshr_db_clnup relies on the fact that cs->ondsk_blkver
* accurately reflects the on-disk block version of the
* block and therefore can be used to set cr->ondsk_blkver.
* Confirm this by checking that if a cr exists for this
* block, then that cr's ondsk_blkver matches with the cs.
* db_csh_get uses the global variable cs_addrs to determine
* the region. So make it uptodate temporarily holding its
* value in the local variable csaddrs.
*/
csaddrs = cs_addrs; /* save cs_addrs in local */
cs_addrs = csa; /* set cs_addrs for db_csh_get */
actual_cr = db_csh_get(cs->blk);
cs_addrs = csaddrs; /* restore cs_addrs */
/* actual_cr can be NULL if the block is NOT in the cache.
* It can be CR_NOTVALID if the cache record originally
* containing this block got reused for a different block
* (i.e. cr->stopped = 1) as part of secshr_db_clnup.
*/
assert((NULL == actual_cr)
|| ((cache_rec_ptr_t)CR_NOTVALID == actual_cr)
|| (cs->ondsk_blkver == actual_cr->ondsk_blkver));
# endif
cr->ondsk_blkver = cs->ondsk_blkver;
if (cr->ondsk_blkver != csd->desired_db_format)
{
if (GDSV4 == csd->desired_db_format)
{
if (gds_t_write_recycled != cs->mode)
INCR_BLKS_TO_UPGRD(csa, csd, 1);
} else
{
if (gds_t_write_recycled != cs->mode)
DECR_BLKS_TO_UPGRD(csa, csd, 1);
}
}
}
}
}
/* Before resetting cr->ondsk_blkver, ensure db_format in file header did not
* change in between phase1 (inside of crit) and phase2 (outside of crit).
* This is needed to ensure the correctness of the blks_to_upgrd counter.
*/
assert(currtn > csd->desired_db_format_tn);
cr->ondsk_blkver = csd->desired_db_format;
/* else we are in phase2 and all blks_to_upgrd manipulation is already done */
blk_ptr = (sm_uc_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr);
} else
{ /* access method is MM */
blk_ptr = MM_BASE_ADDR(csa) + (off_t)csd->blk_size * cs->blk;
if (!GTM_PROBE(csd->blk_size, blk_ptr, WRITE))
{
SECSHR_ACCOUNTING(7);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING((INTPTR_T)blk_ptr);
SECSHR_ACCOUNTING(csd->blk_size);
SECSHR_ACCOUNTING((INTPTR_T)(MM_BASE_ADDR(csa)));
assert(FALSE);
continue;
}
}
/* The following block of code rolls forward steps (6a) and/or (13) of the commit */
if (cs->mode == gds_t_writemap)
{
if (!GTM_PROBE(csd->blk_size, cs->old_block, READ))
{
SECSHR_ACCOUNTING(11);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING(cs->tn);
SECSHR_ACCOUNTING(cs->level);
SECSHR_ACCOUNTING(cs->done);
SECSHR_ACCOUNTING(cs->forward_process);
SECSHR_ACCOUNTING(cs->first_copy);
SECSHR_ACCOUNTING((INTPTR_T)cs->old_block);
SECSHR_ACCOUNTING(csd->blk_size);
assert(FALSE);
continue;
}
memmove(blk_ptr, cs->old_block, csd->blk_size);
if (FALSE == sec_shr_map_build(csa, (uint4*)cs->upd_addr, blk_ptr, cs,
currtn, BM_SIZE(csd->bplmap)))
{
SECSHR_ACCOUNTING(11);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING(cs->tn);
SECSHR_ACCOUNTING(cs->level);
SECSHR_ACCOUNTING(cs->done);
SECSHR_ACCOUNTING(cs->forward_process);
SECSHR_ACCOUNTING(cs->first_copy);
SECSHR_ACCOUNTING((INTPTR_T)cs->upd_addr);
SECSHR_ACCOUNTING((INTPTR_T)blk_ptr);
assert(FALSE);
continue;
}
} else
{
if (!tp_update_underway)
{
if (FALSE == sec_shr_blk_build(csa, csd, is_bg, cs, blk_ptr, currtn))
{
SECSHR_ACCOUNTING(10);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING(cs->level);
SECSHR_ACCOUNTING(cs->done);
SECSHR_ACCOUNTING(cs->forward_process);
SECSHR_ACCOUNTING(cs->first_copy);
SECSHR_ACCOUNTING((INTPTR_T)cs->upd_addr);
SECSHR_ACCOUNTING((INTPTR_T)blk_ptr);
assert(FALSE);
continue;
} else if (cs->ins_off)
{
if ((cs->ins_off >
((blk_hdr *)blk_ptr)->bsiz - SIZEOF(block_id))
|| (cs->ins_off < (SIZEOF(blk_hdr)
+ SIZEOF(rec_hdr)))
|| (0 > (short)cs->index)
|| ((cs - cw_set_addrs) <= cs->index))
{
SECSHR_ACCOUNTING(7);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING(cs->index);
SECSHR_ACCOUNTING(cs->ins_off);
SECSHR_ACCOUNTING(((blk_hdr *)blk_ptr)->bsiz);
assert(FALSE);
continue;
}
PUT_LONG((blk_ptr + cs->ins_off),
((cw_set_element *)(cw_set_addrs + cs->index))->blk);
if (((nxt = cs + 1) < cs_top)
&& (gds_t_write_root == nxt->mode))
{
if ((nxt->ins_off >
((blk_hdr *)blk_ptr)->bsiz - SIZEOF(block_id))
|| (nxt->ins_off < (SIZEOF(blk_hdr)
+ SIZEOF(rec_hdr)))
|| (0 > (short)nxt->index)
|| ((cs - cw_set_addrs) <= nxt->index))
{
SECSHR_ACCOUNTING(7);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)nxt);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING(nxt->index);
SECSHR_ACCOUNTING(nxt->ins_off);
SECSHR_ACCOUNTING(
((blk_hdr *)blk_ptr)->bsiz);
assert(FALSE);
continue;
}
PUT_LONG((blk_ptr + nxt->ins_off),
((cw_set_element *)
(cw_set_addrs + nxt->index))->blk);
}
}
} else
{ /* TP */
if (cs->done == 0)
{
if (FALSE == sec_shr_blk_build(csa, csd, is_bg, cs, blk_ptr,
currtn))
{
SECSHR_ACCOUNTING(10);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING(cs->level);
SECSHR_ACCOUNTING(cs->done);
SECSHR_ACCOUNTING(cs->forward_process);
SECSHR_ACCOUNTING(cs->first_copy);
SECSHR_ACCOUNTING((INTPTR_T)cs->upd_addr);
SECSHR_ACCOUNTING((INTPTR_T)blk_ptr);
assert(FALSE);
continue;
}
if (cs->ins_off != 0)
{
if ((cs->ins_off
> ((blk_hdr *)blk_ptr)->bsiz
- SIZEOF(block_id))
|| (cs->ins_off
< (SIZEOF(blk_hdr) + SIZEOF(rec_hdr))))
{
SECSHR_ACCOUNTING(7);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING(cs->index);
SECSHR_ACCOUNTING(cs->ins_off);
SECSHR_ACCOUNTING(
((blk_hdr *)blk_ptr)->bsiz);
assert(FALSE);
continue;
}
if (cs->first_off == 0)
cs->first_off = cs->ins_off;
chain_ptr = blk_ptr + cs->ins_off;
chain.flag = 1;
chain.cw_index = cs->index;
/* note: currently no verification of cs->index */
chain.next_off = cs->next_off;
GET_LONGP(chain_ptr, &chain);
cs->ins_off = cs->next_off = 0;
}
} else
{
memmove(blk_ptr, cs->new_buff,
((blk_hdr *)cs->new_buff)->bsiz);
((blk_hdr *)blk_ptr)->tn = currtn;
}
if (cs->first_off)
{
for (chain_ptr = blk_ptr + cs->first_off; ;
chain_ptr += chain.next_off)
{
GET_LONGP(&chain, chain_ptr);
if ((1 == chain.flag)
&& ((chain_ptr - blk_ptr + SIZEOF(block_id))
<= ((blk_hdr *)blk_ptr)->bsiz)
&& (chain.cw_index < si->cw_set_depth)
&& (FALSE != secshr_tp_get_cw(
first_cw_set, chain.cw_index, &cs_ptr)))
{
PUT_LONG(chain_ptr, cs_ptr->blk);
if (0 == chain.next_off)
break;
} else
{
SECSHR_ACCOUNTING(11);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cs);
SECSHR_ACCOUNTING(cs->blk);
SECSHR_ACCOUNTING(cs->index);
SECSHR_ACCOUNTING((INTPTR_T)blk_ptr);
SECSHR_ACCOUNTING((INTPTR_T)chain_ptr);
SECSHR_ACCOUNTING(chain.next_off);
SECSHR_ACCOUNTING(chain.cw_index);
SECSHR_ACCOUNTING(si->cw_set_depth);
SECSHR_ACCOUNTING(
((blk_hdr *)blk_ptr)->bsiz);
assert(FALSE);
break;
}
}
}
} /* TP */
} /* non-map processing */
if (0 > cs->reference_cnt)
{ /* blocks were freed up */
assert(non_tp_update_underway);
UNIX_ONLY(
assert((&inctn_opcode == inctn_opcode_addrs)
&& (&inctn_detail == inctn_detail_addrs)
&& ((inctn_bmp_mark_free_gtm == inctn_opcode)
|| (inctn_bmp_mark_free_mu_reorg == inctn_opcode)
|| (inctn_blkmarkfree == inctn_opcode)
|| dse_running));
)
/* Check if we are freeing a V4 format block and if so decrement the
* blks_to_upgrd counter. Do not do this in case MUPIP REORG UPGRADE/DOWNGRADE
* is marking a recycled block as free (inctn_opcode is inctn_blkmarkfree).
*/
if ((NULL != inctn_opcode_addrs)
&& (GTM_PROBE(SIZEOF(*inctn_opcode_addrs), inctn_opcode_addrs, READ))
&& ((inctn_bmp_mark_free_gtm == *inctn_opcode_addrs)
|| (inctn_bmp_mark_free_mu_reorg == *inctn_opcode_addrs))
&& (NULL != inctn_detail_addrs)
&& (GTM_PROBE(SIZEOF(*inctn_detail_addrs), inctn_detail_addrs, READ))
&& (0 != inctn_detail_addrs->blknum_struct.blknum))
{
DECR_BLKS_TO_UPGRD(csa, csd, 1);
}
}
assert(!cs->reference_cnt || (T_COMMIT_CRIT_PHASE2 != csa->t_commit_crit));
if (csa->now_crit)
{ /* Even though we know cs->reference_cnt is guaranteed to be 0 if we are in
* phase2 of commit (see above assert), we still do not want to be touching
* free_blocks in the file header outside of crit as it could potentially
* result in an incorrect value of the free_blocks counter. This is because
* in between the time we note down the current value of free_blocks on the
* right hand side of the below expression and assign the same value to the
* left side, it is possible that a concurrent process holding crit could
* have updated the free_blocks counter. In that case, our update would
* result in incorrect values. Hence dont touch this field if phase2.
*/
csd->trans_hist.free_blocks -= cs->reference_cnt;
}
cs->old_mode = (int4)cs->mode;
assert(0 < cs->old_mode);
cs->mode = gds_t_committed; /* rolls forward step (14) */
UNIX_ONLY(
/* Do not do a cert_blk of bitmap here since it could give a DBBMMSTR error. The
* bitmap block build is COMPLETE only in wcs_recover so do the cert_blk there.
* Assert that the bitmap buffer will indeed go through cert_blk there.
*/
assert((cs->old_mode != gds_t_writemap) || !is_bg || cr->stopped);
if (cs->old_mode != gds_t_writemap)
{
assert(NULL == gvt);
CERT_BLK_IF_NEEDED(certify_all_blocks, reg, cs, blk_ptr, gvt);
}
)
if (is_bg && (rundown_process_id == cr->in_tend))
{ /* Reset cr->in_tend now that cr is uptodate. This way if at all wcs_recover
* sees cr->in_tend set, it can be sure that was leftover from an interrupted
* phase1 commit for which the complete commit happened in another cache-record
* which will have cr->stopped set so the in_tend cache-record can be discarded.
* Take this opportunity to reset data_invalid, in_cw_set and the write interlock
* as well thereby simulating exactly what bg_update_phase2 would have done.
* This is easily done in Unix using the INSQ*I macros. But in VMS, these macros
* will pull in extra routines (including wcs_sleep) into the privileged image
* GTMSECSHR which we want to avoid. Therefore in VMS, we decide to skip the
* part about re-inserting the dirty cache-record into the active queue.
* The VMS version of wcs_get_space.c needs to take this into account while
* it is waiting for a dirty cache-record (that it could not be in any queues).
*/
assert(T_COMMIT_CRIT_PHASE2 == csa->t_commit_crit);
if (!dont_reset_data_invalid)
cr->data_invalid = 0;
if (PROBE_EVEN(cr))
{ /* Release write interlock. The following code is very similar to that
* at the end of the function "bg_update_phase2".
*/
UNIX_ONLY(
/* Avoid using gv_cur_region in the LOCK_HIST macro that is
* used by the RELEASE_BUFF_UPDATE_LOCK macro by setting locknl
*/
locknl = cnl;
)
if (!cr->tn)
{
cr->jnl_addr = cs->jnl_freeaddr;
assert(LATCH_SET == WRITE_LATCH_VAL(cr));
# ifdef UNIX
/* cache-record was not dirty BEFORE this update.
* insert this in the active queue. See comment above for
* why this is done only in Unix and not VMS.
*/
n = INSQTI((que_ent_ptr_t)&cr->state_que,
(que_head_ptr_t)&cache_state->cacheq_active);
if (INTERLOCK_FAIL == n)
{
SECSHR_ACCOUNTING(7);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cr);
SECSHR_ACCOUNTING(cr->blk);
SECSHR_ACCOUNTING(n);
SECSHR_ACCOUNTING(cache_state->cacheq_active.fl);
SECSHR_ACCOUNTING(cache_state->cacheq_active.bl);
assert(FALSE);
}
ADD_ENT_TO_ACTIVE_QUE_CNT(&cnl->wcs_active_lvl, &cnl->wc_var_lock);
# endif
}
RELEASE_BUFF_UPDATE_LOCK(cr, n, &cnl->db_latch);
/* "n" holds the pre-release value in Unix and post-release value in VMS,
* so check that we did hold the lock before releasing it above */
UNIX_ONLY(assert(LATCH_CONFLICT >= n);)
UNIX_ONLY(assert(LATCH_CLEAR < n);)
VMS_ONLY(assert(LATCH_SET >= n);)
VMS_ONLY(assert(LATCH_CLEAR <= n);)
if (WRITER_BLOCKED_BY_PROC(n))
{
VMS_ONLY(
assert(LATCH_SET == WRITE_LATCH_VAL(cr));
RELEASE_BUFF_UPDATE_LOCK(cr, n, &cnl->db_latch);
assert(LATCH_CLEAR == n);
assert(0 != cr->epid);
assert(WRT_STRT_PNDNG == cr->iosb.cond);
cr->epid = 0;
cr->iosb.cond = 0;
cr->wip_stopped = FALSE;
)
# ifdef UNIX
n = INSQHI((que_ent_ptr_t)&cr->state_que,
(que_head_ptr_t)&cache_state->cacheq_active);
if (INTERLOCK_FAIL == n)
{
SECSHR_ACCOUNTING(7);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cr);
SECSHR_ACCOUNTING(cr->blk);
SECSHR_ACCOUNTING(n);
SECSHR_ACCOUNTING(cache_state->cacheq_active.fl);
SECSHR_ACCOUNTING(cache_state->cacheq_active.bl);
assert(FALSE);
}
# endif
}
}
assert(process_id == cr->in_cw_set);
UNPIN_CACHE_RECORD(cr);
assert(!cr->in_cw_set);
SECSHR_SHM_WRITE_MEMORY_BARRIER;
cr->in_tend = 0;
}
} /* for all cw_set entries */
/* Check if kill_in_prog flag in file header has to be incremented. */
if (tp_update_underway)
{ /* TP : Do this only if GT.M has not already completed the commit on this region. */
assert((REG_COMMIT_COMPLETE == this_reg_commit_type)
|| (REG_COMMIT_PARTIAL == this_reg_commit_type)
|| (REG_COMMIT_UNSTARTED == this_reg_commit_type));
/* We have already checked that "si" is READABLE. Check that it is WRITABLE since
* we might need to set "si->kip_csa" in the CAREFUL_INCR_KIP macro.
*/
if (GTM_PROBE(SIZEOF(sgm_info), si, WRITE))
{
kip_csa_usable = TRUE;
/* Take this opportunity to reset si->cr_array_index */
si->cr_array_index = 0;
} else
{
kip_csa_usable = FALSE;
assert(FALSE);
}
if (REG_COMMIT_COMPLETE != this_reg_commit_type)
{
if (kip_csa_usable && (NULL != si->kill_set_head) && (NULL == si->kip_csa))
CAREFUL_INCR_KIP(csd, csa, si->kip_csa);
} else
assert((NULL == si->kill_set_head) || (NULL != si->kip_csa));
assert((NULL == si->kill_set_head) || (NULL != si->kip_csa));
} else
{ /* Non-TP. Check need_kip_incr and value pointed to by kip_csa. */
assert(non_tp_update_underway);
/* Note that *kip_csa_addrs could be NULL if we are in the
* 1st phase of the M-kill and NON NULL if we are in the 2nd phase of the kill.
* Only if it is NULL, should we increment the kill_in_prog flag.
*/
kip_csa_usable =
(GTM_PROBE(SIZEOF(*kip_csa_addrs), kip_csa_addrs, WRITE))
? TRUE : FALSE;
assert(kip_csa_usable);
if (GTM_PROBE(SIZEOF(*need_kip_incr_addrs), need_kip_incr_addrs, WRITE))
needkipincr = *need_kip_incr_addrs;
else
{
needkipincr = FALSE;
assert(FALSE);
}
if (needkipincr && kip_csa_usable && (NULL == *kip_csa_addrs))
{
CAREFUL_INCR_KIP(csd, csa, *kip_csa_addrs);
*need_kip_incr_addrs = FALSE;
}
# ifdef UNIX
if (MUSWP_INCR_ROOT_CYCLE == TREF(in_mu_swap_root_state))
cnl->root_search_cycle++;
# endif
}
} /* if (NULL != first_cw_set) */
/* If the process is about to exit AND any kills are in progress (bitmap freeup phase of kill), mark
* kill_in_prog as abandoned. Non-TP and TP maintain kill_in_prog information in different structures
* so access them appropriately. Note that even for a TP transaction, the bitmap freeup happens as a
* non-TP transaction so checking dollar_tlevel is not enough to determine if we are in TP or non-TP.
* Thankfully first_sgm_info is guaranteed to be non-NULL in the case of a TP transaction that is
* temporarily running its bitmap freeup phase as a non-TP transaction. And for true non-TP
* transactions, first_sgm_info is guaranteed to be NULL. So we use this for the determination.
* But this global variable value is obtained by dereferencing first_sgm_info_addrs (due to the way
* GTMSECSHR runs as a separate privileged image in VMS). If the probe of first_sgm_info_addrs does
* not succeed (due to some corruption), then we have no clue about the nullness of first_sgm_info.
* Therefore we also check for dlr_tlevel also since if that is TRUE, we are guaranteed it is a TP
* transaction irrespective of the value of first_sgm_info. Note that we store the value of the global
* variable first_sgm_info in a local variable firsgsgminfo (slightly different name) for clarity sake.
*/
if (is_exiting)
{
if (GTM_PROBE(SIZEOF(*first_sgm_info_addrs), first_sgm_info_addrs, READ))
firstsgminfo = *first_sgm_info_addrs;
else
{
assert(FALSE);
firstsgminfo = NULL;
}
if (dlr_tlevel || (NULL != firstsgminfo))
{
si = csa->sgm_info_ptr;
kip_csa_usable = (GTM_PROBE(SIZEOF(sgm_info), si, WRITE)) ? TRUE : FALSE;
assert(kip_csa_usable);
/* Since the kill process cannot be completed, we need to decerement KIP count
* and increment the abandoned_kills count.
*/
if (kip_csa_usable && (NULL != si->kill_set_head) && (NULL != si->kip_csa))
{
assert(csa == si->kip_csa);
CAREFUL_DECR_KIP(csd, csa, si->kip_csa);
CAREFUL_INCR_ABANDONED_KILLS(csd, csa);
} else
assert((NULL == si->kill_set_head) || (NULL == si->kip_csa));
} else if (!dlr_tlevel)
{
kip_csa_usable =
(GTM_PROBE(SIZEOF(*kip_csa_addrs), kip_csa_addrs, WRITE))
? TRUE : FALSE;
assert(kip_csa_usable);
if (kip_csa_usable && (NULL != *kip_csa_addrs) && (csa == *kip_csa_addrs))
{
assert(0 < (*kip_csa_addrs)->hdr->kill_in_prog);
CAREFUL_DECR_KIP(csd, csa, *kip_csa_addrs);
CAREFUL_INCR_ABANDONED_KILLS(csd, csa);
}
}
}
if (JNL_ENABLED(csd))
{
if (GTM_PROBE(SIZEOF(jnl_private_control), csa->jnl, WRITE))
{
jbp = csa->jnl->jnl_buff;
if (GTM_PROBE(SIZEOF(jnl_buffer), jbp, WRITE) && is_exiting)
{
SALVAGE_UNIX_LATCH(&jbp->fsync_in_prog_latch, is_exiting);
if (VMS_ONLY(csa->jnl->qio_active)
UNIX_ONLY(jbp->io_in_prog_latch.u.parts.latch_pid \
== rundown_process_id))
{
if (csa->jnl->dsk_update_inprog)
{
jbp->dsk = csa->jnl->new_dsk;
jbp->dskaddr = csa->jnl->new_dskaddr;
}
VMS_ONLY(
bci(&jbp->io_in_prog);
csa->jnl->qio_active = FALSE;
)
UNIX_ONLY(RELEASE_SWAPLOCK(&jbp->io_in_prog_latch));
}
if (jbp->free_update_pid == rundown_process_id)
{ /* Got shot in the midst of updating freeaddr/free in jnl_write.c
* Fix the values (possible only in VMS where we have kernel extension).
*/
UNIX_ONLY(assert(FALSE);)
assert(csa->now_crit);
jbp->free = csa->jnl->temp_free;
jbp->freeaddr = csa->jnl->new_freeaddr;
jbp->free_update_pid = 0;
DBG_CHECK_JNL_BUFF_FREEADDR(jbp);
}
if (jbp->blocked == rundown_process_id)
{
assert(csa->now_crit);
jbp->blocked = 0;
}
}
} else
{
SECSHR_ACCOUNTING(4);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)csa->jnl);
SECSHR_ACCOUNTING(SIZEOF(jnl_private_control));
assert(FALSE);
}
}
if (is_exiting && csa->freeze && csd->freeze == rundown_process_id && !csa->persistent_freeze)
{
csd->image_count = 0;
csd->freeze = 0;
}
if (is_bg && (csa->wbuf_dqd || csa->now_crit || csa->t_commit_crit || set_wc_blocked))
{ /* if csa->wbuf_dqd == TRUE, most likely failed during REMQHI in wcs_wtstart
* or db_csh_get. cache corruption is suspected so set wc_blocked.
* if csa->now_crit is TRUE, someone else should clean the cache, so set wc_blocked.
* if csa->t_commit_crit is TRUE, even if csa->now_crit is FALSE, we might need cache
* cleanup (e.g. cleanup of orphaned cnl->wcs_phase2_commit_pidcnt counter in case
* a process gets shot in the midst of DECR_WCS_PHASE2_COMMIT_PIDCNT macro before
* decrementing the shared counter but after committing the transaction otherwise)
* so set wc_blocked. This case is folded into phase2 cleanup case below.
* if set_wc_blocked is TRUE, need to clean up queues after phase2 commits.
*/
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
if (csa->now_crit)
{
wcblocked_ptr = WCBLOCKED_NOW_CRIT_LIT;
BG_TRACE_PRO_ANY(csa, wcb_secshr_db_clnup_now_crit);
} else if (csa->wbuf_dqd)
{
wcblocked_ptr = WCBLOCKED_WBUF_DQD_LIT;
BG_TRACE_PRO_ANY(csa, wcb_secshr_db_clnup_wbuf_dqd);
} else
{
wcblocked_ptr = WCBLOCKED_PHASE2_CLNUP_LIT;
BG_TRACE_PRO_ANY(csa, wcb_secshr_db_clnup_phase2_clnup);
}
UNIX_ONLY(
/* cannot send oplog message in VMS as privileged routines cannot do I/O */
send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_STR(wcblocked_ptr),
rundown_process_id, &csd->trans_hist.curr_tn, DB_LEN_STR(reg));
)
}
csa->wbuf_dqd = 0; /* We can clear the flag now */
if (csa->wcs_pidcnt_incremented)
CAREFUL_DECR_WCS_PHASE2_COMMIT_PIDCNT(csa, cnl);
if (csa->now_crit)
{
if (csd->trans_hist.curr_tn == csd->trans_hist.early_tn - 1)
{ /* there can be at most one region in non-TP with different curr_tn and early_tn */
assert(!non_tp_update_underway || first_time);
assert(NORMAL_TERMINATION != secshr_state); /* for normal termination we should not
* have been in the midst of commit */
DEBUG_ONLY(first_time = FALSE;)
if (update_underway)
{
INCREMENT_CURR_TN(csd); /* roll forward step (8) */
} else
csd->trans_hist.early_tn = csd->trans_hist.curr_tn;
}
assert(csd->trans_hist.early_tn == csd->trans_hist.curr_tn);
if (GTM_PROBE(CRIT_SPACE(NUM_CRIT_ENTRY(csd)), csa->critical, WRITE))
{
/* ONLINE ROLLBACK can come here holding crit ONLY due to commit errors but NOT during
* process exiting as secshr_db_clnup during process exiting is always preceded by
* mur_close_files which does the rel_crit anyways. Assert that.
*/
UNIX_ONLY(assert(!csa->hold_onto_crit || !jgbl.onlnrlbk || !is_exiting));
if (!csa->hold_onto_crit || is_exiting)
{ /* Release crit but since it involves modifying more than one field, make sure
* we prevent interrupts while in this code. The global variable "crit_count"
* does this for us. See similar usage in rel_crit.c. We currently use this here
* only for Unix because in VMS, a global variable in GTMSHR is not accessible
* in GTMSECSHR image easily unless passed through init_secshr_addrs. Since in
* VMS, if we are here, we are already in a kernel level routine, we will not be
* interrupted by user level timer handlers (wcs_stale or wcs_clean_dbsync_ast)
* that care about the consistency of the crit values so it is okay not to
* explicitly prevent interrupts using "crit_count" in VMS.
*/
UNIX_ONLY(
assert(0 == crit_count);
crit_count++; /* prevent interrupts */
CRIT_TRACE(crit_ops_rw); /* see gdsbt.h for comment on placement */
)
if (cnl->in_crit == rundown_process_id)
cnl->in_crit = 0;
UNIX_ONLY(
csa->hold_onto_crit = FALSE;
DEBUG_ONLY(locknl = cnl;) /* for DEBUG_ONLY LOCK_HIST macro */
mutex_unlockw(reg, crash_count);/* roll forward step (11) */
assert(!csa->now_crit);
DEBUG_ONLY(locknl = NULL;) /* restore "locknl" to default value */
crit_count = 0;
)
VMS_ONLY(
mutex_stoprelw(csa->critical); /* roll forward step (11) */
csa->now_crit = FALSE;
)
UNSUPPORTED_PLATFORM_CHECK;
}
} else
{
SECSHR_ACCOUNTING(6);
SECSHR_ACCOUNTING(__LINE__);
SECSHR_ACCOUNTING((INTPTR_T)cnl);
SECSHR_ACCOUNTING(NODE_LOCAL_SIZE_DBS);
SECSHR_ACCOUNTING((INTPTR_T)csa->critical);
SECSHR_ACCOUNTING(CRIT_SPACE(NUM_CRIT_ENTRY(csd)));
assert(FALSE);
}
}
csa->t_commit_crit = FALSE; /* ensure we don't process this region again (rolls forward step (15)) */
if ((NORMAL_TERMINATION == secshr_state || ABNORMAL_TERMINATION == secshr_state)
&& GTM_PROBE(SHMPOOL_BUFFER_SIZE, csa->shmpool_buffer, WRITE))
{
if ((pid = csa->shmpool_buffer->shmpool_crit_latch.u.parts.latch_pid)
== rundown_process_id VMS_ONLY(&&)
VMS_ONLY((imgcnt = csa->shmpool_buffer->shmpool_crit_latch.u.parts.latch_image_count) \
== rundown_image_count))
{
if (is_exiting)
{ /* Tiz our lock. Force recovery to run and release */
csa->shmpool_buffer->shmpool_blocked = TRUE;
BG_TRACE_PRO_ANY(csa, shmpool_blkd_by_sdc);
SET_LATCH_GLOBAL(&csa->shmpool_buffer->shmpool_crit_latch, LOCK_AVAILABLE);
DEBUG_LATCH(util_out_print("Latch cleaned up", FLUSH));
}
} else if (0 != pid && FALSE == is_proc_alive(pid, 0))
{
/* Attempt to make it our lock so we can set blocked */
if (COMPSWAP_LOCK(&csa->shmpool_buffer->shmpool_crit_latch, pid, imgcnt,
rundown_process_id, rundown_image_count))
{ /* Now our lock .. set blocked and release. */
csa->shmpool_buffer->shmpool_blocked = TRUE;
BG_TRACE_PRO_ANY(csa, shmpool_blkd_by_sdc);
DEBUG_LATCH(util_out_print("Orphaned latch cleaned up", TRUE));
COMPSWAP_UNLOCK(&csa->shmpool_buffer->shmpool_crit_latch, rundown_process_id,
rundown_image_count, LOCK_AVAILABLE, 0);
} /* Else someone else took care of it */
}
}
#ifdef UNIX
/* All releases done now. Double check latch is really cleared */
if (GTM_PROBE(CRIT_SPACE(NUM_CRIT_ENTRY(csd)), csa->critical, WRITE))
{
/* as long as csa->hold_onto_crit is FALSE, we should have released crit if we held it at entry */
assert(!csa->now_crit || csa->hold_onto_crit);
SALVAGE_UNIX_LATCH_DBCRIT(&csa->critical->semaphore, is_exiting, cnl->wc_blocked);
SALVAGE_UNIX_LATCH(&csa->critical->crashcnt_latch, is_exiting);
SALVAGE_UNIX_LATCH(&csa->critical->prochead.latch, is_exiting);
SALVAGE_UNIX_LATCH(&csa->critical->freehead.latch, is_exiting);
}
#endif
} /* For all regions */
} /* For all glds */
if (jnlpool_reg_addrs && (GTM_PROBE(SIZEOF(*jnlpool_reg_addrs), jnlpool_reg_addrs, READ)))
{ /* although there is only one jnlpool reg, SECSHR_PROBE_REGION macro might do a "continue" and hence the for loop */
for (reg = *jnlpool_reg_addrs, jnlpool_reg = TRUE; jnlpool_reg && reg; jnlpool_reg = FALSE) /* only jnlpool reg */
{
SECSHR_PROBE_REGION(reg); /* SECSHR_PROBE_REGION sets csa */
if (csa->now_crit)
{
jpl = (jnlpool_ctl_ptr_t)((sm_uc_ptr_t)csa->critical - JNLPOOL_CTL_SIZE); /* see jnlpool_init() for
* relationship between
* critical and jpl */
if (GTM_PROBE(SIZEOF(jnlpool_ctl_struct), jpl, WRITE))
{
if ((jpl->early_write_addr > jpl->write_addr) && (update_underway))
{ /* we need to update journal pool to reflect the increase in jnl-seqno */
cumul_jnl_rec_len = (uint4)(jpl->early_write_addr - jpl->write_addr);
jh = (jnldata_hdr_ptr_t)((sm_uc_ptr_t)jpl + JNLDATA_BASE_OFF + jpl->write);
if (GTM_PROBE(SIZEOF(*jh), jh, WRITE) && 0 != (jsize = jpl->jnlpool_size))
{ /* Below chunk of code mirrors what is done in t_end/tp_tend */
/* Begin atomic stmnts. Follow same order as in t_end/tp_tend */
jh->jnldata_len = cumul_jnl_rec_len;
jh->prev_jnldata_len = jpl->lastwrite_len;
# ifdef UNIX
if (INVALID_SUPPL_STRM != strm_index)
{ /* Need to also update supplementary stream seqno */
assert(0 <= strm_index);
/* assert(strm_index < ARRAYSIZE(tjpl->strm_seqno)); */
ASSERT_INST_FILE_HDR_HAS_HISTREC_FOR_STRM(strm_index);
jpl->strm_seqno[strm_index]++;
}
# endif
jpl->lastwrite_len = cumul_jnl_rec_len;
SECSHR_SHM_WRITE_MEMORY_BARRIER;
/* Emulate
* jpl->write = (jpl->write + cumul_jnl_rec_len) % jsize;
* See note in DOs and DONTs about using % operator
*/
for (new_write = jpl->write + cumul_jnl_rec_len;
new_write >= jsize;
new_write -= jsize)
;
jpl->write = new_write;
jpl->write_addr += cumul_jnl_rec_len;
jpl->jnl_seqno++;
/* End atomic stmts */
/* the above takes care of rolling forward steps (9) and (10) of the
* commit flow */
}
}
#ifdef DEBUG
else if (jpl->early_write_addr > jpl->write_addr)
{ /* PRO code will do the right thing by overwriting that exact space in the jnlpool with
* the current transaction's journal records. For dbg though, it is better if
* secshr_db_clnup (which is invoked as part of exit handling) does the cleanup.
*/
assert(!update_underway);
jpl->early_write_addr = jpl->write_addr;
}
#endif
}
cnl = csa->nl;
if ((GTM_PROBE(NODE_LOCAL_SIZE_DBS, cnl, WRITE)) &&
(GTM_PROBE(JNLPOOL_CRIT_SPACE, csa->critical, WRITE)))
{
/* ONLINE ROLLBACK can come here holding crit ONLY due to commit errors but NOT during
* process exiting as secshr_db_clnup during process exiting is always preceded by
* mur_close_files which does the rel_crit anyways. Assert that.
*/
UNIX_ONLY(assert(!csa->hold_onto_crit || !jgbl.onlnrlbk || !is_exiting));
if (!csa->hold_onto_crit || is_exiting)
{
UNIX_ONLY(CRIT_TRACE(crit_ops_rw)); /* see gdsbt.h for comment on placement */
if (cnl->in_crit == rundown_process_id)
cnl->in_crit = 0;
UNIX_ONLY(
csa->hold_onto_crit = FALSE;
DEBUG_ONLY(locknl = cnl;) /* for DEBUG_ONLY LOCK_HIST macro */
mutex_unlockw(reg, 0); /* roll forward step (12) */
assert(!csa->now_crit);
DEBUG_ONLY(locknl = NULL;) /* restore "locknl" to default value */
)
VMS_ONLY(
mutex_stoprelw(csa->critical); /* roll forward step (12) */
csa->now_crit = FALSE;
)
/* the above takes care of rolling forward step (12) of the commit flow */
}
}
}
/* as long as csa->hold_onto_crit is FALSE, we should have released crit if we held it at entry */
UNIX_ONLY(assert(!csa->now_crit || csa->hold_onto_crit));
}
}
return;
}
boolean_t secshr_tp_get_cw(cw_set_element *cs, int depth, cw_set_element **cs1)
{
int iter;
*cs1 = cs;
for (iter = 0; iter < depth; iter++)
{
if (!(GTM_PROBE(SIZEOF(cw_set_element), *cs1, READ)))
{
*cs1 = NULL;
return FALSE;
}
*cs1 = (*cs1)->next_cw_set;
}
if (*cs1 && GTM_PROBE(SIZEOF(cw_set_element), *cs1, READ))
{
while ((*cs1)->high_tlevel)
{
if (GTM_PROBE(SIZEOF(cw_set_element), (*cs1)->high_tlevel, READ))
*cs1 = (*cs1)->high_tlevel;
else
{
*cs1 = NULL;
return FALSE;
}
}
}
return TRUE;
}