fis-gtm/sr_port/tp_tend.c

2320 lines
97 KiB
C

/****************************************************************
* *
* Copyright 2001, 2012 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include <stddef.h> /* for offsetof macro */
#include <signal.h> /* for VSIG_ATOMIC_T type */
#ifdef UNIX
#include "gtm_stdio.h"
#endif
#include "gtm_time.h"
#include "gtm_inet.h" /* Required for gtmsource.h */
#include "gtm_string.h"
#ifdef VMS
#include <descrip.h> /* Required for gtmsource.h */
#endif
#include "gtm_ctype.h"
#include "cdb_sc.h"
#include "gdsroot.h"
#include "gdskill.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "gdscc.h"
#include "gdsbml.h"
#include "min_max.h" /* needed for gdsblkops.h */
#include "gdsblkops.h" /* needed for recompute_upd_array routine */
#include "ccp.h"
#include "copy.h"
#include "error.h"
#include "iosp.h"
#include "jnl.h"
#include "jnl_typedef.h"
#include "buddy_list.h" /* needed for tp.h */
#include "hashtab_int4.h" /* needed for tp.h */
#include "tp.h"
#include "interlock.h"
#include "gdsbgtr.h"
#include "repl_msg.h"
#include "gtmsource.h"
#include "t_commit_cleanup.h"
#include "mupipbckup.h"
#include "gvcst_blk_build.h"
#include "gvcst_protos.h" /* for gvcst_search_blk prototype */
#include "cache.h"
#include "rc_cpt_ops.h"
#include "wcs_flu.h"
#include "jnl_write_pblk.h"
#include "jnl_write.h"
#include "process_deferred_stale.h"
#include "wcs_backoff.h"
#include "mm_update.h"
#include "bg_update.h"
#include "wcs_get_space.h"
#include "wcs_timer_start.h"
#include "send_msg.h"
#include "add_inter.h"
#include "t_qread.h"
#include "memcoherency.h"
#include "jnl_get_checksum.h"
#include "wbox_test_init.h"
#include "cert_blk.h"
#include "have_crit.h"
#include "bml_status_check.h"
#include "gtmimagename.h"
#ifdef UNIX
#include "gtmrecv.h"
#include "deferred_signal_handler.h"
#include "repl_instance.h"
#endif
#include "shmpool.h"
#ifdef GTM_SNAPSHOT
#include "db_snapshot.h"
#endif
#include "is_proc_alive.h"
GBLREF uint4 dollar_tlevel;
GBLREF uint4 dollar_trestart;
GBLREF gd_region *gv_cur_region;
GBLREF sgmnt_addrs *cs_addrs;
GBLREF sgmnt_data_ptr_t cs_data;
GBLREF sgm_info *first_sgm_info, *sgm_info_ptr;
GBLREF sgm_info *first_tp_si_by_ftok; /* List of participating regions in the TP transaction sorted on ftok order */
GBLREF tp_region *tp_reg_list;
GBLREF boolean_t tp_kill_bitmaps;
GBLREF unsigned char t_fail_hist[CDB_MAX_TRIES];
GBLREF int4 n_pvtmods, n_blkmods;
GBLREF unsigned int t_tries;
GBLREF jnl_fence_control jnl_fence_ctl;
GBLREF jnlpool_addrs jnlpool;
GBLREF jnlpool_ctl_ptr_t jnlpool_ctl, temp_jnlpool_ctl;
GBLREF boolean_t is_updproc;
GBLREF seq_num seq_num_zero;
GBLREF seq_num seq_num_one;
GBLREF int gv_fillfactor;
GBLREF char *update_array, *update_array_ptr;
GBLREF int rc_set_fragment;
GBLREF uint4 update_array_size, cumul_update_array_size;
GBLREF boolean_t unhandled_stale_timer_pop;
GBLREF jnl_gbls_t jgbl;
GBLREF struct_jrec_tcom tcom_record;
GBLREF boolean_t certify_all_blocks;
GBLREF boolean_t gvdupsetnoop; /* if TRUE, duplicate SETs update journal but not database (except for curr_tn++) */
GBLREF gv_namehead *gv_target;
GBLREF trans_num local_tn; /* transaction number for THIS PROCESS */
GBLREF uint4 process_id;
#ifdef UNIX
GBLREF recvpool_addrs recvpool;
GBLREF int4 strm_index;
#endif
#ifdef VMS
GBLREF boolean_t tp_has_kill_t_cse; /* cse->mode of kill_t_write or kill_t_create got created in this transaction */
#endif
#ifdef GTM_TRIGGER
GBLREF boolean_t skip_dbtriggers; /* see gbldefs.c for description of this global */
GBLREF int4 gtm_trigger_depth;
#endif
#ifdef DEBUG
GBLREF boolean_t mupip_jnl_recover;
#endif
error_def(ERR_DLCKAVOIDANCE);
error_def(ERR_JNLFILOPN);
error_def(ERR_JNLFLUSH);
error_def(ERR_JNLTRANS2BIG);
error_def(ERR_REPLOFFJNLON);
error_def(ERR_TEXT);
#define SET_REG_SEQNO_IF_REPLIC(CSA, TJPL, SUPPLEMENTARY, NEXT_STRM_SEQNO) \
{ \
GBLREF jnl_gbls_t jgbl; \
GBLREF boolean_t is_updproc; \
UNIX_ONLY(GBLREF recvpool_addrs recvpool;) \
\
if (REPL_ALLOWED(CSA)) \
{ \
assert(CSA->hdr->reg_seqno < TJPL->jnl_seqno); \
CSA->hdr->reg_seqno = TJPL->jnl_seqno; \
UNIX_ONLY( \
if (SUPPLEMENTARY) \
{ \
CSA->hdr->strm_reg_seqno[strm_index] = NEXT_STRM_SEQNO; \
} \
) \
VMS_ONLY( \
if (is_updproc) \
CSA->hdr->resync_seqno = jgbl.max_resync_seqno; \
) \
} \
}
boolean_t reallocate_bitmap(sgm_info *si, cw_set_element *bml_cse);
enum cdb_sc recompute_upd_array(srch_blk_status *hist1, cw_set_element *cse);
boolean_t tp_crit_all_regions()
{
int lcnt;
boolean_t x_lock;
tp_region *tr, *tr_last;
sgmnt_addrs *tmpcsa;
sgm_info *tmpsi;
sgmnt_data_ptr_t tmpcsd;
gd_region *reg;
assert(dollar_tlevel);
/* This function is in tp_tend because its technique and structures should be maintained in parallel with tp_tend.
* The following section grabs crit in all regions touched by the transaction. We use a different
* structure here for grabbing crit. The tp_reg_list region list contains all the regions that
* were touched by this transaction. Since this array is sorted by the ftok value of the database
* file being operated on, the obtains will always occurr in a consistent manner. Therefore, we
* will grab crit on each file with wait since deadlock should not be able to occurr. We cannot
* use first_tp_si_by_ftok list because it will be setup only in tp_tend which is further down the line.
*/
for (lcnt = 0; ;lcnt++)
{
x_lock = TRUE; /* Assume success */
for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr)
{
reg = tr->reg;
tmpcsa = &FILE_INFO(reg)->s_addrs;
tmpcsd = tmpcsa->hdr;
tmpsi = (sgm_info *)(tmpcsa->sgm_info_ptr);
DEBUG_ONLY(
/* Track retries in debug mode */
if (0 != lcnt)
{
BG_TRACE_ANY(tmpcsa, tp_crit_retries);
}
)
assert(!tmpcsa->hold_onto_crit);
grab_crit(reg);
assert(!(tmpsi->update_trans & ~UPDTRNS_VALID_MASK));
if (tmpcsd->freeze && tmpsi->update_trans)
{
tr = tr->fPtr; /* Increment so we release the lock we actually got */
x_lock = FALSE;
break;
}
}
if (x_lock)
break;
tr_last = tr;
for (tr = tp_reg_list; tr_last != tr; tr = tr->fPtr)
rel_crit(tr->reg);
/* Wait for region to be unfrozen before re-grabbing crit on ALL regions */
WAIT_FOR_REGION_TO_UNFREEZE(tmpcsa, tmpcsd);
} /* for (;;) */
return TRUE;
}
boolean_t tp_tend()
{
block_id tp_blk;
boolean_t is_mm, release_crit, was_crit, x_lock, do_validation;
boolean_t replication = FALSE, region_is_frozen;
# ifdef UNIX
boolean_t supplementary = FALSE; /* this variable is initialized ONLY if "replication" is TRUE. */
seq_num strm_seqno, next_strm_seqno;
# endif
bt_rec_ptr_t bt;
cache_rec_ptr_t cr;
cw_set_element *cse, *first_cw_set, *bmp_begin_cse;
file_control *fc;
jnl_private_control *jpc;
jnl_buffer_ptr_t jbp;
jnl_format_buffer *jfb;
sgm_info *si, *si_last, *tmpsi, *si_not_validated;
tp_region *tr, *tr_last;
sgmnt_addrs *csa, *repl_csa = NULL;
sgmnt_data_ptr_t csd;
node_local_ptr_t cnl;
srch_blk_status *t1;
trans_num ctn, tnque_earliest_tn, epoch_tn, old_block_tn;
trans_num valid_thru; /* buffers touched by this transaction will be valid thru this tn */
enum cdb_sc status;
gd_region *save_gv_cur_region;
int lcnt, jnl_participants, replay_jnl_participants;
jnldata_hdr_ptr_t jnl_header;
jnl_record *rec;
boolean_t yes_jnl_no_repl, recompute_cksum, cksum_needed;
boolean_t save_dont_reset_gbl_jrec_time;
uint4 jnl_status, leafmods, indexmods;
uint4 total_jnl_rec_size, in_tend;
uint4 update_trans;
jnlpool_ctl_ptr_t jpl, tjpl;
boolean_t read_before_image; /* TRUE if before-image journaling or online backup in progress */
blk_hdr_ptr_t old_block;
unsigned int bsiz;
cache_rec_ptr_t *tp_cr_array;
unsigned int tp_cr_array_index;
sgm_info **prev_tp_si_by_ftok, *tmp_first_tp_si_by_ftok;
gv_namehead *prev_target, *curr_target;
jnl_tm_t save_gbl_jrec_time;
enum gds_t_mode mode;
# ifdef GTM_CRYPT
DEBUG_ONLY(
blk_hdr_ptr_t save_old_block;
)
# endif
boolean_t ss_need_to_restart, new_bkup_started;
DEBUG_ONLY(
int tmp_jnl_participants;
uint4 upd_num;
uint4 max_upd_num;
uint4 prev_upd_num;
uint4 upd_num_start;
uint4 upd_num_end;
char upd_num_seen[256];
)
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
assert(dollar_tlevel);
assert(0 == jnl_fence_ctl.level);
status = cdb_sc_normal;
/* if the transaction does no updates and the transaction history has not changed, we do not need any more validation */
do_validation = FALSE; /* initially set to FALSE, but set to TRUE below */
jnl_status = 0;
assert(NULL == first_tp_si_by_ftok);
first_tp_si_by_ftok = NULL; /* just in case it is not set */
prev_tp_si_by_ftok = &tmp_first_tp_si_by_ftok;
yes_jnl_no_repl = FALSE;
jnl_participants = 0; /* # of regions that had a LOGICAL journal record written for this TP */
assert(!IS_DSE_IMAGE UNIX_ONLY (&& !TREF(in_gvcst_redo_root_search))); /* DSE and gvcst_redo_root_search work in Non-TP */
for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr)
{
TP_CHANGE_REG_IF_NEEDED(tr->reg);
csa = cs_addrs;
csd = cs_data;
UNIX_ONLY(
assert(!csa->hold_onto_crit || jgbl.onlnrlbk); /* In TP, hold_onto_crit is set ONLY by online rollback */
assert(!jgbl.onlnrlbk || (csa->hold_onto_crit && csa->now_crit));
)
si = (sgm_info *)(csa->sgm_info_ptr);
sgm_info_ptr = si;
*prev_tp_si_by_ftok = si;
prev_tp_si_by_ftok = &si->next_tp_si_by_ftok;
if ((csd->wc_blocked) || /* If blocked, or.. */
((dba_mm == csd->acc_meth) && /* we have MM and.. */
(csa->total_blks != csd->trans_hist.total_blks))) /* and file has been extended */
{ /* Force repair */
status = cdb_sc_helpedout; /* special status to prevent punishing altruism */
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
goto failed_skip_revert;
}
/* Note that there are three ways a deadlock can occur.
* (a) If we are not in the final retry and we already hold crit on some region.
* (b) If we are in the final retry and we don't hold crit on some region.
* (c) If we are in the final retry and we hold crit on a frozen region that we want to update.
* This is possible if:
* (1) We did a tp_grab_crit through one of the gvcst_* routines when we first encountered the region
* in the TP transaction and it wasn't locked down although it was frozen then.
* (2) tp_crit_all_regions notices that at least one of the participating regions did ONLY READs, it
* will not wait for any freeze on THAT region to complete before grabbing crit. Later, in the
* final retry, if THAT region did an update which caused op_tcommit to invoke bm_getfree ->
* gdsfilext, then we would have come here with a frozen region on which we hold crit.
* The first two cases, (a) and (b), we don't know of any way they can happen. Case (c) though can happen.
* Nevertheless, we restart for all the three and in dbg version assert so we get some information.
*
* Note that in case of an online mupip journal rollback/recover, we will hold onto crit for the entire life
* of the process so that needs to be taken into account below.
*/
update_trans = si->update_trans;
assert(!(update_trans & ~UPDTRNS_VALID_MASK));
assert((UPDTRNS_JNL_LOGICAL_MASK & update_trans) || (NULL == si->jnl_head));
assert(!(UPDTRNS_JNL_LOGICAL_MASK & update_trans) || (NULL != si->jnl_head));
assert(!tr->reg->read_only || !update_trans);
region_is_frozen = (update_trans && csd->freeze);
if ((CDB_STAGNATE > t_tries)
? (csa->now_crit && !csa->hold_onto_crit)
: (!csa->now_crit || region_is_frozen))
{
assert(!csa->hold_onto_crit);
send_msg(VARLSTCNT(8) ERR_DLCKAVOIDANCE, 6, DB_LEN_STR(tr->reg),
&csd->trans_hist.curr_tn, t_tries, dollar_trestart, csa->now_crit);
/* The only possible case we know of is (c). assert to that effect. Use local variable region_is_frozen
* instead of csd->freeze as it could be concurrently changed even though we hold crit (freeze holding
* pid can clear it in secshr_db_clnup as part of exit processing).
*/
assert((CDB_STAGNATE <= t_tries) && csa->now_crit && region_is_frozen);
status = cdb_sc_needcrit; /* break the possible deadlock by signalling a restart */
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
goto failed_skip_revert;
}
/* Whenever si->first_cw_set is non-NULL, ensure that update_trans is non-zero */
assert((NULL == si->first_cw_set) || update_trans);
/* Whenever si->first_cw_set is NULL, ensure that si->update_trans is FALSE. See op_tcommit.c for exceptions */
assert((NULL != si->first_cw_set) || !si->update_trans || (UPDTRNS_ZTRIGGER_MASK & si->update_trans)
|| (gvdupsetnoop && (!JNL_ENABLED(csa) || (NULL != si->jnl_head))));
if (!update_trans)
{
/* See if we can take a fast path for read transactions based on the following conditions :
* 1. If the transaction number hasn't changed since we read the blocks from the disk or cache
* 2. If NO concurrent online rollback is running. This is needed because we don't want read transactions
* to succeed. The issue with this check is that for a rollback that was killed, the PID will be non-
* zero. In that case, we might skip the fast path and go ahead and do the validation. The validation
* logic gets crit anyways and so will salvage the lock and do the necessary recovery and issue
* DBFLCORRP if it notices that csd->file_corrupt is TRUE.
*/
if ((si->start_tn == csd->trans_hist.early_tn) UNIX_ONLY(&& (0 == csa->nl->onln_rlbk_pid)))
{ /* read with no change to the transaction history. ensure we haven't overrun
* our history buffer and we have reasonable values for first and last */
assert(si->last_tp_hist - si->first_tp_hist <= si->tp_hist_size);
continue;
} else
do_validation = TRUE;
} else
{
do_validation = TRUE;
is_mm = (dba_mm == cs_data->acc_meth);
/* We are still out of crit if this is not our last attempt. If so, run the region list and check
* that we have sufficient free blocks for our update. If not, get them now while we can.
* We will repeat this check later in crit but it will hopefully have little or nothing to do.
* bypass 1st check if already in crit -- check later
*/
if (!csa->now_crit && !is_mm && (csa->nl->wc_in_free < si->cw_set_depth + 1)
&& !wcs_get_space(gv_cur_region, si->cw_set_depth + 1, NULL))
assert(FALSE); /* wcs_get_space should have returned TRUE unconditionally in this case */
if (JNL_ENABLED(csa))
{ /* compute the total journal record size requirements before grab_crit.
* there is code later that will check for state changes from now to then
*/
TOTAL_TPJNL_REC_SIZE(total_jnl_rec_size, si, csa);
/* compute current transaction's maximum journal space needs in number of disk blocks */
si->tot_jrec_size = MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size);
GTM_WHITE_BOX_TEST(WBTEST_TP_TEND_TRANS2BIG, si->tot_jrec_size, (2 * csd->autoswitchlimit));
/* check if current TP transaction's jnl size needs are greater than max jnl file size */
if (si->tot_jrec_size > csd->autoswitchlimit)
/* can't fit in current transaction's journal records into one journal file */
rts_error(VARLSTCNT(6) ERR_JNLTRANS2BIG, 4, si->tot_jrec_size,
JNL_LEN_STR(csd), csd->autoswitchlimit);
}
if (REPL_ALLOWED(csa))
{
assert(JNL_ENABLED(csa) || REPL_WAS_ENABLED(csa));
replication = TRUE;
jnl_participants++;
} else if (JNL_ENABLED(csa))
{
yes_jnl_no_repl = TRUE;
save_gv_cur_region = gv_cur_region; /* save the region for later error reporting */
jnl_participants++;
}
}
if (region_is_frozen)
{ /* Wait for it to be unfrozen before proceeding to commit. This reduces the
* chances that we find it frozen after we grab crit further down below.
*/
WAIT_FOR_REGION_TO_UNFREEZE(csa, csd);
}
} /* for (tr... ) */
*prev_tp_si_by_ftok = NULL;
if (replication || yes_jnl_no_repl)
{ /* The SET_GBL_JREC_TIME done below should be done before any journal writing activity
* on ANY region's journal file. This is because all the jnl record writing routines assume
* jgbl.gbl_jrec_time is initialized appropriately.
*/
assert(!jgbl.forw_phase_recovery || jgbl.dont_reset_gbl_jrec_time);
if (!jgbl.dont_reset_gbl_jrec_time)
SET_GBL_JREC_TIME; /* initializes jgbl.gbl_jrec_time */
assert(jgbl.gbl_jrec_time);
/* If any one DB that we are updating has replication turned on and another has only journaling, issue error */
if (replication && yes_jnl_no_repl)
rts_error(VARLSTCNT(4) ERR_REPLOFFJNLON, 2, DB_LEN_STR(save_gv_cur_region));
}
if (!do_validation)
{
if ((CDB_STAGNATE <= t_tries) UNIX_ONLY(&& !jgbl.onlnrlbk))
{
for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr)
{
# ifdef DEBUG
csa = &FILE_INFO(tr->reg)->s_addrs;
assert(!csa->hold_onto_crit);
# endif
rel_crit(tr->reg);
}
} /* else we are online rollback and we already hold crit on all regions */
/* Must be done after REVERT since we are no longer in crit */
if (unhandled_stale_timer_pop)
process_deferred_stale();
for (si = first_sgm_info; (NULL != si); si = si->next_sgm_info)
{
csa = si->tp_csa;
cnl = csa->nl;
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_blkread, si->num_of_blks);
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_readonly, 1);
}
return TRUE;
}
/* Because secshr_db_clnup uses first_tp_si_by_ftok to determine if a TP transaction is underway and expects
* a well-formed linked list if it is non-zero, the following assignment to the head of the region list must occur
* after the loop above
*/
first_tp_si_by_ftok = tmp_first_tp_si_by_ftok;
DEBUG_ONLY(
/* Cross-check the validity of the ftok sorted sgm_info list with "tp_reg_list" */
tr = tp_reg_list;
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
{
tmpsi = (sgm_info *)(FILE_INFO(tr->reg)->s_addrs.sgm_info_ptr);
assert(tmpsi == si);
tr = tr->fPtr;
}
assert(NULL == tr);
)
assert(cdb_sc_normal == status);
/* The following section of code (initial part of the for loop) is similar to the function "tp_crit_all_regions".
* The duplication is there only because of performance reasons. The latter function has to go through tp_reg_list
* linked list while here we can go through first_tp_si_by_ftok list which offers a performance advantage.
*
* The following section grabs crit in all regions touched by the transaction. We use a different
* structure here for grabbing crit. The tp_reg_list region list contains all the regions that
* were touched by this transaction. Since this array is sorted by the ftok value of the database
* file being operated on, the obtains will always occurr in a consistent manner. Therefore, we
* will grab crit on each file with wait since deadlock should not be able to occurr.
*/
ESTABLISH_RET(t_ch, FALSE);
for (lcnt = 0; ; lcnt++)
{
x_lock = TRUE; /* Assume success */
DEBUG_ONLY(tmp_jnl_participants = 0;)
/* The following loop grabs crit, does validations and prepares for commit on ALL participating regions */
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
{
sgm_info_ptr = si;
TP_TEND_CHANGE_REG(si);
csa = cs_addrs;
csd = cs_data;
assert(!si->cr_array_index);
DEBUG_ONLY(
/* Track retries in debug mode */
if (0 != lcnt)
{
BG_TRACE_ANY(csa, tp_crit_retries);
}
)
update_trans = si->update_trans;
assert(!(update_trans & ~UPDTRNS_VALID_MASK));
first_cw_set = si->first_cw_set;
/* whenever si->first_cw_set is non-NULL, ensure that si->update_trans is non-zero */
assert((NULL == first_cw_set) || update_trans);
/* When si->first_cw_set is NULL, ensure that si->update_trans is FALSE. See op_tcommit.c for exceptions */
assert((NULL != si->first_cw_set) || !si->update_trans || (UPDTRNS_ZTRIGGER_MASK & si->update_trans)
|| (gvdupsetnoop && (!JNL_ENABLED(csa) || (NULL != si->jnl_head))));
leafmods = indexmods = 0;
is_mm = (dba_mm == csd->acc_meth);
if (TREF(tprestart_syslog_delta))
n_blkmods = n_pvtmods = 0;
/* If we already hold crit (possible if we are in the final retry), do not invoke grab_crit as it will
* invoke wcs_recover unconditionally if csd->wc_blocked is set to TRUE. In that case, we want to
* restart with a helped out code because the cache recovery will most likely result in a restart of
* the current transaction which we want to avoid if we are in the final retry.
*/
if (!csa->now_crit)
grab_crit(gv_cur_region);
else if (csd->wc_blocked)
{
status = cdb_sc_helpedout;
goto failed;
}
if (is_mm && ((csa->hdr != csd) || (csa->total_blks != csd->trans_hist.total_blks)))
{ /* If MM, check if wcs_mm_recover was invoked as part of the grab_crit done above OR if
* the file has been extended. If so, restart.
*/
status = cdb_sc_helpedout; /* force retry with special status so philanthropy isn't punished */
goto failed;
}
# ifdef GTM_TRUNCATE
assert(csa->total_blks);
if (csa->ti->total_blks < csa->total_blks)
{
/* File has been truncated since this process entered op_tcommit or last called gdsfilext on csa.
* I.e., the file is smaller than its last known size and we might have allocated blocks
* beyond csa->ti->total_blks. Restart. */
assert(dba_mm != csd->acc_meth);
assert(CDB_STAGNATE > t_tries); /* On the final retry, should have crit and truncate can't happen */
status = cdb_sc_truncate;
goto failed;
}
# endif
/* Note that even though we ensured that regions are not frozen outside of crit, it is still possible
* that they become frozen just before we grab crit. In this case (should be rare though) release
* crit on ALL regions that we have grabbed uptil this point and wait for the freeze to be removed.
*/
if (csd->freeze && update_trans)
{
x_lock = FALSE;
break;
}
CHECK_TN(csa, csd, csd->trans_hist.curr_tn); /* can issue rts_error TNTOOLARGE */
if (!is_mm)
tnque_earliest_tn = ((th_rec_ptr_t)((sm_uc_ptr_t)csa->th_base + csa->th_base->tnque.fl))->tn;
# ifdef UNIX
/* We never expect to come here with file_corrupt set to TRUE (in case of an online rollback) because
* grab_crit done above will make sure of that. The only exception is RECOVER/ROLLBACK itself coming
* here in the forward phase
*/
assert(!csd->file_corrupt || mupip_jnl_recover);
/* only_reset_clues_if_onln_rlbk is set ONLY for gvcst_bmp_mark_free which operates completely in non-tp
* and so we should never come here. This also ensures that we never try to commit a TP transaction when
* this flag is set
*/
assert(!TREF(only_reset_clues_if_onln_rlbk));
if (csa->onln_rlbk_cycle != csa->nl->onln_rlbk_cycle)
{ /* A concurrent Online Rollback occurred. Restart to be safe. */
assert(!mupip_jnl_recover);
/* Note: We don't assert that CDB_STAGNATE > t_tries because we can detect an online rollback even
* in the final retry.
*/
status = cdb_sc_onln_rlbk1;
if (csa->db_onln_rlbkd_cycle != csa->nl->db_onln_rlbkd_cycle)
status = cdb_sc_onln_rlbk2; /* database was rolled back to a different logical state */
SYNC_ONLN_RLBK_CYCLES;
if ((CDB_STAGNATE - 1) == t_tries)
release_crit = TRUE;
goto failed;
}
# endif
# ifdef GTM_TRIGGER
if (!skip_dbtriggers && si->tp_set_sgm_done && (csa->db_trigger_cycle != csd->db_trigger_cycle))
{ /* The process' view of the triggers could be potentially stale. restart to be safe.
* Note: We need to validate triggers ONLY if the region (pointed to by si) was actually referenced
* in this retry of the transaction. Hence the si->tp_set_sgm_done check.
*/
/* Triggers can be invoked only by GT.M and Update process. Out of these, we expect only
* GT.M to see restarts due to concurrent trigger changes. Update process is the only
* updater on the secondary so we dont expect it to see any concurrent trigger changes.
* The only exception is if this is a supplementary root primary instance. In that case,
* the update process coexists with GT.M processes and hence can see restarts due to
* concurrent trigger changes. Assert accordingly.
*/
assert(CDB_STAGNATE > t_tries);
assert(!is_updproc || (jnlpool.repl_inst_filehdr->is_supplementary
&& !jnlpool.jnlpool_ctl->upd_disabled));
assert(csd->db_trigger_cycle > csa->db_trigger_cycle);
/* csa->db_trigger_cycle will be set to csd->db_trigger_cycle for all participating
* regions when they are each first referenced in the next retry (in tp_set_sgm)\
*/
status = cdb_sc_triggermod;
goto failed;
}
# endif
if (update_trans)
{
assert((NULL == first_cw_set) || (0 != si->cw_set_depth));
DEBUG_ONLY(
/* Recompute # of replicated regions inside of crit */
if (REPL_ALLOWED(csa))
{
tmp_jnl_participants++;
} else if (JNL_ENABLED(csa))
{
assert(!replication); /* should have issued a REPLOFFJNLON error outside of crit */
tmp_jnl_participants++;
}
)
if (JNL_ALLOWED(csa))
{
if ((csa->jnl_state != csd->jnl_state) || (csa->jnl_before_image != csd->jnl_before_image))
{ /* Take this opportunity to check/sync ALL regions where csa/csd dont match */
for (tmpsi = first_tp_si_by_ftok;
(NULL != tmpsi);
tmpsi = tmpsi->next_tp_si_by_ftok)
{
csa = tmpsi->tp_csa;
csd = csa->hdr;
csa->jnl_state = csd->jnl_state;
csa->jnl_before_image = csd->jnl_before_image;
/* jnl_file_lost causes a jnl_state transition from jnl_open to jnl_closed
* and additionally causes a repl_state transition from repl_open to
* repl_closed all without standalone access. This means that
* csa->repl_state might be repl_open while csd->repl_state might be
* repl_closed. update csa->repl_state in this case as otherwise the rest
* of the code might look at csa->repl_state and incorrectly conclude
* replication is on and generate sequence numbers when actually no journal
* records are being generated. [C9D01-002219]
*/
csa->repl_state = csd->repl_state;
}
status = cdb_sc_jnlstatemod;
goto failed;
}
}
/* Flag retry, if other mupip activities like BACKUP, INTEG or FREEZE are in progress.
* If in final retry, go ahead with kill. BACKUP/INTEG/FREEZE will wait for us to be done.
*/
if ((NULL != si->kill_set_head) && (0 < csa->nl->inhibit_kills) && (CDB_STAGNATE > t_tries))
{
status = cdb_sc_inhibitkills;
goto failed;
}
/* Caution : since csa->backup_in_prog was initialized in op_tcommit only if si->first_cw_set was
* non-NULL, it should be used in tp_tend only within an if (NULL != si->first_cw_set)
*/
if (NULL != first_cw_set)
{
ss_need_to_restart = new_bkup_started = FALSE;
GTM_SNAPSHOT_ONLY(
CHK_AND_UPDATE_SNAPSHOT_STATE_IF_NEEDED(csa, csa->nl, ss_need_to_restart);
)
CHK_AND_UPDATE_BKUP_STATE_IF_NEEDED(csa->nl, csa, new_bkup_started);
if (ss_need_to_restart
|| (new_bkup_started && !(JNL_ENABLED(csa) && csa->jnl_before_image)))
{
/* If online backup is in progress now and before-image journaling is
* not enabled, we would not have read before-images for created blocks.
* Although it is possible that this transaction might not have blocks
* with gds_t_create at all, we expect this backup_in_prog state change
* to be so rare that it is ok to restart.
*/
status = cdb_sc_bkupss_statemod;
goto failed;
}
}
/* recalculate based on the new values of snapshot_in_prog and backup_in_prog */
read_before_image = ((JNL_ENABLED(csa) && csa->jnl_before_image)
|| csa->backup_in_prog
|| SNAPSHOTS_IN_PROG(csa));
if (!is_mm)
{ /* in crit, ensure cache-space is available.
* the out-of-crit check done above might not be enough
*/
if (csa->nl->wc_in_free < si->cw_set_depth + 1)
{
if (!wcs_get_space(gv_cur_region, si->cw_set_depth + 1, NULL))
{
assert(csd->wc_blocked); /* only reason we currently know
* why wcs_get_space could fail */
assert(gtm_white_box_test_case_enabled);
SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_wcsgetspace);
status = cdb_sc_cacheprob;
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
goto failed;
}
}
VMS_ONLY(
if (csd->clustered && !CCP_SEGMENT_STATE(csa->nl, CCST_MASK_HAVE_DIRTY_BUFFERS))
{
CCP_FID_MSG(gv_cur_region, CCTR_FLUSHLK);
ccp_userwait(gv_cur_region, CCST_MASK_HAVE_DIRTY_BUFFERS,
NULL, csa->nl->ccp_cycle);
}
)
}
if (JNL_ENABLED(csa))
{ /* Since we got the system time (jgbl.gbl_jrec_time) outside of crit, it is possible that
* journal records were written concurrently to this file with a timestamp that is future
* relative to what we recorded. In that case, adjust our recorded time to match this.
* This is necessary to ensure that timestamps of successive journal records for each
* database file are in non-decreasing order. A side-effect of this is that our recorded
* time might not accurately reflect the current system time but that is considered not
* an issue since we dont expect to be off by more than a second or two if at all.
* Another side effect is that even if the system time went back, we will never write
* out-of-order timestamped journal records in the lifetime of this database shared memory.
*/
jpc = csa->jnl;
jbp = jpc->jnl_buff;
/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
* of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
* journal records (if it decides to switch to a new journal file).
*/
ADJUST_GBL_JREC_TIME(jgbl, jbp);
/* Note that jnl_ensure_open can call cre_jnl_file which in turn assumes
* jgbl.gbl_jrec_time is set. Also jnl_file_extend can call jnl_write_epoch_rec
* which in turn assumes jgbl.gbl_jrec_time is set. In case of forw-phase-recovery,
* mur_output_record would have already set this.
*/
assert(jgbl.gbl_jrec_time);
jnl_status = jnl_ensure_open();
GTM_WHITE_BOX_TEST(WBTEST_TP_TEND_JNLFILOPN, jnl_status, ERR_JNLFILOPN);
if (jnl_status != 0)
{
ctn = csd->trans_hist.curr_tn;
assert(csd->trans_hist.early_tn == ctn);
if (SS_NORMAL != jpc->status)
rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd),
DB_LEN_STR(gv_cur_region), jpc->status);
else
rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd),
DB_LEN_STR(gv_cur_region));
}
if (DISK_BLOCKS_SUM(jbp->freeaddr, si->total_jnl_rec_size) > jbp->filesize)
{ /* Moved here to prevent jnlrecs split across multiple generation journal files. */
if (SS_NORMAL != (jnl_status = jnl_flush(jpc->region)))
{
send_msg(VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd),
ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in tp_tend"),
jnl_status);
assert((!JNL_ENABLED(csd)) && JNL_ENABLED(csa));
status = cdb_sc_jnlclose;
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
goto failed;
} else if (EXIT_ERR == jnl_file_extend(jpc, si->total_jnl_rec_size))
{
assert((!JNL_ENABLED(csd)) && JNL_ENABLED(csa));
assert(csd == csa->hdr); /* If MM, csd shouldn't have been reset */
status = cdb_sc_jnlclose;
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
goto failed;
}
assert(csd == csa->hdr); /* If MM, csd shouldn't have been reset */
}
if (JNL_HAS_EPOCH(jbp)
&& ((jbp->next_epoch_time <= jgbl.gbl_jrec_time) UNCONDITIONAL_EPOCH_ONLY(|| TRUE)))
{ /* Flush the cache. Since we are in crit, defer syncing the epoch */
/* Note that at this point, jgbl.gbl_jrec_time has been computed taking into
* account the current system time & the last journal record timestamp of ALL
* regions involved in this TP transaction. To prevent wcs_flu from inadvertently
* setting this BACK in time (poses out-of-order timestamp issues for backward
* recovery and is asserted later in tp_tend) set jgbl.dont_reset_gbl_jrec_time
* to TRUE for the duration of the wcs_flu.
* Also, in case of rts_error from wcs_flu, t_ch will be invoked which will take
* care of restoring this variable to FALSE. Any new codepath in mumps that sets
* this variable for the duration of wcs_flu should take care of resetting this
* back to FALSE in an existing condition handler (or by creating a new one if not
* already present)
* Since, this global is set to TRUE explicitly by forward recovery, we should NOT
* reset this to FALSE unconditionally. But, instead of checking if forward recovery
* is TRUE, save and restore this variable unconditionally thereby saving a few
* CPU cycles.
*/
save_dont_reset_gbl_jrec_time = jgbl.dont_reset_gbl_jrec_time;
jgbl.dont_reset_gbl_jrec_time = TRUE;
if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_IN_COMMIT))
{
assert(csd == csa->hdr);
jgbl.dont_reset_gbl_jrec_time = save_dont_reset_gbl_jrec_time;
SET_WCS_FLU_FAIL_STATUS(status, csd);
SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_jnl_wcsflu);
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
goto failed;
}
jgbl.dont_reset_gbl_jrec_time = save_dont_reset_gbl_jrec_time;
assert(csd == csa->hdr);
}
assert(jgbl.gbl_jrec_time >= jbp->prev_jrec_time);
} /* if (journaling) */
}
/* the following section verifies that the optimistic concurrency was justified */
assert(cdb_sc_normal == status);
for (t1 = si->first_tp_hist; t1 != si->last_tp_hist; t1++)
{
assert(NULL != t1->blk_target);
cse = t1->cse;
if (is_mm)
{ /* the check below is different from the one for BG (i.e. doesn't have the killtn check)
* because there is no BT equivalent in MM. there is a mmblk_rec which is more or
* less the same as a BT. when the mmblk_rec becomes as fully functional as BT, we
* can use the killtn optimization for MM also.
*/
if (t1->tn <= ((blk_hdr_ptr_t)t1->buffaddr)->tn)
{
assert(CDB_STAGNATE > t_tries);
assert(!cse || !cse->high_tlevel);
if (!cse || !cse->recompute_list_head || cse->write_type
|| (cdb_sc_normal != recompute_upd_array(t1, cse)) || !++leafmods)
{
status = cdb_sc_blkmod;
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
DEBUG_ONLY(continue;)
PRO_ONLY(goto failed;)
}
}
} else
{
bt = bt_get(t1->blk_num);
if (NULL != bt)
{
if (t1->tn <= bt->tn)
{
assert(!cse || !cse->high_tlevel);
assert(CDB_STAGNATE > t_tries);
/* "indexmods" and "leafmods" are to monitor number of blocks that used
* indexmod and noisolation optimizations respectively. Note that once
* in this part of the code, atleast one of them will be non-zero and
* if both of them turn out to be non-zero, then we need to restart.
* See gdscc.h for a description of the indexmod optimization.
*/
if (t1->level)
{
if (cse || t1->tn <= bt->killtn)
status = cdb_sc_blkmod;
else
{
indexmods++;
t1->blk_target->clue.end = 0;
if (leafmods)
status = cdb_sc_blkmod;
}
} else
{ /* For a non-isolated global, if the leaf block isn't part of the
* cw-set, this means that it was involved in an M-kill that freed
* the data-block from the B-tree. In this case, if the leaf-block
* has changed since we did our read of the block, we have to redo
* the M-kill. But since redo of that M-kill might involve much
* more than just leaf-level block changes, we will be safe and do
* a restart. If the need for NOISOLATION optimization for M-kills
* is felt, we need to revisit this.
*/
if (!t1->blk_target->noisolation || !cse)
status = cdb_sc_blkmod;
else
{
assert(cse->write_type || cse->recompute_list_head);
leafmods++;
if (indexmods || cse->write_type
|| (cdb_sc_normal !=
recompute_upd_array(t1, cse)))
status = cdb_sc_blkmod;
}
}
if (cdb_sc_normal != status)
{
if (TREF(tprestart_syslog_delta))
{
n_blkmods++;
if (cse)
n_pvtmods++;
if (1 != n_blkmods)
continue;
}
assert(CDB_STAGNATE > t_tries);
status = cdb_sc_blkmod;
TP_TRACE_HIST_MOD(t1->blk_num, t1->blk_target,
tp_blkmod_tp_tend, csd, t1->tn, bt->tn, t1->level);
DEBUG_ONLY(continue;)
PRO_ONLY(goto failed;)
}
}
} else if (t1->tn <= tnque_earliest_tn)
{
assert(CDB_STAGNATE > t_tries);
status = cdb_sc_losthist;
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
DEBUG_ONLY(continue;)
PRO_ONLY(goto failed;)
}
assert(CYCLE_PVT_COPY != t1->cycle);
if (cse)
{ /* Do cycle check only if blk has cse and hasn't been built (if it has, then tp_hist
* would have done the cdb_sc_lostcr check soon after it got built) or if we have BI
* journaling or online backup is currently running. The BI-journaling/online-backup
* check is to ensure that the before-image/pre-update-copy we write hasn't been
* recycled.
*/
if ((NULL == bt) || (CR_NOTVALID == bt->cache_index))
cr = db_csh_get(t1->blk_num);
else
{
cr = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index);
if ((NULL != cr) && (cr->blk != bt->blk))
{
assert(FALSE);
SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_crbtmismatch1);
status = cdb_sc_crbtmismatch;
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
goto failed;
}
}
if ((cache_rec_ptr_t)CR_NOTVALID == cr)
{
SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_t1);
status = cdb_sc_cacheprob;
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
goto failed;
}
assert(update_trans); /* ensure read_before_image was computed above */
if (!cse->new_buff || read_before_image)
{
if ((NULL == cr) || (cr->cycle != t1->cycle)
|| ((sm_long_t)GDS_ANY_REL2ABS(csa, cr->buffaddr)
!= (sm_long_t)t1->buffaddr))
{
if ((NULL != cr) && (NULL != bt) && (cr->blk != bt->blk))
{
assert(FALSE);
SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_crbtmismatch2);
status = cdb_sc_crbtmismatch;
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
goto failed;
}
assert(CDB_STAGNATE > t_tries);
status = cdb_sc_lostcr;
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
DEBUG_ONLY(continue;)
PRO_ONLY(goto failed;)
}
/* Now that we know the cache-record is still valid, pin it.
*
* It is possible that t1->cse is non-NULL even though we eventually
* decided NOT to update that particular block e.g. if t1->cse->mode
* was originally t_write but later got set to kill_t_write. In such
* cases, dont set in_cw_set as we dont need this buffer pinned at all.
*/
assert(n_gds_t_op != cse->mode);
assert(kill_t_create > n_gds_t_op);
assert(kill_t_write > n_gds_t_op);
if (n_gds_t_op > cse->mode)
TP_PIN_CACHE_RECORD(cr, si);
}
/* The only case cr can be NULL at this point of code is when
* a) cse->new_buff is non-NULL
* b) AND the block is not in cache
* c) AND we don't have before-image-journaling
* d) AND online backup is not running.
* In this case bg_update will do a db_csh_getn and appropriately set in_cw_set
* field to be TRUE so no need to pin the cache-record here.
*/
}
}
} /* for (t1 ... ) */
DEBUG_ONLY(
if (cdb_sc_normal != status)
goto failed;
else
{ /* Now that we have successfully validated all histories, check that there is no
* gv_target mismatch between history and corresponding cse.
*/
for (t1 = si->first_tp_hist; t1 != si->last_tp_hist; t1++)
{
cse = t1->cse;
if (NULL != cse)
assert(t1->blk_target == cse->blk_target);
}
}
)
if (DIVIDE_ROUND_UP(si->num_of_blks, 4) < leafmods) /* if status == cdb_sc_normal, then leafmods */
{
status = cdb_sc_toomanyrecompute; /* is exactly the number of recomputed blocks */
goto failed;
}
assert(cdb_sc_normal == status);
if (NULL == first_cw_set)
continue;
/* Check bit maps for usage */
for (cse = si->first_cw_bitmap; NULL != cse; cse = cse->next_cw_set)
{
assert(0 == cse->jnl_freeaddr); /* ensure haven't missed out resetting jnl_freeaddr for any cse in
* t_write/t_create/{t,mu}_write_map/t_write_root [D9B11-001991] */
TRAVERSE_TO_LATEST_CSE(cse);
assert(0 == ((off_chain *)&cse->blk)->flag);
assert(!cse->high_tlevel);
if (is_mm)
{
if ((cse->tn <= ((blk_hdr_ptr_t)cse->old_block)->tn) && !reallocate_bitmap(si, cse))
{
assert(CDB_STAGNATE > t_tries);
status = cdb_sc_bmlmod;
TP_TRACE_HIST(cse->blk, NULL);
goto failed;
}
} else
{
tp_blk = cse->blk;
bt = bt_get(tp_blk);
if (NULL != bt)
{
if ((cse->tn <= bt->tn) && !reallocate_bitmap(si, cse))
{
assert(CDB_STAGNATE > t_tries);
status = cdb_sc_bmlmod;
TP_TRACE_HIST(tp_blk, NULL);
goto failed;
}
} else if (cse->tn <= tnque_earliest_tn)
{
assert(CDB_STAGNATE > t_tries);
status = cdb_sc_lostbmlhist;
TP_TRACE_HIST(tp_blk, NULL);
goto failed;
}
assert(NULL == cse->new_buff);
if ((NULL == bt) || (CR_NOTVALID == bt->cache_index))
{
cr = db_csh_get(tp_blk);
if ((cache_rec_ptr_t)CR_NOTVALID == cr)
{
status = cdb_sc_cacheprob;
TP_TRACE_HIST(tp_blk, NULL);
SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_bitmap);
goto failed;
}
} else
{
cr = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index);
if (cr->blk != bt->blk)
{
assert(FALSE);
SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_crbtmismatch3);
status = cdb_sc_crbtmismatch;
TP_TRACE_HIST(tp_blk, NULL);
goto failed;
}
}
if ((NULL == cr) || (cr->cycle != cse->cycle) ||
((sm_long_t)GDS_ANY_REL2ABS(csa, cr->buffaddr) != (sm_long_t)cse->old_block))
{
assert(CDB_STAGNATE > t_tries);
status = cdb_sc_lostbmlcr;
TP_TRACE_HIST(tp_blk, NULL);
goto failed;
}
TP_PIN_CACHE_RECORD(cr, si);
}
} /* for (all bitmaps written) */
si->backup_block_saved = FALSE;
jbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
/* Caution : since csa->backup_in_prog was initialized in op_tcommit only if si->first_cw_set was
* non-NULL, it should be used in tp_tend only within an if (NULL != si->first_cw_set)
*/
if (!is_mm && ((NULL != jbp) || csa->backup_in_prog || SNAPSHOTS_IN_PROG(csa)))
{
for (cse = first_cw_set; NULL != cse; cse = cse->next_cw_set)
{ /* have already read old block for creates before we got crit, make sure
* cache record still has correct block. if not, reset "cse" fields to
* point to correct cache-record. this is ok to do since we only need the
* prior content of the block (for online backup or before-image journaling)
* and did not rely on it for constructing the transaction. Restart if
* block is not present in cache now or is being read in currently.
*/
TRAVERSE_TO_LATEST_CSE(cse);
if (gds_t_acquired == cse->mode && (NULL != cse->old_block))
{
assert(CYCLE_PVT_COPY != cse->cycle);
cr = db_csh_get(cse->blk);
if ((cache_rec_ptr_t)CR_NOTVALID == cr)
{
TP_TRACE_HIST(cse->blk, cse->blk_target);
status = cdb_sc_cacheprob;
SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_jnl_cwset);
goto failed;
}
/* It is possible that cr->in_cw_set is non-zero in case a concurrent MUPIP REORG
* UPGRADE/DOWNGRADE is in PHASE2 touching this very same block. In that case,
* we cannot reuse this block so we restart. We could try finding a different block
* to acquire instead and avoid a restart (tracked as part of C9E11-002651).
* Note that in_cw_set is set to 0 ahead of in_tend in bg_update_phase2. Therefore
* it is possible that we see in_cw_set 0 but in_tend is still non-zero. In that
* case, we cannot proceed with pinning this cache-record as the cr is still locked
* by the other process. We can choose to wait here but instead decide to restart.
*/
if ((NULL == cr) || (0 <= cr->read_in_progress)
|| (0 != cr->in_cw_set) || (0 != cr->in_tend))
{
TP_TRACE_HIST(cse->blk, cse->blk_target);
assert(CDB_STAGNATE > t_tries);
status = cdb_sc_lostbefor;
goto failed;
}
TP_PIN_CACHE_RECORD(cr, si);
cse->ondsk_blkver = cr->ondsk_blkver;
old_block = (blk_hdr_ptr_t)GDS_REL2ABS(cr->buffaddr);
assert((cse->cr != cr) || (cse->old_block == (sm_uc_ptr_t)old_block));
old_block_tn = old_block->tn;
/* Need checksums if before imaging and if a PBLK record is going to be written. */
cksum_needed = (!cse->was_free && (NULL != jbp) && (old_block_tn < jbp->epoch_tn));
if ((cse->cr != cr) || (cse->cycle != cr->cycle))
{ /* Block has relocated in the cache. Adjust pointers to new location. */
cse->cr = cr;
cse->cycle = cr->cycle;
cse->old_block = (sm_uc_ptr_t)old_block;
/* PBLK checksum was computed outside-of-crit when block was read but
* block has relocated in the cache since then so recompute the checksum
* if this block needs a checksum in the first place (cksum_needed is TRUE).
*/
recompute_cksum = cksum_needed;
} else if (cksum_needed)
{ /* We have determined that a checksum is needed for this block. If we
* have not previously computed one outside crit OR if the block contents
* have changed since the checksum was previously computed, we need to
* recompute it. Otherwise, the out-of-crit computed value can be safely
* used. Note that cse->tn is valid only if a checksum was computed outside
* of crit. So make sure it is used only if checksum is non-zero. There is
* a rare chance that the computed checksum could be zero in which case we
* will recompute unnecessarily. Since that is expected to be very rare,
* it is considered ok for now.
*/
recompute_cksum = (!cse->blk_checksum || (cse->tn <= old_block_tn));
}
if (!cksum_needed)
cse->blk_checksum = 0; /* zero any out-of-crit computed checksum */
else if (recompute_cksum)
{ /* We hold crit at this point so we are guaranteed valid bsiz field.
* Hence we do not need to verify if bsiz is lesser than csd->blk_size
* like we did in an earlier call to jnl_get_checksum (in op_tcommit.c).
*/
assert(NULL != jbp);
assert(SIZEOF(bsiz) == SIZEOF(old_block->bsiz));
bsiz = old_block->bsiz;
assert(bsiz <= csd->blk_size);
cse->blk_checksum = jnl_get_checksum((uint4*)old_block, csa, bsiz);
}
DEBUG_ONLY(
else
assert(cse->blk_checksum ==
jnl_get_checksum((uint4 *)old_block, csa, old_block->bsiz));
)
assert(cse->cr->blk == cse->blk);
} /* end if acquired block */
} /* end cse for loop */
} /* end if !mm && before-images need to be written */
# ifdef VMS
/* Check if this TP transaction created any cses of mode kill_t_write or kill_t_create. If so, ensure
* the corresponding cse->new_buff has already been initialized. This is necessary in case an error
* occurs in the midst of commit and we end up invoking secshr_db_clnup which in turn will invoke
* sec_shr_blk_build for these cses and that will expect that cse->new_buff be non-zero. For Unix
* we invoke get_new_free_element to initialize cse->new_buff in secshr_db_clnup itself. But for VMS
* we dont want this routine invoked in the privileged GTMSECSHR image so we do this beforehand.
*/
if (tp_has_kill_t_cse)
{
for (cse = first_cw_set; NULL != cse; cse = cse->next_cw_set)
{
TRAVERSE_TO_LATEST_CSE(cse);
if ((n_gds_t_op < cse->mode) && (NULL == cse->new_buff))
{
assert(!cse->done);
cse->new_buff = get_new_free_element(si->new_buff_list);
cse->first_copy = TRUE;
}
}
}
DEBUG_ONLY(
for (cse = first_cw_set; NULL != cse; cse = cse->next_cw_set)
{
TRAVERSE_TO_LATEST_CSE(cse);
assert((n_gds_t_op > cse->mode) || (NULL != cse->new_buff));
}
)
# endif
assert(cdb_sc_normal == status);
}
if (x_lock)
break;
assert(csd == si->tp_csd);
si = si->next_tp_si_by_ftok; /* Increment so we release the lock we actually got */
si_last = si;
for (si = first_tp_si_by_ftok; (si_last != si); si = si->next_tp_si_by_ftok)
{
assert(si->tp_csa->now_crit);
tp_cr_array = si->cr_array;
UNPIN_CR_ARRAY_ON_RETRY(tp_cr_array, si->cr_array_index);
assert(!si->cr_array_index);
if (!si->tp_csa->hold_onto_crit)
rel_crit(si->gv_cur_region);
}
/* Check that we DONT own crit/commit on ANY region. The only exception is online mupip journal rollback/recovery
* which holds crit for the entire process lifetime.
*/
assert(UNIX_ONLY(jgbl.onlnrlbk || ) (0 == have_crit(CRIT_HAVE_ANY_REG | CRIT_IN_COMMIT)));
/* Wait for it to be unfrozen before re-grabbing crit on ALL regions */
WAIT_FOR_REGION_TO_UNFREEZE(csa, csd);
assert(CDB_STAGNATE > t_tries);
} /* for (;;) */
/* Validate the correctness of the calculation of # of replication/journaled regions inside & outside of crit */
assert(tmp_jnl_participants == jnl_participants);
assert(cdb_sc_normal == status);
if (replication)
{
jpl = jnlpool_ctl;
tjpl = temp_jnlpool_ctl;
repl_csa = &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs;
if (!repl_csa->hold_onto_crit)
GRAB_LOCK(jnlpool.jnlpool_dummy_reg, ASSERT_NO_ONLINE_ROLLBACK);
tjpl->write_addr = jpl->write_addr;
tjpl->write = jpl->write;
tjpl->jnl_seqno = jpl->jnl_seqno;
# ifdef UNIX
if (INVALID_SUPPL_STRM != strm_index)
{ /* Need to also update supplementary stream seqno */
supplementary = TRUE;
assert(0 <= strm_index);
/* assert(strm_index < ARRAYSIZE(tjpl->strm_seqno)); */
strm_seqno = jpl->strm_seqno[strm_index];
ASSERT_INST_FILE_HDR_HAS_HISTREC_FOR_STRM(strm_index);
} else
supplementary = FALSE;
# endif
INT8_ONLY(assert(tjpl->write == tjpl->write_addr % tjpl->jnlpool_size));
tjpl->write += SIZEOF(jnldata_hdr_struct);
if (tjpl->write >= tjpl->jnlpool_size)
{
assert(tjpl->write == tjpl->jnlpool_size);
tjpl->write = 0;
}
assert(jgbl.cumul_jnl_rec_len);
jgbl.cumul_jnl_rec_len += TCOM_RECLEN * jnl_participants + SIZEOF(jnldata_hdr_struct);
DEBUG_ONLY(jgbl.cumul_index += jnl_participants;)
assert(jgbl.cumul_jnl_rec_len % JNL_REC_START_BNDRY == 0);
/* Make sure timestamp of this seqno is >= timestamp of previous seqno. Note: The below macro
* invocation should be done AFTER the ADJUST_GBL_JREC_TIME call as the below resets
* jpl->prev_jnlseqno_time. Doing it the other way around would mean the reset will happen
* with a potentially lower value than the final adjusted time written in the jnl record.
*/
ADJUST_GBL_JREC_TIME_JNLPOOL(jgbl, jpl);
assert(jpl->early_write_addr == jpl->write_addr);
jpl->early_write_addr = jpl->write_addr + jgbl.cumul_jnl_rec_len;
/* Source server does not read in crit. It relies on early_write_addr, the transaction
* data, lastwrite_len, write_addr being updated in that order. To ensure this order,
* we have to force out early_write_addr to its coherency point now. If not, the source
* server may read data that is overwritten (or stale). This is true only on
* architectures and OSes that allow unordered memory access
*/
SHM_WRITE_MEMORY_BARRIER;
}
/* There are two possible approaches that can be taken from now onwards.
* a) Write journal and database records together for a region and move onto the next region.
* b) Write journal records for all regions and only then move onto writing database updates for all regions.
* If journal and database updates are done together region by region, there is a problem in that if an error
* occurs after one region's updates are committed (to jnl and db) or if the process gets STOP/IDed in VMS,
* secshr_db_clnup should then commit BOTH the journal and database updates of the remaining regions.
* committing journal updates is not trivial in secshr_db_clnup since it can also be invoked as a user termination
* handler in VMS in which case it cannot do any I/O.
*
* We therefore take approach (b) below. Write journal records for all regions in one loop. Write database updates
* for all regions in another loop. This way if any error occurs before database updates for any region begins in
* the second loop, we cleanup the structures as if the transaction is rolled back (there is an exception to this in
* that currently the journal buffers are not rolled back to undo the write of journal records but currently
* MUPIP RECOVER knows to handle such records and TR C9905-001072 exists to make the source-server handle such records).
* If any error occurs (or if the process gets STOP/IDed in VMS) while we are committing database updates,
* secshr_db_clnup will be invoked and will complete the updates for this TP transaction.
*/
/* the following section writes journal records in all regions */
DEBUG_ONLY(save_gbl_jrec_time = jgbl.gbl_jrec_time;)
DEBUG_ONLY(tmp_jnl_participants = 0;)
assert(!TREF(donot_commit)); /* We should never commit a transaction that was determined restartable */
# ifdef DEBUG
/* Check that upd_num in jnl records got set in increasing order (not necessarily contiguous) within each region.
* This is true for GT.M and journal recovery. Take the chance to also check that jnl_head & update_trans are in sync.
*/
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
{
prev_upd_num = 0;
jfb = si->jnl_head;
/* If we have formatted journal records for this transaction, ensure update_trans is TRUE. Not doing so
* would mean we miss out on writing journal records. This might be ok since the database was not seen as
* needing any update at all but in tp_clean_up, we will not free up si->jnl_head structures so there might
* be a memory leak. In addition, want to know if such a situation happens so assert accordingly.
*/
assert((NULL == jfb) || si->update_trans);
for ( ; NULL != jfb; jfb = jfb->next)
{
upd_num = ((struct_jrec_upd *)(jfb->buff))->update_num;
assert((prev_upd_num < upd_num)
GTMTRIG_ONLY(|| ((prev_upd_num == upd_num)
&& IS_ZTWORM(jfb->prev->rectype) && !IS_ZTWORM(jfb->rectype))));
assert(upd_num);
prev_upd_num = upd_num;
}
}
/* Check that tp_ztp_jnl_upd_num got set in contiguous increasing order across all regions.
* In case of forward processing phase of journal recovery, multi-region TP transactions are
* played as multi-region transactions only after resolve-time is reached and that too in
* region-by-region order (not necessarily upd_num order across all regions). Until then they
* are played as multiple single-region transactions. Also if -fences=none is specified, then
* ALL multi-region TP transactions (even those after resolve time) are played as multiple
* single-region TP transactions. Assert accordingly.
*/
max_upd_num = jgbl.tp_ztp_jnl_upd_num;
if (jgbl.forw_phase_recovery)
max_upd_num = jgbl.max_tp_ztp_jnl_upd_num;
if (max_upd_num)
{
upd_num_end = 0;
for (upd_num = 0; upd_num < ARRAYSIZE(upd_num_seen); upd_num++)
upd_num_seen[upd_num] = FALSE;
upd_num_seen[0] = TRUE; /* 0 will never be seen but set it to TRUE to simplify below logic */
do
{
upd_num_start = upd_num_end;
upd_num_end += ARRAYSIZE(upd_num_seen);
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
{
for (jfb = si->jnl_head; NULL != jfb; jfb = jfb->next)
{
/* ZTWORMHOLE will have same update_num as following SET/KILL record so dont double count */
if (IS_ZTWORM(jfb->rectype))
continue;
upd_num = ((struct_jrec_upd *)(jfb->buff))->update_num;
if ((upd_num >= upd_num_start) && (upd_num < upd_num_end))
{
assert(FALSE == upd_num_seen[upd_num - upd_num_start]);
upd_num_seen[upd_num - upd_num_start] = TRUE;
}
assert(upd_num <= max_upd_num);
}
}
for (upd_num = 0; upd_num < ARRAYSIZE(upd_num_seen); upd_num++)
{
if (upd_num <= (max_upd_num - upd_num_start))
{
assert((TRUE == upd_num_seen[upd_num])
|| (jgbl.forw_phase_recovery && ((jgbl.gbl_jrec_time < jgbl.mur_tp_resolve_time)
|| jgbl.mur_fences_none)));
upd_num_seen[upd_num] = FALSE;
} else
assert(FALSE == upd_num_seen[upd_num]);
}
} while (upd_num_end <= max_upd_num);
}
# endif
if (!jgbl.forw_phase_recovery)
{
jnl_fence_ctl.token = 0;
replay_jnl_participants = jnl_participants;
} else
replay_jnl_participants = jgbl.mur_jrec_participants;
/* In case of journal recovery, token would be initialized to a non-zero value */
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
{
if (!si->update_trans)
continue;
assert((NULL == si->first_cw_set) || (0 != si->cw_set_depth));
TP_TEND_CHANGE_REG(si);
csa = cs_addrs;
csd = cs_data;
ctn = csd->trans_hist.curr_tn;
ASSERT_CURR_TN_EQUALS_EARLY_TN(csa, ctn);
csd->trans_hist.early_tn = ctn + 1;
/* Write non-logical records (PBLK) if applicable */
if (JNL_ENABLED(csa))
{
jpc = csa->jnl;
jbp = jpc->jnl_buff;
/* si->tmp_cw_set_depth is a copy of si->cw_set_depth at TOTAL_TPJNL_REC_SIZE calculation time;
* ensure it has not changed until now when the actual jnl record write occurs.
* same case with csa->jnl_before_images & jbp->before_images.
*/
assert(si->cw_set_depth == si->tmp_cw_set_depth);
assert(jbp->before_images == csa->jnl_before_image);
assert(jgbl.gbl_jrec_time >= jbp->prev_jrec_time);
if (0 == jpc->pini_addr)
jnl_put_jrt_pini(csa);
if (jbp->before_images)
{
epoch_tn = jbp->epoch_tn; /* store in a local variable as it is used in a loop below */
for (cse = si->first_cw_set; NULL != cse; cse = cse->next_cw_set)
{ /* Write out before-update journal image records */
TRAVERSE_TO_LATEST_CSE(cse);
if (cse->was_free)
continue;
old_block = (blk_hdr_ptr_t)cse->old_block;
ASSERT_IS_WITHIN_SHM_BOUNDS((sm_uc_ptr_t)old_block, csa);
assert((n_gds_t_op != cse->mode) && (gds_t_committed != cse->mode));
assert(n_gds_t_op < kill_t_create);
assert(n_gds_t_op < kill_t_write);
if (n_gds_t_op <= cse->mode)
continue;
DEBUG_ONLY(is_mm = (dba_mm == csd->acc_meth));
DBG_ENSURE_OLD_BLOCK_IS_VALID(cse, is_mm, csa, csd);
if ((NULL != old_block) && (old_block->tn < epoch_tn))
{ /* For acquired blocks, we should have computed checksum already.
* The only exception is if we found no need to compute checksum
* outside of crit but before we got crit, an EPOCH got written
* concurrently so we have to write a PBLK (and hence compute the
* checksum as well) when earlier we thought none was necessary.
* An easy way to check this is that an EPOCH was written AFTER
* we started this transaction.
*/
assert((gds_t_acquired != cse->mode) || cse->blk_checksum
|| (epoch_tn >= si->start_tn));
assert(old_block->bsiz <= csd->blk_size);
if (!cse->blk_checksum)
cse->blk_checksum = jnl_get_checksum((uint4 *)old_block,
csa,
old_block->bsiz);
else
assert(cse->blk_checksum == jnl_get_checksum((uint4 *)old_block,
csa,
old_block->bsiz));
# ifdef GTM_CRYPT
if (csd->is_encrypted)
{
DBG_ENSURE_PTR_IS_VALID_GLOBUFF(csa, csd, (sm_uc_ptr_t)old_block);
DEBUG_ONLY(save_old_block = old_block;)
old_block = (blk_hdr_ptr_t)GDS_ANY_ENCRYPTGLOBUF(old_block, csa);
/* Ensure that the unencrypted block and it's twin counterpart are in
* sync. */
assert(save_old_block->tn == old_block->tn);
assert(save_old_block->bsiz == old_block->bsiz);
assert(save_old_block->levl == old_block->levl);
DBG_ENSURE_PTR_IS_VALID_ENCTWINGLOBUFF(csa, csd, (sm_uc_ptr_t)old_block);
}
# endif
jnl_write_pblk(csa, cse, old_block);
cse->jnl_freeaddr = jbp->freeaddr;
} else
cse->jnl_freeaddr = 0;
}
}
}
/* Write logical journal records if applicable. */
if (JNL_WRITE_LOGICAL_RECS(csa))
{
if (0 == jnl_fence_ctl.token)
{
assert(!jgbl.forw_phase_recovery);
if (replication)
{
jnl_fence_ctl.token = tjpl->jnl_seqno;
UNIX_ONLY(
if (supplementary)
jnl_fence_ctl.strm_seqno = SET_STRM_INDEX(strm_seqno, strm_index);
)
} else
TOKEN_SET(&jnl_fence_ctl.token, local_tn, process_id);
}
/* else : jnl_fence_ctl.token would be pre-filled by journal recovery */
assert(0 != jnl_fence_ctl.token);
jfb = si->jnl_head;
assert(NULL != jfb);
/* Fill in "num_participants" field in TSET/TKILL/TZKILL/TZTRIG/TZTWORM record.
* The rest of the records (USET/UKILL/UZKILL/UZTRIG/UZTWORM) dont have this initialized.
* Recovery looks at this field only in the T* records.
*/
rec = (jnl_record *)jfb->buff;
assert(IS_TUPD(jfb->rectype));
assert(IS_SET_KILL_ZKILL_ZTRIG_ZTWORM(jfb->rectype));
assert(&rec->jrec_set_kill.num_participants == &rec->jrec_ztworm.num_participants);
rec->jrec_set_kill.num_participants = replay_jnl_participants;
DEBUG_ONLY(++tmp_jnl_participants;)
do
{
jnl_write_logical(csa, jfb);
jfb = jfb->next;
} while (NULL != jfb);
}
}
assert(tmp_jnl_participants == jnl_participants);
/* the next section marks the transaction complete in the journal by writing TCOM record in all regions */
tcom_record.prefix.time = jgbl.gbl_jrec_time;
tcom_record.num_participants = replay_jnl_participants;
assert((JNL_FENCE_LIST_END == jnl_fence_ctl.fence_list) || (0 != jnl_fence_ctl.token));
tcom_record.token_seq.token = jnl_fence_ctl.token;
tcom_record.strm_seqno = jnl_fence_ctl.strm_seqno;
if (replication)
{
assert(!jgbl.forw_phase_recovery);
tjpl->jnl_seqno++;
UNIX_ONLY(
if (supplementary)
next_strm_seqno = strm_seqno + 1;
)
VMS_ONLY(
if (is_updproc)
jgbl.max_resync_seqno++;
)
}
/* Note that only those regions that are actively journaling will appear in the following list: */
DEBUG_ONLY(tmp_jnl_participants = 0;)
for (csa = jnl_fence_ctl.fence_list; JNL_FENCE_LIST_END != csa; csa = csa->next_fenced)
{
jpc = csa->jnl;
DEBUG_ONLY(update_trans = ((sgm_info *)(csa->sgm_info_ptr))->update_trans;)
assert(!(update_trans & ~UPDTRNS_VALID_MASK));
assert(UPDTRNS_DB_UPDATED_MASK & update_trans);
tcom_record.prefix.pini_addr = jpc->pini_addr;
tcom_record.prefix.tn = csa->ti->curr_tn;
tcom_record.prefix.checksum = INIT_CHECKSUM_SEED;
UNIX_ONLY(SET_REG_SEQNO_IF_REPLIC(csa, tjpl, supplementary, next_strm_seqno);)
VMS_ONLY(SET_REG_SEQNO_IF_REPLIC(csa, tjpl, supplementary, 0);)
/* Switch to current region. Not using TP_CHANGE_REG macros since we already have csa and csa->hdr available. */
gv_cur_region = jpc->region;
cs_addrs = csa;
cs_data = csa->hdr;
/* Note tcom_record.jnl_tid was set in op_tstart or updproc */
JNL_WRITE_APPROPRIATE(csa, jpc, JRT_TCOM, (jnl_record *)&tcom_record, NULL, NULL);
DEBUG_ONLY(tmp_jnl_participants++;)
}
assert(jnl_participants == tmp_jnl_participants);
/* Ensure jgbl.gbl_jrec_time did not get reset by any of the jnl writing functions */
assert(save_gbl_jrec_time == jgbl.gbl_jrec_time);
/* the following section is the actual commitment of the changes in the database (phase1 for BG) */
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
{
if (update_trans = si->update_trans)
{
assert((NULL == si->first_cw_set) || (0 != si->cw_set_depth));
sgm_info_ptr = si;
TP_TEND_CHANGE_REG(si);
csa = cs_addrs;
csd = cs_data;
is_mm = (dba_mm == csd->acc_meth);
ctn = csd->trans_hist.curr_tn;
assert((ctn + 1) == csd->trans_hist.early_tn);
csa->prev_free_blks = csd->trans_hist.free_blocks;
csa->t_commit_crit = T_COMMIT_CRIT_PHASE1;
if (csd->dsid && tp_kill_bitmaps)
rc_cpt_inval();
cse = si->first_cw_set;
if (NULL != cse)
{
if (!is_mm) /* increment counter of # of processes that are actively doing two-phase commit */
{
cnl = csa->nl;
INCR_WCS_PHASE2_COMMIT_PIDCNT(csa, cnl);
}
# ifdef DEBUG
/* Assert that cse->old_mode, if uninitialized, never contains a negative value
* (this is relied upon by secshr_db_clnup)
*/
do
{
TRAVERSE_TO_LATEST_CSE(cse);
assert(0 <= cse->old_mode);
cse = cse->next_cw_set;
} while (NULL != cse);
cse = si->first_cw_set;
# endif
do
{
TRAVERSE_TO_LATEST_CSE(cse);
mode = cse->mode;
assert((n_gds_t_op != mode) && (gds_t_committed != mode));
assert(n_gds_t_op < kill_t_create);
assert(n_gds_t_op < kill_t_write);
assert(gds_t_committed < gds_t_write_root);
assert(gds_t_committed < gds_t_busy2free);
assert(gds_t_write_root < n_gds_t_op);
assert(gds_t_busy2free < n_gds_t_op);
assert(gds_t_write_root != mode);
assert(gds_t_busy2free != mode);
cse->old_mode = (int4)mode; /* note down before being reset to gds_t_committed */
if (n_gds_t_op > mode)
{
DEBUG_ONLY(bml_status_check(cse));
if (csd->dsid && !tp_kill_bitmaps && (0 == cse->level))
{
assert(!is_mm);
rc_cpt_entry(cse->blk);
}
/* Do phase1 of bg_update while holding crit on the database.
* This will lock the buffers that need to be changed.
* Once crit is released, invoke phase2 which will update those locked buffers.
* There are two exceptions.
* 1) If it is a bitmap block. In that case we also do phase2
* while holding crit so the next process to use this bitmap will see a
* consistent copy of this bitmap when it gets crit for commit. This avoids
* the reallocate_bitmap routine from restarting or having to wait for a
* concurrent phase2 construction to finish. When the change request C9E11-002651
* (to reduce restarts due to bitmap collisions) is addressed, we can reexamine
* whether it makes sense to move bitmap block builds back to phase2.
* 2) If the block has a recompute update array. This means it is a global that
* has NOISOLATION turned on. In this case, we have seen that deferring the
* updates to phase2 can cause lots of restarts in the "recompute_upd_array"
* function (where cr->in_tend check fails) in a highly contentious environment.
* Hence build such blocks in phase1 while holding crit and avoid such restarts.
*/
if (is_mm)
status = mm_update(cse, ctn, ctn, si);
else
{
status = bg_update_phase1(cse, ctn, si);
if ((cdb_sc_normal == status)
&& ((gds_t_writemap == mode)
|| cse->recompute_list_head && (gds_t_write == cse->mode)))
{
status = bg_update_phase2(cse, ctn, ctn, si);
if (cdb_sc_normal == status)
cse->mode = gds_t_committed;
}
}
if (cdb_sc_normal != status)
{ /* the database is probably in trouble */
TP_TRACE_HIST(cse->blk, cse->blk_target);
INVOKE_T_COMMIT_CLEANUP(status, csa);
assert(cdb_sc_normal == status);
/* At this time "si->cr_array_index" could be non-zero for one or more
* regions and a few cache-records might have their "in_cw_set" field set
* to TRUE. We should not reset "in_cw_set" as we don't hold crit at this
* point and also because we might still need those buffers pinned until
* their before-images are backed up in wcs_recover (in case an online
* backup was running while secshr_db_clnup did its job). The variable
* "si->cr_array_index" is reset to 0 by secshr_db_clnup.
*/
assert(0 == si->cr_array_index);
goto skip_failed; /* do not do "failed:" processing as we dont hold crit */
}
} else
{
if (!cse->done)
{ /* This block is needed in the 2nd-phase of KILL. Build a private
* copy right now while we hold crit and the update array points
* to validated buffer contents.
*/
gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, ctn);
cse->done = TRUE;
assert(NULL != cse->blk_target);
CERT_BLK_IF_NEEDED(certify_all_blocks, gv_cur_region,
cse, cse->new_buff, gv_target);
}
cse->mode = gds_t_committed;
}
cse = cse->next_cw_set;
} while (NULL != cse);
}
/* signal secshr_db_clnup/t_commit_cleanup, roll-back is no longer possible */
assert(!(update_trans & ~UPDTRNS_VALID_MASK));
assert(!(UPDTRNS_TCOMMIT_STARTED_MASK & update_trans));
si->update_trans = update_trans | UPDTRNS_TCOMMIT_STARTED_MASK;
csa->t_commit_crit = T_COMMIT_CRIT_PHASE2; /* set this BEFORE releasing crit */
assert(!csd->freeze); /* should never increment curr_tn on a frozen database */
INCREMENT_CURR_TN(csd);
# ifdef GTM_TRIGGER
if (csa->incr_db_trigger_cycle)
{
csd->db_trigger_cycle++;
if (0 == csd->db_trigger_cycle)
csd->db_trigger_cycle = 1; /* Don't allow cycle set to 0 which means uninitialized */
/* Update the process private view of trigger cycle also since we are the ones
* who updated csd->db_trigger_cycle so we can safely keep csa in sync as well.
* Not doing this would cause an unnecessary cdb_sc_triggermod restart for the
* next transaction. In fact this restart will create an out-of-design situation
* for recovery (which operates in the final-retry) and cause an unnecessary
* replication pipe drain for the update process (a costly operation). So it
* is in fact a necessary step (considering recovery).
*/
csa->db_trigger_cycle = csd->db_trigger_cycle;
csa->incr_db_trigger_cycle = FALSE;
}
# endif
/* If db is journaled, then db header is flushed periodically when writing the EPOCH record,
* otherwise do it here every HEADER_UPDATE_COUNT transactions.
*/
if ((!JNL_ENABLED(csa) || !JNL_HAS_EPOCH(csa->jnl->jnl_buff))
&& !(csd->trans_hist.curr_tn & (HEADER_UPDATE_COUNT - 1)))
fileheader_sync(gv_cur_region);
if (NULL != si->kill_set_head)
INCR_KIP(csd, csa, si->kip_csa);
} else
ctn = si->tp_csd->trans_hist.curr_tn;
si->start_tn = ctn; /* start_tn used temporarily to store currtn (for bg_update_phase2) before releasing crit */
if (!si->tp_csa->hold_onto_crit)
rel_crit(si->gv_cur_region); /* should use si->gv_cur_region (not gv_cur_region) as the latter is not
* set in case we are not updating this region */
} /* for (si ... ) */
assert(cdb_sc_normal == status);
if (replication)
{
assert(jgbl.cumul_index == jgbl.cu_jnl_index);
assert((jpl->write + jgbl.cumul_jnl_rec_len) % jpl->jnlpool_size == tjpl->write);
assert(jpl->early_write_addr > jpl->write_addr);
jnl_header = (jnldata_hdr_ptr_t)(jnlpool.jnldata_base + jpl->write); /* Begin atomic stmnts */
jnl_header->jnldata_len = jgbl.cumul_jnl_rec_len;
jnl_header->prev_jnldata_len = jpl->lastwrite_len;
UNIX_ONLY(
if (supplementary)
jpl->strm_seqno[strm_index] = next_strm_seqno;
)
jpl->lastwrite_len = jnl_header->jnldata_len;
/* For systems with UNORDERED memory access (example, ALPHA, POWER4, PA-RISC 2.0), on a multi
* processor system, it is possible that the source server notices the change in write_addr
* before seeing the change to jnlheader->jnldata_len, leading it to read an invalid
* transaction length. To avoid such conditions, we should commit the order of shared
* memory updates before we update write_addr. This ensures that the source server sees all
* shared memory updates related to a transaction before the change in write_addr
*/
SHM_WRITE_MEMORY_BARRIER;
jpl->write = tjpl->write;
/* jpl->write_addr should be updated before updating jpl->jnl_seqno as secshr_db_clnup relies on this */
jpl->write_addr += jnl_header->jnldata_len;
jpl->jnl_seqno = tjpl->jnl_seqno; /* End atomic stmnts */
assert(jpl->early_write_addr == jpl->write_addr);
assert(NULL != repl_csa);
if (!repl_csa->hold_onto_crit)
rel_lock(jnlpool.jnlpool_dummy_reg);
}
/* Check that we DONT own crit on ANY region. The only exception is online mupip journal rollback/recovery
* which holds crit for the entire process lifetime. */
assert(UNIX_ONLY(jgbl.onlnrlbk || ) (0 == have_crit(CRIT_HAVE_ANY_REG)));
/* the following section is the actual commitment of the changes in the database (phase2 for BG) */
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
{
cse = si->first_cw_set;
if (NULL != cse)
{
sgm_info_ptr = si;
TP_TEND_CHANGE_REG(si);
ctn = si->start_tn;
is_mm = (dba_mm == cs_data->acc_meth);
/* If BG, check that we have not pinned any more buffers than we are updating */
DBG_CHECK_PINNED_CR_ARRAY_CONTENTS(is_mm, si->cr_array, si->cr_array_index, si->tp_csd->bplmap);
do
{
TRAVERSE_TO_LATEST_CSE(cse);
if (gds_t_committed > cse->mode)
{ /* Finish 2nd phase of commit for BG (updating the buffers locked in phase1) now that CRIT
* has been released. For MM, only thing needed is to set cs->mode to gds_t_committed.
*/
if (!is_mm)
{ /* Validate old_mode noted down in first phase is the same as the current mode.
* Note that cs->old_mode is negated by bg_update_phase1 (to help secshr_db_clnup).
*/
assert(-cse->old_mode == (int4)cse->mode);
status = bg_update_phase2(cse, ctn, ctn, si);
if (cdb_sc_normal != status)
{ /* the database is probably in trouble */
TP_TRACE_HIST(cse->blk, cse->blk_target);
INVOKE_T_COMMIT_CLEANUP(status, si->tp_csa);
assert(cdb_sc_normal == status);
/* At this time "si->cr_array_index" could be non-zero for one or more
* regions and a few cache-records might have their "in_cw_set" field set
* to TRUE. We should not reset "in_cw_set" as we don't hold crit at this
* point and also because we might still need those buffers pinned until
* their before-images are backed up in wcs_recover (in case an online
* backup was running while secshr_db_clnup did its job). The local
* variable "si->cr_array_index" is reset to 0 by secshr_db_clnup.
*/
assert(0 == si->cr_array_index);
/* Note that seshr_db_clnup (invoked by t_commit_cleanup above) would have
* done a lot of cleanup for us including decrementing the counter
* "wcs_phase2_commit_pidcnt" so it is ok to skip all that processing
* below and go directly to skip_failed.
*/
goto skip_failed; /* do not do "failed:" processing as we dont hold crit */
}
}
cse->mode = gds_t_committed;
} else
{ /* blk build should have been completed in phase1 for kill_t_* modes */
assert((n_gds_t_op > cse->mode) || cse->done);
assert(gds_t_committed == cse->mode);
}
cse = cse->next_cw_set;
} while (NULL != cse);
/* Free up all pinnned cache-records */
tp_cr_array = si->cr_array;
UNPIN_CR_ARRAY_ON_COMMIT(tp_cr_array, si->cr_array_index);
if (!is_mm)
{ /* In BG, now that two-phase commit is done, decrement counter */
csa = cs_addrs;
cnl = csa->nl;
DECR_WCS_PHASE2_COMMIT_PIDCNT(csa, cnl);
/* Phase 2 commits are completed for the current region. See if we had done a snapshot
* init (csa->snapshot_in_prog == TRUE). If so, try releasing the resources obtained
* while snapshot init.
*/
if (SNAPSHOTS_IN_PROG(csa))
{
assert(NULL != si->first_cw_set);
SS_RELEASE_IF_NEEDED(csa, cnl);
}
}
}
assert(!si->cr_array_index);
si->tp_csa->t_commit_crit = FALSE;
}
si_not_validated = NULL; /* all "si" have been validated at this point */
/* Caution: followthrough, cleanup for normal and abnormal "status" */
failed:
if (cdb_sc_normal != status)
{
si_not_validated = si;
si_last = (NULL == si_not_validated) ? NULL : si_not_validated->next_tp_si_by_ftok;
/* Free up all pinnned cache-records and release crit */
release_crit = (NEED_TO_RELEASE_CRIT(t_tries) UNIX_ONLY(&& !jgbl.onlnrlbk));
for (si = first_tp_si_by_ftok; (si_last != si); si = si->next_tp_si_by_ftok)
{
assert(si->tp_csa->now_crit);
tp_cr_array = si->cr_array;
UNPIN_CR_ARRAY_ON_RETRY(tp_cr_array, si->cr_array_index);
assert(!si->cr_array_index);
si->start_tn = si->tp_csd->trans_hist.curr_tn; /* start_tn used temporarily to store currtn
* before releasing crit */
if (release_crit)
{
assert(!si->tp_csa->hold_onto_crit);
rel_crit(si->gv_cur_region);
}
}
/* Check that we DONT own crit/commit on ANY region. The only exception is online mupip journal rollback/recovery
* which holds crit for the entire process lifetime.
*/
assert(UNIX_ONLY(jgbl.onlnrlbk || ) !release_crit || (0 == have_crit(CRIT_HAVE_ANY_REG | CRIT_IN_COMMIT)));
}
/* We have finished validation on this region. Reset transaction numbers in the gv_target
* histories so they will be valid for a future access utilizing the clue field. This occurs
* to improve performance (of next tn in case of commit of current tn) or the chances of commit
* (of current tn in case of a restart/retry).
*/
for (si = first_tp_si_by_ftok; (si_not_validated != si); si = si->next_tp_si_by_ftok)
{
if ((cdb_sc_normal == status) && (si->update_trans))
valid_thru = si->start_tn + 1; /* curr_tn of db AFTER incrementing it but before releasing crit */
else
valid_thru = si->start_tn;
assert(valid_thru <= si->tp_csd->trans_hist.curr_tn);
is_mm = (dba_mm == si->tp_csd->acc_meth);
bmp_begin_cse = si->first_cw_bitmap;
prev_target = NULL;
for (cse = si->first_cw_set; bmp_begin_cse != cse; cse = cse->next_cw_set)
{
TRAVERSE_TO_LATEST_CSE(cse);
curr_target = cse->blk_target;
/* Avoid redundant updates to gv_target's history using a simplistic scheme (check previous iteration) */
if ((prev_target != curr_target) && (0 != curr_target->clue.end))
{
prev_target = curr_target;
for (t1 = curr_target->hist.h; t1->blk_num; t1++)
{ /* If MM, the history can be safely updated. In BG, phase2 of commit happens outside of
* crit. So we need to check if the global buffer corresponding to this block is
* in the process of being updated concurrently by another process. If so, we have no
* guarantee that the concurrent update started AFTER valid_thru db-tn so we cannot safely
* reset t1->tn in this case. If no update is in progress, we can safely update our history
* to reflect the fact that all updates to this block before the current transaction number
* are complete as of this point. Note that it is ok to do the cr->in_tend check outside
* of crit. If this update started after we released crit, t1->tn will still be lesser than
* the transaction at which this update occurred so a cdb_sc_blkmod check is guaranteed to
* be signalled. If this update started and ended after we released crit but before we
* reached here, it is ok to set t1->tn to valid_thru as the concurrent update corresponds
* to a higher transaction number and will still fail the cdb_sc_blkmod check in the next
* validation. Also note that because of this selective updation of t1->tn, it is possible
* that for a given gv_target->hist, hist[0].tn is not guaranteed to be GREATER than
* hist[1].tn. Therefore t_begin has to now determine the minimum and use that as the
* start_tn instead of looking at hist[MAXDEPTH].tn and using that.
*/
cr = !is_mm ? t1->cr : NULL;
in_tend = (NULL != cr) ? cr->in_tend : 0;
assert(process_id != in_tend);
if (!in_tend)
t1->tn = valid_thru;
}
}
}
}
skip_failed:
REVERT;
DEFERRED_EXIT_HANDLING_CHECK; /* now that all crits are released, check if deferred signal/exit handling needs to be done */
/* Must be done after REVERT since we are no longer in crit */
if (cdb_sc_normal == status)
{ /* keep this out of the loop above so crits of all regions are released without delay */
/* Take this moment of non-critness to check if we had an unhandled IO timer pop. */
if (unhandled_stale_timer_pop)
process_deferred_stale();
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
{
csa = si->tp_csa;
cnl = csa->nl;
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_blkread, si->num_of_blks);
if (!si->update_trans)
{
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_readonly, 1);
continue;
}
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_readwrite, 1);
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_blkwrite, si->cw_set_depth);
GVSTATS_SET_CSA_STATISTIC(csa, db_curr_tn, si->start_tn);
TP_TEND_CHANGE_REG(si);
wcs_timer_start(gv_cur_region, TRUE);
if (si->backup_block_saved)
backup_buffer_flush(gv_cur_region);
}
first_tp_si_by_ftok = NULL; /* Signal t_commit_cleanup/secshr_db_clnup that TP transaction is NOT underway */
return TRUE;
}
failed_skip_revert:
assert(cdb_sc_normal != status);
t_fail_hist[t_tries] = status;
SET_WC_BLOCKED_FINAL_RETRY_IF_NEEDED(csa, status);
TP_RETRY_ACCOUNTING(csa, csa->nl, status);
first_tp_si_by_ftok = NULL; /* Signal t_commit_cleanup/secshr_db_clnup that TP transaction is NOT underway */
return FALSE;
}
/* --------------------------------------------------------------------------------------------
* This code is very similar to the code in gvcst_put for the non-block-split case. Any changes
* in either place should be reflected in the other.
* --------------------------------------------------------------------------------------------
*/
enum cdb_sc recompute_upd_array(srch_blk_status *bh, cw_set_element *cse)
{
blk_segment *bs1, *bs_ptr;
boolean_t new_rec;
cache_rec_ptr_t cr;
char *va;
enum cdb_sc status;
gv_key *pKey = NULL;
int4 blk_size, blk_fill_size, cur_blk_size, blk_seg_cnt, delta ;
int4 n, new_rec_size, next_rec_shrink;
int4 rec_cmpc, target_key_size;
uint4 segment_update_array_size;
key_cum_value *kv, *kvhead;
mstr value;
off_chain chain1;
rec_hdr_ptr_t curr_rec_hdr, next_rec_hdr, rp;
sm_uc_ptr_t cp1, buffaddr;
unsigned short rec_size;
sgmnt_addrs *csa;
blk_hdr_ptr_t old_block;
gv_namehead *gvt;
srch_blk_status *t1;
csa = cs_addrs;
BG_TRACE_PRO_ANY(csa, recompute_upd_array_calls);
assert(csa->now_crit && dollar_tlevel && sgm_info_ptr);
assert(!cse->level && cse->blk_target && !cse->first_off && !cse->write_type);
blk_size = cs_data->blk_size;
blk_fill_size = (blk_size * gv_fillfactor) / 100 - cs_data->reserved_bytes;
cse->first_copy = TRUE;
if (dba_bg == csa->hdr->acc_meth)
{ /* For BG method, modify history with uptodate cache-record, buffer and cycle information.
* Also modify cse->old_block and cse->ondsk_blkver to reflect the updated buffer.
* This is necessary in case history contains an older twin cr or a cr which has since been recycled
*/
cr = db_csh_get(bh->blk_num);
assert(CR_NOTVALID != (sm_long_t)cr);
if (NULL == cr || CR_NOTVALID == (sm_long_t)cr || 0 <= cr->read_in_progress)
{
BG_TRACE_PRO_ANY(csa, recompute_upd_array_rip);
assert(CDB_STAGNATE > t_tries);
return cdb_sc_lostcr;
}
if (cr->in_tend)
{ /* Possible if this cache-record is being modified concurrently by another process in bg_update_phase2.
* In this case, we cannot determine if recomputation is possible. Have to restart.
*/
assert(CDB_STAGNATE > t_tries);
BG_TRACE_PRO_ANY(csa, recompute_upd_array_in_tend);
return cdb_sc_blkmod;
}
bh->cr = cr;
bh->cycle = cr->cycle;
cse->old_block = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
cse->ondsk_blkver = cr->ondsk_blkver;
/* old_block needs to be repointed to the NEW buffer but the fact that this block was free does not change in this
* entire function. So cse->was_free can stay as it is.
*/
bh->buffaddr = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
}
buffaddr = bh->buffaddr;
assert(NULL != cse->recompute_list_head);
for (kvhead = kv = cse->recompute_list_head; (NULL != kv); kv = kv->next)
{
pKey = &kv->key;
value = kv->value;
target_key_size = pKey->end + 1;
if (kvhead != kv)
{
assert(FALSE == cse->done);
assert(0 == cse->reference_cnt);
assert(0 == cse->ins_off); /* because leaf-level block */
assert(0 == cse->level);
assert(0 == cse->index);
assert(FALSE == cse->forward_process); /* because no kills should have taken place in this block */
gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0);
bh->buffaddr = buffaddr = cse->new_buff;
}
if (cdb_sc_normal != (status = gvcst_search_blk(pKey, bh)))
{
BG_TRACE_PRO_ANY(csa, recompute_upd_array_search_blk);
return status;
}
cur_blk_size = ((blk_hdr_ptr_t)buffaddr)->bsiz;
new_rec = (target_key_size != bh->curr_rec.match);
rp = (rec_hdr_ptr_t)(buffaddr + bh->curr_rec.offset);
if (bh->curr_rec.offset == cur_blk_size)
{
if (FALSE == new_rec)
{
assert(CDB_STAGNATE > t_tries);
BG_TRACE_PRO_ANY(csa, recompute_upd_array_new_rec);
return cdb_sc_mkblk;
}
rec_cmpc = 0;
rec_size = 0;
} else
{
GET_USHORT(rec_size, &rp->rsiz);
rec_cmpc = rp->cmpc;
if ((sm_uc_ptr_t)rp + rec_size > (sm_uc_ptr_t)buffaddr + cur_blk_size)
{
assert(CDB_STAGNATE > t_tries);
BG_TRACE_PRO_ANY(csa, recompute_upd_array_rec_size);
return cdb_sc_mkblk;
}
}
if (new_rec)
{
new_rec_size = SIZEOF(rec_hdr) + target_key_size - bh->prev_rec.match + value.len;
if (cur_blk_size <= (int)bh->curr_rec.offset)
next_rec_shrink = 0;
else
next_rec_shrink = bh->curr_rec.match - rec_cmpc;
delta = new_rec_size - next_rec_shrink;
} else
{
if (rec_cmpc != bh->prev_rec.match)
{
assert(CDB_STAGNATE > t_tries);
BG_TRACE_PRO_ANY(csa, recompute_upd_array_rec_cmpc);
return cdb_sc_mkblk;
}
new_rec_size = SIZEOF(rec_hdr) + (target_key_size - rec_cmpc) + value.len;
delta = new_rec_size - rec_size;
next_rec_shrink = 0;
}
chain1 = *(off_chain *)&bh->blk_num;
assert(0 == chain1.flag);
if (cur_blk_size + delta <= blk_fill_size)
{
segment_update_array_size = UA_NON_BM_SIZE(cs_data);
ENSURE_UPDATE_ARRAY_SPACE(segment_update_array_size);
BLK_INIT(bs_ptr, bs1);
if (0 != rc_set_fragment)
GTMASSERT;
BLK_SEG(bs_ptr, buffaddr + SIZEOF(blk_hdr), bh->curr_rec.offset - SIZEOF(blk_hdr));
BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr);
curr_rec_hdr->rsiz = new_rec_size;
curr_rec_hdr->cmpc = bh->prev_rec.match;
BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr));
BLK_ADDR(cp1, target_key_size - bh->prev_rec.match, unsigned char);
memcpy(cp1, pKey->base + bh->prev_rec.match, target_key_size - bh->prev_rec.match);
BLK_SEG(bs_ptr, cp1, target_key_size - bh->prev_rec.match);
if (0 != value.len)
{
BLK_ADDR(va, value.len, char);
memcpy(va, value.addr, value.len);
BLK_SEG(bs_ptr, (unsigned char *)va, value.len);
}
if (!new_rec)
rp = (rec_hdr_ptr_t)((sm_uc_ptr_t)rp + rec_size);
n = (int)(cur_blk_size - ((sm_uc_ptr_t)rp - buffaddr));
if (n > 0)
{
if (new_rec)
{
BLK_ADDR(next_rec_hdr, SIZEOF(rec_hdr), rec_hdr);
next_rec_hdr->rsiz = rec_size - next_rec_shrink;
next_rec_hdr->cmpc = bh->curr_rec.match;
BLK_SEG(bs_ptr, (sm_uc_ptr_t)next_rec_hdr, SIZEOF(rec_hdr));
next_rec_shrink += SIZEOF(rec_hdr);
}
BLK_SEG(bs_ptr, (sm_uc_ptr_t)rp + next_rec_shrink, n - next_rec_shrink);
}
if (0 == BLK_FINI(bs_ptr, bs1))
{
assert(CDB_STAGNATE > t_tries);
BG_TRACE_PRO_ANY(csa, recompute_upd_array_blk_fini);
return cdb_sc_mkblk;
}
cse->upd_addr = (unsigned char *)bs1;
cse->done = FALSE;
} else
{
BG_TRACE_PRO_ANY(csa, recompute_upd_array_blk_split);
return cdb_sc_blksplit;
}
}
/* Update bh->tn to reflect the fact that it is uptodate as of the current database transaction.
* Not doing so could actually cause unnecessary restarts.
*/
bh->tn = csa->hdr->trans_hist.curr_tn;
/* If block in this history element is the same as gv_target's leaf block and it has a non-zero clue, update it */
gvt = bh->blk_target;
assert(!bh->level); /* this is why it is safe to access 0th array index in the next line */
t1 = gvt->hist.h;
if (gvt->clue.end && (t1->blk_num == bh->blk_num))
{
*t1 = *bh;
/* Update clue to reflect last key in recompute list. No need to update gvt->first_rec and gvt->last_rec
* as they are guaranteed to be the same as what it was when the clue was filled in by gvcst_search (if
* they are different, an index block would have changed which means we would restart this transaction
* anyways and the clue would be reset to 0).
*/
assert(NULL != pKey);
COPY_CURRKEY_TO_GVTARGET_CLUE(gvt, pKey);
if (new_rec)
t1->curr_rec.match = gvt->clue.end + 1; /* Keep srch_hist and clue in sync for NEXT gvcst_search */
/* Now that the clue is known to be non-zero, we have the potential for the first_rec part of it to be
* unreliable. Reset it to be safe. See comment in similar section in tp_hist for details on why.
*/
GVT_CLUE_INVALIDATE_FIRST_REC(gvt);
}
/* At this point, cse->new_buff could be non-NULL either because the same variable was being updated multiple times
* inside of the TP transaction or because cse->recompute_list_head contained more than one variable (in which case
* cse->new_buff will be set by the invocation of gvcst_blk_build (above) for the second element in the list. In
* either case, the final update-array contents rely on the shared memory buffer (in case of BG access method) and
* not on cse->new_buff. Therefore we need to PIN the corresponding cache-record in tp_tend. So reset cse->new_buff.
*/
cse->new_buff = NULL;
if (!cse->was_free && (NULL != cse->old_block) && JNL_ENABLED(csa) && csa->jnl_before_image)
{
old_block = (blk_hdr_ptr_t)cse->old_block;
assert(old_block->bsiz <= csa->hdr->blk_size);
if (old_block->tn < csa->jnl->jnl_buff->epoch_tn)
cse->blk_checksum = jnl_get_checksum((uint4 *)old_block, csa, old_block->bsiz);
else
cse->blk_checksum = 0;
}
return cdb_sc_normal;
}
/* This function does not update "bml_cse->tn" (to reflect that the reallocation is valid as of the current database tn).
* See similar comment before the function definition of "recompute_upd_array". For the same reasons, it is considered
* ok to do the reallocation since frozen regions are considered relatively rare.
*/
boolean_t reallocate_bitmap(sgm_info *si, cw_set_element *bml_cse)
{
boolean_t blk_used;
block_id_ptr_t b_ptr;
block_id bml, free_bit;
cache_rec_ptr_t cr;
cw_set_element *cse, *bmp_begin_cse;
int4 offset;
uint4 total_blks, map_size;
boolean_t read_before_image; /* TRUE if before-image journaling or online backup in progress */
sgmnt_addrs *csa;
sgmnt_data_ptr_t csd;
boolean_t is_mm;
jnl_buffer_ptr_t jbp; /* jbp is non-NULL only if before-image journaling */
blk_hdr_ptr_t old_block;
unsigned int bsiz;
boolean_t before_image_needed;
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
csa = cs_addrs;
csd = csa->hdr;
is_mm = (dba_mm == csd->acc_meth);
/* This optimization should only be used if blocks are being allocated (not if freed) in this bitmap. */
assert(0 <= bml_cse->reference_cnt);
bml = bml_cse->blk;
if (!is_mm && bml_cse->cr->in_tend)
{ /* Possible if this cache-record no longer contains the bitmap block we think it does. In this case restart.
* Since we hold crit at this point, the block that currently resides should not be a bitmap block since
* all updates to the bitmap (both phase1 and phase) happen inside of crit.
*/
assert(csa->now_crit && (bml != bml_cse->cr->blk) && (bml_cse->cr->blk % csd->bplmap));
return FALSE;
}
assert(is_mm || (FALSE == bml_cse->cr->in_tend));
assert(is_mm || (FALSE == bml_cse->cr->data_invalid));
b_ptr = (block_id_ptr_t)bml_cse->upd_addr;
offset = 0;
total_blks = is_mm ? csa->total_blks : csa->ti->total_blks;
if (ROUND_DOWN2(total_blks, BLKS_PER_LMAP) == bml)
map_size = total_blks - bml;
else
map_size = BLKS_PER_LMAP;
assert(bml >= 0 && bml < total_blks);
bmp_begin_cse = si->first_cw_bitmap; /* stored in a local to avoid pointer de-referencing within the loop below */
jbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
read_before_image = ((NULL != jbp) || csa->backup_in_prog || SNAPSHOTS_IN_PROG(csa));
for (cse = si->first_cw_set; cse != bmp_begin_cse; cse = cse->next_cw_set)
{
TRAVERSE_TO_LATEST_CSE(cse);
if ((gds_t_acquired != cse->mode) || (ROUND_DOWN2(cse->blk, BLKS_PER_LMAP) != bml))
continue;
assert(*b_ptr == (cse->blk - bml));
free_bit = bm_find_blk(offset, (sm_uc_ptr_t)bml_cse->old_block + SIZEOF(blk_hdr), map_size, &blk_used);
if (MAP_RD_FAIL == free_bit || NO_FREE_SPACE == free_bit)
return FALSE;
cse->blk = bml + free_bit;
assert(cse->blk < total_blks);
cse->was_free = !blk_used;
/* re-point before-images into cse->old_block if necessary; if not available restart by returning FALSE */
BEFORE_IMAGE_NEEDED(read_before_image, cse, csa, csd, cse->blk, before_image_needed);
if (!before_image_needed)
{
cse->old_block = NULL;
cse->blk_checksum = 0;
} else if (!is_mm)
{
cr = db_csh_get(cse->blk);
assert(CR_NOTVALID != (sm_long_t)cr);
if ((NULL == cr) || (CR_NOTVALID == (sm_long_t)cr) || (0 <= cr->read_in_progress))
return FALSE; /* if one block was freed a long time ago, most probably were; so just give up */
/* Reset cse->cr, cycle, old_block and checksums if we had not read a before-image previously (because
* cse->blk was not a reused block previously) OR if old cse->cr and cse->cycle dont match current cr
*/
assert((NULL == cse->old_block) || (cse->cr != cr)
|| cse->old_block == (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr));
if ((NULL == cse->old_block) || (cse->cr != cr) || (cse->cycle != cr->cycle)
|| (cse->tn <= ((blk_hdr_ptr_t)GDS_REL2ABS(cr->buffaddr))->tn))
{ /* Bitmap reallocation has resulted in a situation where checksums etc. have to be recomputed */
cse->cr = cr;
cse->cycle = cr->cycle;
cse->old_block = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
old_block = (blk_hdr_ptr_t)cse->old_block;
if (!cse->was_free && (NULL != jbp))
{
assert(old_block->bsiz <= csd->blk_size);
if (old_block->tn < jbp->epoch_tn)
{
bsiz = old_block->bsiz;
JNL_GET_CHECKSUM_ACQUIRED_BLK(cse, csd, csa, old_block, bsiz);
} else
cse->blk_checksum = 0;
}
}
assert(gds_t_acquired == cse->mode);
assert(GDSVCURR == cse->ondsk_blkver);
} else
{ /* in MM, although mm_update does not use cse->old_block, tp_tend uses it to write before-images.
* therefore, fix it to point to the reallocated block's buffer address
*/
cse->old_block = t_qread(cse->blk, (sm_int_ptr_t)&cse->cycle, &cse->cr);
assert(GDSVCURR == cse->ondsk_blkver); /* should have been already initialized in t_write_map */
old_block = (blk_hdr_ptr_t)cse->old_block;
if (NULL == old_block)
return FALSE;
assert(NULL == jbp); /* this means we dont need to have any JNL_GET_CHECKSUM_ACQUIRED_BLK logic */
}
*b_ptr++ = free_bit;
offset = free_bit + 1;
if (offset >= map_size)
{ /* If bm_find_blk is passed a hint (first arg) it assumes it is less than map_size
* and gives invalid results (like values >= map_size). Instead of changing bm_find_blk
* we do the check here and assert that "hint" < "map_size" in bm_find_blk.
*/
assert(offset == map_size);
return FALSE;
}
}
if (cse == bmp_begin_cse)
{
assert(0 == *b_ptr);
/* since bitmap block got modified, copy latest "ondsk_blkver" status from cache-record to bml_cse */
assert((NULL != bml_cse->cr) || is_mm);
old_block = (blk_hdr_ptr_t)bml_cse->old_block;
assert(!bml_cse->was_free); /* Bitmap blocks are never of type gds_t_acquired or gds_t_create */
if (NULL != jbp)
{ /* recompute CHECKSUM for the modified bitmap block before-image */
if (old_block->tn < jbp->epoch_tn)
bml_cse->blk_checksum = jnl_get_checksum((uint4 *)old_block, csa, old_block->bsiz);
else
bml_cse->blk_checksum = 0;
}
if (!is_mm)
bml_cse->ondsk_blkver = bml_cse->cr->ondsk_blkver;
return TRUE;
} else
return FALSE;
}