2346 lines
97 KiB
C
2346 lines
97 KiB
C
/****************************************************************
|
|
* *
|
|
* Copyright 2001, 2012 Fidelity Information Services, Inc *
|
|
* *
|
|
* This source code contains the intellectual property *
|
|
* of its copyright holder(s), and is made available *
|
|
* under a license. If you do not know the terms of *
|
|
* the license, please stop and do not read further. *
|
|
* *
|
|
****************************************************************/
|
|
|
|
#include "mdef.h"
|
|
|
|
#include <stddef.h> /* for offsetof macro */
|
|
#include <signal.h> /* for VSIG_ATOMIC_T type */
|
|
|
|
#ifdef UNIX
|
|
#include "gtm_stdio.h"
|
|
#endif
|
|
|
|
#include "gtm_time.h"
|
|
#include "gtm_inet.h" /* Required for gtmsource.h */
|
|
#include "gtm_string.h"
|
|
|
|
#ifdef VMS
|
|
#include <descrip.h> /* Required for gtmsource.h */
|
|
#endif
|
|
|
|
#include "gtm_ctype.h"
|
|
#include "cdb_sc.h"
|
|
#include "gdsroot.h"
|
|
#include "gdskill.h"
|
|
#include "gtm_facility.h"
|
|
#include "fileinfo.h"
|
|
#include "gdsbt.h"
|
|
#include "gdsblk.h"
|
|
#include "gdsfhead.h"
|
|
#include "filestruct.h"
|
|
#include "gdscc.h"
|
|
#include "gdsbml.h"
|
|
#include "min_max.h" /* needed for gdsblkops.h */
|
|
#include "gdsblkops.h" /* needed for recompute_upd_array routine */
|
|
#include "ccp.h"
|
|
#include "copy.h"
|
|
#include "error.h"
|
|
#include "iosp.h"
|
|
#include "jnl.h"
|
|
#include "jnl_typedef.h"
|
|
#include "buddy_list.h" /* needed for tp.h */
|
|
#include "hashtab_int4.h" /* needed for tp.h */
|
|
#include "tp.h"
|
|
#include "interlock.h"
|
|
#include "gdsbgtr.h"
|
|
#include "repl_msg.h"
|
|
#include "gtmsource.h"
|
|
#include "t_commit_cleanup.h"
|
|
#include "mupipbckup.h"
|
|
#include "gvcst_blk_build.h"
|
|
#include "gvcst_protos.h" /* for gvcst_search_blk prototype */
|
|
#include "cache.h"
|
|
#include "rc_cpt_ops.h"
|
|
#include "wcs_flu.h"
|
|
#include "jnl_write_pblk.h"
|
|
#include "jnl_write.h"
|
|
#include "process_deferred_stale.h"
|
|
#include "wcs_backoff.h"
|
|
#include "mm_update.h"
|
|
#include "bg_update.h"
|
|
#include "wcs_get_space.h"
|
|
#include "wcs_timer_start.h"
|
|
#include "send_msg.h"
|
|
#include "add_inter.h"
|
|
#include "t_qread.h"
|
|
#include "memcoherency.h"
|
|
#include "jnl_get_checksum.h"
|
|
#include "wbox_test_init.h"
|
|
#include "cert_blk.h"
|
|
#include "have_crit.h"
|
|
#include "bml_status_check.h"
|
|
#include "gtmimagename.h"
|
|
#ifdef DEBUG
|
|
#include "anticipatory_freeze.h" /* needed for IS_REPL_INST_FROZEN macro */
|
|
#endif
|
|
|
|
#ifdef UNIX
|
|
#include "gtmrecv.h"
|
|
#include "deferred_signal_handler.h"
|
|
#include "repl_instance.h"
|
|
#endif
|
|
#include "shmpool.h"
|
|
#ifdef GTM_SNAPSHOT
|
|
#include "db_snapshot.h"
|
|
#endif
|
|
#include "is_proc_alive.h"
|
|
|
|
GBLREF uint4 dollar_tlevel;
|
|
GBLREF uint4 dollar_trestart;
|
|
GBLREF gd_region *gv_cur_region;
|
|
GBLREF sgmnt_addrs *cs_addrs;
|
|
GBLREF sgmnt_data_ptr_t cs_data;
|
|
GBLREF sgm_info *first_sgm_info, *sgm_info_ptr;
|
|
GBLREF sgm_info *first_tp_si_by_ftok; /* List of participating regions in the TP transaction sorted on ftok order */
|
|
GBLREF tp_region *tp_reg_list;
|
|
GBLREF boolean_t tp_kill_bitmaps;
|
|
GBLREF unsigned char t_fail_hist[CDB_MAX_TRIES];
|
|
GBLREF int4 n_pvtmods, n_blkmods;
|
|
GBLREF unsigned int t_tries;
|
|
GBLREF jnl_fence_control jnl_fence_ctl;
|
|
GBLREF jnlpool_addrs jnlpool;
|
|
GBLREF jnlpool_ctl_ptr_t jnlpool_ctl, temp_jnlpool_ctl;
|
|
GBLREF boolean_t is_updproc;
|
|
GBLREF seq_num seq_num_zero;
|
|
GBLREF seq_num seq_num_one;
|
|
GBLREF int gv_fillfactor;
|
|
GBLREF char *update_array, *update_array_ptr;
|
|
GBLREF int rc_set_fragment;
|
|
GBLREF uint4 update_array_size, cumul_update_array_size;
|
|
GBLREF boolean_t unhandled_stale_timer_pop;
|
|
GBLREF jnl_gbls_t jgbl;
|
|
GBLREF struct_jrec_tcom tcom_record;
|
|
GBLREF boolean_t certify_all_blocks;
|
|
GBLREF boolean_t gvdupsetnoop; /* if TRUE, duplicate SETs update journal but not database (except for curr_tn++) */
|
|
GBLREF gv_namehead *gv_target;
|
|
GBLREF trans_num local_tn; /* transaction number for THIS PROCESS */
|
|
GBLREF uint4 process_id;
|
|
#ifdef UNIX
|
|
GBLREF recvpool_addrs recvpool;
|
|
GBLREF int4 strm_index;
|
|
GBLREF uint4 update_trans;
|
|
#endif
|
|
#ifdef VMS
|
|
GBLREF boolean_t tp_has_kill_t_cse; /* cse->mode of kill_t_write or kill_t_create got created in this transaction */
|
|
#endif
|
|
#ifdef GTM_TRIGGER
|
|
GBLREF boolean_t skip_dbtriggers; /* see gbldefs.c for description of this global */
|
|
GBLREF int4 gtm_trigger_depth;
|
|
#endif
|
|
#ifdef DEBUG
|
|
GBLREF boolean_t mupip_jnl_recover;
|
|
#endif
|
|
|
|
error_def(ERR_DLCKAVOIDANCE);
|
|
error_def(ERR_JNLFILOPN);
|
|
error_def(ERR_JNLFLUSH);
|
|
error_def(ERR_JNLTRANS2BIG);
|
|
error_def(ERR_REPLOFFJNLON);
|
|
error_def(ERR_TEXT);
|
|
|
|
#define SET_REG_SEQNO_IF_REPLIC(CSA, TJPL, SUPPLEMENTARY, NEXT_STRM_SEQNO) \
|
|
{ \
|
|
GBLREF jnl_gbls_t jgbl; \
|
|
GBLREF boolean_t is_updproc; \
|
|
UNIX_ONLY(GBLREF recvpool_addrs recvpool;) \
|
|
\
|
|
if (REPL_ALLOWED(CSA)) \
|
|
{ \
|
|
assert(CSA->hdr->reg_seqno < TJPL->jnl_seqno); \
|
|
CSA->hdr->reg_seqno = TJPL->jnl_seqno; \
|
|
UNIX_ONLY( \
|
|
if (SUPPLEMENTARY) \
|
|
{ \
|
|
CSA->hdr->strm_reg_seqno[strm_index] = NEXT_STRM_SEQNO; \
|
|
} \
|
|
) \
|
|
VMS_ONLY( \
|
|
if (is_updproc) \
|
|
CSA->hdr->resync_seqno = jgbl.max_resync_seqno; \
|
|
) \
|
|
} \
|
|
}
|
|
|
|
boolean_t reallocate_bitmap(sgm_info *si, cw_set_element *bml_cse);
|
|
enum cdb_sc recompute_upd_array(srch_blk_status *hist1, cw_set_element *cse);
|
|
|
|
boolean_t tp_crit_all_regions()
|
|
{
|
|
int lcnt;
|
|
boolean_t x_lock;
|
|
tp_region *tr;
|
|
sgmnt_addrs *tmpcsa, *frozen_csa;
|
|
sgm_info *tmpsi;
|
|
sgmnt_data_ptr_t tmpcsd;
|
|
gd_region *reg;
|
|
DCL_THREADGBL_ACCESS;
|
|
|
|
SETUP_THREADGBL_ACCESS;
|
|
assert(dollar_tlevel);
|
|
/* This function is in tp_tend because its technique and structures should be maintained in parallel with tp_tend.
|
|
* The following section grabs crit in all regions touched by the transaction. We use a different
|
|
* structure here for grabbing crit. The tp_reg_list region list contains all the regions that
|
|
* were touched by this transaction. Since this array is sorted by the ftok value of the database
|
|
* file being operated on, the obtains will always occurr in a consistent manner. Therefore, we
|
|
* will grab crit on each file with wait since deadlock should not be able to occurr. We cannot
|
|
* use first_tp_si_by_ftok list because it will be setup only in tp_tend which is further down the line.
|
|
*/
|
|
for (lcnt = 0; ;lcnt++)
|
|
{
|
|
x_lock = TRUE; /* Assume success */
|
|
for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr)
|
|
{
|
|
reg = tr->reg;
|
|
tmpcsa = &FILE_INFO(reg)->s_addrs;
|
|
tmpcsd = tmpcsa->hdr;
|
|
tmpsi = (sgm_info *)(tmpcsa->sgm_info_ptr);
|
|
DEBUG_ONLY(
|
|
/* Track retries in debug mode */
|
|
if (0 != lcnt)
|
|
{
|
|
BG_TRACE_ANY(tmpcsa, tp_crit_retries);
|
|
}
|
|
)
|
|
assert(!tmpcsa->hold_onto_crit);
|
|
if (!tmpcsa->now_crit)
|
|
grab_crit(reg);
|
|
assert(!(tmpsi->update_trans & ~UPDTRNS_VALID_MASK));
|
|
if (tmpcsd->freeze && tmpsi->update_trans)
|
|
{
|
|
tr = tr->fPtr; /* Increment so we release the lock we actually got */
|
|
x_lock = FALSE;
|
|
frozen_csa = tmpcsa;
|
|
break;
|
|
}
|
|
}
|
|
if (x_lock)
|
|
break;
|
|
for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr)
|
|
{ /* We may already have crit, if we're entering the final retry or an extra retry. So we need to
|
|
* release crit on all before waiting for freezes.
|
|
*/
|
|
reg = tr->reg;
|
|
tmpcsa = &FILE_INFO(reg)->s_addrs;
|
|
if (tmpcsa->now_crit)
|
|
rel_crit(tr->reg);
|
|
}
|
|
/* Wait for region to be unfrozen before re-grabbing crit on ALL regions */
|
|
WAIT_FOR_REGION_TO_UNFREEZE(frozen_csa, tmpcsd);
|
|
} /* for (;;) */
|
|
return TRUE;
|
|
}
|
|
|
|
boolean_t tp_tend()
|
|
{
|
|
block_id tp_blk;
|
|
boolean_t is_mm, release_crit, was_crit, x_lock, do_validation;
|
|
boolean_t replication = FALSE, region_is_frozen;
|
|
# ifdef UNIX
|
|
boolean_t supplementary = FALSE; /* this variable is initialized ONLY if "replication" is TRUE. */
|
|
seq_num strm_seqno, next_strm_seqno;
|
|
# endif
|
|
bt_rec_ptr_t bt;
|
|
cache_rec_ptr_t cr;
|
|
cw_set_element *cse, *first_cw_set, *bmp_begin_cse;
|
|
file_control *fc;
|
|
jnl_private_control *jpc;
|
|
jnl_buffer_ptr_t jbp;
|
|
jnl_format_buffer *jfb;
|
|
sgm_info *si, *si_last, *tmpsi, *si_not_validated;
|
|
tp_region *tr;
|
|
sgmnt_addrs *csa, *repl_csa = NULL;
|
|
sgmnt_data_ptr_t csd;
|
|
node_local_ptr_t cnl;
|
|
srch_blk_status *t1;
|
|
trans_num ctn, oldest_hist_tn, epoch_tn, old_block_tn;
|
|
trans_num valid_thru; /* buffers touched by this transaction will be valid thru this tn */
|
|
enum cdb_sc status;
|
|
gd_region *save_gv_cur_region;
|
|
int lcnt, jnl_participants, replay_jnl_participants;
|
|
jnldata_hdr_ptr_t jnl_header;
|
|
jnl_record *rec;
|
|
boolean_t yes_jnl_no_repl, recompute_cksum, cksum_needed;
|
|
boolean_t save_dont_reset_gbl_jrec_time;
|
|
uint4 jnl_status, leafmods, indexmods;
|
|
uint4 total_jnl_rec_size, in_tend;
|
|
uint4 update_trans;
|
|
jnlpool_ctl_ptr_t jpl, tjpl;
|
|
boolean_t read_before_image; /* TRUE if before-image journaling or online backup in progress */
|
|
blk_hdr_ptr_t old_block;
|
|
unsigned int bsiz;
|
|
cache_rec_ptr_t *tp_cr_array;
|
|
unsigned int tp_cr_array_index;
|
|
sgm_info **prev_tp_si_by_ftok, *tmp_first_tp_si_by_ftok;
|
|
gv_namehead *prev_target, *curr_target;
|
|
jnl_tm_t save_gbl_jrec_time;
|
|
enum gds_t_mode mode;
|
|
# ifdef GTM_CRYPT
|
|
DEBUG_ONLY(
|
|
blk_hdr_ptr_t save_old_block;
|
|
)
|
|
# endif
|
|
boolean_t ss_need_to_restart, new_bkup_started;
|
|
uint4 com_csum;
|
|
DEBUG_ONLY(
|
|
int tmp_jnl_participants;
|
|
uint4 upd_num;
|
|
uint4 max_upd_num;
|
|
uint4 prev_upd_num;
|
|
uint4 upd_num_start;
|
|
uint4 upd_num_end;
|
|
char upd_num_seen[256];
|
|
)
|
|
DCL_THREADGBL_ACCESS;
|
|
|
|
SETUP_THREADGBL_ACCESS;
|
|
assert(dollar_tlevel);
|
|
assert(0 == jnl_fence_ctl.level);
|
|
status = cdb_sc_normal;
|
|
/* if the transaction does no updates and the transaction history has not changed, we do not need any more validation */
|
|
do_validation = FALSE; /* initially set to FALSE, but set to TRUE below */
|
|
jnl_status = 0;
|
|
assert(NULL == first_tp_si_by_ftok);
|
|
first_tp_si_by_ftok = NULL; /* just in case it is not set */
|
|
prev_tp_si_by_ftok = &tmp_first_tp_si_by_ftok;
|
|
yes_jnl_no_repl = FALSE;
|
|
jnl_participants = 0; /* # of regions that had a LOGICAL journal record written for this TP */
|
|
assert(!IS_DSE_IMAGE UNIX_ONLY (&& !TREF(in_gvcst_redo_root_search))); /* DSE and gvcst_redo_root_search work in Non-TP */
|
|
for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr)
|
|
{
|
|
TP_CHANGE_REG_IF_NEEDED(tr->reg);
|
|
csa = cs_addrs;
|
|
csd = cs_data;
|
|
cnl = csa->nl;
|
|
UNIX_ONLY(
|
|
assert(!csa->hold_onto_crit || jgbl.onlnrlbk); /* In TP, hold_onto_crit is set ONLY by online rollback */
|
|
assert(!jgbl.onlnrlbk || (csa->hold_onto_crit && csa->now_crit));
|
|
)
|
|
si = (sgm_info *)(csa->sgm_info_ptr);
|
|
sgm_info_ptr = si;
|
|
*prev_tp_si_by_ftok = si;
|
|
prev_tp_si_by_ftok = &si->next_tp_si_by_ftok;
|
|
if ((cnl->wc_blocked) || /* If blocked, or.. */
|
|
((dba_mm == csd->acc_meth) && /* we have MM and.. */
|
|
(csa->total_blks != csd->trans_hist.total_blks))) /* and file has been extended */
|
|
{ /* Force repair */
|
|
status = cdb_sc_helpedout; /* special status to prevent punishing altruism */
|
|
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
|
|
goto failed_skip_revert;
|
|
}
|
|
/* Note that there are three ways a deadlock can occur.
|
|
* (a) If we are not in the final retry and we already hold crit on some region.
|
|
* (b) If we are in the final retry and we don't hold crit on some region.
|
|
* (c) If we are in the final retry and we hold crit on a frozen region that we want to update.
|
|
* This is possible if:
|
|
* (1) We did a tp_grab_crit through one of the gvcst_* routines when we first encountered the region
|
|
* in the TP transaction and it wasn't locked down although it was frozen then.
|
|
* (2) tp_crit_all_regions notices that at least one of the participating regions did ONLY READs, it
|
|
* will not wait for any freeze on THAT region to complete before grabbing crit. Later, in the
|
|
* final retry, if THAT region did an update which caused op_tcommit to invoke bm_getfree ->
|
|
* gdsfilext, then we would have come here with a frozen region on which we hold crit.
|
|
* The first two cases, (a) and (b), we don't know of any way they can happen. Case (c) though can happen.
|
|
* Nevertheless, we restart for all the three and in dbg version assert so we get some information.
|
|
*
|
|
* Note that in case of an online mupip journal rollback/recover, we will hold onto crit for the entire life
|
|
* of the process so that needs to be taken into account below.
|
|
*/
|
|
update_trans = si->update_trans;
|
|
assert(!(update_trans & ~UPDTRNS_VALID_MASK));
|
|
assert((UPDTRNS_JNL_LOGICAL_MASK & update_trans) || (NULL == si->jnl_head));
|
|
assert(!(UPDTRNS_JNL_LOGICAL_MASK & update_trans) || (NULL != si->jnl_head));
|
|
assert(!tr->reg->read_only || !update_trans);
|
|
region_is_frozen = (update_trans && csd->freeze);
|
|
if ((CDB_STAGNATE > t_tries)
|
|
? (csa->now_crit && !csa->hold_onto_crit)
|
|
: (!csa->now_crit || region_is_frozen))
|
|
{
|
|
assert(!csa->hold_onto_crit);
|
|
send_msg(VARLSTCNT(8) ERR_DLCKAVOIDANCE, 6, DB_LEN_STR(tr->reg),
|
|
&csd->trans_hist.curr_tn, t_tries, dollar_trestart, csa->now_crit);
|
|
/* The only possible case we know of is (c). assert to that effect. Use local variable region_is_frozen
|
|
* instead of csd->freeze as it could be concurrently changed even though we hold crit (freeze holding
|
|
* pid can clear it in secshr_db_clnup as part of exit processing).
|
|
*/
|
|
assert((CDB_STAGNATE <= t_tries) && csa->now_crit && region_is_frozen);
|
|
status = cdb_sc_needcrit; /* break the possible deadlock by signalling a restart */
|
|
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
|
|
goto failed_skip_revert;
|
|
}
|
|
/* Whenever si->first_cw_set is non-NULL, ensure that update_trans is non-zero */
|
|
assert((NULL == si->first_cw_set) || update_trans);
|
|
/* Whenever si->first_cw_set is NULL, ensure that si->update_trans is FALSE. See op_tcommit.c for exceptions */
|
|
assert((NULL != si->first_cw_set) || !si->update_trans || (UPDTRNS_ZTRIGGER_MASK & si->update_trans)
|
|
|| (gvdupsetnoop && (!JNL_ENABLED(csa) || (NULL != si->jnl_head))));
|
|
if (!update_trans)
|
|
{
|
|
/* See if we can take a fast path for read transactions based on the following conditions :
|
|
* 1. If the transaction number hasn't changed since we read the blocks from the disk or cache
|
|
* 2. If NO concurrent online rollback is running. This is needed because we don't want read transactions
|
|
* to succeed. The issue with this check is that for a rollback that was killed, the PID will be non-
|
|
* zero. In that case, we might skip the fast path and go ahead and do the validation. The validation
|
|
* logic gets crit anyways and so will salvage the lock and do the necessary recovery and issue
|
|
* DBFLCORRP if it notices that csd->file_corrupt is TRUE.
|
|
*/
|
|
if ((si->start_tn == csd->trans_hist.early_tn) UNIX_ONLY(&& (0 == cnl->onln_rlbk_pid)))
|
|
{ /* read with no change to the transaction history. ensure we haven't overrun
|
|
* our history buffer and we have reasonable values for first and last */
|
|
assert(si->last_tp_hist - si->first_tp_hist <= si->tp_hist_size);
|
|
continue;
|
|
} else
|
|
do_validation = TRUE;
|
|
} else
|
|
{
|
|
do_validation = TRUE;
|
|
is_mm = (dba_mm == cs_data->acc_meth);
|
|
/* We are still out of crit if this is not our last attempt. If so, run the region list and check
|
|
* that we have sufficient free blocks for our update. If not, get them now while we can.
|
|
* We will repeat this check later in crit but it will hopefully have little or nothing to do.
|
|
* bypass 1st check if already in crit -- check later
|
|
*/
|
|
if (!csa->now_crit && !is_mm && (cnl->wc_in_free < si->cw_set_depth + 1)
|
|
&& !wcs_get_space(gv_cur_region, si->cw_set_depth + 1, NULL))
|
|
assert(FALSE); /* wcs_get_space should have returned TRUE unconditionally in this case */
|
|
if (JNL_ENABLED(csa))
|
|
{ /* compute the total journal record size requirements before grab_crit.
|
|
* there is code later that will check for state changes from now to then
|
|
*/
|
|
TOTAL_TPJNL_REC_SIZE(total_jnl_rec_size, si, csa);
|
|
/* compute current transaction's maximum journal space needs in number of disk blocks */
|
|
si->tot_jrec_size = MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size);
|
|
GTM_WHITE_BOX_TEST(WBTEST_TP_TEND_TRANS2BIG, si->tot_jrec_size, (2 * csd->autoswitchlimit));
|
|
/* check if current TP transaction's jnl size needs are greater than max jnl file size */
|
|
if (si->tot_jrec_size > csd->autoswitchlimit)
|
|
/* can't fit in current transaction's journal records into one journal file */
|
|
rts_error(VARLSTCNT(6) ERR_JNLTRANS2BIG, 4, si->tot_jrec_size,
|
|
JNL_LEN_STR(csd), csd->autoswitchlimit);
|
|
}
|
|
if (REPL_ALLOWED(csa))
|
|
{
|
|
assert(JNL_ENABLED(csa) || REPL_WAS_ENABLED(csa));
|
|
replication = TRUE;
|
|
repl_csa = &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs;
|
|
jnl_participants++;
|
|
} else if (JNL_ENABLED(csa))
|
|
{
|
|
yes_jnl_no_repl = TRUE;
|
|
save_gv_cur_region = gv_cur_region; /* save the region for later error reporting */
|
|
jnl_participants++;
|
|
}
|
|
}
|
|
if (region_is_frozen)
|
|
{ /* Wait for it to be unfrozen before proceeding to commit. This reduces the
|
|
* chances that we find it frozen after we grab crit further down below.
|
|
*/
|
|
WAIT_FOR_REGION_TO_UNFREEZE(csa, csd);
|
|
}
|
|
} /* for (tr... ) */
|
|
*prev_tp_si_by_ftok = NULL;
|
|
if (replication || yes_jnl_no_repl)
|
|
{ /* The SET_GBL_JREC_TIME done below should be done before any journal writing activity
|
|
* on ANY region's journal file. This is because all the jnl record writing routines assume
|
|
* jgbl.gbl_jrec_time is initialized appropriately.
|
|
*/
|
|
assert(!jgbl.forw_phase_recovery || jgbl.dont_reset_gbl_jrec_time);
|
|
if (!jgbl.dont_reset_gbl_jrec_time)
|
|
SET_GBL_JREC_TIME; /* initializes jgbl.gbl_jrec_time */
|
|
assert(jgbl.gbl_jrec_time);
|
|
/* If any one DB that we are updating has replication turned on and another has only journaling, issue error */
|
|
if (replication && yes_jnl_no_repl)
|
|
rts_error(VARLSTCNT(4) ERR_REPLOFFJNLON, 2, DB_LEN_STR(save_gv_cur_region));
|
|
}
|
|
if (!do_validation)
|
|
{
|
|
if ((CDB_STAGNATE <= t_tries) UNIX_ONLY(&& !jgbl.onlnrlbk))
|
|
{
|
|
for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr)
|
|
{
|
|
# ifdef DEBUG
|
|
csa = &FILE_INFO(tr->reg)->s_addrs;
|
|
assert(!csa->hold_onto_crit);
|
|
# endif
|
|
rel_crit(tr->reg);
|
|
}
|
|
} /* else we are online rollback and we already hold crit on all regions */
|
|
/* Must be done after REVERT since we are no longer in crit */
|
|
if (unhandled_stale_timer_pop)
|
|
process_deferred_stale();
|
|
for (si = first_sgm_info; (NULL != si); si = si->next_sgm_info)
|
|
{
|
|
csa = si->tp_csa;
|
|
cnl = csa->nl;
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_blkread, si->num_of_blks);
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_readonly, 1);
|
|
}
|
|
return TRUE;
|
|
}
|
|
/* Because secshr_db_clnup uses first_tp_si_by_ftok to determine if a TP transaction is underway and expects
|
|
* a well-formed linked list if it is non-zero, the following assignment to the head of the region list must occur
|
|
* after the loop above
|
|
*/
|
|
first_tp_si_by_ftok = tmp_first_tp_si_by_ftok;
|
|
DEBUG_ONLY(
|
|
/* Cross-check the validity of the ftok sorted sgm_info list with "tp_reg_list" */
|
|
tr = tp_reg_list;
|
|
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
|
|
{
|
|
tmpsi = (sgm_info *)(FILE_INFO(tr->reg)->s_addrs.sgm_info_ptr);
|
|
assert(tmpsi == si);
|
|
tr = tr->fPtr;
|
|
}
|
|
assert(NULL == tr);
|
|
)
|
|
assert(cdb_sc_normal == status);
|
|
/* The following section of code (initial part of the for loop) is similar to the function "tp_crit_all_regions".
|
|
* The duplication is there only because of performance reasons. The latter function has to go through tp_reg_list
|
|
* linked list while here we can go through first_tp_si_by_ftok list which offers a performance advantage.
|
|
*
|
|
* The following section grabs crit in all regions touched by the transaction. We use a different
|
|
* structure here for grabbing crit. The tp_reg_list region list contains all the regions that
|
|
* were touched by this transaction. Since this array is sorted by the ftok value of the database
|
|
* file being operated on, the obtains will always occurr in a consistent manner. Therefore, we
|
|
* will grab crit on each file with wait since deadlock should not be able to occurr.
|
|
*/
|
|
ESTABLISH_RET(t_ch, FALSE);
|
|
for (lcnt = 0; ; lcnt++)
|
|
{
|
|
x_lock = TRUE; /* Assume success */
|
|
DEBUG_ONLY(tmp_jnl_participants = 0;)
|
|
/* The following loop grabs crit, does validations and prepares for commit on ALL participating regions */
|
|
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
|
|
{
|
|
sgm_info_ptr = si;
|
|
TP_TEND_CHANGE_REG(si);
|
|
csa = cs_addrs;
|
|
cnl = csa->nl;
|
|
csd = cs_data;
|
|
assert(!si->cr_array_index);
|
|
DEBUG_ONLY(
|
|
/* Track retries in debug mode */
|
|
if (0 != lcnt)
|
|
{
|
|
BG_TRACE_ANY(csa, tp_crit_retries);
|
|
}
|
|
)
|
|
update_trans = si->update_trans;
|
|
assert(!(update_trans & ~UPDTRNS_VALID_MASK));
|
|
first_cw_set = si->first_cw_set;
|
|
/* whenever si->first_cw_set is non-NULL, ensure that si->update_trans is non-zero */
|
|
assert((NULL == first_cw_set) || update_trans);
|
|
/* When si->first_cw_set is NULL, ensure that si->update_trans is FALSE. See op_tcommit.c for exceptions */
|
|
assert((NULL != si->first_cw_set) || !si->update_trans || (UPDTRNS_ZTRIGGER_MASK & si->update_trans)
|
|
|| (gvdupsetnoop && (!JNL_ENABLED(csa) || (NULL != si->jnl_head))));
|
|
leafmods = indexmods = 0;
|
|
is_mm = (dba_mm == csd->acc_meth);
|
|
if (TREF(tprestart_syslog_delta))
|
|
n_blkmods = n_pvtmods = 0;
|
|
/* If we already hold crit (possible if we are in the final retry), do not invoke grab_crit as it will
|
|
* invoke wcs_recover unconditionally if cnl->wc_blocked is set to TRUE. In that case, we want to
|
|
* restart with a helped out code because the cache recovery will most likely result in a restart of
|
|
* the current transaction which we want to avoid if we are in the final retry.
|
|
*/
|
|
if (!csa->now_crit)
|
|
grab_crit(gv_cur_region);
|
|
else if (cnl->wc_blocked)
|
|
{
|
|
status = cdb_sc_helpedout;
|
|
goto failed;
|
|
}
|
|
if (is_mm && ((csa->hdr != csd) || (csa->total_blks != csd->trans_hist.total_blks)))
|
|
{ /* If MM, check if wcs_mm_recover was invoked as part of the grab_crit done above OR if
|
|
* the file has been extended. If so, restart.
|
|
*/
|
|
status = cdb_sc_helpedout; /* force retry with special status so philanthropy isn't punished */
|
|
goto failed;
|
|
}
|
|
/* Note that even though we ensured that regions are not frozen outside of crit, it is still possible
|
|
* that they become frozen just before we grab crit. In this case (should be rare though) release
|
|
* crit on ALL regions that we have grabbed uptil this point and wait for the freeze to be removed.
|
|
*/
|
|
if (csd->freeze && update_trans)
|
|
{
|
|
x_lock = FALSE;
|
|
break;
|
|
}
|
|
CHECK_TN(csa, csd, csd->trans_hist.curr_tn); /* can issue rts_error TNTOOLARGE */
|
|
if (!is_mm)
|
|
oldest_hist_tn = OLDEST_HIST_TN(csa);
|
|
# ifdef UNIX
|
|
/* We never expect to come here with file_corrupt set to TRUE (in case of an online rollback) because
|
|
* grab_crit done above will make sure of that. The only exception is RECOVER/ROLLBACK itself coming
|
|
* here in the forward phase
|
|
*/
|
|
assert(!csd->file_corrupt || mupip_jnl_recover);
|
|
if (MISMATCH_ROOT_CYCLES(csa, cnl))
|
|
{
|
|
status = cdb_sc_gvtrootmod2;
|
|
if (MISMATCH_ONLN_RLBK_CYCLES(csa, cnl))
|
|
{
|
|
assert(!mupip_jnl_recover);
|
|
status = ONLN_RLBK_STATUS(csa, cnl);
|
|
SYNC_ONLN_RLBK_CYCLES;
|
|
SYNC_ROOT_CYCLES(NULL);
|
|
} else
|
|
SYNC_ROOT_CYCLES(csa);
|
|
goto failed;
|
|
}
|
|
# endif
|
|
# ifdef GTM_TRIGGER
|
|
if (!skip_dbtriggers && si->tp_set_sgm_done && (csa->db_trigger_cycle != csd->db_trigger_cycle))
|
|
{ /* The process' view of the triggers could be potentially stale. restart to be safe.
|
|
* Note: We need to validate triggers ONLY if the region (pointed to by si) was actually referenced
|
|
* in this retry of the transaction. Hence the si->tp_set_sgm_done check.
|
|
*/
|
|
/* Triggers can be invoked only by GT.M and Update process. Out of these, we expect only
|
|
* GT.M to see restarts due to concurrent trigger changes. Update process is the only
|
|
* updater on the secondary so we dont expect it to see any concurrent trigger changes.
|
|
* The only exception is if this is a supplementary root primary instance. In that case,
|
|
* the update process coexists with GT.M processes and hence can see restarts due to
|
|
* concurrent trigger changes. Assert accordingly.
|
|
*/
|
|
assert(CDB_STAGNATE > t_tries);
|
|
assert(!is_updproc || (jnlpool.repl_inst_filehdr->is_supplementary
|
|
&& !jnlpool.jnlpool_ctl->upd_disabled));
|
|
assert(csd->db_trigger_cycle > csa->db_trigger_cycle);
|
|
/* csa->db_trigger_cycle will be set to csd->db_trigger_cycle for all participating
|
|
* regions when they are each first referenced in the next retry (in tp_set_sgm)\
|
|
*/
|
|
status = cdb_sc_triggermod;
|
|
goto failed;
|
|
}
|
|
# endif
|
|
if (update_trans)
|
|
{
|
|
assert((NULL == first_cw_set) || (0 != si->cw_set_depth));
|
|
DEBUG_ONLY(
|
|
/* Recompute # of replicated regions inside of crit */
|
|
if (REPL_ALLOWED(csa))
|
|
{
|
|
tmp_jnl_participants++;
|
|
} else if (JNL_ENABLED(csa))
|
|
{
|
|
assert(!replication); /* should have issued a REPLOFFJNLON error outside of crit */
|
|
tmp_jnl_participants++;
|
|
}
|
|
)
|
|
if (JNL_ALLOWED(csa))
|
|
{
|
|
if ((csa->jnl_state != csd->jnl_state) || (csa->jnl_before_image != csd->jnl_before_image))
|
|
{ /* Take this opportunity to check/sync ALL regions where csa/csd dont match */
|
|
for (tmpsi = first_tp_si_by_ftok;
|
|
(NULL != tmpsi);
|
|
tmpsi = tmpsi->next_tp_si_by_ftok)
|
|
{
|
|
csa = tmpsi->tp_csa;
|
|
csd = csa->hdr;
|
|
cnl = csa->nl;
|
|
csa->jnl_state = csd->jnl_state;
|
|
csa->jnl_before_image = csd->jnl_before_image;
|
|
/* jnl_file_lost causes a jnl_state transition from jnl_open to jnl_closed
|
|
* and additionally causes a repl_state transition from repl_open to
|
|
* repl_closed all without standalone access. This means that
|
|
* csa->repl_state might be repl_open while csd->repl_state might be
|
|
* repl_closed. update csa->repl_state in this case as otherwise the rest
|
|
* of the code might look at csa->repl_state and incorrectly conclude
|
|
* replication is on and generate sequence numbers when actually no journal
|
|
* records are being generated. [C9D01-002219]
|
|
*/
|
|
csa->repl_state = csd->repl_state;
|
|
}
|
|
status = cdb_sc_jnlstatemod;
|
|
goto failed;
|
|
}
|
|
}
|
|
/* Flag retry, if other mupip activities like BACKUP, INTEG or FREEZE are in progress.
|
|
* If in final retry, go ahead with kill. BACKUP/INTEG/FREEZE will wait for us to be done.
|
|
*/
|
|
if ((NULL != si->kill_set_head) && (0 < cnl->inhibit_kills) && (CDB_STAGNATE > t_tries))
|
|
{
|
|
status = cdb_sc_inhibitkills;
|
|
goto failed;
|
|
}
|
|
/* Caution : since csa->backup_in_prog was initialized in op_tcommit only if si->first_cw_set was
|
|
* non-NULL, it should be used in tp_tend only within an if (NULL != si->first_cw_set)
|
|
*/
|
|
if (NULL != first_cw_set)
|
|
{
|
|
ss_need_to_restart = new_bkup_started = FALSE;
|
|
GTM_SNAPSHOT_ONLY(
|
|
CHK_AND_UPDATE_SNAPSHOT_STATE_IF_NEEDED(csa, cnl, ss_need_to_restart);
|
|
)
|
|
CHK_AND_UPDATE_BKUP_STATE_IF_NEEDED(cnl, csa, new_bkup_started);
|
|
if (ss_need_to_restart
|
|
|| (new_bkup_started && !(JNL_ENABLED(csa) && csa->jnl_before_image)))
|
|
{
|
|
/* If online backup is in progress now and before-image journaling is
|
|
* not enabled, we would not have read before-images for created blocks.
|
|
* Although it is possible that this transaction might not have blocks
|
|
* with gds_t_create at all, we expect this backup_in_prog state change
|
|
* to be so rare that it is ok to restart.
|
|
*/
|
|
status = cdb_sc_bkupss_statemod;
|
|
goto failed;
|
|
}
|
|
}
|
|
/* recalculate based on the new values of snapshot_in_prog and backup_in_prog */
|
|
read_before_image = ((JNL_ENABLED(csa) && csa->jnl_before_image)
|
|
|| csa->backup_in_prog
|
|
|| SNAPSHOTS_IN_PROG(csa));
|
|
if (!is_mm)
|
|
{ /* in crit, ensure cache-space is available.
|
|
* the out-of-crit check done above might not be enough
|
|
*/
|
|
if (cnl->wc_in_free < si->cw_set_depth + 1)
|
|
{
|
|
if (!wcs_get_space(gv_cur_region, si->cw_set_depth + 1, NULL))
|
|
{
|
|
assert(cnl->wc_blocked); /* only reason we currently know
|
|
* why wcs_get_space could fail */
|
|
assert(gtm_white_box_test_case_enabled);
|
|
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
|
|
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_wcsgetspace);
|
|
SET_CACHE_FAIL_STATUS(status, csd);
|
|
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
|
|
goto failed;
|
|
}
|
|
}
|
|
VMS_ONLY(
|
|
if (csd->clustered && !CCP_SEGMENT_STATE(cnl, CCST_MASK_HAVE_DIRTY_BUFFERS))
|
|
{
|
|
CCP_FID_MSG(gv_cur_region, CCTR_FLUSHLK);
|
|
ccp_userwait(gv_cur_region, CCST_MASK_HAVE_DIRTY_BUFFERS,
|
|
NULL, cnl->ccp_cycle);
|
|
}
|
|
)
|
|
}
|
|
if (JNL_ENABLED(csa))
|
|
{ /* Since we got the system time (jgbl.gbl_jrec_time) outside of crit, it is possible that
|
|
* journal records were written concurrently to this file with a timestamp that is future
|
|
* relative to what we recorded. In that case, adjust our recorded time to match this.
|
|
* This is necessary to ensure that timestamps of successive journal records for each
|
|
* database file are in non-decreasing order. A side-effect of this is that our recorded
|
|
* time might not accurately reflect the current system time but that is considered not
|
|
* an issue since we dont expect to be off by more than a second or two if at all.
|
|
* Another side effect is that even if the system time went back, we will never write
|
|
* out-of-order timestamped journal records in the lifetime of this database shared memory.
|
|
*/
|
|
jpc = csa->jnl;
|
|
jbp = jpc->jnl_buff;
|
|
/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
|
|
* of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
|
|
* journal records (if it decides to switch to a new journal file).
|
|
*/
|
|
ADJUST_GBL_JREC_TIME(jgbl, jbp);
|
|
/* Note that jnl_ensure_open can call cre_jnl_file which in turn assumes
|
|
* jgbl.gbl_jrec_time is set. Also jnl_file_extend can call jnl_write_epoch_rec
|
|
* which in turn assumes jgbl.gbl_jrec_time is set. In case of forw-phase-recovery,
|
|
* mur_output_record would have already set this.
|
|
*/
|
|
assert(jgbl.gbl_jrec_time);
|
|
jnl_status = jnl_ensure_open();
|
|
GTM_WHITE_BOX_TEST(WBTEST_TP_TEND_JNLFILOPN, jnl_status, ERR_JNLFILOPN);
|
|
if (jnl_status != 0)
|
|
{
|
|
ctn = csd->trans_hist.curr_tn;
|
|
assert(csd->trans_hist.early_tn == ctn);
|
|
if (SS_NORMAL != jpc->status)
|
|
rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd),
|
|
DB_LEN_STR(gv_cur_region), jpc->status);
|
|
else
|
|
rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd),
|
|
DB_LEN_STR(gv_cur_region));
|
|
}
|
|
if (DISK_BLOCKS_SUM(jbp->freeaddr, si->total_jnl_rec_size) > jbp->filesize)
|
|
{ /* Moved here to prevent jnlrecs split across multiple generation journal files. */
|
|
if (SS_NORMAL != (jnl_status = jnl_flush(jpc->region)))
|
|
{
|
|
send_msg(VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd),
|
|
ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in tp_tend"),
|
|
jnl_status);
|
|
assert((!JNL_ENABLED(csd)) && JNL_ENABLED(csa));
|
|
status = cdb_sc_jnlclose;
|
|
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
|
|
goto failed;
|
|
} else if (EXIT_ERR == jnl_file_extend(jpc, si->total_jnl_rec_size))
|
|
{
|
|
assert((!JNL_ENABLED(csd)) && JNL_ENABLED(csa));
|
|
assert(csd == csa->hdr); /* If MM, csd shouldn't have been reset */
|
|
status = cdb_sc_jnlclose;
|
|
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
|
|
goto failed;
|
|
}
|
|
assert(csd == csa->hdr); /* If MM, csd shouldn't have been reset */
|
|
}
|
|
if (JNL_HAS_EPOCH(jbp)
|
|
&& ((jbp->next_epoch_time <= jgbl.gbl_jrec_time) UNCONDITIONAL_EPOCH_ONLY(|| TRUE)))
|
|
{ /* Flush the cache. Since we are in crit, defer syncing the epoch */
|
|
/* Note that at this point, jgbl.gbl_jrec_time has been computed taking into
|
|
* account the current system time & the last journal record timestamp of ALL
|
|
* regions involved in this TP transaction. To prevent wcs_flu from inadvertently
|
|
* setting this BACK in time (poses out-of-order timestamp issues for backward
|
|
* recovery and is asserted later in tp_tend) set jgbl.dont_reset_gbl_jrec_time
|
|
* to TRUE for the duration of the wcs_flu.
|
|
* Also, in case of rts_error from wcs_flu, t_ch will be invoked which will take
|
|
* care of restoring this variable to FALSE. Any new codepath in mumps that sets
|
|
* this variable for the duration of wcs_flu should take care of resetting this
|
|
* back to FALSE in an existing condition handler (or by creating a new one if not
|
|
* already present)
|
|
* Since, this global is set to TRUE explicitly by forward recovery, we should NOT
|
|
* reset this to FALSE unconditionally. But, instead of checking if forward recovery
|
|
* is TRUE, save and restore this variable unconditionally thereby saving a few
|
|
* CPU cycles.
|
|
*/
|
|
save_dont_reset_gbl_jrec_time = jgbl.dont_reset_gbl_jrec_time;
|
|
jgbl.dont_reset_gbl_jrec_time = TRUE;
|
|
if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_IN_COMMIT
|
|
| WCSFLU_SPEEDUP_NOBEFORE))
|
|
{
|
|
assert(csd == csa->hdr);
|
|
jgbl.dont_reset_gbl_jrec_time = save_dont_reset_gbl_jrec_time;
|
|
SET_WCS_FLU_FAIL_STATUS(status, csd);
|
|
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
|
|
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_jnl_wcsflu);
|
|
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
|
|
goto failed;
|
|
}
|
|
jgbl.dont_reset_gbl_jrec_time = save_dont_reset_gbl_jrec_time;
|
|
assert(csd == csa->hdr);
|
|
}
|
|
assert(jgbl.gbl_jrec_time >= jbp->prev_jrec_time);
|
|
} /* if (journaling) */
|
|
}
|
|
/* the following section verifies that the optimistic concurrency was justified */
|
|
assert(cdb_sc_normal == status);
|
|
for (t1 = si->first_tp_hist; t1 != si->last_tp_hist; t1++)
|
|
{
|
|
assert(NULL != t1->blk_target);
|
|
cse = t1->cse;
|
|
if (is_mm)
|
|
{ /* the check below is different from the one for BG (i.e. doesn't have the killtn check)
|
|
* because there is no BT equivalent in MM. there is a mmblk_rec which is more or
|
|
* less the same as a BT. when the mmblk_rec becomes as fully functional as BT, we
|
|
* can use the killtn optimization for MM also.
|
|
*/
|
|
if (t1->tn <= ((blk_hdr_ptr_t)t1->buffaddr)->tn)
|
|
{
|
|
assert(CDB_STAGNATE > t_tries);
|
|
assert(!cse || !cse->high_tlevel);
|
|
if (!cse || !cse->recompute_list_head || cse->write_type
|
|
|| (cdb_sc_normal != recompute_upd_array(t1, cse)) || !++leafmods)
|
|
{
|
|
status = cdb_sc_blkmod;
|
|
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
|
|
DEBUG_ONLY(continue;)
|
|
PRO_ONLY(goto failed;)
|
|
}
|
|
}
|
|
} else
|
|
{
|
|
bt = bt_get(t1->blk_num);
|
|
if (NULL != bt)
|
|
{
|
|
if (t1->tn <= bt->tn)
|
|
{
|
|
assert(!cse || !cse->high_tlevel);
|
|
assert(CDB_STAGNATE > t_tries);
|
|
/* "indexmods" and "leafmods" are to monitor number of blocks that used
|
|
* indexmod and noisolation optimizations respectively. Note that once
|
|
* in this part of the code, atleast one of them will be non-zero and
|
|
* if both of them turn out to be non-zero, then we need to restart.
|
|
* See gdscc.h for a description of the indexmod optimization.
|
|
*/
|
|
if (t1->level)
|
|
{
|
|
if (cse || t1->tn <= bt->killtn)
|
|
status = cdb_sc_blkmod;
|
|
else
|
|
{
|
|
indexmods++;
|
|
t1->blk_target->clue.end = 0;
|
|
if (leafmods)
|
|
status = cdb_sc_blkmod;
|
|
}
|
|
} else
|
|
{ /* For a non-isolated global, if the leaf block isn't part of the
|
|
* cw-set, this means that it was involved in an M-kill that freed
|
|
* the data-block from the B-tree. In this case, if the leaf-block
|
|
* has changed since we did our read of the block, we have to redo
|
|
* the M-kill. But since redo of that M-kill might involve much
|
|
* more than just leaf-level block changes, we will be safe and do
|
|
* a restart. If the need for NOISOLATION optimization for M-kills
|
|
* is felt, we need to revisit this.
|
|
*/
|
|
if (!t1->blk_target->noisolation || !cse)
|
|
status = cdb_sc_blkmod;
|
|
else
|
|
{
|
|
assert(cse->write_type || cse->recompute_list_head);
|
|
leafmods++;
|
|
if (indexmods || cse->write_type
|
|
|| (cdb_sc_normal !=
|
|
recompute_upd_array(t1, cse)))
|
|
status = cdb_sc_blkmod;
|
|
}
|
|
}
|
|
if (cdb_sc_normal != status)
|
|
{
|
|
if (TREF(tprestart_syslog_delta))
|
|
{
|
|
n_blkmods++;
|
|
if (cse)
|
|
n_pvtmods++;
|
|
if (1 != n_blkmods)
|
|
continue;
|
|
}
|
|
assert(CDB_STAGNATE > t_tries);
|
|
status = cdb_sc_blkmod;
|
|
TP_TRACE_HIST_MOD(t1->blk_num, t1->blk_target,
|
|
tp_blkmod_tp_tend, csd, t1->tn, bt->tn, t1->level);
|
|
DEBUG_ONLY(continue;)
|
|
PRO_ONLY(goto failed;)
|
|
}
|
|
}
|
|
} else if (t1->tn <= oldest_hist_tn)
|
|
{
|
|
assert(CDB_STAGNATE > t_tries);
|
|
status = cdb_sc_losthist;
|
|
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
|
|
DEBUG_ONLY(continue;)
|
|
PRO_ONLY(goto failed;)
|
|
}
|
|
assert(CYCLE_PVT_COPY != t1->cycle);
|
|
if (cse)
|
|
{ /* Do cycle check only if blk has cse and hasn't been built (if it has, then tp_hist
|
|
* would have done the cdb_sc_lostcr check soon after it got built) or if we have BI
|
|
* journaling or online backup is currently running. The BI-journaling/online-backup
|
|
* check is to ensure that the before-image/pre-update-copy we write hasn't been
|
|
* recycled.
|
|
*/
|
|
if ((NULL == bt) || (CR_NOTVALID == bt->cache_index))
|
|
cr = db_csh_get(t1->blk_num);
|
|
else
|
|
{
|
|
cr = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index);
|
|
if ((NULL != cr) && (cr->blk != bt->blk))
|
|
{
|
|
assert(FALSE);
|
|
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
|
|
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_crbtmismatch1);
|
|
status = cdb_sc_crbtmismatch;
|
|
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
|
|
goto failed;
|
|
}
|
|
}
|
|
if ((cache_rec_ptr_t)CR_NOTVALID == cr)
|
|
{
|
|
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
|
|
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_t1);
|
|
SET_CACHE_FAIL_STATUS(status, csd);
|
|
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
|
|
goto failed;
|
|
}
|
|
assert(update_trans); /* ensure read_before_image was computed above */
|
|
if (!cse->new_buff || read_before_image)
|
|
{
|
|
if ((NULL == cr) || (cr->cycle != t1->cycle)
|
|
|| ((sm_long_t)GDS_ANY_REL2ABS(csa, cr->buffaddr)
|
|
!= (sm_long_t)t1->buffaddr))
|
|
{
|
|
if ((NULL != cr) && (NULL != bt) && (cr->blk != bt->blk))
|
|
{
|
|
assert(FALSE);
|
|
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
|
|
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_crbtmismatch2);
|
|
status = cdb_sc_crbtmismatch;
|
|
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
|
|
goto failed;
|
|
}
|
|
assert(CDB_STAGNATE > t_tries);
|
|
status = cdb_sc_lostcr;
|
|
TP_TRACE_HIST(t1->blk_num, t1->blk_target);
|
|
DEBUG_ONLY(continue;)
|
|
PRO_ONLY(goto failed;)
|
|
}
|
|
/* Now that we know the cache-record is still valid, pin it.
|
|
*
|
|
* It is possible that t1->cse is non-NULL even though we eventually
|
|
* decided NOT to update that particular block e.g. if t1->cse->mode
|
|
* was originally t_write but later got set to kill_t_write. In such
|
|
* cases, dont set in_cw_set as we dont need this buffer pinned at all.
|
|
*/
|
|
assert(n_gds_t_op != cse->mode);
|
|
assert(kill_t_create > n_gds_t_op);
|
|
assert(kill_t_write > n_gds_t_op);
|
|
if (n_gds_t_op > cse->mode)
|
|
TP_PIN_CACHE_RECORD(cr, si);
|
|
}
|
|
/* The only case cr can be NULL at this point of code is when
|
|
* a) cse->new_buff is non-NULL
|
|
* b) AND the block is not in cache
|
|
* c) AND we don't have before-image-journaling
|
|
* d) AND online backup is not running.
|
|
* In this case bg_update will do a db_csh_getn and appropriately set in_cw_set
|
|
* field to be TRUE so no need to pin the cache-record here.
|
|
*/
|
|
}
|
|
}
|
|
} /* for (t1 ... ) */
|
|
DEBUG_ONLY(
|
|
if (cdb_sc_normal != status)
|
|
goto failed;
|
|
else
|
|
{ /* Now that we have successfully validated all histories, check that there is no
|
|
* gv_target mismatch between history and corresponding cse.
|
|
*/
|
|
for (t1 = si->first_tp_hist; t1 != si->last_tp_hist; t1++)
|
|
{
|
|
cse = t1->cse;
|
|
if (NULL != cse)
|
|
assert(t1->blk_target == cse->blk_target);
|
|
}
|
|
}
|
|
)
|
|
if (DIVIDE_ROUND_UP(si->num_of_blks, 4) < leafmods) /* if status == cdb_sc_normal, then leafmods */
|
|
{
|
|
status = cdb_sc_toomanyrecompute; /* is exactly the number of recomputed blocks */
|
|
goto failed;
|
|
}
|
|
assert(cdb_sc_normal == status);
|
|
if (NULL == first_cw_set)
|
|
continue;
|
|
/* Check bit maps for usage */
|
|
for (cse = si->first_cw_bitmap; NULL != cse; cse = cse->next_cw_set)
|
|
{
|
|
assert(0 == cse->jnl_freeaddr); /* ensure haven't missed out resetting jnl_freeaddr for any cse in
|
|
* t_write/t_create/{t,mu}_write_map/t_write_root [D9B11-001991] */
|
|
TRAVERSE_TO_LATEST_CSE(cse);
|
|
assert(0 == ((off_chain *)&cse->blk)->flag);
|
|
assert(!cse->high_tlevel);
|
|
if (is_mm)
|
|
{
|
|
if ((cse->tn <= ((blk_hdr_ptr_t)cse->old_block)->tn) && !reallocate_bitmap(si, cse))
|
|
{
|
|
assert(CDB_STAGNATE > t_tries);
|
|
status = cdb_sc_bmlmod;
|
|
TP_TRACE_HIST(cse->blk, NULL);
|
|
goto failed;
|
|
}
|
|
} else
|
|
{
|
|
tp_blk = cse->blk;
|
|
bt = bt_get(tp_blk);
|
|
if (NULL != bt)
|
|
{
|
|
if ((cse->tn <= bt->tn) && !reallocate_bitmap(si, cse))
|
|
{
|
|
assert(CDB_STAGNATE > t_tries);
|
|
status = cdb_sc_bmlmod;
|
|
TP_TRACE_HIST(tp_blk, NULL);
|
|
goto failed;
|
|
}
|
|
} else if (cse->tn <= oldest_hist_tn)
|
|
{
|
|
assert(CDB_STAGNATE > t_tries);
|
|
status = cdb_sc_lostbmlhist;
|
|
TP_TRACE_HIST(tp_blk, NULL);
|
|
goto failed;
|
|
}
|
|
assert(NULL == cse->new_buff);
|
|
if ((NULL == bt) || (CR_NOTVALID == bt->cache_index))
|
|
{
|
|
cr = db_csh_get(tp_blk);
|
|
if ((cache_rec_ptr_t)CR_NOTVALID == cr)
|
|
{
|
|
SET_CACHE_FAIL_STATUS(status, csd);
|
|
TP_TRACE_HIST(tp_blk, NULL);
|
|
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
|
|
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_bitmap);
|
|
goto failed;
|
|
}
|
|
} else
|
|
{
|
|
cr = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index);
|
|
if (cr->blk != bt->blk)
|
|
{
|
|
assert(FALSE);
|
|
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
|
|
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_crbtmismatch3);
|
|
status = cdb_sc_crbtmismatch;
|
|
TP_TRACE_HIST(tp_blk, NULL);
|
|
goto failed;
|
|
}
|
|
}
|
|
if ((NULL == cr) || (cr->cycle != cse->cycle) ||
|
|
((sm_long_t)GDS_ANY_REL2ABS(csa, cr->buffaddr) != (sm_long_t)cse->old_block))
|
|
{
|
|
assert(CDB_STAGNATE > t_tries);
|
|
status = cdb_sc_lostbmlcr;
|
|
TP_TRACE_HIST(tp_blk, NULL);
|
|
goto failed;
|
|
}
|
|
TP_PIN_CACHE_RECORD(cr, si);
|
|
}
|
|
} /* for (all bitmaps written) */
|
|
si->backup_block_saved = FALSE;
|
|
jbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
|
|
/* Caution : since csa->backup_in_prog was initialized in op_tcommit only if si->first_cw_set was
|
|
* non-NULL, it should be used in tp_tend only within an if (NULL != si->first_cw_set)
|
|
*/
|
|
if (!is_mm && ((NULL != jbp) || csa->backup_in_prog || SNAPSHOTS_IN_PROG(csa)))
|
|
{
|
|
for (cse = first_cw_set; NULL != cse; cse = cse->next_cw_set)
|
|
{ /* have already read old block for creates before we got crit, make sure
|
|
* cache record still has correct block. if not, reset "cse" fields to
|
|
* point to correct cache-record. this is ok to do since we only need the
|
|
* prior content of the block (for online backup or before-image journaling)
|
|
* and did not rely on it for constructing the transaction. Restart if
|
|
* block is not present in cache now or is being read in currently.
|
|
*/
|
|
TRAVERSE_TO_LATEST_CSE(cse);
|
|
if (gds_t_acquired == cse->mode && (NULL != cse->old_block))
|
|
{
|
|
assert(CYCLE_PVT_COPY != cse->cycle);
|
|
cr = db_csh_get(cse->blk);
|
|
if ((cache_rec_ptr_t)CR_NOTVALID == cr)
|
|
{
|
|
TP_TRACE_HIST(cse->blk, cse->blk_target);
|
|
SET_CACHE_FAIL_STATUS(status, csd);
|
|
SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
|
|
BG_TRACE_PRO_ANY(csa, wc_blocked_tp_tend_jnl_cwset);
|
|
goto failed;
|
|
}
|
|
/* It is possible that cr->in_cw_set is non-zero in case a concurrent MUPIP REORG
|
|
* UPGRADE/DOWNGRADE is in PHASE2 touching this very same block. In that case,
|
|
* we cannot reuse this block so we restart. We could try finding a different block
|
|
* to acquire instead and avoid a restart (tracked as part of C9E11-002651).
|
|
* Note that in_cw_set is set to 0 ahead of in_tend in bg_update_phase2. Therefore
|
|
* it is possible that we see in_cw_set 0 but in_tend is still non-zero. In that
|
|
* case, we cannot proceed with pinning this cache-record as the cr is still locked
|
|
* by the other process. We can choose to wait here but instead decide to restart.
|
|
*/
|
|
if ((NULL == cr) || (0 <= cr->read_in_progress)
|
|
|| (0 != cr->in_cw_set) || (0 != cr->in_tend))
|
|
{
|
|
TP_TRACE_HIST(cse->blk, cse->blk_target);
|
|
assert(CDB_STAGNATE > t_tries);
|
|
status = cdb_sc_lostbefor;
|
|
goto failed;
|
|
}
|
|
TP_PIN_CACHE_RECORD(cr, si);
|
|
cse->ondsk_blkver = cr->ondsk_blkver;
|
|
old_block = (blk_hdr_ptr_t)GDS_REL2ABS(cr->buffaddr);
|
|
assert((cse->cr != cr) || (cse->old_block == (sm_uc_ptr_t)old_block));
|
|
old_block_tn = old_block->tn;
|
|
/* Need checksums if before imaging and if a PBLK record is going to be written. */
|
|
cksum_needed = (!WAS_FREE(cse) && (NULL != jbp) && (old_block_tn < jbp->epoch_tn));
|
|
if ((cse->cr != cr) || (cse->cycle != cr->cycle))
|
|
{ /* Block has relocated in the cache. Adjust pointers to new location. */
|
|
cse->cr = cr;
|
|
cse->cycle = cr->cycle;
|
|
cse->old_block = (sm_uc_ptr_t)old_block;
|
|
/* PBLK checksum was computed outside-of-crit when block was read but
|
|
* block has relocated in the cache since then so recompute the checksum
|
|
* if this block needs a checksum in the first place (cksum_needed is TRUE).
|
|
*/
|
|
recompute_cksum = cksum_needed;
|
|
} else if (cksum_needed)
|
|
{ /* We have determined that a checksum is needed for this block. If we
|
|
* have not previously computed one outside crit OR if the block contents
|
|
* have changed since the checksum was previously computed, we need to
|
|
* recompute it. Otherwise, the out-of-crit computed value can be safely
|
|
* used. Note that cse->tn is valid only if a checksum was computed outside
|
|
* of crit. So make sure it is used only if checksum is non-zero. There is
|
|
* a rare chance that the computed checksum could be zero in which case we
|
|
* will recompute unnecessarily. Since that is expected to be very rare,
|
|
* it is considered ok for now.
|
|
*/
|
|
recompute_cksum = (!cse->blk_checksum || (cse->tn <= old_block_tn));
|
|
}
|
|
if (!cksum_needed)
|
|
cse->blk_checksum = 0; /* zero any out-of-crit computed checksum */
|
|
else if (recompute_cksum)
|
|
{ /* We hold crit at this point so we are guaranteed valid bsiz field.
|
|
* Hence we do not need to verify if bsiz is lesser than csd->blk_size
|
|
* like we did in an earlier call to jnl_get_checksum (in op_tcommit.c).
|
|
*/
|
|
assert(NULL != jbp);
|
|
assert(SIZEOF(bsiz) == SIZEOF(old_block->bsiz));
|
|
bsiz = old_block->bsiz;
|
|
assert(bsiz <= csd->blk_size);
|
|
cse->blk_checksum = jnl_get_checksum((uint4*)old_block, csa, bsiz);
|
|
}
|
|
DEBUG_ONLY(
|
|
else
|
|
assert(cse->blk_checksum ==
|
|
jnl_get_checksum((uint4 *)old_block, csa, old_block->bsiz));
|
|
)
|
|
assert(cse->cr->blk == cse->blk);
|
|
} /* end if acquired block */
|
|
} /* end cse for loop */
|
|
} /* end if !mm && before-images need to be written */
|
|
# ifdef VMS
|
|
/* Check if this TP transaction created any cses of mode kill_t_write or kill_t_create. If so, ensure
|
|
* the corresponding cse->new_buff has already been initialized. This is necessary in case an error
|
|
* occurs in the midst of commit and we end up invoking secshr_db_clnup which in turn will invoke
|
|
* sec_shr_blk_build for these cses and that will expect that cse->new_buff be non-zero. For Unix
|
|
* we invoke get_new_free_element to initialize cse->new_buff in secshr_db_clnup itself. But for VMS
|
|
* we dont want this routine invoked in the privileged GTMSECSHR image so we do this beforehand.
|
|
*/
|
|
if (tp_has_kill_t_cse)
|
|
{
|
|
for (cse = first_cw_set; NULL != cse; cse = cse->next_cw_set)
|
|
{
|
|
TRAVERSE_TO_LATEST_CSE(cse);
|
|
if ((n_gds_t_op < cse->mode) && (NULL == cse->new_buff))
|
|
{
|
|
assert(!cse->done);
|
|
cse->new_buff = get_new_free_element(si->new_buff_list);
|
|
cse->first_copy = TRUE;
|
|
}
|
|
}
|
|
}
|
|
DEBUG_ONLY(
|
|
for (cse = first_cw_set; NULL != cse; cse = cse->next_cw_set)
|
|
{
|
|
TRAVERSE_TO_LATEST_CSE(cse);
|
|
assert((n_gds_t_op > cse->mode) || (NULL != cse->new_buff));
|
|
}
|
|
)
|
|
# endif
|
|
assert(cdb_sc_normal == status);
|
|
}
|
|
if (x_lock)
|
|
break;
|
|
assert(csd == si->tp_csd);
|
|
si = si->next_tp_si_by_ftok; /* Increment so we release the lock we actually got */
|
|
si_last = si;
|
|
for (si = first_tp_si_by_ftok; (si_last != si); si = si->next_tp_si_by_ftok)
|
|
{
|
|
assert(si->tp_csa->now_crit);
|
|
tp_cr_array = si->cr_array;
|
|
UNPIN_CR_ARRAY_ON_RETRY(tp_cr_array, si->cr_array_index);
|
|
assert(!si->cr_array_index);
|
|
if (!si->tp_csa->hold_onto_crit)
|
|
rel_crit(si->gv_cur_region);
|
|
}
|
|
/* Check that we DONT own crit/commit on ANY region. The only exception is online mupip journal rollback/recovery
|
|
* which holds crit for the entire process lifetime.
|
|
*/
|
|
assert(UNIX_ONLY(jgbl.onlnrlbk || ) (0 == have_crit(CRIT_HAVE_ANY_REG | CRIT_IN_COMMIT)));
|
|
/* Wait for it to be unfrozen before re-grabbing crit on ALL regions */
|
|
WAIT_FOR_REGION_TO_UNFREEZE(csa, csd);
|
|
assert(CDB_STAGNATE > t_tries);
|
|
} /* for (;;) */
|
|
/* Validate the correctness of the calculation of # of replication/journaled regions inside & outside of crit */
|
|
assert(tmp_jnl_participants == jnl_participants);
|
|
assert(cdb_sc_normal == status);
|
|
if (replication)
|
|
{
|
|
jpl = jnlpool_ctl;
|
|
tjpl = temp_jnlpool_ctl;
|
|
if (!repl_csa->hold_onto_crit)
|
|
grab_lock(jnlpool.jnlpool_dummy_reg, ASSERT_NO_ONLINE_ROLLBACK);
|
|
# ifdef UNIX
|
|
if (jnlpool.jnlpool_ctl->freeze)
|
|
{
|
|
status = cdb_sc_instancefreeze; /* break the possible deadlock by signalling a restart */
|
|
TP_TRACE_HIST(CR_BLKEMPTY, NULL);
|
|
goto failed;
|
|
}
|
|
# endif
|
|
tjpl->write_addr = jpl->write_addr;
|
|
tjpl->write = jpl->write;
|
|
tjpl->jnl_seqno = jpl->jnl_seqno;
|
|
# ifdef UNIX
|
|
if (INVALID_SUPPL_STRM != strm_index)
|
|
{ /* Need to also update supplementary stream seqno */
|
|
supplementary = TRUE;
|
|
assert(0 <= strm_index);
|
|
/* assert(strm_index < ARRAYSIZE(tjpl->strm_seqno)); */
|
|
strm_seqno = jpl->strm_seqno[strm_index];
|
|
ASSERT_INST_FILE_HDR_HAS_HISTREC_FOR_STRM(strm_index);
|
|
} else
|
|
supplementary = FALSE;
|
|
# endif
|
|
INT8_ONLY(assert(tjpl->write == tjpl->write_addr % tjpl->jnlpool_size));
|
|
tjpl->write += SIZEOF(jnldata_hdr_struct);
|
|
if (tjpl->write >= tjpl->jnlpool_size)
|
|
{
|
|
assert(tjpl->write == tjpl->jnlpool_size);
|
|
tjpl->write = 0;
|
|
}
|
|
assert(jgbl.cumul_jnl_rec_len);
|
|
jgbl.cumul_jnl_rec_len += TCOM_RECLEN * jnl_participants + SIZEOF(jnldata_hdr_struct);
|
|
DEBUG_ONLY(jgbl.cumul_index += jnl_participants;)
|
|
assert(jgbl.cumul_jnl_rec_len % JNL_REC_START_BNDRY == 0);
|
|
/* Make sure timestamp of this seqno is >= timestamp of previous seqno. Note: The below macro
|
|
* invocation should be done AFTER the ADJUST_GBL_JREC_TIME call as the below resets
|
|
* jpl->prev_jnlseqno_time. Doing it the other way around would mean the reset will happen
|
|
* with a potentially lower value than the final adjusted time written in the jnl record.
|
|
*/
|
|
ADJUST_GBL_JREC_TIME_JNLPOOL(jgbl, jpl);
|
|
assert(jpl->early_write_addr == jpl->write_addr);
|
|
jpl->early_write_addr = jpl->write_addr + jgbl.cumul_jnl_rec_len;
|
|
/* Source server does not read in crit. It relies on early_write_addr, the transaction
|
|
* data, lastwrite_len, write_addr being updated in that order. To ensure this order,
|
|
* we have to force out early_write_addr to its coherency point now. If not, the source
|
|
* server may read data that is overwritten (or stale). This is true only on
|
|
* architectures and OSes that allow unordered memory access
|
|
*/
|
|
SHM_WRITE_MEMORY_BARRIER;
|
|
}
|
|
/* There are two possible approaches that can be taken from now onwards.
|
|
* a) Write journal and database records together for a region and move onto the next region.
|
|
* b) Write journal records for all regions and only then move onto writing database updates for all regions.
|
|
* If journal and database updates are done together region by region, there is a problem in that if an error
|
|
* occurs after one region's updates are committed (to jnl and db) or if the process gets STOP/IDed in VMS,
|
|
* secshr_db_clnup should then commit BOTH the journal and database updates of the remaining regions.
|
|
* committing journal updates is not trivial in secshr_db_clnup since it can also be invoked as a user termination
|
|
* handler in VMS in which case it cannot do any I/O.
|
|
*
|
|
* We therefore take approach (b) below. Write journal records for all regions in one loop. Write database updates
|
|
* for all regions in another loop. This way if any error occurs before database updates for any region begins in
|
|
* the second loop, we cleanup the structures as if the transaction is rolled back (there is an exception to this in
|
|
* that currently the journal buffers are not rolled back to undo the write of journal records but currently
|
|
* MUPIP RECOVER knows to handle such records and TR C9905-001072 exists to make the source-server handle such records).
|
|
* If any error occurs (or if the process gets STOP/IDed in VMS) while we are committing database updates,
|
|
* secshr_db_clnup will be invoked and will complete the updates for this TP transaction.
|
|
*/
|
|
/* the following section writes journal records in all regions */
|
|
DEBUG_ONLY(save_gbl_jrec_time = jgbl.gbl_jrec_time;)
|
|
DEBUG_ONLY(tmp_jnl_participants = 0;)
|
|
assert(!TREF(donot_commit)); /* We should never commit a transaction that was determined restartable */
|
|
# ifdef DEBUG
|
|
/* Check that upd_num in jnl records got set in increasing order (not necessarily contiguous) within each region.
|
|
* This is true for GT.M and journal recovery. Take the chance to also check that jnl_head & update_trans are in sync.
|
|
*/
|
|
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
|
|
{
|
|
prev_upd_num = 0;
|
|
jfb = si->jnl_head;
|
|
/* If we have formatted journal records for this transaction, ensure update_trans is TRUE. Not doing so
|
|
* would mean we miss out on writing journal records. This might be ok since the database was not seen as
|
|
* needing any update at all but in tp_clean_up, we will not free up si->jnl_head structures so there might
|
|
* be a memory leak. In addition, want to know if such a situation happens so assert accordingly.
|
|
*/
|
|
assert((NULL == jfb) || si->update_trans);
|
|
for ( ; NULL != jfb; jfb = jfb->next)
|
|
{
|
|
upd_num = ((struct_jrec_upd *)(jfb->buff))->update_num;
|
|
assert((prev_upd_num < upd_num)
|
|
GTMTRIG_ONLY(|| ((prev_upd_num == upd_num)
|
|
&& IS_ZTWORM(jfb->prev->rectype) && !IS_ZTWORM(jfb->rectype))));
|
|
assert(upd_num);
|
|
prev_upd_num = upd_num;
|
|
}
|
|
}
|
|
/* Check that tp_ztp_jnl_upd_num got set in contiguous increasing order across all regions.
|
|
* In case of forward processing phase of journal recovery, multi-region TP transactions are
|
|
* played as multi-region transactions only after resolve-time is reached and that too in
|
|
* region-by-region order (not necessarily upd_num order across all regions). Until then they
|
|
* are played as multiple single-region transactions. Also if -fences=none is specified, then
|
|
* ALL multi-region TP transactions (even those after resolve time) are played as multiple
|
|
* single-region TP transactions. Assert accordingly.
|
|
*/
|
|
max_upd_num = jgbl.tp_ztp_jnl_upd_num;
|
|
if (jgbl.forw_phase_recovery)
|
|
max_upd_num = jgbl.max_tp_ztp_jnl_upd_num;
|
|
if (max_upd_num)
|
|
{
|
|
upd_num_end = 0;
|
|
for (upd_num = 0; upd_num < ARRAYSIZE(upd_num_seen); upd_num++)
|
|
upd_num_seen[upd_num] = FALSE;
|
|
upd_num_seen[0] = TRUE; /* 0 will never be seen but set it to TRUE to simplify below logic */
|
|
do
|
|
{
|
|
upd_num_start = upd_num_end;
|
|
upd_num_end += ARRAYSIZE(upd_num_seen);
|
|
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
|
|
{
|
|
for (jfb = si->jnl_head; NULL != jfb; jfb = jfb->next)
|
|
{
|
|
/* ZTWORMHOLE will have same update_num as following SET/KILL record so dont double count */
|
|
if (IS_ZTWORM(jfb->rectype))
|
|
continue;
|
|
upd_num = ((struct_jrec_upd *)(jfb->buff))->update_num;
|
|
if ((upd_num >= upd_num_start) && (upd_num < upd_num_end))
|
|
{
|
|
assert(FALSE == upd_num_seen[upd_num - upd_num_start]);
|
|
upd_num_seen[upd_num - upd_num_start] = TRUE;
|
|
}
|
|
assert(upd_num <= max_upd_num);
|
|
}
|
|
}
|
|
for (upd_num = 0; upd_num < ARRAYSIZE(upd_num_seen); upd_num++)
|
|
{
|
|
if (upd_num <= (max_upd_num - upd_num_start))
|
|
{
|
|
assert((TRUE == upd_num_seen[upd_num])
|
|
|| (jgbl.forw_phase_recovery && ((jgbl.gbl_jrec_time < jgbl.mur_tp_resolve_time)
|
|
|| jgbl.mur_fences_none)));
|
|
upd_num_seen[upd_num] = FALSE;
|
|
} else
|
|
assert(FALSE == upd_num_seen[upd_num]);
|
|
}
|
|
} while (upd_num_end <= max_upd_num);
|
|
}
|
|
# endif
|
|
if (!jgbl.forw_phase_recovery)
|
|
{
|
|
jnl_fence_ctl.token = 0;
|
|
replay_jnl_participants = jnl_participants;
|
|
} else
|
|
replay_jnl_participants = jgbl.mur_jrec_participants;
|
|
|
|
/* In case of journal recovery, token would be initialized to a non-zero value */
|
|
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
|
|
{
|
|
if (!si->update_trans)
|
|
continue;
|
|
assert((NULL == si->first_cw_set) || (0 != si->cw_set_depth));
|
|
TP_TEND_CHANGE_REG(si);
|
|
csa = cs_addrs;
|
|
csd = cs_data;
|
|
ctn = csd->trans_hist.curr_tn;
|
|
ASSERT_CURR_TN_EQUALS_EARLY_TN(csa, ctn);
|
|
csd->trans_hist.early_tn = ctn + 1;
|
|
com_csum = 0;
|
|
/* Write non-logical records (PBLK) if applicable */
|
|
if (JNL_ENABLED(csa))
|
|
{
|
|
jpc = csa->jnl;
|
|
jbp = jpc->jnl_buff;
|
|
/* si->tmp_cw_set_depth is a copy of si->cw_set_depth at TOTAL_TPJNL_REC_SIZE calculation time;
|
|
* ensure it has not changed until now when the actual jnl record write occurs.
|
|
* same case with csa->jnl_before_images & jbp->before_images.
|
|
*/
|
|
assert(si->cw_set_depth == si->tmp_cw_set_depth);
|
|
assert(jbp->before_images == csa->jnl_before_image);
|
|
assert(jgbl.gbl_jrec_time >= jbp->prev_jrec_time);
|
|
if (0 == jpc->pini_addr)
|
|
jnl_put_jrt_pini(csa);
|
|
if(!com_csum)
|
|
{
|
|
ADJUST_CHECKSUM_TN(INIT_CHECKSUM_SEED, &ctn, com_csum);
|
|
ADJUST_CHECKSUM(com_csum, jpc->pini_addr, com_csum);
|
|
ADJUST_CHECKSUM(com_csum, jgbl.gbl_jrec_time, com_csum);
|
|
}
|
|
if (jbp->before_images)
|
|
{
|
|
epoch_tn = jbp->epoch_tn; /* store in a local variable as it is used in a loop below */
|
|
for (cse = si->first_cw_set; NULL != cse; cse = cse->next_cw_set)
|
|
{ /* Write out before-update journal image records */
|
|
TRAVERSE_TO_LATEST_CSE(cse);
|
|
if (WAS_FREE(cse))
|
|
continue;
|
|
old_block = (blk_hdr_ptr_t)cse->old_block;
|
|
ASSERT_IS_WITHIN_SHM_BOUNDS((sm_uc_ptr_t)old_block, csa);
|
|
assert((n_gds_t_op != cse->mode) && (gds_t_committed != cse->mode));
|
|
assert(n_gds_t_op < kill_t_create);
|
|
assert(n_gds_t_op < kill_t_write);
|
|
if (n_gds_t_op <= cse->mode)
|
|
continue;
|
|
DEBUG_ONLY(is_mm = (dba_mm == csd->acc_meth));
|
|
DBG_ENSURE_OLD_BLOCK_IS_VALID(cse, is_mm, csa, csd);
|
|
if ((NULL != old_block) && (old_block->tn < epoch_tn))
|
|
{ /* For acquired blocks, we should have computed checksum already.
|
|
* The only exception is if we found no need to compute checksum
|
|
* outside of crit but before we got crit, an EPOCH got written
|
|
* concurrently so we have to write a PBLK (and hence compute the
|
|
* checksum as well) when earlier we thought none was necessary.
|
|
* An easy way to check this is that an EPOCH was written AFTER
|
|
* we started this transaction.
|
|
*/
|
|
assert((gds_t_acquired != cse->mode) || cse->blk_checksum
|
|
|| (epoch_tn >= si->start_tn));
|
|
assert(old_block->bsiz <= csd->blk_size);
|
|
if (!cse->blk_checksum)
|
|
cse->blk_checksum = jnl_get_checksum((uint4 *)old_block,
|
|
csa,
|
|
old_block->bsiz);
|
|
else
|
|
assert(cse->blk_checksum == jnl_get_checksum((uint4 *)old_block,
|
|
csa,
|
|
old_block->bsiz));
|
|
# ifdef GTM_CRYPT
|
|
if (csd->is_encrypted)
|
|
{
|
|
DBG_ENSURE_PTR_IS_VALID_GLOBUFF(csa, csd, (sm_uc_ptr_t)old_block);
|
|
DEBUG_ONLY(save_old_block = old_block;)
|
|
old_block = (blk_hdr_ptr_t)GDS_ANY_ENCRYPTGLOBUF(old_block, csa);
|
|
/* Ensure that the unencrypted block and it's twin counterpart are in
|
|
* sync. */
|
|
assert(save_old_block->tn == old_block->tn);
|
|
assert(save_old_block->bsiz == old_block->bsiz);
|
|
assert(save_old_block->levl == old_block->levl);
|
|
DBG_ENSURE_PTR_IS_VALID_ENCTWINGLOBUFF(csa, csd, (sm_uc_ptr_t)old_block);
|
|
}
|
|
# endif
|
|
jnl_write_pblk(csa, cse, old_block, com_csum);
|
|
cse->jnl_freeaddr = jbp->freeaddr;
|
|
} else
|
|
cse->jnl_freeaddr = 0;
|
|
}
|
|
}
|
|
}
|
|
/* Write logical journal records if applicable. */
|
|
if (JNL_WRITE_LOGICAL_RECS(csa))
|
|
{
|
|
if (0 == jnl_fence_ctl.token)
|
|
{
|
|
assert(!jgbl.forw_phase_recovery);
|
|
if (replication)
|
|
{
|
|
jnl_fence_ctl.token = tjpl->jnl_seqno;
|
|
UNIX_ONLY(
|
|
if (supplementary)
|
|
jnl_fence_ctl.strm_seqno = SET_STRM_INDEX(strm_seqno, strm_index);
|
|
)
|
|
} else
|
|
TOKEN_SET(&jnl_fence_ctl.token, local_tn, process_id);
|
|
}
|
|
/* else : jnl_fence_ctl.token would be pre-filled by journal recovery */
|
|
assert(0 != jnl_fence_ctl.token);
|
|
jfb = si->jnl_head;
|
|
assert(NULL != jfb);
|
|
/* Fill in "num_participants" field in TSET/TKILL/TZKILL/TZTRIG/TZTWORM record.
|
|
* The rest of the records (USET/UKILL/UZKILL/UZTRIG/UZTWORM) dont have this initialized.
|
|
* Recovery looks at this field only in the T* records.
|
|
*/
|
|
rec = (jnl_record *)jfb->buff;
|
|
assert(IS_TUPD(jfb->rectype));
|
|
assert(IS_SET_KILL_ZKILL_ZTRIG_ZTWORM(jfb->rectype));
|
|
assert(&rec->jrec_set_kill.num_participants == &rec->jrec_ztworm.num_participants);
|
|
rec->jrec_set_kill.num_participants = replay_jnl_participants;
|
|
DEBUG_ONLY(++tmp_jnl_participants;)
|
|
do
|
|
{
|
|
jnl_write_logical(csa, jfb, com_csum);
|
|
jfb = jfb->next;
|
|
} while (NULL != jfb);
|
|
}
|
|
}
|
|
assert(tmp_jnl_participants == jnl_participants);
|
|
/* the next section marks the transaction complete in the journal by writing TCOM record in all regions */
|
|
tcom_record.prefix.time = jgbl.gbl_jrec_time;
|
|
tcom_record.num_participants = replay_jnl_participants;
|
|
assert((JNL_FENCE_LIST_END == jnl_fence_ctl.fence_list) || (0 != jnl_fence_ctl.token));
|
|
tcom_record.token_seq.token = jnl_fence_ctl.token;
|
|
tcom_record.strm_seqno = jnl_fence_ctl.strm_seqno;
|
|
if (replication)
|
|
{
|
|
assert(!jgbl.forw_phase_recovery);
|
|
tjpl->jnl_seqno++;
|
|
UNIX_ONLY(
|
|
if (supplementary)
|
|
next_strm_seqno = strm_seqno + 1;
|
|
)
|
|
VMS_ONLY(
|
|
if (is_updproc)
|
|
jgbl.max_resync_seqno++;
|
|
)
|
|
}
|
|
/* Note that only those regions that are actively journaling will appear in the following list: */
|
|
DEBUG_ONLY(tmp_jnl_participants = 0;)
|
|
for (csa = jnl_fence_ctl.fence_list; JNL_FENCE_LIST_END != csa; csa = csa->next_fenced)
|
|
{
|
|
jpc = csa->jnl;
|
|
DEBUG_ONLY(update_trans = ((sgm_info *)(csa->sgm_info_ptr))->update_trans;)
|
|
assert(!(update_trans & ~UPDTRNS_VALID_MASK));
|
|
assert(UPDTRNS_DB_UPDATED_MASK & update_trans);
|
|
tcom_record.prefix.pini_addr = jpc->pini_addr;
|
|
tcom_record.prefix.tn = csa->ti->curr_tn;
|
|
tcom_record.prefix.checksum = INIT_CHECKSUM_SEED;
|
|
UNIX_ONLY(SET_REG_SEQNO_IF_REPLIC(csa, tjpl, supplementary, next_strm_seqno);)
|
|
VMS_ONLY(SET_REG_SEQNO_IF_REPLIC(csa, tjpl, supplementary, 0);)
|
|
/* Switch to current region. Not using TP_CHANGE_REG macros since we already have csa and csa->hdr available. */
|
|
gv_cur_region = jpc->region;
|
|
cs_addrs = csa;
|
|
cs_data = csa->hdr;
|
|
/* Note tcom_record.jnl_tid was set in op_tstart or updproc */
|
|
tcom_record.prefix.checksum = compute_checksum(INIT_CHECKSUM_SEED, (uint4 *)&tcom_record, SIZEOF(struct_jrec_tcom));
|
|
JNL_WRITE_APPROPRIATE(csa, jpc, JRT_TCOM, (jnl_record *)&tcom_record, NULL, NULL);
|
|
DEBUG_ONLY(tmp_jnl_participants++;)
|
|
}
|
|
assert(jnl_participants == tmp_jnl_participants);
|
|
/* Ensure jgbl.gbl_jrec_time did not get reset by any of the jnl writing functions */
|
|
assert(save_gbl_jrec_time == jgbl.gbl_jrec_time);
|
|
/* the following section is the actual commitment of the changes in the database (phase1 for BG) */
|
|
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
|
|
{
|
|
if (update_trans = si->update_trans)
|
|
{
|
|
assert((NULL == si->first_cw_set) || (0 != si->cw_set_depth));
|
|
sgm_info_ptr = si;
|
|
TP_TEND_CHANGE_REG(si);
|
|
csa = cs_addrs;
|
|
csd = cs_data;
|
|
is_mm = (dba_mm == csd->acc_meth);
|
|
ctn = csd->trans_hist.curr_tn;
|
|
assert((ctn + 1) == csd->trans_hist.early_tn);
|
|
csa->prev_free_blks = csd->trans_hist.free_blocks;
|
|
csa->t_commit_crit = T_COMMIT_CRIT_PHASE1;
|
|
if (csd->dsid && tp_kill_bitmaps)
|
|
rc_cpt_inval();
|
|
cse = si->first_cw_set;
|
|
if (NULL != cse)
|
|
{
|
|
if (!is_mm) /* increment counter of # of processes that are actively doing two-phase commit */
|
|
{
|
|
cnl = csa->nl;
|
|
INCR_WCS_PHASE2_COMMIT_PIDCNT(csa, cnl);
|
|
}
|
|
# ifdef DEBUG
|
|
/* Assert that cse->old_mode, if uninitialized, never contains a negative value
|
|
* (this is relied upon by secshr_db_clnup)
|
|
*/
|
|
do
|
|
{
|
|
TRAVERSE_TO_LATEST_CSE(cse);
|
|
assert(0 <= cse->old_mode);
|
|
cse = cse->next_cw_set;
|
|
} while (NULL != cse);
|
|
cse = si->first_cw_set;
|
|
# endif
|
|
do
|
|
{
|
|
TRAVERSE_TO_LATEST_CSE(cse);
|
|
mode = cse->mode;
|
|
assert((n_gds_t_op != mode) && (gds_t_committed != mode));
|
|
assert(n_gds_t_op < kill_t_create);
|
|
assert(n_gds_t_op < kill_t_write);
|
|
assert(gds_t_committed < gds_t_write_root);
|
|
assert(gds_t_committed < gds_t_busy2free);
|
|
assert(gds_t_write_root < n_gds_t_op);
|
|
assert(gds_t_busy2free < n_gds_t_op);
|
|
assert(gds_t_write_root != mode);
|
|
assert(gds_t_busy2free != mode);
|
|
cse->old_mode = (int4)mode; /* note down before being reset to gds_t_committed */
|
|
if (n_gds_t_op > mode)
|
|
{
|
|
DEBUG_ONLY(bml_status_check(cse));
|
|
if (csd->dsid && !tp_kill_bitmaps && (0 == cse->level))
|
|
{
|
|
assert(!is_mm);
|
|
rc_cpt_entry(cse->blk);
|
|
}
|
|
/* Do phase1 of bg_update while holding crit on the database.
|
|
* This will lock the buffers that need to be changed.
|
|
* Once crit is released, invoke phase2 which will update those locked buffers.
|
|
* There are two exceptions.
|
|
* 1) If it is a bitmap block. In that case we also do phase2
|
|
* while holding crit so the next process to use this bitmap will see a
|
|
* consistent copy of this bitmap when it gets crit for commit. This avoids
|
|
* the reallocate_bitmap routine from restarting or having to wait for a
|
|
* concurrent phase2 construction to finish. When the change request C9E11-002651
|
|
* (to reduce restarts due to bitmap collisions) is addressed, we can reexamine
|
|
* whether it makes sense to move bitmap block builds back to phase2.
|
|
* 2) If the block has a recompute update array. This means it is a global that
|
|
* has NOISOLATION turned on. In this case, we have seen that deferring the
|
|
* updates to phase2 can cause lots of restarts in the "recompute_upd_array"
|
|
* function (where cr->in_tend check fails) in a highly contentious environment.
|
|
* Hence build such blocks in phase1 while holding crit and avoid such restarts.
|
|
*/
|
|
if (is_mm)
|
|
status = mm_update(cse, ctn, ctn, si);
|
|
else
|
|
{
|
|
status = bg_update_phase1(cse, ctn, si);
|
|
if ((cdb_sc_normal == status)
|
|
&& ((gds_t_writemap == mode)
|
|
|| cse->recompute_list_head && (gds_t_write == cse->mode)))
|
|
{
|
|
status = bg_update_phase2(cse, ctn, ctn, si);
|
|
if (cdb_sc_normal == status)
|
|
cse->mode = gds_t_committed;
|
|
}
|
|
}
|
|
if (cdb_sc_normal != status)
|
|
{ /* the database is probably in trouble */
|
|
TP_TRACE_HIST(cse->blk, cse->blk_target);
|
|
INVOKE_T_COMMIT_CLEANUP(status, csa);
|
|
assert(cdb_sc_normal == status);
|
|
/* At this time "si->cr_array_index" could be non-zero for one or more
|
|
* regions and a few cache-records might have their "in_cw_set" field set
|
|
* to TRUE. We should not reset "in_cw_set" as we don't hold crit at this
|
|
* point and also because we might still need those buffers pinned until
|
|
* their before-images are backed up in wcs_recover (in case an online
|
|
* backup was running while secshr_db_clnup did its job). The variable
|
|
* "si->cr_array_index" is reset to 0 by secshr_db_clnup.
|
|
*/
|
|
assert(0 == si->cr_array_index);
|
|
goto skip_failed; /* do not do "failed:" processing as we dont hold crit */
|
|
}
|
|
} else
|
|
{
|
|
if (!cse->done)
|
|
{ /* This block is needed in the 2nd-phase of KILL. Build a private
|
|
* copy right now while we hold crit and the update array points
|
|
* to validated buffer contents.
|
|
*/
|
|
gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, ctn);
|
|
cse->done = TRUE;
|
|
assert(NULL != cse->blk_target);
|
|
CERT_BLK_IF_NEEDED(certify_all_blocks, gv_cur_region,
|
|
cse, cse->new_buff, gv_target);
|
|
}
|
|
cse->mode = gds_t_committed;
|
|
}
|
|
cse = cse->next_cw_set;
|
|
} while (NULL != cse);
|
|
}
|
|
/* signal secshr_db_clnup/t_commit_cleanup, roll-back is no longer possible */
|
|
assert(!(update_trans & ~UPDTRNS_VALID_MASK));
|
|
assert(!(UPDTRNS_TCOMMIT_STARTED_MASK & update_trans));
|
|
si->update_trans = update_trans | UPDTRNS_TCOMMIT_STARTED_MASK;
|
|
csa->t_commit_crit = T_COMMIT_CRIT_PHASE2; /* set this BEFORE releasing crit */
|
|
/* should never increment curr_tn on a frozen database */
|
|
assert(!(csd->freeze UNIX_ONLY(|| (replication && IS_REPL_INST_FROZEN))));
|
|
INCREMENT_CURR_TN(csd);
|
|
# ifdef GTM_TRIGGER
|
|
if (csa->incr_db_trigger_cycle)
|
|
{
|
|
csd->db_trigger_cycle++;
|
|
if (0 == csd->db_trigger_cycle)
|
|
csd->db_trigger_cycle = 1; /* Don't allow cycle set to 0 which means uninitialized */
|
|
/* Update the process private view of trigger cycle also since we are the ones
|
|
* who updated csd->db_trigger_cycle so we can safely keep csa in sync as well.
|
|
* Not doing this would cause an unnecessary cdb_sc_triggermod restart for the
|
|
* next transaction. In fact this restart will create an out-of-design situation
|
|
* for recovery (which operates in the final-retry) and cause an unnecessary
|
|
* replication pipe drain for the update process (a costly operation). So it
|
|
* is in fact a necessary step (considering recovery).
|
|
*/
|
|
csa->db_trigger_cycle = csd->db_trigger_cycle;
|
|
csa->incr_db_trigger_cycle = FALSE;
|
|
}
|
|
# endif
|
|
/* If db is journaled, then db header is flushed periodically when writing the EPOCH record,
|
|
* otherwise do it here every HEADER_UPDATE_COUNT transactions.
|
|
*/
|
|
if ((!JNL_ENABLED(csa) || !JNL_HAS_EPOCH(csa->jnl->jnl_buff))
|
|
&& !(csd->trans_hist.curr_tn & (HEADER_UPDATE_COUNT - 1)))
|
|
fileheader_sync(gv_cur_region);
|
|
if (NULL != si->kill_set_head)
|
|
INCR_KIP(csd, csa, si->kip_csa);
|
|
} else
|
|
ctn = si->tp_csd->trans_hist.curr_tn;
|
|
si->start_tn = ctn; /* start_tn used temporarily to store currtn (for bg_update_phase2) before releasing crit */
|
|
if (!si->tp_csa->hold_onto_crit)
|
|
rel_crit(si->gv_cur_region); /* should use si->gv_cur_region (not gv_cur_region) as the latter is not
|
|
* set in case we are not updating this region */
|
|
} /* for (si ... ) */
|
|
assert(cdb_sc_normal == status);
|
|
if (replication)
|
|
{
|
|
assert(jgbl.cumul_index == jgbl.cu_jnl_index);
|
|
assert((jpl->write + jgbl.cumul_jnl_rec_len) % jpl->jnlpool_size == tjpl->write);
|
|
assert(jpl->early_write_addr > jpl->write_addr);
|
|
jnl_header = (jnldata_hdr_ptr_t)(jnlpool.jnldata_base + jpl->write); /* Begin atomic stmnts */
|
|
jnl_header->jnldata_len = jgbl.cumul_jnl_rec_len;
|
|
jnl_header->prev_jnldata_len = jpl->lastwrite_len;
|
|
UNIX_ONLY(
|
|
if (supplementary)
|
|
jpl->strm_seqno[strm_index] = next_strm_seqno;
|
|
)
|
|
jpl->lastwrite_len = jnl_header->jnldata_len;
|
|
/* For systems with UNORDERED memory access (example, ALPHA, POWER4, PA-RISC 2.0), on a multi
|
|
* processor system, it is possible that the source server notices the change in write_addr
|
|
* before seeing the change to jnlheader->jnldata_len, leading it to read an invalid
|
|
* transaction length. To avoid such conditions, we should commit the order of shared
|
|
* memory updates before we update write_addr. This ensures that the source server sees all
|
|
* shared memory updates related to a transaction before the change in write_addr
|
|
*/
|
|
SHM_WRITE_MEMORY_BARRIER;
|
|
jpl->write = tjpl->write;
|
|
/* jpl->write_addr should be updated before updating jpl->jnl_seqno as secshr_db_clnup relies on this */
|
|
jpl->write_addr += jnl_header->jnldata_len;
|
|
jpl->jnl_seqno = tjpl->jnl_seqno; /* End atomic stmnts */
|
|
assert(jpl->early_write_addr == jpl->write_addr);
|
|
assert(NULL != repl_csa);
|
|
if (!repl_csa->hold_onto_crit)
|
|
rel_lock(jnlpool.jnlpool_dummy_reg);
|
|
}
|
|
/* Check that we DONT own crit on ANY region. The only exception is online mupip journal rollback/recovery
|
|
* which holds crit for the entire process lifetime. */
|
|
assert(UNIX_ONLY(jgbl.onlnrlbk || ) (0 == have_crit(CRIT_HAVE_ANY_REG)));
|
|
/* the following section is the actual commitment of the changes in the database (phase2 for BG) */
|
|
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
|
|
{
|
|
cse = si->first_cw_set;
|
|
if (NULL != cse)
|
|
{
|
|
sgm_info_ptr = si;
|
|
TP_TEND_CHANGE_REG(si);
|
|
ctn = si->start_tn;
|
|
is_mm = (dba_mm == cs_data->acc_meth);
|
|
/* If BG, check that we have not pinned any more buffers than we are updating */
|
|
DBG_CHECK_PINNED_CR_ARRAY_CONTENTS(is_mm, si->cr_array, si->cr_array_index, si->tp_csd->bplmap);
|
|
do
|
|
{
|
|
TRAVERSE_TO_LATEST_CSE(cse);
|
|
if (gds_t_committed > cse->mode)
|
|
{ /* Finish 2nd phase of commit for BG (updating the buffers locked in phase1) now that CRIT
|
|
* has been released. For MM, only thing needed is to set cs->mode to gds_t_committed.
|
|
*/
|
|
if (!is_mm)
|
|
{ /* Validate old_mode noted down in first phase is the same as the current mode.
|
|
* Note that cs->old_mode is negated by bg_update_phase1 (to help secshr_db_clnup).
|
|
*/
|
|
assert(-cse->old_mode == (int4)cse->mode);
|
|
status = bg_update_phase2(cse, ctn, ctn, si);
|
|
if (cdb_sc_normal != status)
|
|
{ /* the database is probably in trouble */
|
|
TP_TRACE_HIST(cse->blk, cse->blk_target);
|
|
INVOKE_T_COMMIT_CLEANUP(status, si->tp_csa);
|
|
assert(cdb_sc_normal == status);
|
|
/* At this time "si->cr_array_index" could be non-zero for one or more
|
|
* regions and a few cache-records might have their "in_cw_set" field set
|
|
* to TRUE. We should not reset "in_cw_set" as we don't hold crit at this
|
|
* point and also because we might still need those buffers pinned until
|
|
* their before-images are backed up in wcs_recover (in case an online
|
|
* backup was running while secshr_db_clnup did its job). The local
|
|
* variable "si->cr_array_index" is reset to 0 by secshr_db_clnup.
|
|
*/
|
|
assert(0 == si->cr_array_index);
|
|
/* Note that seshr_db_clnup (invoked by t_commit_cleanup above) would have
|
|
* done a lot of cleanup for us including decrementing the counter
|
|
* "wcs_phase2_commit_pidcnt" so it is ok to skip all that processing
|
|
* below and go directly to skip_failed.
|
|
*/
|
|
goto skip_failed; /* do not do "failed:" processing as we dont hold crit */
|
|
}
|
|
}
|
|
cse->mode = gds_t_committed;
|
|
} else
|
|
{ /* blk build should have been completed in phase1 for kill_t_* modes */
|
|
assert((n_gds_t_op > cse->mode) || cse->done);
|
|
assert(gds_t_committed == cse->mode);
|
|
}
|
|
cse = cse->next_cw_set;
|
|
} while (NULL != cse);
|
|
/* Free up all pinnned cache-records */
|
|
tp_cr_array = si->cr_array;
|
|
UNPIN_CR_ARRAY_ON_COMMIT(tp_cr_array, si->cr_array_index);
|
|
if (!is_mm)
|
|
{ /* In BG, now that two-phase commit is done, decrement counter */
|
|
csa = cs_addrs;
|
|
cnl = csa->nl;
|
|
DECR_WCS_PHASE2_COMMIT_PIDCNT(csa, cnl);
|
|
/* Phase 2 commits are completed for the current region. See if we had done a snapshot
|
|
* init (csa->snapshot_in_prog == TRUE). If so, try releasing the resources obtained
|
|
* while snapshot init.
|
|
*/
|
|
if (SNAPSHOTS_IN_PROG(csa))
|
|
{
|
|
assert(NULL != si->first_cw_set);
|
|
SS_RELEASE_IF_NEEDED(csa, cnl);
|
|
}
|
|
}
|
|
}
|
|
assert(!si->cr_array_index);
|
|
si->tp_csa->t_commit_crit = FALSE;
|
|
}
|
|
si_not_validated = NULL; /* all "si" have been validated at this point */
|
|
/* Caution: followthrough, cleanup for normal and abnormal "status" */
|
|
failed:
|
|
if (cdb_sc_normal != status)
|
|
{
|
|
si_not_validated = si;
|
|
si_last = (NULL == si_not_validated) ? NULL : si_not_validated->next_tp_si_by_ftok;
|
|
/* Free up all pinnned cache-records and release crit */
|
|
release_crit = (NEED_TO_RELEASE_CRIT(t_tries, status) UNIX_ONLY(&& !jgbl.onlnrlbk));
|
|
for (si = first_tp_si_by_ftok; (si_last != si); si = si->next_tp_si_by_ftok)
|
|
{
|
|
assert(si->tp_csa->now_crit);
|
|
tp_cr_array = si->cr_array;
|
|
UNPIN_CR_ARRAY_ON_RETRY(tp_cr_array, si->cr_array_index);
|
|
assert(!si->cr_array_index);
|
|
si->start_tn = si->tp_csd->trans_hist.curr_tn; /* start_tn used temporarily to store currtn
|
|
* before releasing crit */
|
|
if (release_crit)
|
|
{
|
|
assert(!si->tp_csa->hold_onto_crit);
|
|
rel_crit(si->gv_cur_region);
|
|
}
|
|
}
|
|
#ifdef UNIX
|
|
if (replication && repl_csa->now_crit && release_crit)
|
|
{ /* The only restart that is possible once we acquired the journal pool lock is due to instance freeze */
|
|
assert(cdb_sc_instancefreeze == status);
|
|
rel_lock(jnlpool.jnlpool_dummy_reg);
|
|
}
|
|
#endif
|
|
/* Check that we DONT own crit/commit on ANY region. The only exception is online mupip journal rollback/recovery
|
|
* which holds crit for the entire process lifetime.
|
|
*/
|
|
assert(UNIX_ONLY(jgbl.onlnrlbk || ) !release_crit || (0 == have_crit(CRIT_HAVE_ANY_REG | CRIT_IN_COMMIT)));
|
|
}
|
|
/* We have finished validation on this region. Reset transaction numbers in the gv_target
|
|
* histories so they will be valid for a future access utilizing the clue field. This occurs
|
|
* to improve performance (of next tn in case of commit of current tn) or the chances of commit
|
|
* (of current tn in case of a restart/retry).
|
|
*/
|
|
for (si = first_tp_si_by_ftok; (si_not_validated != si); si = si->next_tp_si_by_ftok)
|
|
{
|
|
if ((cdb_sc_normal == status) && (si->update_trans))
|
|
valid_thru = si->start_tn + 1; /* curr_tn of db AFTER incrementing it but before releasing crit */
|
|
else
|
|
valid_thru = si->start_tn;
|
|
assert(valid_thru <= si->tp_csd->trans_hist.curr_tn);
|
|
is_mm = (dba_mm == si->tp_csd->acc_meth);
|
|
bmp_begin_cse = si->first_cw_bitmap;
|
|
prev_target = NULL;
|
|
for (cse = si->first_cw_set; bmp_begin_cse != cse; cse = cse->next_cw_set)
|
|
{
|
|
TRAVERSE_TO_LATEST_CSE(cse);
|
|
curr_target = cse->blk_target;
|
|
/* Avoid redundant updates to gv_target's history using a simplistic scheme (check previous iteration) */
|
|
if ((prev_target != curr_target) && (0 != curr_target->clue.end))
|
|
{
|
|
prev_target = curr_target;
|
|
for (t1 = curr_target->hist.h; t1->blk_num; t1++)
|
|
{ /* If MM, the history can be safely updated. In BG, phase2 of commit happens outside of
|
|
* crit. So we need to check if the global buffer corresponding to this block is
|
|
* in the process of being updated concurrently by another process. If so, we have no
|
|
* guarantee that the concurrent update started AFTER valid_thru db-tn so we cannot safely
|
|
* reset t1->tn in this case. If no update is in progress, we can safely update our history
|
|
* to reflect the fact that all updates to this block before the current transaction number
|
|
* are complete as of this point. Note that it is ok to do the cr->in_tend check outside
|
|
* of crit. If this update started after we released crit, t1->tn will still be lesser than
|
|
* the transaction at which this update occurred so a cdb_sc_blkmod check is guaranteed to
|
|
* be signalled. If this update started and ended after we released crit but before we
|
|
* reached here, it is ok to set t1->tn to valid_thru as the concurrent update corresponds
|
|
* to a higher transaction number and will still fail the cdb_sc_blkmod check in the next
|
|
* validation. Also note that because of this selective updation of t1->tn, it is possible
|
|
* that for a given gv_target->hist, hist[0].tn is not guaranteed to be GREATER than
|
|
* hist[1].tn. Therefore t_begin has to now determine the minimum and use that as the
|
|
* start_tn instead of looking at hist[MAXDEPTH].tn and using that.
|
|
*/
|
|
cr = !is_mm ? t1->cr : NULL;
|
|
in_tend = (NULL != cr) ? cr->in_tend : 0;
|
|
assert(process_id != in_tend);
|
|
if (!in_tend)
|
|
t1->tn = valid_thru;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
skip_failed:
|
|
REVERT;
|
|
DEFERRED_EXIT_HANDLING_CHECK; /* now that all crits are released, check if deferred signal/exit handling needs to be done */
|
|
/* Must be done after REVERT since we are no longer in crit */
|
|
if (cdb_sc_normal == status)
|
|
{ /* keep this out of the loop above so crits of all regions are released without delay */
|
|
/* Take this moment of non-critness to check if we had an unhandled IO timer pop. */
|
|
if (unhandled_stale_timer_pop)
|
|
process_deferred_stale();
|
|
for (si = first_tp_si_by_ftok; (NULL != si); si = si->next_tp_si_by_ftok)
|
|
{
|
|
csa = si->tp_csa;
|
|
cnl = csa->nl;
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_blkread, si->num_of_blks);
|
|
if (!si->update_trans)
|
|
{
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_readonly, 1);
|
|
continue;
|
|
}
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_readwrite, 1);
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_tp_blkwrite, si->cw_set_depth);
|
|
GVSTATS_SET_CSA_STATISTIC(csa, db_curr_tn, si->start_tn);
|
|
TP_TEND_CHANGE_REG(si);
|
|
wcs_timer_start(gv_cur_region, TRUE);
|
|
if (si->backup_block_saved)
|
|
backup_buffer_flush(gv_cur_region);
|
|
}
|
|
first_tp_si_by_ftok = NULL; /* Signal t_commit_cleanup/secshr_db_clnup that TP transaction is NOT underway */
|
|
return TRUE;
|
|
}
|
|
failed_skip_revert:
|
|
assert(cdb_sc_normal != status);
|
|
t_fail_hist[t_tries] = status;
|
|
SET_WC_BLOCKED_FINAL_RETRY_IF_NEEDED(csa, cnl, status);
|
|
TP_RETRY_ACCOUNTING(csa, cnl);
|
|
first_tp_si_by_ftok = NULL; /* Signal t_commit_cleanup/secshr_db_clnup that TP transaction is NOT underway */
|
|
return FALSE;
|
|
}
|
|
|
|
/* --------------------------------------------------------------------------------------------
|
|
* This code is very similar to the code in gvcst_put for the non-block-split case. Any changes
|
|
* in either place should be reflected in the other.
|
|
* --------------------------------------------------------------------------------------------
|
|
*/
|
|
enum cdb_sc recompute_upd_array(srch_blk_status *bh, cw_set_element *cse)
|
|
{
|
|
blk_segment *bs1, *bs_ptr;
|
|
boolean_t new_rec;
|
|
cache_rec_ptr_t cr;
|
|
char *va;
|
|
enum cdb_sc status;
|
|
gv_key *pKey = NULL;
|
|
int4 blk_size, blk_fill_size, cur_blk_size, blk_seg_cnt, delta ;
|
|
int4 n, new_rec_size, next_rec_shrink;
|
|
int4 rec_cmpc, target_key_size;
|
|
int tmp_cmpc;
|
|
uint4 segment_update_array_size;
|
|
key_cum_value *kv, *kvhead;
|
|
mstr value;
|
|
off_chain chain1;
|
|
rec_hdr_ptr_t curr_rec_hdr, next_rec_hdr, rp;
|
|
sm_uc_ptr_t cp1, buffaddr;
|
|
unsigned short rec_size;
|
|
sgmnt_addrs *csa;
|
|
blk_hdr_ptr_t old_block;
|
|
gv_namehead *gvt;
|
|
srch_blk_status *t1;
|
|
|
|
csa = cs_addrs;
|
|
BG_TRACE_PRO_ANY(csa, recompute_upd_array_calls);
|
|
assert(csa->now_crit && dollar_tlevel && sgm_info_ptr);
|
|
assert(!cse->level && cse->blk_target && !cse->first_off && !cse->write_type);
|
|
blk_size = cs_data->blk_size;
|
|
blk_fill_size = (blk_size * gv_fillfactor) / 100 - cs_data->reserved_bytes;
|
|
cse->first_copy = TRUE;
|
|
if (dba_bg == csa->hdr->acc_meth)
|
|
{ /* For BG method, modify history with uptodate cache-record, buffer and cycle information.
|
|
* Also modify cse->old_block and cse->ondsk_blkver to reflect the updated buffer.
|
|
* This is necessary in case history contains an older twin cr or a cr which has since been recycled
|
|
*/
|
|
cr = db_csh_get(bh->blk_num);
|
|
assert(CR_NOTVALID != (sm_long_t)cr);
|
|
if (NULL == cr || CR_NOTVALID == (sm_long_t)cr || 0 <= cr->read_in_progress)
|
|
{
|
|
BG_TRACE_PRO_ANY(csa, recompute_upd_array_rip);
|
|
assert(CDB_STAGNATE > t_tries);
|
|
return cdb_sc_lostcr;
|
|
}
|
|
if (cr->in_tend)
|
|
{ /* Possible if this cache-record is being modified concurrently by another process in bg_update_phase2.
|
|
* In this case, we cannot determine if recomputation is possible. Have to restart.
|
|
*/
|
|
assert(CDB_STAGNATE > t_tries);
|
|
BG_TRACE_PRO_ANY(csa, recompute_upd_array_in_tend);
|
|
return cdb_sc_blkmod;
|
|
}
|
|
bh->cr = cr;
|
|
bh->cycle = cr->cycle;
|
|
cse->old_block = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
|
|
cse->ondsk_blkver = cr->ondsk_blkver;
|
|
/* old_block needs to be repointed to the NEW buffer but the fact that this block was free does not change in this
|
|
* entire function. So cse->blk_prior_state's free_status can stay as it is.
|
|
*/
|
|
bh->buffaddr = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
|
|
}
|
|
buffaddr = bh->buffaddr;
|
|
assert(NULL != cse->recompute_list_head);
|
|
for (kvhead = kv = cse->recompute_list_head; (NULL != kv); kv = kv->next)
|
|
{
|
|
pKey = &kv->key;
|
|
value = kv->value;
|
|
target_key_size = pKey->end + 1;
|
|
if (kvhead != kv)
|
|
{
|
|
assert(FALSE == cse->done);
|
|
assert(0 == cse->reference_cnt);
|
|
assert(0 == cse->ins_off); /* because leaf-level block */
|
|
assert(0 == cse->level);
|
|
assert(0 == cse->index);
|
|
assert(FALSE == cse->forward_process); /* because no kills should have taken place in this block */
|
|
gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0);
|
|
bh->buffaddr = buffaddr = cse->new_buff;
|
|
}
|
|
if (cdb_sc_normal != (status = gvcst_search_blk(pKey, bh)))
|
|
{
|
|
BG_TRACE_PRO_ANY(csa, recompute_upd_array_search_blk);
|
|
return status;
|
|
}
|
|
cur_blk_size = ((blk_hdr_ptr_t)buffaddr)->bsiz;
|
|
new_rec = (target_key_size != bh->curr_rec.match);
|
|
rp = (rec_hdr_ptr_t)(buffaddr + bh->curr_rec.offset);
|
|
if (bh->curr_rec.offset == cur_blk_size)
|
|
{
|
|
if (FALSE == new_rec)
|
|
{
|
|
assert(CDB_STAGNATE > t_tries);
|
|
BG_TRACE_PRO_ANY(csa, recompute_upd_array_new_rec);
|
|
return cdb_sc_mkblk;
|
|
}
|
|
rec_cmpc = 0;
|
|
rec_size = 0;
|
|
} else
|
|
{
|
|
GET_USHORT(rec_size, &rp->rsiz);
|
|
rec_cmpc = EVAL_CMPC(rp);
|
|
if ((sm_uc_ptr_t)rp + rec_size > (sm_uc_ptr_t)buffaddr + cur_blk_size)
|
|
{
|
|
assert(CDB_STAGNATE > t_tries);
|
|
BG_TRACE_PRO_ANY(csa, recompute_upd_array_rec_size);
|
|
return cdb_sc_mkblk;
|
|
}
|
|
}
|
|
if (new_rec)
|
|
{
|
|
new_rec_size = SIZEOF(rec_hdr) + target_key_size - bh->prev_rec.match + value.len;
|
|
if (cur_blk_size <= (int)bh->curr_rec.offset)
|
|
next_rec_shrink = 0;
|
|
else
|
|
next_rec_shrink = bh->curr_rec.match - rec_cmpc;
|
|
delta = new_rec_size - next_rec_shrink;
|
|
} else
|
|
{
|
|
if (rec_cmpc != bh->prev_rec.match)
|
|
{
|
|
assert(CDB_STAGNATE > t_tries);
|
|
BG_TRACE_PRO_ANY(csa, recompute_upd_array_rec_cmpc);
|
|
return cdb_sc_mkblk;
|
|
}
|
|
new_rec_size = SIZEOF(rec_hdr) + (target_key_size - rec_cmpc) + value.len;
|
|
delta = new_rec_size - rec_size;
|
|
next_rec_shrink = 0;
|
|
}
|
|
chain1 = *(off_chain *)&bh->blk_num;
|
|
assert(0 == chain1.flag);
|
|
if (cur_blk_size + delta <= blk_fill_size)
|
|
{
|
|
segment_update_array_size = UA_NON_BM_SIZE(cs_data);
|
|
ENSURE_UPDATE_ARRAY_SPACE(segment_update_array_size);
|
|
BLK_INIT(bs_ptr, bs1);
|
|
if (0 != rc_set_fragment)
|
|
GTMASSERT;
|
|
BLK_SEG(bs_ptr, buffaddr + SIZEOF(blk_hdr), bh->curr_rec.offset - SIZEOF(blk_hdr));
|
|
BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr);
|
|
curr_rec_hdr->rsiz = new_rec_size;
|
|
SET_CMPC(curr_rec_hdr, bh->prev_rec.match);
|
|
BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr));
|
|
BLK_ADDR(cp1, target_key_size - bh->prev_rec.match, unsigned char);
|
|
memcpy(cp1, pKey->base + bh->prev_rec.match, target_key_size - bh->prev_rec.match);
|
|
BLK_SEG(bs_ptr, cp1, target_key_size - bh->prev_rec.match);
|
|
if (0 != value.len)
|
|
{
|
|
BLK_ADDR(va, value.len, char);
|
|
memcpy(va, value.addr, value.len);
|
|
BLK_SEG(bs_ptr, (unsigned char *)va, value.len);
|
|
}
|
|
if (!new_rec)
|
|
rp = (rec_hdr_ptr_t)((sm_uc_ptr_t)rp + rec_size);
|
|
n = (int)(cur_blk_size - ((sm_uc_ptr_t)rp - buffaddr));
|
|
if (n > 0)
|
|
{
|
|
if (new_rec)
|
|
{
|
|
BLK_ADDR(next_rec_hdr, SIZEOF(rec_hdr), rec_hdr);
|
|
next_rec_hdr->rsiz = rec_size - next_rec_shrink;
|
|
SET_CMPC(next_rec_hdr, bh->curr_rec.match);
|
|
BLK_SEG(bs_ptr, (sm_uc_ptr_t)next_rec_hdr, SIZEOF(rec_hdr));
|
|
next_rec_shrink += SIZEOF(rec_hdr);
|
|
}
|
|
BLK_SEG(bs_ptr, (sm_uc_ptr_t)rp + next_rec_shrink, n - next_rec_shrink);
|
|
}
|
|
if (0 == BLK_FINI(bs_ptr, bs1))
|
|
{
|
|
assert(CDB_STAGNATE > t_tries);
|
|
BG_TRACE_PRO_ANY(csa, recompute_upd_array_blk_fini);
|
|
return cdb_sc_mkblk;
|
|
}
|
|
cse->upd_addr = (unsigned char *)bs1;
|
|
cse->done = FALSE;
|
|
} else
|
|
{
|
|
BG_TRACE_PRO_ANY(csa, recompute_upd_array_blk_split);
|
|
return cdb_sc_blksplit;
|
|
}
|
|
}
|
|
/* Update bh->tn to reflect the fact that it is uptodate as of the current database transaction.
|
|
* Not doing so could actually cause unnecessary restarts.
|
|
*/
|
|
bh->tn = csa->hdr->trans_hist.curr_tn;
|
|
/* If block in this history element is the same as gv_target's leaf block and it has a non-zero clue, update it */
|
|
gvt = bh->blk_target;
|
|
assert(!bh->level); /* this is why it is safe to access 0th array index in the next line */
|
|
t1 = gvt->hist.h;
|
|
if (gvt->clue.end && (t1->blk_num == bh->blk_num))
|
|
{
|
|
*t1 = *bh;
|
|
/* Update clue to reflect last key in recompute list. No need to update gvt->first_rec and gvt->last_rec
|
|
* as they are guaranteed to be the same as what it was when the clue was filled in by gvcst_search (if
|
|
* they are different, an index block would have changed which means we would restart this transaction
|
|
* anyways and the clue would be reset to 0).
|
|
*/
|
|
assert(NULL != pKey);
|
|
COPY_CURRKEY_TO_GVTARGET_CLUE(gvt, pKey);
|
|
if (new_rec)
|
|
t1->curr_rec.match = gvt->clue.end + 1; /* Keep srch_hist and clue in sync for NEXT gvcst_search */
|
|
/* Now that the clue is known to be non-zero, we have the potential for the first_rec part of it to be
|
|
* unreliable. Reset it to be safe. See comment in similar section in tp_hist for details on why.
|
|
*/
|
|
GVT_CLUE_INVALIDATE_FIRST_REC(gvt);
|
|
}
|
|
/* At this point, cse->new_buff could be non-NULL either because the same variable was being updated multiple times
|
|
* inside of the TP transaction or because cse->recompute_list_head contained more than one variable (in which case
|
|
* cse->new_buff will be set by the invocation of gvcst_blk_build (above) for the second element in the list. In
|
|
* either case, the final update-array contents rely on the shared memory buffer (in case of BG access method) and
|
|
* not on cse->new_buff. Therefore we need to PIN the corresponding cache-record in tp_tend. So reset cse->new_buff.
|
|
*/
|
|
cse->new_buff = NULL;
|
|
if (!WAS_FREE(cse) && (NULL != cse->old_block) && JNL_ENABLED(csa) && csa->jnl_before_image)
|
|
{
|
|
old_block = (blk_hdr_ptr_t)cse->old_block;
|
|
assert(old_block->bsiz <= csa->hdr->blk_size);
|
|
if (old_block->tn < csa->jnl->jnl_buff->epoch_tn)
|
|
cse->blk_checksum = jnl_get_checksum((uint4 *)old_block, csa, old_block->bsiz);
|
|
else
|
|
cse->blk_checksum = 0;
|
|
}
|
|
return cdb_sc_normal;
|
|
}
|
|
|
|
/* This function does not update "bml_cse->tn" (to reflect that the reallocation is valid as of the current database tn).
|
|
* See similar comment before the function definition of "recompute_upd_array". For the same reasons, it is considered
|
|
* ok to do the reallocation since frozen regions are considered relatively rare.
|
|
*/
|
|
boolean_t reallocate_bitmap(sgm_info *si, cw_set_element *bml_cse)
|
|
{
|
|
boolean_t blk_used;
|
|
block_id_ptr_t b_ptr;
|
|
block_id bml, free_bit;
|
|
cache_rec_ptr_t cr;
|
|
cw_set_element *cse, *bmp_begin_cse;
|
|
int4 offset;
|
|
uint4 total_blks, map_size;
|
|
boolean_t read_before_image; /* TRUE if before-image journaling or online backup in progress */
|
|
sgmnt_addrs *csa;
|
|
sgmnt_data_ptr_t csd;
|
|
boolean_t is_mm;
|
|
jnl_buffer_ptr_t jbp; /* jbp is non-NULL only if before-image journaling */
|
|
blk_hdr_ptr_t old_block;
|
|
unsigned int bsiz;
|
|
boolean_t before_image_needed;
|
|
DCL_THREADGBL_ACCESS;
|
|
|
|
SETUP_THREADGBL_ACCESS;
|
|
csa = cs_addrs;
|
|
csd = csa->hdr;
|
|
is_mm = (dba_mm == csd->acc_meth);
|
|
/* This optimization should only be used if blocks are being allocated (not if freed) in this bitmap. */
|
|
assert(0 <= bml_cse->reference_cnt);
|
|
bml = bml_cse->blk;
|
|
if (!is_mm && bml_cse->cr->in_tend)
|
|
{ /* Possible if this cache-record no longer contains the bitmap block we think it does. In this case restart.
|
|
* Since we hold crit at this point, the block that currently resides should not be a bitmap block since
|
|
* all updates to the bitmap (both phase1 and phase) happen inside of crit.
|
|
*/
|
|
assert(csa->now_crit && (bml != bml_cse->cr->blk) && (bml_cse->cr->blk % csd->bplmap));
|
|
return FALSE;
|
|
}
|
|
assert(is_mm || (FALSE == bml_cse->cr->in_tend));
|
|
assert(is_mm || (FALSE == bml_cse->cr->data_invalid));
|
|
b_ptr = (block_id_ptr_t)bml_cse->upd_addr;
|
|
offset = 0;
|
|
total_blks = is_mm ? csa->total_blks : csa->ti->total_blks;
|
|
if (ROUND_DOWN2(total_blks, BLKS_PER_LMAP) == bml)
|
|
map_size = total_blks - bml;
|
|
else
|
|
map_size = BLKS_PER_LMAP;
|
|
assert(bml >= 0 && bml < total_blks);
|
|
bmp_begin_cse = si->first_cw_bitmap; /* stored in a local to avoid pointer de-referencing within the loop below */
|
|
jbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
|
|
read_before_image = ((NULL != jbp) || csa->backup_in_prog || SNAPSHOTS_IN_PROG(csa));
|
|
for (cse = si->first_cw_set; cse != bmp_begin_cse; cse = cse->next_cw_set)
|
|
{
|
|
TRAVERSE_TO_LATEST_CSE(cse);
|
|
if ((gds_t_acquired != cse->mode) || (ROUND_DOWN2(cse->blk, BLKS_PER_LMAP) != bml))
|
|
continue;
|
|
assert(*b_ptr == (cse->blk - bml));
|
|
free_bit = bm_find_blk(offset, (sm_uc_ptr_t)bml_cse->old_block + SIZEOF(blk_hdr), map_size, &blk_used);
|
|
if (MAP_RD_FAIL == free_bit || NO_FREE_SPACE == free_bit)
|
|
return FALSE;
|
|
cse->blk = bml + free_bit;
|
|
assert(cse->blk < total_blks);
|
|
blk_used ? SET_RECYCLED(cse) : SET_NRECYCLED(cse);
|
|
/* re-point before-images into cse->old_block if necessary; if not available restart by returning FALSE */
|
|
BEFORE_IMAGE_NEEDED(read_before_image, cse, csa, csd, cse->blk, before_image_needed);
|
|
if (!before_image_needed)
|
|
{
|
|
cse->old_block = NULL;
|
|
cse->blk_checksum = 0;
|
|
} else if (!is_mm)
|
|
{
|
|
cr = db_csh_get(cse->blk);
|
|
assert(CR_NOTVALID != (sm_long_t)cr);
|
|
if ((NULL == cr) || (CR_NOTVALID == (sm_long_t)cr) || (0 <= cr->read_in_progress))
|
|
return FALSE; /* if one block was freed a long time ago, most probably were; so just give up */
|
|
/* Reset cse->cr, cycle, old_block and checksums if we had not read a before-image previously (because
|
|
* cse->blk was not a reused block previously) OR if old cse->cr and cse->cycle dont match current cr
|
|
*/
|
|
assert((NULL == cse->old_block) || (cse->cr != cr)
|
|
|| cse->old_block == (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr));
|
|
if ((NULL == cse->old_block) || (cse->cr != cr) || (cse->cycle != cr->cycle)
|
|
|| (cse->tn <= ((blk_hdr_ptr_t)GDS_REL2ABS(cr->buffaddr))->tn))
|
|
{ /* Bitmap reallocation has resulted in a situation where checksums etc. have to be recomputed */
|
|
cse->cr = cr;
|
|
cse->cycle = cr->cycle;
|
|
cse->old_block = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
|
|
old_block = (blk_hdr_ptr_t)cse->old_block;
|
|
if (!WAS_FREE(cse) && (NULL != jbp))
|
|
{
|
|
assert(old_block->bsiz <= csd->blk_size);
|
|
if (old_block->tn < jbp->epoch_tn)
|
|
{
|
|
bsiz = old_block->bsiz;
|
|
JNL_GET_CHECKSUM_ACQUIRED_BLK(cse, csd, csa, old_block, bsiz);
|
|
} else
|
|
cse->blk_checksum = 0;
|
|
}
|
|
}
|
|
assert(gds_t_acquired == cse->mode);
|
|
assert(GDSVCURR == cse->ondsk_blkver);
|
|
} else
|
|
{ /* in MM, although mm_update does not use cse->old_block, tp_tend uses it to write before-images.
|
|
* therefore, fix it to point to the reallocated block's buffer address
|
|
*/
|
|
cse->old_block = t_qread(cse->blk, (sm_int_ptr_t)&cse->cycle, &cse->cr);
|
|
assert(GDSVCURR == cse->ondsk_blkver); /* should have been already initialized in t_write_map */
|
|
old_block = (blk_hdr_ptr_t)cse->old_block;
|
|
if (NULL == old_block)
|
|
return FALSE;
|
|
assert(NULL == jbp); /* this means we dont need to have any JNL_GET_CHECKSUM_ACQUIRED_BLK logic */
|
|
}
|
|
*b_ptr++ = free_bit;
|
|
offset = free_bit + 1;
|
|
if (offset >= map_size)
|
|
{ /* If bm_find_blk is passed a hint (first arg) it assumes it is less than map_size
|
|
* and gives invalid results (like values >= map_size). Instead of changing bm_find_blk
|
|
* we do the check here and assert that "hint" < "map_size" in bm_find_blk.
|
|
*/
|
|
assert(offset == map_size);
|
|
return FALSE;
|
|
}
|
|
}
|
|
if (cse == bmp_begin_cse)
|
|
{
|
|
assert(0 == *b_ptr);
|
|
/* since bitmap block got modified, copy latest "ondsk_blkver" status from cache-record to bml_cse */
|
|
assert((NULL != bml_cse->cr) || is_mm);
|
|
old_block = (blk_hdr_ptr_t)bml_cse->old_block;
|
|
assert(!WAS_FREE(bml_cse)); /* Bitmap blocks are never of type gds_t_acquired or gds_t_create */
|
|
if (NULL != jbp)
|
|
{ /* recompute CHECKSUM for the modified bitmap block before-image */
|
|
if (old_block->tn < jbp->epoch_tn)
|
|
bml_cse->blk_checksum = jnl_get_checksum((uint4 *)old_block, csa, old_block->bsiz);
|
|
else
|
|
bml_cse->blk_checksum = 0;
|
|
}
|
|
if (!is_mm)
|
|
bml_cse->ondsk_blkver = bml_cse->cr->ondsk_blkver;
|
|
return TRUE;
|
|
} else
|
|
return FALSE;
|
|
}
|