/**************************************************************** * * * Copyright 2001, 2011 Fidelity Information Services, Inc * * * * This source code contains the intellectual property * * of its copyright holder(s), and is made available * * under a license. If you do not know the terms of * * the license, please stop and do not read further. * * * ****************************************************************/ #include "mdef.h" #include #include /* for VSIG_ATOMIC_T type */ #include "gtm_time.h" #include "gtm_inet.h" #ifdef VMS #include /* Required for gtmsource.h */ #endif #include "cdb_sc.h" #include "gdsroot.h" #include "gtm_facility.h" #include "gdskill.h" #include "fileinfo.h" #include "gdsbt.h" #include "gdsblk.h" #include "gdsfhead.h" #include "filestruct.h" #include "gdscc.h" #include "gdsbml.h" #include "ccp.h" #include "error.h" #include "iosp.h" #include "interlock.h" #include "jnl.h" #include "buddy_list.h" /* needed for tp.h */ #include "hashtab_int4.h" /* needed for tp.h and cws_insert.h */ #include "tp.h" #include "gdsbgtr.h" #include "repl_msg.h" #include "gtmsource.h" #include "mupipbckup.h" #include "cache.h" #include "gt_timer.h" #include "longset.h" /* needed for cws_insert.h */ #include "cws_insert.h" #include "min_max.h" #include "gtmimagename.h" #ifdef UNIX #include "gtmrecv.h" #include "deferred_signal_handler.h" #endif /* Include prototypes */ #include "t_qread.h" #include "t_retry.h" #include "t_commit_cleanup.h" #include "send_msg.h" #include "bm_getfree.h" #include "rc_cpt_ops.h" #include "rel_quant.h" #include "wcs_flu.h" #include "mm_update.h" #include "bg_update.h" #include "wcs_get_space.h" #include "wcs_timer_start.h" #include "process_deferred_stale.h" #include "t_end.h" #include "add_inter.h" #include "jnl_write_pblk.h" #include "jnl_write_aimg_rec.h" #include "memcoherency.h" #include "jnl_get_checksum.h" #include "wbox_test_init.h" #include "have_crit.h" #ifdef GTM_SNAPSHOT #include "db_snapshot.h" #endif #include "shmpool.h" #include "bml_status_check.h" #include "is_proc_alive.h" GBLREF bool rc_locked; GBLREF unsigned char t_fail_hist[CDB_MAX_TRIES]; GBLREF cache_rec_ptr_t cr_array[((MAX_BT_DEPTH * 2) - 1) * 2]; /* Maximum number of blocks that can be in transaction */ GBLREF unsigned int cr_array_index; GBLREF boolean_t block_saved; GBLREF uint4 update_trans; GBLREF cw_set_element cw_set[]; /* create write set. */ GBLREF gd_region *gv_cur_region; GBLREF gv_namehead *gv_target; GBLREF sgmnt_addrs *cs_addrs; GBLREF sgmnt_data_ptr_t cs_data; GBLREF uint4 dollar_tlevel; GBLREF trans_num start_tn; GBLREF unsigned int t_tries; GBLREF uint4 t_err, process_id; GBLREF unsigned char cw_set_depth, cw_map_depth; GBLREF unsigned char rdfail_detail; GBLREF jnlpool_addrs jnlpool; GBLREF jnlpool_ctl_ptr_t jnlpool_ctl, temp_jnlpool_ctl; GBLREF boolean_t is_updproc; GBLREF seq_num seq_num_one; GBLREF boolean_t mu_reorg_process; GBLREF boolean_t unhandled_stale_timer_pop; GBLREF jnl_format_buffer *non_tp_jfb_ptr; GBLREF sgmnt_addrs *kip_csa; GBLREF boolean_t need_kip_incr; GBLREF boolean_t write_after_image; GBLREF boolean_t is_replicator; GBLREF seq_num seq_num_zero; GBLREF jnl_gbls_t jgbl; GBLREF jnl_fence_control jnl_fence_ctl; GBLREF boolean_t gvdupsetnoop; /* if TRUE, duplicate SETs update journal but not database (except for curr_tn++) */ GBLREF boolean_t is_dollar_incr; /* valid only if gvcst_put is in the call-stack. * is a copy of "in_gvcst_incr" just before it got reset to FALSE */ GBLREF boolean_t mu_reorg_upgrd_dwngrd_in_prog; /* TRUE if MUPIP REORG UPGRADE/DOWNGRADE is in progress */ GBLREF boolean_t mu_reorg_nosafejnl; /* TRUE if NOSAFEJNL explicitly specified */ GBLREF trans_num mu_reorg_upgrd_dwngrd_blktn; /* tn in blkhdr of current block processed by REORG UP/DOWNGRADE */ GBLREF inctn_opcode_t inctn_opcode; GBLREF inctn_detail_t inctn_detail; /* holds detail to fill in to inctn jnl record */ GBLREF boolean_t block_is_free; #ifdef GTM_TRIGGER GBLREF boolean_t skip_dbtriggers; /* see gbldefs.c for description of this global */ #endif #ifdef UNIX GBLREF recvpool_addrs recvpool; #endif error_def(ERR_GVKILLFAIL); error_def(ERR_GVPUTFAIL); error_def(ERR_JNLFILOPN); error_def(ERR_JNLFLUSH); error_def(ERR_NOTREPLICATED); error_def(ERR_TEXT); #define BLOCK_FLUSHING(x) (csa->hdr->clustered && x->flushing && !CCP_SEGMENT_STATE(cs_addrs->nl,CCST_MASK_HAVE_DIRTY_BUFFERS)) #define RESTORE_CURRTN_IF_NEEDED(csa, write_inctn, decremented_currtn) \ { \ if (write_inctn && decremented_currtn) \ { /* decremented curr_tn above; need to restore to original state due to the restart */ \ assert(csa->now_crit); \ if (csa->now_crit) \ { /* need crit to update curr_tn and early_tn */ \ csa->ti->curr_tn++; \ csa->ti->early_tn++; \ } \ decremented_currtn = FALSE; \ } \ } /* This macro isn't enclosed in parantheses to allow for optimizations */ #define VALIDATE_CYCLE(is_mm, history) \ if (history) \ { \ for (t1 = history->h; t1->blk_num; t1++) \ { \ if (!is_mm && (t1->cr->cycle != t1->cycle)) \ { /* cache slot has been stolen */ \ assert(!csa->now_crit || csa->hold_onto_crit); \ status = cdb_sc_cyclefail; \ goto failed_skip_revert; \ } \ n_blks_validated++; \ } \ } trans_num t_end(srch_hist *hist1, srch_hist *hist2, trans_num ctn) { srch_hist *hist; bt_rec_ptr_t bt; boolean_t blk_used; cache_rec cr_save; cache_rec_ptr_t cr; cw_set_element *cs, *cs_top, *cs1; enum cdb_sc status; int int_depth, tmpi; uint4 jnl_status; jnl_private_control *jpc; jnl_buffer_ptr_t jbp, jbbp; /* jbp is non-NULL if journaling, jbbp is non-NULL only if before-image journaling */ sgmnt_addrs *csa, *repl_csa; sgmnt_data_ptr_t csd; node_local_ptr_t cnl; sgm_info *dummysi = NULL; /* needed as a dummy parameter for {mm,bg}_update */ srch_blk_status *t1; trans_num valid_thru, tnque_earliest_tn, dbtn, blktn, temp_tn, epoch_tn, old_block_tn; unsigned char cw_depth, cw_bmp_depth; jnldata_hdr_ptr_t jnl_header; uint4 total_jnl_rec_size, tmp_cumul_jnl_rec_len, tmp_cw_set_depth, prev_cw_set_depth; DEBUG_ONLY(unsigned int tot_jrec_size;) jnlpool_ctl_ptr_t jpl, tjpl; boolean_t replication = FALSE; boolean_t is_mm, release_crit = FALSE; boolean_t read_before_image; /* TRUE if before-image journaling or online backup in progress * This is used to read before-images of blocks whose cs->mode is gds_t_create */ boolean_t write_inctn = FALSE; /* set to TRUE in case writing an inctn record is necessary */ boolean_t decremented_currtn, retvalue, busy2free_seen, recompute_cksum, cksum_needed; blk_hdr_ptr_t old_block; unsigned int bsiz, crindex; jnl_tm_t save_gbl_jrec_time; enum gds_t_mode mode; uint4 prev_cr_array_index; # ifdef DEBUG boolean_t ready2signal_gvundef_lcl; GTMCRYPT_ONLY( blk_hdr_ptr_t save_old_block; ) # endif int n_blks_validated; boolean_t before_image_needed, lcl_ss_in_prog = FALSE, reorg_ss_in_prog = FALSE; boolean_t ss_need_to_restart, new_bkup_started; # ifdef GTM_TRIGGER uint4 cycle; # endif DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; assert(hist1 != hist2); DEBUG_ONLY( /* Store global variable ready2signal_gvundef in a local variable and reset the global right away to ensure that * the global value does not incorrectly get carried over to the next call of "t_end". */ ready2signal_gvundef_lcl = TREF(ready2signal_gvundef); TREF(ready2signal_gvundef) = FALSE; ) csa = cs_addrs; csd = csa->hdr; cnl = csa->nl; is_mm = (dba_mm == csd->acc_meth); status = cdb_sc_normal; assert(cs_data == csd); assert((t_tries < CDB_STAGNATE) || csa->now_crit); assert(!dollar_tlevel); /* whenever cw_set_depth is non-zero, ensure update_trans is also non-zero */ assert(!cw_set_depth || (UPDTRNS_DB_UPDATED_MASK == update_trans)); assert(cw_set_depth || !update_trans || gvdupsetnoop); /* whenever cw_set_depth is zero, ensure that update_trans * is FALSE except when set noop optimization is enabled */ assert(0 == cr_array_index); cr_array_index = 0; /* be safe and reset it in PRO even if it is not zero */ if (csd->wc_blocked || (is_mm && (csa->total_blks != csa->ti->total_blks))) { /* If blocked, or we have MM and file has been extended, force repair */ status = cdb_sc_helpedout; /* force retry with special status so philanthropy isn't punished */ goto failed_skip_revert; } else { if (!update_trans && (start_tn == csa->ti->early_tn)) { /* read with no change to the transaction history */ n_blks_validated = 0; VALIDATE_CYCLE(is_mm, hist1); /* updates n_blks_validated */ VALIDATE_CYCLE(is_mm, hist2); /* updates n_blks_validated */ /* Assert that if gtm_gvundef_fatal is non-zero, then we better not be about to signal a GVUNDEF */ assert(!TREF(gtm_gvundef_fatal) || !ready2signal_gvundef_lcl); if (csa->now_crit && !csa->hold_onto_crit) rel_crit(gv_cur_region); if (unhandled_stale_timer_pop) process_deferred_stale(); CWS_RESET; t_tries = 0; /* commit was successful so reset t_tries */ INCR_GVSTATS_COUNTER(csa, cnl, n_nontp_readonly, 1); INCR_GVSTATS_COUNTER(csa, cnl, n_nontp_blkread, n_blks_validated); return csa->ti->curr_tn; } } busy2free_seen = FALSE; if ((0 != cw_set_depth) && ((gds_t_writemap == cw_set[0].mode) || ((gds_t_busy2free == cw_set[0].mode) && (gds_t_writemap == cw_set[1].mode)))) { /* freeing a block from gvcst_kill or reorg, or upgrading/downgrading a block by reorg */ cw_depth = 0; if (gds_t_writemap == cw_set[0].mode) cw_bmp_depth = 0; else { busy2free_seen = TRUE; cw_bmp_depth = 1; } } else { cw_depth = cw_set_depth; cw_bmp_depth = cw_depth; } # ifdef GTM_SNAPSHOT if (update_trans && SNAPSHOTS_IN_PROG(cnl)) { /* If snapshot context is not already created, then create one now to be used by this transaction. If context * creation failed (for instance, on snapshot file open fail), then SS_INIT_IF_NEEDED sets csa->snapshot_in_prog * to FALSE. */ SS_INIT_IF_NEEDED(csa, cnl); } else csa->snapshot_in_prog = FALSE; lcl_ss_in_prog = SNAPSHOTS_IN_PROG(csa); /* store in local variable to avoid pointer access */ reorg_ss_in_prog = (mu_reorg_process && lcl_ss_in_prog); /* store in local variable if both snapshots and MUPIP REORG * are in progress */ # endif if (0 != cw_depth) { /* Caution : since csa->backup_in_prog and read_before_image are initialized below * only if (cw_depth), these variables should be used below only within an if (cw_depth). */ assert(SIZEOF(bsiz) == SIZEOF(old_block->bsiz)); assert(update_trans); csa->backup_in_prog = (BACKUP_NOT_IN_PROGRESS != cnl->nbb); jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL; read_before_image = ((NULL != jbbp) || csa->backup_in_prog || lcl_ss_in_prog); for (cs = cw_set, cs_top = cs + cw_depth; cs < cs_top; cs++) { assert(0 == cs->jnl_freeaddr); /* ensure haven't missed out resetting jnl_freeaddr for any cse in * t_write/t_create/t_write_map/t_write_root/mu_write_map [D9B11-001991] */ if (gds_t_create == cs->mode) { assert(0 == cs->blk_checksum); int_depth = (int)cw_set_depth; if (0 > (cs->blk = bm_getfree(cs->blk, &blk_used, cw_depth, cw_set, &int_depth))) { if (FILE_EXTENDED == cs->blk) { status = cdb_sc_helpedout; assert(is_mm); } else { GET_CDB_SC_CODE(cs->blk, status); /* code is set in status */ } goto failed_skip_revert; } cs->was_free = !blk_used; BEFORE_IMAGE_NEEDED(read_before_image, cs, csa, csd, cs->blk, before_image_needed); if (!before_image_needed) cs->old_block = NULL; else { block_is_free = cs->was_free; cs->old_block = t_qread(cs->blk, (sm_int_ptr_t)&cs->cycle, &cs->cr); old_block = (blk_hdr_ptr_t)cs->old_block; if (NULL == old_block) { status = (enum cdb_sc)rdfail_detail; goto failed_skip_revert; } if (!cs->was_free && (NULL != jbbp) && (old_block->tn < jbbp->epoch_tn)) { /* Compute CHECKSUM for writing PBLK record before getting crit. * It is possible that we are reading a block that is actually marked free in * the bitmap (due to concurrency issues at this point). Therefore we might be * actually reading uninitialized block headers and in turn a bad value of * "old_block->bsiz". Restart if we ever access a buffer whose size is greater * than the db block size. */ bsiz = old_block->bsiz; if (bsiz > csd->blk_size) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_lostbmlcr; goto failed_skip_revert; } JNL_GET_CHECKSUM_ACQUIRED_BLK(cs, csd, csa, old_block, bsiz); } } /* assert that the block that we got from bm_getfree is less than the total blocks. * if we do not have crit in this region, then it is possible that bm_getfree can return * a cs->blk that is >= csa->ti->total_blks (i.e. if the bitmap buffer gets recycled). * adjust assert accordingly. * note that checking for crit is equivalent to checking if we are in the final retry. */ assert((CDB_STAGNATE > t_tries) || (cs->blk < csa->ti->total_blks)); cs->mode = gds_t_acquired; assert(GDSVCURR == cs->ondsk_blkver); } else if (reorg_ss_in_prog && cs->was_free) { assert((gds_t_acquired == cs->mode) && (NULL == cs->old_block)); /* If snapshots are in progress, we might want to read the before images of the FREE blocks also. * Since mu_swap_blk mimics a small part of t_end, it sets cse->mode to gds_t_acquired and hence * will not read the before images of the FREE blocks in t_end. To workaround this, set * cse->was_free to TRUE so that in t_end, this condition can be used to read the before images of * the FREE blocks if needed. */ BEFORE_IMAGE_NEEDED(read_before_image, cs, csa, csd, cs->blk, before_image_needed); if (before_image_needed) { block_is_free = TRUE; /* To tell t_qread that the block it's trying to read is * actually a FREE block */ cs->old_block = t_qread(cs->blk, (sm_int_ptr_t)&cs->cycle, &cs->cr); if (NULL == cs->old_block) { status = (enum cdb_sc)rdfail_detail; goto failed_skip_revert; } } } } } if (update_trans && JNL_ENABLED(csa)) { /* compute the total journal record size requirements before grab_crit. * there is code later that will check for state changes from now to then and if so do a recomputation */ assert(!cw_map_depth || cw_set_depth < cw_map_depth); tmp_cw_set_depth = cw_map_depth ? cw_map_depth : cw_set_depth; TOTAL_NONTPJNL_REC_SIZE(total_jnl_rec_size, non_tp_jfb_ptr, csa, tmp_cw_set_depth); /* For a non-tp update maximum journal space we may need is total size of * 1) space for maximum CDB_CW_SET_SIZE PBLKs, that is, MAX_JNL_REC_SIZE * CDB_CW_SET_SIZE * 2) space for a logical record itself, that is, MAX_LOGI_JNL_REC_SIZE and * 3) overhead records (MIN_TOTAL_NONTPJNL_REC_SIZE + JNL_FILE_TAIL_PRESERVE) * This requirement is less than the minimum autoswitchlimit size (JNL_AUTOSWITCHLIMIT_MIN) as asserted below. * Therefore we do not need any check to issue JNLTRANS2BIG error like is being done in tp_tend.c */ assert((CDB_CW_SET_SIZE * MAX_JNL_REC_SIZE + MAX_LOGI_JNL_REC_SIZE + MIN_TOTAL_NONTPJNL_REC_SIZE + JNL_FILE_TAIL_PRESERVE) <= (JNL_AUTOSWITCHLIMIT_MIN * DISK_BLOCK_SIZE)); DEBUG_ONLY(tot_jrec_size = MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size)); assert(tot_jrec_size <= csd->autoswitchlimit); /* The SET_GBL_JREC_TIME done below should be done before any journal writing activity * on this region's journal file. This is because all the jnl record writing routines assume * jgbl.gbl_jrec_time is initialized appropriately. */ assert(!jgbl.forw_phase_recovery || jgbl.dont_reset_gbl_jrec_time); if (!jgbl.dont_reset_gbl_jrec_time) SET_GBL_JREC_TIME; /* initializes jgbl.gbl_jrec_time */ assert(jgbl.gbl_jrec_time); } block_saved = FALSE; ESTABLISH_RET(t_ch, 0); assert(!csa->hold_onto_crit || csa->now_crit); if (!csa->now_crit) { if (update_trans) { /* Get more space if needed. This is done outside crit so that * any necessary IO has a chance of occurring outside crit. * The available space must be double-checked inside crit. */ if (!is_mm && (cnl->wc_in_free < (int4)(cw_set_depth + 1)) && !wcs_get_space(gv_cur_region, cw_set_depth + 1, NULL)) assert(FALSE); /* wcs_get_space should have returned TRUE unconditionally in this case */ for (;;) { grab_crit(gv_cur_region); if (FALSE == csd->freeze) break; rel_crit(gv_cur_region); /* We are about to wait for freeze. Assert that we are not in phase2 of a bitmap free operation * (part of an M-kill or REORG operation). The freeze must have waited for the phase2 to complete. */ assert((inctn_bmp_mark_free_gtm != inctn_opcode) && (inctn_bmp_mark_free_mu_reorg != inctn_opcode)); while (csd->freeze) hiber_start(1000); } } else grab_crit(gv_cur_region); } /* We should never proceed to update a frozen database. Only exception is DSE where we want the ability to freeze a * database for every process that wants to update the database except for DSE. */ assert(!update_trans || !csd->freeze || IS_DSE_IMAGE); if (is_mm && ((csa->hdr != csd) || (csa->total_blks != csa->ti->total_blks))) { /* If MM, check if wcs_mm_recover was invoked as part of the grab_crit done above OR if * the file has been extended. If so, restart. */ status = cdb_sc_helpedout; /* force retry with special status so philanthropy isn't punished */ if ((CDB_STAGNATE - 1) == t_tries) release_crit = TRUE; goto failed; } /* Any retry transition where the destination state is the 3rd retry, we don't want to release crit, * i.e. for 2nd to 3rd retry transition or 3rd to 3rd retry transition. * Therefore we need to release crit only if (CDB_STAGNATE - 1) > t_tries * But 2nd to 3rd retry transition doesn't occur if in 2nd retry we get jnlstatemod, jnlclose, bkupss_statemod, * helpedout, or cacheprob code. * Hence the variable release_crit to track the above. */ release_crit = (CDB_STAGNATE - 1) > t_tries; assert(!cw_depth || update_trans); # ifdef GTM_TRIGGER if (!skip_dbtriggers) { cycle = csd->db_trigger_cycle; if (csa->db_trigger_cycle != cycle) { /* the process' view of the triggers could be potentially stale. restart to be safe. */ /* On an originating instance, in addition to the run-time, utilities can collide with * with concurrent triggers definition updates * The following asserts verify that: * (1) Activities on a replicating instance don't see concurrent trigger changes as update * process is the only updater in the replicating instance. * (2) Journal recover operates in standalone mode. So, it should NOT see any concurrent * trigger changes as well */ assert(!is_updproc); assert(!jgbl.forw_phase_recovery); assert(cycle > csa->db_trigger_cycle); /* csa->db_trigger_cycle will be set to csd->db_trigger_cycle in t_retry */ status = cdb_sc_triggermod; if ((CDB_STAGNATE - 1) == t_tries) release_crit = TRUE; goto failed; } } # endif /* If inctn_opcode has a valid value, then we better be doing an update. The only exception to this rule is if we are * in MUPIP REORG UPGRADE/DOWNGRADE (mu_reorg_upgrd_dwngrd.c) where update_trans is explicitly set to 0 in some cases. */ assert((inctn_invalid_op == inctn_opcode) || mu_reorg_upgrd_dwngrd_in_prog || update_trans); if (update_trans) { if (JNL_ALLOWED(csa)) { if ((csa->jnl_state != csd->jnl_state) || (csa->jnl_before_image != csd->jnl_before_image)) { /* csd->jnl_state or csd->jnl_before_image changed since last time * csa->jnl_before_image and csa->jnl_state got set */ csa->jnl_before_image = csd->jnl_before_image; csa->jnl_state = csd->jnl_state; /* jnl_file_lost causes a jnl_state transition from jnl_open to jnl_closed * and additionally causes a repl_state transition from repl_open to repl_closed * all without standalone access. This means that csa->repl_state might be repl_open * while csd->repl_state might be repl_closed. update csa->repl_state in this case * as otherwise the rest of the code might look at csa->repl_state and incorrectly * conclude replication is on and generate sequence numbers when actually no journal * records are being generated. [C9D01-002219] */ csa->repl_state = csd->repl_state; status = cdb_sc_jnlstatemod; if ((CDB_STAGNATE - 1) == t_tries) release_crit = TRUE; goto failed; } } /* Flag retry, if other mupip activities like BACKUP, INTEG or FREEZE are in progress. * If in final retry, go ahead with kill. BACKUP/INTEG/FREEZE will wait for us to be done. */ if (need_kip_incr && (0 < cnl->inhibit_kills) && (CDB_STAGNATE > t_tries)) { status = cdb_sc_inhibitkills; goto failed; } ss_need_to_restart = new_bkup_started = FALSE; GTM_SNAPSHOT_ONLY( if (update_trans) CHK_AND_UPDATE_SNAPSHOT_STATE_IF_NEEDED(csa, cnl, ss_need_to_restart); ) if (cw_depth) { assert(update_trans); CHK_AND_UPDATE_BKUP_STATE_IF_NEEDED(cnl, csa, new_bkup_started); /* recalculate based on the new values of snapshot_in_prog and backup_in_prog. Since read_before_image used * only in the context of acquired blocks, recalculation should happen only for non-zero cw_depth */ read_before_image = ((JNL_ENABLED(csa) && csa->jnl_before_image) || csa->backup_in_prog || SNAPSHOTS_IN_PROG(csa)); } if ((cw_depth && new_bkup_started) || (update_trans && ss_need_to_restart)) { if (ss_need_to_restart || (new_bkup_started && !(JNL_ENABLED(csa) && csa->jnl_before_image))) { /* If online backup is in progress now and before-image journaling is not enabled, * we would not have read before-images for created blocks. Although it is possible * that this transaction might not have blocks with gds_t_create at all, we expect * this backup_in_prog state change to be so rare that it is ok to restart. */ status = cdb_sc_bkupss_statemod; if ((CDB_STAGNATE - 1) == t_tries) release_crit = TRUE; goto failed; } } /* in crit, ensure cache-space is available. the out-of-crit check done above might not have been enough */ if (!is_mm && (cnl->wc_in_free < (int4)(cw_set_depth + 1)) && !wcs_get_space(gv_cur_region, cw_set_depth + 1, NULL)) { assert(csd->wc_blocked); /* only reason we currently know why wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_end_hist); status = cdb_sc_cacheprob; if ((CDB_STAGNATE - 1) == t_tries) release_crit = TRUE; goto failed; } if (inctn_invalid_op != inctn_opcode) { assert(cw_set_depth || mu_reorg_process); write_inctn = TRUE; /* mupip reorg or gvcst_bmp_mark_free or extra block split in gvcstput */ decremented_currtn = FALSE; if (jgbl.forw_phase_recovery && !JNL_ENABLED(csa)) { /* forward recovery (deduced above from the fact that journaling is not enabled) is supposed * to accurately simulate GT.M runtime activity for every transaction number. The way it does * this is by incrementing transaction numbers for all inctn records that GT.M wrote and not * incrementing transaction number for any inctn activity that forward recovery internally needs * to do. all inctn activity done outside of t_end has already been protected against incrementing * transaction number in case of forward recovery. t_end is a little bit tricky since in this case * a few database blocks get modified with the current transaction number and not incrementing the * transaction number might result in the database transaction number being lesser than the block * transaction number. we work around this problem by decrementing the database transaction number * just before the commit so the database block updates for the inctn transaction get the * transaction number of the previous transaction effectively merging the inctn transaction with * the previous transaction. */ /* cw_set_depth is 1 for all INCTN operations except for block free operations when it can be 2 */ assert((inctn_gvcstput_extra_blk_split == inctn_opcode) || (inctn_bmp_mark_free_gtm == inctn_opcode) || (inctn_bmp_mark_free_mu_reorg == inctn_opcode) || (1 == cw_set_depth)); csa->ti->curr_tn--; csa->ti->early_tn--; decremented_currtn = TRUE; } } } assert(csd == csa->hdr); valid_thru = dbtn = csa->ti->curr_tn; if (!is_mm) tnque_earliest_tn = ((th_rec_ptr_t)((sm_uc_ptr_t)csa->th_base + csa->th_base->tnque.fl))->tn; if (update_trans) valid_thru++; n_blks_validated = 0; for (hist = hist1; (NULL != hist); hist = (hist == hist1) ? hist2 : NULL) { for (t1 = hist->h; t1->blk_num; t1++) { if (is_mm) { if (t1->tn <= ((blk_hdr_ptr_t)(t1->buffaddr))->tn) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_blkmod; goto failed; } t1->cse = NULL; /* reset for next transaction */ } else { bt = bt_get(t1->blk_num); if (NULL == bt) { if (t1->tn <= tnque_earliest_tn) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_losthist; goto failed; } cr = db_csh_get(t1->blk_num); } else { if (BLOCK_FLUSHING(bt)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_blockflush; goto failed; } if (CR_NOTVALID == bt->cache_index) cr = db_csh_get(t1->blk_num); else { cr = (cache_rec_ptr_t)GDS_REL2ABS(bt->cache_index); if (cr->blk != bt->blk) { assert(FALSE); SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_end_crbtmismatch1); status = cdb_sc_crbtmismatch; goto failed; } } assert(bt->killtn <= bt->tn); if (t1->tn <= bt->tn) { assert((CDB_STAGNATE > t_tries) || (cdb_sc_helpedout == t_fail_hist[t_tries])); assert(!IS_DOLLAR_INCREMENT || !write_inctn); /* If the current operation is a $INCREMENT, then try to optimize by recomputing * the update array. Do this only as long as ALL the following conditions are met. * a) the current history's block is a leaf level block we intend to modify * b) update is restricted to the data block * c) no block splits are involved. * d) this does not end up creating a new global variable tree * e) this is not a case of $INCR about to signal a GVUNDEF * cw_set_depth is 0 in that case * Conveniently enough, a simple check of (1 != cw_set_depth) is enough * to categorize conditions (c) to (e) * * Future optimization : It is possible that M-SETs (not just $INCR) * that update only one data block can benefit from this optimization. * But that has to be carefully thought out. */ if (!IS_DOLLAR_INCREMENT || t1->level || (1 != cw_set_depth) || (t1->blk_num != cw_set[0].blk)) { status = cdb_sc_blkmod; goto failed; } else { status = gvincr_recompute_upd_array(t1, cw_set, cr); if (cdb_sc_normal != status) { status = cdb_sc_blkmod; goto failed; } } } } if ((cache_rec_ptr_t)CR_NOTVALID == cr) { SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_end_hist); status = cdb_sc_cacheprob; goto failed; } if ((NULL == cr) || (cr->cycle != t1->cycle) || ((sm_long_t)GDS_REL2ABS(cr->buffaddr) != (sm_long_t)t1->buffaddr)) { if ((NULL != cr) && (NULL != bt) && (cr->blk != bt->blk)) { assert(FALSE); SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_end_crbtmismatch2); status = cdb_sc_crbtmismatch; goto failed; } assert(CDB_STAGNATE > t_tries); status = cdb_sc_lostcr; goto failed; } assert(0 == cr->in_tend); /* Pin those buffers that we are planning on updating. Those are easily identified as the ones * where the history has a non-zero cw-set-element. */ cs = t1->cse; if (cs) { if (n_gds_t_op > cs->mode) { assert(update_trans); assert(gds_t_busy2free > gds_t_committed); assert(gds_t_busy2free < n_gds_t_op); assert(gds_t_write_root > gds_t_committed); assert(gds_t_write_root < n_gds_t_op); assert((gds_t_committed > cs->mode) || busy2free_seen && (gds_t_busy2free == cs->mode)); PIN_CACHE_RECORD(cr, cr_array, cr_array_index); /* If cs->mode is gds_t_busy2free, then the corresponding cache-record needs * to be pinned to write the before-image right away but this cse is not going * to go through bg_update. So remember to unpin the cache-record before phase2 * as otherwise the pre-phase2 check (that we have pinned only those cache-records * that we are planning to update) will fail. But to do that, we rely on the fact * that the cache-record corresponding to the gds_t_busy2free cse is always the * first one in the cr_array. */ assert((gds_t_busy2free != cs->mode) || (1 == cr_array_index)); } t1->cse = NULL; /* reset for next transaction */ } } t1->tn = valid_thru; n_blks_validated++; } assert((hist != hist2) || (t1 != hist->h)); } # ifdef DEBUG /* If clue is non-zero, validate it (BEFORE this could be used in a future transaction). The only exception is reorg * where we could have an invalid clue (e.g. last_rec < first_rec etc.). This is because reorg shuffles records around * heavily and therefore it is hard to maintain an uptodate clue. reorg therefore handles this situation by actually * resetting the clue just before doing the next gvcst_search. The mu_reorg* routines already take care of this reset * (in fact, this is asserted in gvcst_search too). So we can allow invalid clues here in that special case. */ if (!mu_reorg_process && (NULL != gv_target) && gv_target->clue.end) /* gv_target can be NULL in case of DSE MAPS etc. */ DEBUG_GVT_CLUE_VALIDATE(gv_target); /* Validate that gvt has valid first_rec, clue & last_rec fields */ # endif /* Assert that if gtm_gvundef_fatal is non-zero, then we better not be about to signal a GVUNDEF */ assert(!TREF(gtm_gvundef_fatal) || !ready2signal_gvundef_lcl); /* check bit maps for usage */ if (0 != cw_map_depth) { /* Bit maps on end from mu_reorg (from a call to mu_swap_blk) or mu_reorg_upgrd_dwngrd */ prev_cw_set_depth = cw_set_depth; prev_cr_array_index = cr_array_index; /* note down current depth of pinned cache-records */ cw_set_depth = cw_map_depth; } for (cs = &cw_set[cw_bmp_depth], cs_top = &cw_set[cw_set_depth]; cs < cs_top; cs++) { assert(0 == cs->jnl_freeaddr); /* ensure haven't missed out resetting jnl_freeaddr for any cse in * t_write/t_create/{t,mu}_write_map/t_write_root [D9B11-001991] */ /* A bitmap block update will cause us to restart with "cdb_sc_bmlmod". TP transactions on the other hand * try reallocating blocks using the function "reallocate_bitmap". That is not presently used here because * there are cases like MUPIP REORG or MUPIP REORG UPGRADE etc. where we do not want this functionality. * Also, non-TP restarts due to bitmap collisions are currently assumed to be negligible. Hence no * reallocation done in the non-TP case. Reconsider if this assumption is invalidated. */ if (is_mm) { if (cs->tn <= ((blk_hdr_ptr_t)(cs->old_block))->tn) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_bmlmod; goto failed; } } else { bt = bt_get(cs->blk); if (NULL == bt) { if (cs->tn <= tnque_earliest_tn) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_lostbmlhist; goto failed; } cr = db_csh_get(cs->blk); } else { if (BLOCK_FLUSHING(bt)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_blockflush; goto failed; } if (cs->tn <= bt->tn) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_bmlmod; goto failed; } if (CR_NOTVALID == bt->cache_index) cr = db_csh_get(cs->blk); else { cr = (cache_rec_ptr_t)GDS_REL2ABS(bt->cache_index); if (cr->blk != bt->blk) { assert(FALSE); SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_end_crbtmismatch3); status = cdb_sc_crbtmismatch; goto failed; } } } if ((cache_rec_ptr_t)CR_NOTVALID == cr) { SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_end_bitmap_nullbt); status = cdb_sc_cacheprob; goto failed; } if ((NULL == cr) || (cr->cycle != cs->cycle) || ((sm_long_t)GDS_REL2ABS(cr->buffaddr) != (sm_long_t)cs->old_block)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_lostbmlcr; goto failed; } PIN_CACHE_RECORD(cr, cr_array, cr_array_index); } } if ((0 != cw_map_depth) && mu_reorg_upgrd_dwngrd_in_prog) { /* Bit maps on end from mu_reorg_upgrd_dwngrd. Bitmap history has been validated. * But we do not want bitmap cse to be considered for bg_update. Reset cw_set_depth accordingly. */ cw_set_depth = prev_cw_set_depth; assert(1 >= cw_set_depth); assert(2 >= cw_map_depth); /* UNPIN the bitmap cache record we no longer need */ assert(prev_cr_array_index <= cr_array_index); if (prev_cr_array_index < cr_array_index) { cr_array_index--; assert(prev_cr_array_index == cr_array_index); assert(process_id == cr_array[cr_array_index]->in_cw_set); UNPIN_CACHE_RECORD(cr_array[cr_array_index]); } } assert(csd == csa->hdr); assert(!need_kip_incr || update_trans); if (update_trans) { if (cw_depth && read_before_image && !is_mm) { assert(!(JNL_ENABLED(csa) && csa->jnl_before_image) || jbbp == csa->jnl->jnl_buff); assert((JNL_ENABLED(csa) && csa->jnl_before_image) || (NULL == jbbp)); for (cs = cw_set, cs_top = cs + cw_set_depth; cs < cs_top; ++cs) { /* have already read old block for creates before we got crit, make sure * cache record still has correct block. if not, reset "cse" fields to * point to correct cache-record. this is ok to do since we only need the * prior content of the block (for online backup or before-image journaling) * and did not rely on it for constructing the transaction. Restart if * block is not present in cache now or is being read in currently. */ if ((gds_t_acquired == cs->mode) && (NULL != cs->old_block)) { assert(read_before_image == ((JNL_ENABLED(csa) && csa->jnl_before_image) || csa->backup_in_prog || SNAPSHOTS_IN_PROG(csa))); cr = db_csh_get(cs->blk); if ((cache_rec_ptr_t)CR_NOTVALID == cr) { SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_end_jnl_cwset); status = cdb_sc_cacheprob; goto failed; } /* It is possible that cr->in_cw_set is non-zero in case a concurrent MUPIP REORG * UPGRADE/DOWNGRADE is in PHASE2 touching this very same block. In that case, * we cannot reuse this block so we restart. We could try finding a different block * to acquire instead and avoid a restart (tracked as part of C9E11-002651). * Note that in_cw_set is set to 0 ahead of in_tend in bg_update_phase2. Therefore * it is possible that we see in_cw_set 0 but in_tend is still non-zero. In that case, * we cannot proceed with pinning this cache-record as the cr is still locked by * the other process. We can choose to wait here but instead decide to restart. */ if ((NULL == cr) || (0 <= cr->read_in_progress) || (0 != cr->in_cw_set) || (0 != cr->in_tend)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_lostbefor; goto failed; } PIN_CACHE_RECORD(cr, cr_array, cr_array_index); cs->ondsk_blkver = cr->ondsk_blkver; old_block = (blk_hdr_ptr_t)GDS_REL2ABS(cr->buffaddr); assert((cs->cr != cr) || (cs->old_block == (sm_uc_ptr_t)old_block)); old_block_tn = old_block->tn; /* Need checksums if before imaging and if a PBLK record is going to be written. However, * while doing the bm_getfree, if we got a free block, then no need to compute checksum * as we would NOT be writing before images of free blocks to journal files */ cksum_needed = (!cs->was_free && (NULL != jbbp) && (old_block_tn < jbbp->epoch_tn)); if ((cs->cr != cr) || (cs->cycle != cr->cycle)) { /* Block has relocated in the cache. Adjust pointers to new location. */ cs->cr = cr; cs->cycle = cr->cycle; cs->old_block = (sm_uc_ptr_t)old_block; /* PBLK checksum was computed outside-of-crit when block was read but * block has relocated in the cache since then so recompute the checksum * if this block needs a checksum in the first place (cksum_needed is TRUE). */ recompute_cksum = cksum_needed; } else if (cksum_needed) { /* We have determined that a checksum is needed for this block. If we have not * previously computed one outside crit OR if the block contents have changed * since the checksum was previously computed, we need to recompute it. * Otherwise, the out-of-crit computed value can be safely used. * Note that cs->tn is valid only if a checksum was computed outside of crit. * So make sure it is used only if checksum is non-zero. There is a rare chance * that the computed checksum could be zero in which case we will recompute * unnecessarily. Since that is expected to be very rare, it is considered ok. */ recompute_cksum = (!cs->blk_checksum || (cs->tn <= old_block_tn)); } if (!cksum_needed) cs->blk_checksum = 0; /* zero any out-of-crit computed checksum */ else if (recompute_cksum) { /* We hold crit at this point so we are guaranteed valid bsiz field. * Hence we do not need to take MIN(bsiz, csd->blk_size) like we did * in the earlier call to jnl_get_checksum. */ assert(NULL != jbbp); assert(SIZEOF(bsiz) == SIZEOF(old_block->bsiz)); bsiz = old_block->bsiz; assert(bsiz <= csd->blk_size); cs->blk_checksum = jnl_get_checksum((uint4*)old_block, csa, bsiz); } DEBUG_ONLY( else assert(cs->blk_checksum == jnl_get_checksum((uint4 *)old_block, csa, old_block->bsiz)); ) assert(cs->cr->blk == cs->blk); } } } /* if we are not writing an INCTN record, we better have a non-zero cw_depth. * the only two exceptions are that * a) if we were being called from gvcst_put for a duplicate SET * b) if we were called from DSE MAPS * in case (a), we want to write logical SET journal records and replicate them. * in case (b), we do not want to replicate them. we want to assert that is_replicator is FALSE in this case. * the following assert achieves that purpose. */ assert((inctn_invalid_op != inctn_opcode) || cw_depth || !is_replicator /* exception case (b) */ || (ERR_GVPUTFAIL == t_err) && gvdupsetnoop); /* exception case (a) */ if (REPL_ALLOWED(csa) && is_replicator && (inctn_invalid_op == inctn_opcode)) { replication = TRUE; jpl = jnlpool_ctl; tjpl = temp_jnlpool_ctl; repl_csa = &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs; if (!repl_csa->hold_onto_crit) grab_lock(jnlpool.jnlpool_dummy_reg); assert(repl_csa->now_crit); QWASSIGN(tjpl->write_addr, jpl->write_addr); QWASSIGN(tjpl->write, jpl->write); QWASSIGN(tjpl->jnl_seqno, jpl->jnl_seqno); INT8_ONLY(assert(tjpl->write == tjpl->write_addr % tjpl->jnlpool_size)); assert(jgbl.cumul_jnl_rec_len); tmp_cumul_jnl_rec_len = (uint4)(jgbl.cumul_jnl_rec_len + SIZEOF(jnldata_hdr_struct)); tjpl->write += SIZEOF(jnldata_hdr_struct); if (tjpl->write >= tjpl->jnlpool_size) { assert(tjpl->write == tjpl->jnlpool_size); tjpl->write = 0; } assert(QWEQ(jpl->early_write_addr, jpl->write_addr)); QWADDDW(jpl->early_write_addr, jpl->write_addr, tmp_cumul_jnl_rec_len); /* Source server does not read in crit. It relies on early_write_addr, the transaction * data, lastwrite_len, write_addr being updated in that order. To ensure this order, * we have to force out early_write_addr to its coherency point now. If not, the source * server may read data that is overwritten (or stale). This is true only on * architectures and OSes that allow unordered memory access */ SHM_WRITE_MEMORY_BARRIER; } assert(cw_set_depth < CDB_CW_SET_SIZE); ASSERT_CURR_TN_EQUALS_EARLY_TN(csa, dbtn); CHECK_TN(csa, csd, dbtn); /* can issue rts_error TNTOOLARGE */ if (JNL_ENABLED(csa)) { /* Since we got the system time (jgbl.gbl_jrec_time) outside of crit, it is possible that * journal records were written concurrently to this file with a timestamp that is future * relative to what we recorded. In that case, adjust our recorded time to match this. * This is necessary to ensure that timestamps of successive journal records for each * database file are in non-decreasing order. A side-effect of this is that our recorded * time might not accurately reflect the current system time but that is considered not * an issue since we dont expect to be off by more than a second or two if at all. * Another side effect is that even if the system time went back, we will never write * out-of-order timestamped journal records in the lifetime of this database shared memory. */ jpc = csa->jnl; jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order of jnl * records. This needs to be done BEFORE the jnl_ensure_open as that could write journal records * (if it decides to switch to a new journal file) */ ADJUST_GBL_JREC_TIME(jgbl, jbp); /* Note that jnl_ensure_open can call cre_jnl_file which * in turn assumes jgbl.gbl_jrec_time is set. Also jnl_file_extend can call * jnl_write_epoch_rec which in turn assumes jgbl.gbl_jrec_time is set. * In case of forw-phase-recovery, mur_output_record would have already set this. */ assert(jgbl.gbl_jrec_time); jnl_status = jnl_ensure_open(); GTM_WHITE_BOX_TEST(WBTEST_T_END_JNLFILOPN, jnl_status, ERR_JNLFILOPN); if (jnl_status == 0) { /* tmp_cw_set_depth was used to do TOTAL_NONTPJNL_REC_SIZE calculation earlier in this function. * It is now though that the actual jnl record write occurs. Ensure that the current value of * cw_set_depth does not entail any change in journal record size than was calculated. * Same case with csa->jnl_before_images & jbp->before_images. * The only exception is that in case of mu_reorg_upgrd_dwngrd_in_prog cw_set_depth will be * LESSER than tmp_cw_set_depth (this is still fine as there is more size allocated than used). */ assert(cw_set_depth == tmp_cw_set_depth || mu_reorg_upgrd_dwngrd_in_prog && cw_map_depth && cw_set_depth < tmp_cw_set_depth); assert(jbp->before_images == csa->jnl_before_image); assert((csa->jnl_state == csd->jnl_state) && (csa->jnl_before_image == csd->jnl_before_image)); if (DISK_BLOCKS_SUM(jbp->freeaddr, total_jnl_rec_size) > jbp->filesize) { /* Moved as part of change to prevent journal records splitting * across multiple generation journal files. */ if (SS_NORMAL != (jnl_status = jnl_flush(jpc->region))) { send_msg(VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush during t_end"), jnl_status); assert((!JNL_ENABLED(csd)) && (JNL_ENABLED(csa))); status = cdb_sc_jnlclose; if ((CDB_STAGNATE - 1) == t_tries) release_crit = TRUE; goto failed; } else if (EXIT_ERR == jnl_file_extend(jpc, total_jnl_rec_size)) { assert(csd == csa->hdr); /* jnl_file_extend() shouldn't reset csd in MM */ assert((!JNL_ENABLED(csd)) && (JNL_ENABLED(csa))); status = cdb_sc_jnlclose; if ((CDB_STAGNATE - 1) == t_tries) release_crit = TRUE; goto failed; } assert(csd == csa->hdr); /* If MM, csd shouldn't have been reset */ } assert(jgbl.gbl_jrec_time >= jbp->prev_jrec_time); if (0 == jpc->pini_addr) jnl_put_jrt_pini(csa); if (JNL_HAS_EPOCH(jbp)) { if ((jbp->next_epoch_time <= jgbl.gbl_jrec_time) UNCONDITIONAL_EPOCH_ONLY(|| TRUE)) { /* Flush the cache. Since we are in crit, defer syncing epoch */ if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_IN_COMMIT)) { SET_WCS_FLU_FAIL_STATUS(status, csd); SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_end_jnl_wcsflu); goto failed; } assert(csd == csa->hdr); VMS_ONLY( if (csd->clustered && !CCP_SEGMENT_STATE(cnl, CCST_MASK_HAVE_DIRTY_BUFFERS)) { CCP_FID_MSG(gv_cur_region, CCTR_FLUSHLK); ccp_userwait(gv_cur_region, CCST_MASK_HAVE_DIRTY_BUFFERS, 0, cnl->ccp_cycle); } ) } } } else { if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status); else rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); } } assert(!TREF(donot_commit)); /* We should never commit a transaction that was determined restartable */ assert(TN_NOT_SPECIFIED > MAX_TN_V5); /* Ensure TN_NOT_SPECIFIED isn't a valid TN number */ blktn = (TN_NOT_SPECIFIED == ctn) ? dbtn : ctn; csa->ti->early_tn = dbtn + 1; if (JNL_ENABLED(csa)) { DEBUG_ONLY(save_gbl_jrec_time = jgbl.gbl_jrec_time;) if (jbp->before_images) { /* do not write PBLKs if MUPIP REORG UPGRADE/DOWNGRADE with -NOSAFEJNL */ if (!mu_reorg_upgrd_dwngrd_in_prog || !mu_reorg_nosafejnl) { epoch_tn = jbp->epoch_tn; /* store in a local as it is used in a loop below */ for (cs = cw_set, cs_top = cs + cw_set_depth; cs < cs_top; ++cs) { /* PBLK computations for FREE blocks are not needed */ if (cs->was_free) continue; /* write out before-update journal image records */ mode = cs->mode; if (gds_t_committed < mode) { /* There are two possibilities at this point. * a) gds_t_write_root : In this case no need to write PBLK. * b) gds_t_busy2free : This is set by gvcst_bmp_mark_free to indicate * that a block has to be freed right away instead of taking it * through the RECYCLED state. This should be done only if * csd->db_got_to_v5_once has not yet become TRUE. Once it is * TRUE, block frees will write PBLK only when the block is reused. */ assert((gds_t_write_root == mode) || (gds_t_busy2free == mode)); if ((gds_t_write_root == mode) || (gds_t_busy2free == mode) && csd->db_got_to_v5_once) continue; } old_block = (blk_hdr_ptr_t)cs->old_block; ASSERT_IS_WITHIN_SHM_BOUNDS((sm_uc_ptr_t)old_block, csa); DBG_ENSURE_OLD_BLOCK_IS_VALID(cs, is_mm, csa, csd); if ((NULL != old_block) && (old_block->tn < epoch_tn)) { bsiz = old_block->bsiz; assert((bsiz <= csd->blk_size) || IS_DSE_IMAGE); assert(bsiz >= SIZEOF(blk_hdr) || IS_DSE_IMAGE); /* For acquired or gds_t_busy2free blocks, we should have computed * checksum already. The only exception is if we found no need to * compute checksum outside of crit but before we got crit, an * EPOCH got written concurrently so we have to write a PBLK (and * hence compute the checksum as well) when earlier we thought none * was necessary. An easy way to check this is that an EPOCH was * written AFTER we started this transaction. */ assert((gds_t_acquired != cs->mode) || (gds_t_busy2free != cs->mode) || cs->blk_checksum || (epoch_tn >= start_tn)); /* It is possible that the block has a bad block-size. * Before computing checksum ensure bsiz passed is safe. * The checks done here for "bsiz" assignment are * similar to those done in jnl_write_pblk/jnl_write_aimg. */ if (IS_DSE_IMAGE) bsiz = MIN(bsiz, csd->blk_size); assert(!cs->blk_checksum || (cs->blk_checksum == jnl_get_checksum((uint4 *)old_block, csa, bsiz))); if (!cs->blk_checksum) cs->blk_checksum = jnl_get_checksum((uint4 *)old_block, csa, bsiz); # ifdef GTM_CRYPT if (csd->is_encrypted) { DBG_ENSURE_PTR_IS_VALID_GLOBUFF(csa, csd, (sm_uc_ptr_t)old_block); DEBUG_ONLY(save_old_block = old_block;) old_block = (blk_hdr_ptr_t)GDS_ANY_ENCRYPTGLOBUF(old_block, csa); /* Ensure that the unencrypted block and it's twin counterpart are * in sync. */ assert(save_old_block->tn == old_block->tn); assert(save_old_block->bsiz == old_block->bsiz); assert(save_old_block->levl == old_block->levl); DBG_ENSURE_PTR_IS_VALID_ENCTWINGLOBUFF(csa, csd, (sm_uc_ptr_t)old_block); } # endif jnl_write_pblk(csa, cs, old_block); cs->jnl_freeaddr = jbp->freeaddr; } DEBUG_ONLY( else assert(0 == cs->jnl_freeaddr); ) } } } if (write_after_image) { /* either DSE or MUPIP RECOVER playing an AIMG record */ assert(1 == cw_set_depth); /* only one block at a time */ assert(!replication); cs = cw_set; jnl_write_aimg_rec(csa, cs); } else if (write_inctn) { assert(!replication); if ((inctn_blkupgrd == inctn_opcode) || (inctn_blkdwngrd == inctn_opcode)) { assert(1 == cw_set_depth); /* upgrade/downgrade one block at a time */ cs = cw_set; assert(inctn_detail.blknum_struct.blknum == cs->blk); assert(mu_reorg_upgrd_dwngrd_blktn < dbtn); if (mu_reorg_nosafejnl) { blktn = mu_reorg_upgrd_dwngrd_blktn; /* if NOSAFEJNL and there is going to be a block format change * as a result of this update, note it down in the inctn opcode * (for recovery) as there is no PBLK record for it to rely on. */ if (cs->ondsk_blkver != csd->desired_db_format) inctn_opcode = (inctn_opcode == inctn_blkupgrd) ? inctn_blkupgrd_fmtchng : inctn_blkdwngrd_fmtchng; } } jnl_write_inctn_rec(csa); } else if (0 == jnl_fence_ctl.level) { assert(!replication || !jgbl.forw_phase_recovery); if (replication) QWASSIGN(jnl_fence_ctl.token, tjpl->jnl_seqno); else if (!jgbl.forw_phase_recovery) QWASSIGN(jnl_fence_ctl.token, seq_num_zero); /* In case of forw-phase of recovery, token would have been set by mur_output_record */ jnl_write_logical(csa, non_tp_jfb_ptr); } else jnl_write_ztp_logical(csa, non_tp_jfb_ptr); /* Ensure jgbl.gbl_jrec_time did not get reset by any of the jnl writing functions */ assert(save_gbl_jrec_time == jgbl.gbl_jrec_time); } else if (replication) { /* Case where JNL_ENABLED(csa) is FALSE but REPL_WAS_ENABLED(csa) is TRUE and therefore we need to * write logical jnl records in the journal pool (no need to write in journal buffer or journal file). */ assert(!JNL_ENABLED(csa) && REPL_WAS_ENABLED(csa)); if (0 == jnl_fence_ctl.level) { QWASSIGN(jnl_fence_ctl.token, tjpl->jnl_seqno); jnl_write_logical(csa, non_tp_jfb_ptr); } else jnl_write_ztp_logical(csa, non_tp_jfb_ptr); } if (replication) { QWINCRBY(tjpl->jnl_seqno, seq_num_one); QWASSIGN(csa->hdr->reg_seqno, tjpl->jnl_seqno); if (is_updproc) { VMS_ONLY( QWINCRBY(jgbl.max_resync_seqno, seq_num_one); QWASSIGN(csa->hdr->resync_seqno, jgbl.max_resync_seqno); ) UNIX_ONLY( assert(REPL_PROTO_VER_UNINITIALIZED != recvpool.gtmrecv_local->last_valid_remote_proto_ver); if (REPL_PROTO_VER_DUALSITE == recvpool.gtmrecv_local->last_valid_remote_proto_ver) { QWINCRBY(jgbl.max_dualsite_resync_seqno, seq_num_one); QWASSIGN(csa->hdr->dualsite_resync_seqno, jgbl.max_dualsite_resync_seqno); } ) } } csa->prev_free_blks = csa->ti->free_blocks; csa->t_commit_crit = T_COMMIT_CRIT_PHASE1; if (cw_set_depth) { if (!is_mm) /* increment counter of # of processes that are actively doing two-phase commit */ INCR_WCS_PHASE2_COMMIT_PIDCNT(csa, cnl); # ifdef DEBUG /* Assert that cs->old_mode if uninitialized, never contains a negative value (relied by secshr_db_clnup) */ for (cs = cw_set, cs_top = cs + cw_set_depth; cs < cs_top; ++cs) assert(0 <= cs->old_mode); # endif for (cs = cw_set, cs_top = cs + cw_set_depth; cs < cs_top; ++cs) { mode = cs->mode; assert((gds_t_write_root != mode) || ((cs - cw_set) + 1 == cw_depth)); cs->old_mode = (int4)mode; /* note down before being reset to gds_t_committed */ assert(gds_t_committed < gds_t_write_root); assert(gds_t_committed < gds_t_busy2free); assert((n_gds_t_op != mode) && (gds_t_committed != mode)); assert((kill_t_write != mode) && (kill_t_create != mode)); if (gds_t_committed > mode) { DEBUG_ONLY( /* Check bitmap status of block we are about to modify. * Two exceptions are * a) DSE which can modify bitmaps at will. * b) MUPIP RECOVER writing an AIMG. In this case it is playing * forward a DSE action so is effectively like DSE doing it. */ if (!IS_DSE_IMAGE && !write_after_image) bml_status_check(cs); ) if (is_mm) status = mm_update(cs, dbtn, blktn, dummysi); else { if (csd->dsid) { if (ERR_GVKILLFAIL == t_err) { if (cs == cw_set) { if ((gds_t_acquired == mode) || ((cw_set_depth > 1) && (0 == cw_set[1].level))) rc_cpt_inval(); else rc_cpt_entry(cs->blk); } } else if (0 == cs->level) rc_cpt_entry(cs->blk); } /* Do phase1 of bg_update while holding crit on the database. * This will lock the buffers that need to be changed. * Once crit is released, invoke phase2 which will update those locked buffers. * The exception is if it is a bitmap block. In that case we also do phase2 * while holding crit so the next process to use this bitmap will see a * consistent copy of this bitmap when it gets crit for commit. This avoids * the reallocate_bitmap routine from restarting or having to wait for a * concurrent phase2 construction to finish. When the change request C9E11-002651 * (to reduce restarts due to bitmap collisions) is addressed, we can reexamine * whether it makes sense to move bitmap block builds back to phase2. */ status = bg_update_phase1(cs, dbtn, dummysi); if ((cdb_sc_normal == status) && (gds_t_writemap == mode)) { status = bg_update_phase2(cs, dbtn, blktn, dummysi); if (cdb_sc_normal == status) cs->mode = gds_t_committed; } } if (cdb_sc_normal != status) { /* the database is probably in trouble */ INVOKE_T_COMMIT_CLEANUP(status, csa); assert(cdb_sc_normal == status); /* At this time "cr_array_index" could be non-zero and a few cache-records might * have their "in_cw_set" field set to TRUE. We should not reset "in_cw_set" as we * don't hold crit at this point and also because we might still need those buffers * pinned until their before-images are backed up in wcs_recover (in case an * online backup was running while secshr_db_clnup did its job). Reset the * local variable "cr_array_index" though so we do not accidentally reset the * "in_cw_set" fields ourselves before the wcs_recover. */ cr_array_index = 0; goto skip_cr_array; /* hence skip until past "cr_array_index" processing */ } } } } /* signal secshr_db_clnup/t_commit_cleanup, roll-back is no longer possible */ update_trans |= UPDTRNS_TCOMMIT_STARTED_MASK; assert(cdb_sc_normal == status); assert(!csd->freeze || IS_DSE_IMAGE); /* should never increment curr_tn on a frozen database except if DSE */ INCREMENT_CURR_TN(csd); csa->t_commit_crit = T_COMMIT_CRIT_PHASE2; /* set this BEFORE releasing crit but AFTER incrementing curr_tn */ /* If db is journaled, then db header is flushed periodically when writing the EPOCH record, * otherwise do it here every HEADER_UPDATE_COUNT transactions. */ assert(!JNL_ENABLED(csa) || (jbp == csa->jnl->jnl_buff)); if ((!JNL_ENABLED(csa) || !JNL_HAS_EPOCH(jbp)) && !(csd->trans_hist.curr_tn & (HEADER_UPDATE_COUNT - 1))) fileheader_sync(gv_cur_region); if (need_kip_incr) /* increment kill_in_prog */ { INCR_KIP(csd, csa, kip_csa); need_kip_incr = FALSE; } start_tn = dbtn; /* start_tn temporarily used to store currtn (for bg_update_phase2) before releasing crit */ } if (!is_mm && busy2free_seen) { assert((gds_t_busy2free == cw_set[0].mode) && cr_array_index && (cw_set[0].blk == cr_array[0]->blk)); assert(process_id == cr_array[0]->in_cw_set); UNPIN_CACHE_RECORD(cr_array[0]); /* need to do this BEFORE releasing crit as we have no other lock on this buffer */ } if (!csa->hold_onto_crit) rel_crit(gv_cur_region); assert(!replication || update_trans); if (replication) { assert(QWGT(jpl->early_write_addr, jpl->write_addr)); assert(tmp_cumul_jnl_rec_len == (tjpl->write - jpl->write + (tjpl->write > jpl->write ? 0 : jpl->jnlpool_size))); /* the following statements should be atomic */ jnl_header = (jnldata_hdr_ptr_t)(jnlpool.jnldata_base + jpl->write); jnl_header->jnldata_len = tmp_cumul_jnl_rec_len; jnl_header->prev_jnldata_len = jpl->lastwrite_len; /* The following assert should be an == rather than a >= (as in tp_tend) because, we have * either one or no update. If no update, we would have no cw_depth and we wouldn't enter * this path. If there is an update, then both the indices should be 1. */ INT8_ONLY(assert(jgbl.cumul_index == jgbl.cu_jnl_index)); jpl->lastwrite_len = jnl_header->jnldata_len; /* For systems with UNORDERED memory access (example, ALPHA, POWER4, PA-RISC 2.0), on a * multi processor system, it is possible that the source server notices the change in * write_addr before seeing the change to jnlheader->jnldata_len, leading it to read an * invalid transaction length. To avoid such conditions, we should commit the order of * shared memory updates before we update write_addr. This ensures that the source server * sees all shared memory updates related to a transaction before the change in write_addr */ SHM_WRITE_MEMORY_BARRIER; jpl->write = tjpl->write; /* jpl->write_addr should be updated before updating jpl->jnl_seqno as secshr_db_clnup relies on this */ QWINCRBYDW(jpl->write_addr, jnl_header->jnldata_len); assert(QWEQ(jpl->early_write_addr, jpl->write_addr)); jpl->jnl_seqno = tjpl->jnl_seqno; if (!repl_csa->hold_onto_crit) rel_lock(jnlpool.jnlpool_dummy_reg); } /* If BG, check that we have not pinned any more buffers than we are updating */ DBG_CHECK_PINNED_CR_ARRAY_CONTENTS(is_mm, cr_array, cr_array_index, csd->bplmap); if (cw_set_depth) { /* Finish 2nd phase of commit for BG (updating the buffers in phase1) now that CRIT has been released. * For MM, only thing needed is to set cs->mode to gds_t_committed. */ for (cs = cw_set, cs_top = cs + cw_set_depth; cs < cs_top; ++cs) { mode = cs->mode; assert((gds_t_write_root != mode) || ((cs - cw_set) + 1 == cw_depth)); assert((kill_t_write != mode) && (kill_t_create != mode)); if (gds_t_committed > mode) { if (!is_mm) { /* Validate old_mode noted down in first phase is the same as the current mode. * Note that cs->old_mode is negated by bg_update_phase1 (to help secshr_db_clnup). */ assert(-cs->old_mode == mode); status = bg_update_phase2(cs, dbtn, blktn, dummysi); if (cdb_sc_normal != status) { /* the database is probably in trouble */ INVOKE_T_COMMIT_CLEANUP(status, csa); assert(cdb_sc_normal == status); /* At this time "cr_array_index" could be non-zero and a few cache-records might * have their "in_cw_set" field set to TRUE. We should not reset "in_cw_set" as we * don't hold crit at this point and also because we might still need those buffers * pinned until their before-images are backed up in wcs_recover (in case an * online backup was running while secshr_db_clnup did its job). Reset the * local variable "cr_array_index" though so we do not accidentally reset the * "in_cw_set" fields ourselves before the wcs_recover. */ cr_array_index = 0; /* Note that seshr_db_clnup (invoked by t_commit_cleanup above) would have * done a lot of cleanup for us including decrementing the wcs_phase2_commit_pidcnt * so it is ok to skip all that processing below and go directly to skip_cr_array. */ goto skip_cr_array; /* hence skip until past "cr_array_index" processing */ } } } cs->mode = gds_t_committed; } if (!is_mm) /* now that two-phase commit is done, decrement counter */ DECR_WCS_PHASE2_COMMIT_PIDCNT(csa, cnl); } UNPIN_CR_ARRAY_ON_COMMIT(cr_array, cr_array_index); assert(!cr_array_index); csa->t_commit_crit = FALSE; /* Phase 2 commits are completed. See if we had done a snapshot init (csa->snapshot_in_prog == TRUE). If so, * try releasing the resources obtained while snapshot init. */ if (SNAPSHOTS_IN_PROG(csa)) { assert(update_trans); SS_RELEASE_IF_NEEDED(csa, cnl); } skip_cr_array: assert(!csa->now_crit || csa->hold_onto_crit); assert(cdb_sc_normal == status); REVERT; /* no need for t_ch to be invoked if any errors occur after this point */ DEFERRED_EXIT_HANDLING_CHECK; /* now that all crits are released, check if deferred signal/exit handling needs to be done */ if (block_saved) backup_buffer_flush(gv_cur_region); if (unhandled_stale_timer_pop) process_deferred_stale(); if (update_trans) { wcs_timer_start(gv_cur_region, TRUE); if (REPL_ALLOWED(csa) && IS_DSE_IMAGE) { temp_tn = dbtn + 1; send_msg(VARLSTCNT(6) ERR_NOTREPLICATED, 4, &temp_tn, LEN_AND_LIT("DSE"), process_id); } INCR_GVSTATS_COUNTER(csa, cnl, n_nontp_readwrite, 1); INCR_GVSTATS_COUNTER(csa, cnl, n_nontp_blkread, n_blks_validated); INCR_GVSTATS_COUNTER(csa, cnl, n_nontp_blkwrite, cw_set_depth); GVSTATS_SET_CSA_STATISTIC(csa, db_curr_tn, dbtn); } else { INCR_GVSTATS_COUNTER(csa, cnl, n_nontp_readonly, 1); INCR_GVSTATS_COUNTER(csa, cnl, n_nontp_blkread, n_blks_validated); } /* "secshr_db_clnup/t_commit_cleanup" assume an active non-TP transaction if cw_set_depth is non-zero * or if update_trans is set to T_COMMIT_STARTED. Now that the transaction is complete, reset these fields. */ cw_set_depth = 0; update_trans = 0; CWS_RESET; t_tries = 0; /* commit was successful so reset t_tries */ assert(0 == cr_array_index); return dbtn; failed: assert(cdb_sc_normal != status); REVERT; failed_skip_revert: RESTORE_CURRTN_IF_NEEDED(csa, write_inctn, decremented_currtn); retvalue = t_commit_cleanup(status, 0); /* we expect to get a return value indicating update was NOT underway */ assert(!retvalue); /* if it was, then we would have done a "goto skip_cr_array:" instead */ if (NULL != gv_target) /* gv_target can be NULL in case of DSE MAPS command etc. */ gv_target->clue.end = 0; if (release_crit && csa->now_crit && !csa->hold_onto_crit) rel_crit(gv_cur_region); DEFERRED_EXIT_HANDLING_CHECK; /* now that all crits are released, check if deferred signal/exit handling needs to be done */ t_retry(status); /* in the retry case, we do not do a CWS_RESET as cw_stagnate is used only in the * final retry in which case t_end will succeed and do a CWS_RESET */ cw_map_depth = 0; assert(0 == cr_array_index); return 0; }