fis-gtm/sr_port/tp_clean_up.c

/****************************************************************
 *								*
 *	Copyright 2001, 2011 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include <signal.h>             /* for VSIG_ATOMIC_T type */

#include "gdsroot.h"
#include "gdskill.h"
#include "gdsbt.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsfhead.h"
#include "gdscc.h"
#include "filestruct.h"
#include "copy.h"
#include "jnl.h"
#include "buddy_list.h"		/* needed for tp.h */
#include "hashtab_int4.h"	/* needed for tp.h and cws_insert.h */
#include "tp.h"
#include "tp_change_reg.h"
#include "cws_insert.h"		/* for cw_stagnate_reinitialized */
#include "gdsblkops.h"		/* for RESET_UPDATE_ARRAY macro */
#include "error.h"
#include "have_crit.h"
#include "min_max.h"
#ifdef GTM_TRIGGER
#include "rtnhdr.h"
#include "gv_trigger.h"		/* for INVALIDATE_TRIGGER_CYCLES_IF_NEEDED macro */
#endif

GBLREF	jnl_fence_control	jnl_fence_ctl;
GBLREF	sgm_info		*sgm_info_ptr, *first_sgm_info;
GBLREF	sgm_info		*first_tp_si_by_ftok; /* List of participating regions in the TP transaction sorted on ftok order */
GBLREF	ua_list			*curr_ua, *first_ua;
GBLREF	char			*update_array, *update_array_ptr;
GBLREF	block_id		tp_allocation_clue;
GBLREF	uint4			update_array_size, cumul_update_array_size;
GBLREF	gd_region		*gv_cur_region;
GBLREF	sgmnt_addrs		*cs_addrs;
GBLREF	gv_namehead		*gv_target_list, *gvt_tp_list;
GBLREF	trans_num		local_tn;
GBLREF	sgmnt_data_ptr_t	cs_data;
GBLREF	buddy_list		*global_tlvl_info_list;
GBLREF	global_tlvl_info	*global_tlvl_info_head;
GBLREF	jnl_gbls_t		jgbl;
GBLREF	int			process_exiting;
GBLREF	block_id		gtm_tp_allocation_clue;	/* block# hint to start allocation for created blocks in TP */
#ifdef VMS
GBLREF	boolean_t		tp_has_kill_t_cse; /* cse->mode of kill_t_write or kill_t_create got created in this transaction */
#endif
#ifdef DEBUG
GBLREF	unsigned int		t_tries;
#endif

error_def(ERR_MEMORY);
error_def(ERR_VMSMEMORY);

void	tp_clean_up(boolean_t rollback_flag)
{
	gv_namehead	*gvnh, *blk_target;
	sgm_info	*si, *next_si;
	kill_set	*ks;
	cw_set_element	*cse, *cse1;
	int		level;
	int4		depth;
	uint4		tmp_update_array_size;
	off_chain	chain1;
	ua_list		*next_ua, *tmp_ua;
	srch_blk_status	*t1;
	boolean_t       is_mm;
	sgmnt_addrs	*csa;
	block_id	cseblk, histblk;
	cache_rec_ptr_t	cr;
	int4		upd_trans;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	/* We are about to clean up structures. Defer MUPIP STOP/signal handling until function end. */
	DEFER_INTERRUPTS(INTRPT_IN_TP_CLEAN_UP);

	assert((NULL != first_sgm_info) || (0 == cw_stagnate.size) || cw_stagnate_reinitialized);
		/* if no database activity, cw_stagnate should be uninitialized or reinitialized */
	DEBUG_ONLY(
		if (rollback_flag)
			TREF(donot_commit) = FALSE;
		assert(!TREF(donot_commit));
	)
	if (NULL != first_sgm_info)
	{	/* It is possible that first_ua is NULL at this point due to a prior call to tp_clean_up() that failed in
		 * malloc() of tmp_ua->update_array. This is possible because we might have originally had two chunks of
		 * update_arrays each x-bytes in size and we freed them up and requested 2x-bytes of contiguous storage
		 * and we might error out on that malloc attempt (though this is very improbable).
		 */
		if ((NULL != first_ua) && (NULL != first_ua->next_ua)
		    && !process_exiting && (UNIX_ONLY(ERR_MEMORY) VMS_ONLY(ERR_VMSMEMORY) != error_condition))
		{	/* if the original update array was too small, make a new larger one */
			/* tmp_update_array_size is used below instead of the global variables (update_array_size,
			 * first_ua->update_array_size or cumul_update_array_size) to handle error returns from	malloc()
			 * The global variables are reset to represent a NULL update_array before the malloc. If the malloc
			 * succeeds, they will be assigned the value of tmp_update_array_size and otherwise (if malloc fails
			 * due to memory exhausted situation) they stay NULL which is the right thing to do.
			 */
			update_array_size = 0;
			for (curr_ua = first_ua, tmp_update_array_size = 0; curr_ua != NULL; curr_ua = next_ua)
			{
				next_ua = curr_ua->next_ua;
				/* curr_ua->update_array can be NULL in case we got an error in the ENSURE_UPDATE_ARRAY_SPACE
				 * macro while trying to do the malloc of the update array. Since tp_clean_up() is called in
				 * most exit handling code, it has to be very careful, hence the checks for non-NULLness below.
				 */
				if (NULL != curr_ua->update_array)
				{
					free(curr_ua->update_array);
					tmp_update_array_size += curr_ua->update_array_size;
						/* add up only those update arrays that have been successfully malloced */
				}
				if (curr_ua != first_ua)
					free(curr_ua);
			}
			assert(tmp_update_array_size == cumul_update_array_size);
			tmp_ua = first_ua;
			curr_ua = first_ua = NULL;	/* reset to indicate no update-array temporarily */
			if (NULL != tmp_ua)
			{
				tmp_ua->next_ua = NULL;
				tmp_ua->update_array = update_array = update_array_ptr = NULL;
				tmp_ua->update_array_size = cumul_update_array_size = 0;
				if (BIG_UA < tmp_update_array_size)
					tmp_update_array_size = BIG_UA;
				tmp_ua->update_array = (char *)malloc(tmp_update_array_size);
				/* assign global variables only after malloc() succeeds */
				update_array = tmp_ua->update_array;
				cumul_update_array_size = update_array_size = tmp_ua->update_array_size = tmp_update_array_size;
				curr_ua = first_ua = tmp_ua; /* set first_ua to non-NULL value once all mallocs are successful */
			}
		}
		RESET_UPDATE_ARRAY; /* do not use CHECK_AND_RESET_UPDATE_ARRAY since we are in TP and will fail the check there */
		if (rollback_flag) 		/* Rollback invalidates clues in all targets used by this transaction */
		{
			for (gvnh = gvt_tp_list; NULL != gvnh; gvnh = gvnh->next_tp_gvnh)
			{
				assert(gvnh->read_local_tn == local_tn);
				gvnh->clue.end = 0;
				chain1 = *(off_chain *)&gvnh->root;
				if (chain1.flag)
				{
					DEBUG_ONLY(csa = gvnh->gd_csa;)
					assert(csa->dir_tree != gvnh);
					gvnh->root = 0;
				}
				/* Cleanup any block-split info (of created block #) in gvtarget histories */
				TP_CLEANUP_GVNH_SPLIT_IF_NEEDED(gvnh, 0);
			}
			GTMTRIG_ONLY(INVALIDATE_TRIGGER_CYCLES_IF_NEEDED(FALSE, FALSE));
#			ifdef DEBUG
			if (!process_exiting)
			{	/* Ensure that we did not miss out on resetting clue for any gvtarget.
				 * Dont do this if the process is cleaning up the TP transaction as part of exit handling
				 * as the tp_clean_up invocation could be due to an interrupt (MUPIP STOP etc.) and we cannot
				 * be sure what state the mainline code was when it was interrupted. Thankfully, the clue
				 * will be used only as part of the next transaction. Since the process is in the process of
				 * exiting, the clue will never be used so it is ok for it to be non-zero in that case.
				 */
				for (gvnh = gv_target_list; NULL != gvnh; gvnh = gvnh->next_gvnh)
				{
					assert((gvnh->read_local_tn != local_tn) || (0 == gvnh->clue.end));
					chain1 = *(off_chain *)&gvnh->root;
					assert(!chain1.flag);	/* Also assert that all gvts in this process have valid root blk */
				}
			}
#			endif
			local_tn++;	/* to effectively invalidate first_tp_srch_status of all gv_targets */
		} else
		{
			GTMTRIG_ONLY(INVALIDATE_TRIGGER_CYCLES_IF_NEEDED(FALSE, TRUE));
		}
		GTMTRIG_ONLY(ASSERT_ZTRIGGER_CYCLE_RESET;) /* for all regions, we better have csa->db_dztrigger_cycle = 0*/
		for (si = first_sgm_info;  si != NULL;  si = next_si)
		{
			TP_TEND_CHANGE_REG(si);
			upd_trans = si->update_trans;	/* copy in local for debugging purposes in case later asserts fail */
			if (upd_trans)
			{
				if (NULL != (ks = si->kill_set_head))
				{
					FREE_KILL_SET(si, ks);
					si->kill_set_tail = NULL;
					si->kill_set_head = NULL;
				}
				if (NULL != si->jnl_head)
				{
					REINITIALIZE_LIST(si->format_buff_list);
					REINITIALIZE_LIST(si->jnl_list);		/* reinitialize the jnl buddy_list */
					si->jnl_tail = &si->jnl_head;
					si->jnl_head = NULL;
				}
				/* Note that cs_addrs->next_fenced could be non-NULL not just for those regions with a non-NULL
				 * value of si->jnl_head but also for those regions where an INCTN record (with opcode
				 * inctn_tp_upd_no_logical_rec) was written. So reset cs_addrs->next_fenced unconditionally.
				 */
				cs_addrs->next_fenced = NULL;
				if (FALSE == rollback_flag)
				{	/* Non-rollback case (op_tcommit) validates clues in the targets we are updating */
					sgm_info_ptr = si;	/* for tp_get_cw to work */
					is_mm = (dba_mm == gv_cur_region->dyn.addr->acc_meth);
					for (cse = si->first_cw_set; cse != si->first_cw_bitmap; cse = cse->next_cw_set)
					{
						assert(0 < cse->old_mode); /* assert that phase2 is complete on this block */
						if (n_gds_t_op < cse->old_mode)
						{	/* cse's block no longer exists in db so no clue can/should point to it */
							assert((kill_t_create == cse->old_mode) || (kill_t_write == cse->old_mode));
							continue;
						}
						TRAVERSE_TO_LATEST_CSE(cse);
						assert(NULL == cse->new_buff || NULL != cse->blk_target);
						if (NULL == (blk_target = cse->blk_target))
							continue;
						if (blk_target->split_cleanup_needed)
						{
							for (level = 0; level < ARRAYSIZE(blk_target->last_split_blk_num); level++)
							{
								chain1 = *(off_chain *)&blk_target->last_split_blk_num[level];
								if (chain1.flag)
								{
									if (chain1.cw_index < si->cw_set_depth)
									{
										tp_get_cw(si->first_cw_set,
												(int)chain1.cw_index, &cse1);
										assert(NULL != cse1);
										histblk = cse1->blk;
									} else
									{	/* out of design situation. fix & proceed in pro */
										assert(FALSE);
										histblk = 0;
									}
									blk_target->last_split_blk_num[level] = histblk;
								}
							}
							blk_target->split_cleanup_needed = FALSE;
						}
						if (0 == blk_target->clue.end)
						{
							chain1 = *(off_chain *)&blk_target->root;
							if (chain1.flag)
							{
								assert(blk_target != cs_addrs->dir_tree);
								tp_get_cw(si->first_cw_set, (int)chain1.cw_index, &cse1);
								assert(NULL != cse1);
								blk_target->root = cse1->blk;
							}
							continue;
						}
						depth = blk_target->hist.depth;
						level = (int)cse->level;
						if (level > depth)
							continue;
						t1 = &blk_target->hist.h[level];
						cseblk = cse->blk;
						histblk = t1->blk_num;
						if (cseblk == histblk)
						{
							assert(!((off_chain *)&histblk)->flag);
							if (!is_mm)
							{
								cr = cse->cr;
								assert(NULL != cr);
								UNIX_ONLY(assert((NULL == t1->cr) || (t1->cr == cr)));
								if (cr != t1->cr)
								{
									t1->cr = cr;
									t1->cycle = cse->cycle;
									t1->buffaddr = GDS_REL2ABS(cr->buffaddr);
								} else
								{
									assert(t1->cr == cr);
									assert(t1->cycle == cse->cycle);
									assert(t1->buffaddr == GDS_REL2ABS(cr->buffaddr));
								}
							} else
							{
								t1->buffaddr = cs_addrs->acc_meth.mm.base_addr
											+ (sm_off_t)cs_data->blk_size * cseblk;
								assert(NULL == t1->cr);
							}
							t1->cse = NULL;
						} else
						{
							chain1 = *(off_chain *)&histblk;
							if (chain1.flag)
							{
								tp_get_cw(si->first_cw_set, (int)chain1.cw_index, &cse1);
								if (cse == cse1)
								{
									if (blk_target->root == histblk)
										blk_target->root = cseblk;
									t1->blk_num = cseblk;
									if (is_mm)
										t1->buffaddr =
											cs_addrs->acc_meth.mm.base_addr
											+ (sm_off_t)cs_data->blk_size * cseblk;
									else
									{
										cr = cse->cr;
										assert(NULL != cr);
										t1->cr = cr;
										t1->cycle = cse->cycle;
										t1->buffaddr = GDS_REL2ABS(cr->buffaddr);
									}
									t1->cse = NULL;
								}
							}
						}
					}
				}
				si->total_jnl_rec_size = cs_addrs->min_total_tpjnl_rec_size; /* Reinitialize total_jnl_rec_size */
				REINITIALIZE_LIST(si->recompute_list);
				REINITIALIZE_LIST(si->cw_set_list);	/* reinitialize the cw_set buddy_list */
				REINITIALIZE_LIST(si->new_buff_list);	/* reinitialize the new_buff buddy_list */
				REINITIALIZE_LIST(si->tlvl_cw_set_list);	/* reinitialize the tlvl_cw_set buddy_list */
				REINITIALIZE_LIST(si->tlvl_info_list);		/* reinitialize the tlvl_info buddy_list */
				si->first_cw_set = si->last_cw_set = si->first_cw_bitmap = NULL;
				si->cw_set_depth = 0;
				si->update_trans = 0;
			} else if (rollback_flag)
				REINITIALIZE_LIST(si->tlvl_info_list);		/* reinitialize the tlvl_info buddy_list */
#			ifdef DEBUG
			/* Verify that all fields that were reset in the if code above are already at the reset value.
			 * There are NO exceptions to this rule. If this transaction had si->update_trans non-zero at some
			 * point but later did rollbacks which caused it to become FALSE, the incremental rollback would
			 * have taken care to reset these fields explicitly.
			 */
			assert(si->tp_csa == cs_addrs);
			DBG_CHECK_SI_BUDDY_LIST_IS_REINITIALIZED(si);
			VERIFY_LIST_IS_REINITIALIZED(si->tlvl_info_list);
#			endif
			if (si->num_of_blks)
			{	/* Check that it is the same as the # of used entries in the hashtable.
				 * The only exception is if we got interrupted by a signal right after updating one
				 * but before updating the other which triggered exit handling for this process.
				 */
				assert(si->num_of_blks == si->blks_in_use->count
					|| process_exiting && (si->num_of_blks == (si->blks_in_use->count - 1)));
				reinitialize_hashtab_int4(si->blks_in_use);
				si->num_of_blks = 0;
			}
			si->cr_array_index = 0;			/* reinitialize si->cr_array */
			si->last_tp_hist = si->first_tp_hist;		/* reinitialize the tp history */
			si->fresh_start = TRUE;
			si->tlvl_info_head = NULL;
			next_si = si->next_sgm_info;
			si->next_sgm_info = NULL;
		}	/* for (all segments in the transaction) */
		jnl_fence_ctl.fence_list = JNL_FENCE_LIST_END;
		/* No need to clean up jnl_fence_ctl.inctn_fence_list as it is used only by tp_tend and op_tcommit (after
		 * tp_tend is invoked) and is initialized to JNL_FENCE_LIST_END before both those usages. If any more
		 * usages of jnl_fence_ctl.inctn_fence_list occur, then this comment needs to be revisited.
		 */
#		ifdef DEBUG
		if (!process_exiting)
		{	/* Ensure that we did not miss out on clearing any gv_target->root which had chain.flag set.
			 * Dont do this if the process is cleaning up the TP transaction as part of exit handling
			 * Also use this opportunity to check that non-zero clues for BG contain non-null cr in histories.
			 * In addition, check that the list of multi-level block numbers (involved in the most recent split
			 * operations) stored in the gv_target are valid block #s.
			 */
			for (gvnh = gv_target_list; NULL != gvnh; gvnh = gvnh->next_gvnh)
			{
				chain1 = *(off_chain *)&gvnh->root;
				assert(!chain1.flag);
				for (level = 0; level < ARRAYSIZE(gvnh->last_split_blk_num); level++)
				{
					chain1 = *(off_chain *)&gvnh->last_split_blk_num[level];
					assert(!chain1.flag);
				}
				/* If there was a gvnh->write_local_tn, we could assert that if ever that field was updated
				 * in this transaction, then gvnh->root better be non-zero. Otherwise gvnh could have been
				 * used only for reads in this TP and in that case it is ok for the root to be 0.
				 */
				if (gvnh->root)
				{	/* check that gv_target->root falls within total blocks range */
					csa = gvnh->gd_csa;
					assert(NULL != csa);
					assert(gvnh->root < csa->ti->total_blks);
					assert(!IS_BITMAP_BLK(gvnh->root));
				}
				if (gvnh->clue.end)
				{
					is_mm = (dba_mm == gvnh->gd_csa->hdr->acc_meth);
					for (t1 = gvnh->hist.h; t1->blk_num; t1++)
					{
						assert(is_mm || (NULL != t1->cr));
						assert(NULL == t1->cse);
					}
					/* Now that we know the clue is non-zero, validate first_rec, clue & last_rec fields
					 * (BEFORE this clue could be used in a future transaction).
					 */
					DEBUG_GVT_CLUE_VALIDATE(gvnh);
				}
			}
		}
#		endif
		jgbl.cumul_jnl_rec_len = 0;
		jgbl.tp_ztp_jnl_upd_num = 0;
		GTMTRIG_ONLY(
			/* reset jgbl.prev_ztworm_ptr as we are now ready to start a new transaction
			 * and thus need to write new ztwormhole records if needed
			 */
			jgbl.prev_ztworm_ptr = NULL;
		)
		DEBUG_ONLY(jgbl.cumul_index = jgbl.cu_jnl_index = 0;)
		global_tlvl_info_head = NULL;
		REINITIALIZE_LIST(global_tlvl_info_list);		/* reinitialize the global_tlvl_info buddy_list */
		gvt_tp_list = NULL;
		CWS_RESET; /* reinitialize the hashtable before restarting/committing the TP transaction */
	}	/* if (any database work in the transaction) */
	VMS_ONLY(tp_has_kill_t_cse = FALSE;)
	tp_allocation_clue = gtm_tp_allocation_clue + 1;
	sgm_info_ptr = NULL;
	first_sgm_info = NULL;
	/* ensure that we don't have crit on any region at the end of a TP transaction (be it GT.M or MUPIP) */
	assert((CDB_STAGNATE == t_tries) || (0 == have_crit(CRIT_HAVE_ANY_REG)));
	/* Now that this transaction try is done (need to start a fresh try in case of a restart; in case of commit the entire
	 * transaction is done) ensure first_tp_si_by_ftok is NULL at end of tp_clean_up as this field is relied upon by
	 * secshr_db_clnup and t_commit_cleanup to determine if we have an ongoing transaction. In case of a successfully
	 * committing transaction (rollback_flag == FALSE), this should be guaranteed already. So we might need to do the reset
	 * only in case rollback_flag == TRUE but since that is an if condition which involves a pipeline break we avoid it by
	 * doing the set to NULL unconditionally.
	 */
	assert(rollback_flag || (NULL == first_tp_si_by_ftok));
	first_tp_si_by_ftok = NULL;
	ENABLE_INTERRUPTS(INTRPT_IN_TP_CLEAN_UP);	/* check if any MUPIP STOP/signals were deferred while in this function */
}