fis-gtm/sr_port/gvcst_init.c

/****************************************************************
 *								*
 *	Copyright 2001, 2013 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include <stddef.h>		/* for offsetof macro */

#include "gtm_string.h"
#include "gtm_time.h"

#include "cdb_sc.h"
#include "gdsroot.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "gdsblk.h"
#include "gdskill.h"
#include "gdscc.h"
#include "min_max.h"		/* needed for gdsblkops.h */
#include "gdsblkops.h"
#include "filestruct.h"
#include "iosp.h"
#include "jnl.h"
#include "hashtab_int4.h"	/* needed for tp.h */
#include "buddy_list.h"		/* needed for tp.h */
#include "tp.h"
#include "gtm_stdlib.h"		/* for ATOI */
#include "cryptdef.h"
#include "mlkdef.h"
#include "error.h"
#include "gt_timer.h"
#include "gtmimagename.h"
#include "trans_log_name.h"
#include "gtm_logicals.h"
#include "dbfilop.h"
#include "set_num_additional_processors.h"
#include "have_crit.h"
#include "t_retry.h"
#include "dpgbldir.h"
#include "longset.h"		/* needed for cws_insert.h */
#include "cws_insert.h"		/* for CWS_INIT macro */
#include "gvcst_protos.h"	/* for gvcst_init,gvcst_init_sysops,gvcst_tp_init prototype */
#include "compswap.h"
#include "send_msg.h"
#include "targ_alloc.h"		/* for "targ_free" prototype */
#include "hashtab_mname.h"
#include "process_gvt_pending_list.h"
#include "gtmmsg.h"
#ifdef UNIX
#include "heartbeat_timer.h"
#include "anticipatory_freeze.h"
#include "wbox_test_init.h"

#define MAX_DBINIT_RETRY	4
#endif

#ifdef	GTM_FD_TRACE
#include "gtm_dbjnl_dupfd_check.h"
#endif

GBLREF	gd_region		*gv_cur_region, *db_init_region;
GBLREF	sgmnt_data_ptr_t	cs_data;
GBLREF	sgmnt_addrs		*cs_addrs;
GBLREF	sgmnt_addrs		*cs_addrs_list;
GBLREF	boolean_t		gtcm_connection;
GBLREF	bool			licensed;
GBLREF	int4			lkid;
GBLREF	char			*update_array, *update_array_ptr;
GBLREF	uint4			update_array_size, cumul_update_array_size;
GBLREF	ua_list			*first_ua, *curr_ua;
GBLREF	short			crash_count;
GBLREF	uint4			dollar_tlevel;
GBLREF	jnl_format_buffer	*non_tp_jfb_ptr;
GBLREF	boolean_t		mupip_jnl_recover;
GBLREF	buddy_list		*global_tlvl_info_list;
GBLREF	tp_region		*tp_reg_free_list;	/* Ptr to list of tp_regions that are unused */
GBLREF	tp_region		*tp_reg_list;		/* Ptr to list of tp_regions for this transaction */
GBLREF	unsigned int		t_tries;
GBLREF	struct_jrec_tcom	tcom_record;
GBLREF	boolean_t		tp_in_use;
GBLREF	uint4			region_open_count;
GBLREF	sm_uc_ptr_t		reformat_buffer;
GBLREF	int			reformat_buffer_len;
GBLREF	volatile int		reformat_buffer_in_use;	/* used only in DEBUG mode */
GBLREF	volatile int4		fast_lock_count;
GBLREF	gvt_container		*gvt_pending_list;
GBLREF	boolean_t		dse_running;
GBLREF	jnl_gbls_t		jgbl;
#ifdef UNIX
GBLREF	boolean_t		pool_init;
GBLREF	boolean_t		jnlpool_init_needed;
GBLREF	jnlpool_addrs		jnlpool;
#endif

LITREF char			gtm_release_name[];
LITREF int4			gtm_release_name_len;

error_def(ERR_BADDBVER);
error_def(ERR_DBCREINCOMP);
error_def(ERR_DBFLCORRP);
error_def(ERR_DBNOTGDS);
error_def(ERR_DBVERPERFWARN1);
error_def(ERR_DBVERPERFWARN2);
error_def(ERR_MMNODYNUPGRD);
error_def(ERR_REGOPENFAIL);

void	assert_jrec_member_offsets(void)
{
	assert(REAL_JNL_HDR_LEN % DISK_BLOCK_SIZE == 0);
	assert(JNL_HDR_LEN % DISK_BLOCK_SIZE == 0);
	/* We currently assume that the journal file header size is aligned relative to the filesystem block size.
	 * which is currently assumed to be a 2-power (e.g. 512 bytes, 1K, 2K, 4K etc.) but never more than 64K
	 * (MAX_IO_BLOCK_SIZE). Given this, we keep the journal file header size at 64K for Unix and 512-byte aligned
	 * for VMS. This way any process updating the file header will hold crit and do aligned writes. Any process
	 * writing the journal file data (journal records) on disk will hold the qio lock and can safely do so without
	 * ever touching the journal file header area. If ever MAX_IO_BLOCK_SIZE changes (say because some filesystem
	 * block size changes to 128K) such that JNL_HDR_LEN is no longer aligned to that, we want to know hence this assert.
	 */
	assert(JNL_HDR_LEN % MAX_IO_BLOCK_SIZE == 0);
	assert(REAL_JNL_HDR_LEN == SIZEOF(jnl_file_header));
	UNIX_ONLY(assert(REAL_JNL_HDR_LEN <= JNL_HDR_LEN);)
	VMS_ONLY(assert(REAL_JNL_HDR_LEN == JNL_HDR_LEN);)
	assert(JNL_HDR_LEN == JNL_FILE_FIRST_RECORD);
	assert(DISK_BLOCK_SIZE >= PINI_RECLEN + EPOCH_RECLEN + PFIN_RECLEN + EOF_RECLEN);
	assert((JNL_ALLOC_MIN * DISK_BLOCK_SIZE) > JNL_HDR_LEN);
	/* Following assert is for JNL_FILE_TAIL_PRESERVE macro in tp.h */
	assert(PINI_RECLEN >= EPOCH_RECLEN && PINI_RECLEN >= PFIN_RECLEN && PINI_RECLEN >= EOF_RECLEN);
	/* jnl_string structure has a 8-bit nodeflags field and a 24-bit length field. In some cases, this is
	 * used as a 32-bit length field (e.g. in the value part of the SET record or ZTWORMHOLE record). These
	 * usages treat the 32-bits as a jnl_str_len_t type and access it directly. Hence the requirement that
	 * jnl_str_len_t be the same size as 32-bits and also the same as the offset to the "text" member.
	 * If this assert fails, all places that reference jnl_str_len_t need to be revisited.
	 */
	assert(SIZEOF(jnl_str_len_t) == SIZEOF(uint4));
	assert(SIZEOF(jnl_str_len_t) == offsetof(jnl_string, text[0]));
	/* since time in jnl record is a uint4, and since JNL_SHORT_TIME expects time_t, we better ensure they are same.
	 * A change in the size of time_t would mean a redesign of the fields.  */

	assert(SIZEOF(time_t) == GTM64_ONLY(SIZEOF(gtm_int8)) NON_GTM64_ONLY(SIZEOF(int4)));

	/* Make sure all jnl_seqno fields start at same offset. mur_output_record and others rely on this. */
	assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_upd, token_seq.jnl_seqno));
	assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_epoch, jnl_seqno));
	assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_eof, jnl_seqno));
	assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_tcom, token_seq.jnl_seqno));
	assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_ztworm, token_seq.jnl_seqno));

	/* Make sure all strm_seqno fields start at same offset. Lot of modules rely on this */
	assert(offsetof(struct_jrec_null, strm_seqno) == offsetof(struct_jrec_upd, strm_seqno));
	assert(offsetof(struct_jrec_null, strm_seqno) == offsetof(struct_jrec_tcom, strm_seqno));
	assert(offsetof(struct_jrec_null, strm_seqno) == offsetof(struct_jrec_ztworm, strm_seqno));
	/* EOF and EPOCH are not included in the above asserts because they have not ONE but 16 strm_seqno values each */

	assert(offsetof(struct_jrec_ztcom, token) == offsetof(struct_jrec_upd, token_seq));
	/* Make sure all jnl_seqno and token fields start at 8-byte boundary */
	assert(offsetof(struct_jrec_upd, token_seq.jnl_seqno) ==
		(ROUND_UP(offsetof(struct_jrec_upd, token_seq.jnl_seqno), SIZEOF(seq_num))));
	assert(offsetof(struct_jrec_tcom, token_seq.jnl_seqno) ==
		(ROUND_UP(offsetof(struct_jrec_tcom, token_seq.jnl_seqno), SIZEOF(seq_num))));
	assert(offsetof(struct_jrec_null, jnl_seqno) ==
		(ROUND_UP(offsetof(struct_jrec_null, jnl_seqno), SIZEOF(seq_num))));
	assert(offsetof(struct_jrec_epoch, jnl_seqno) ==
		(ROUND_UP(offsetof(struct_jrec_epoch, jnl_seqno), SIZEOF(seq_num))));
	assert(offsetof(struct_jrec_eof, jnl_seqno) ==
		(ROUND_UP(offsetof(struct_jrec_eof, jnl_seqno), SIZEOF(seq_num))));
	/* All fixed size records must be multiple of 8-byte */
	assert(TCOM_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_tcom), JNL_REC_START_BNDRY)));
	assert(ZTCOM_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_ztcom), JNL_REC_START_BNDRY)));
	assert(INCTN_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_inctn), JNL_REC_START_BNDRY)));
	assert(PINI_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_pini), JNL_REC_START_BNDRY)));
	assert(PFIN_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_pfin), JNL_REC_START_BNDRY)));
	assert(NULL_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_null), JNL_REC_START_BNDRY)));
	assert(EPOCH_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_epoch), JNL_REC_START_BNDRY)));
	assert(EOF_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_eof), JNL_REC_START_BNDRY)));
	/* Assert following comment which is relied upon in JNL_FILE_TAIL_PRESERVE macro.
	 * 	"We know PINI_RECLEN is maximum of EPOCH_RECLEN, PFIN_RECLEN, EOF_RECLEN"
	 */
	assert(PINI_RECLEN > EPOCH_RECLEN);
	assert(PINI_RECLEN > PFIN_RECLEN);
	assert(PINI_RECLEN > EOF_RECLEN);
	/* Assumption about the structures in code */
	assert(0 == MIN_ALIGN_RECLEN % JNL_REC_START_BNDRY);
	assert(SIZEOF(uint4) == SIZEOF(jrec_suffix));
	assert((SIZEOF(jnl_record) + MAX_LOGI_JNL_REC_SIZE + SIZEOF(jrec_suffix)) < MAX_JNL_REC_SIZE);
	assert((DISK_BLOCK_SIZE * JNL_DEF_ALIGNSIZE) >= MAX_JNL_REC_SIZE);/* default alignsize supports max jnl record length */
	assert(MAX_MAX_NONTP_JNL_REC_SIZE <= MAX_JNL_REC_SIZE);
	assert(MAX_DB_BLK_SIZE < MAX_MAX_NONTP_JNL_REC_SIZE);	/* Ensure a PBLK record can accommodate a full GDS block */
	assert(MAX_JNL_REC_SIZE <= (1 << 24));
		/* Ensure that the 24-bit length field in the journal record can accommodate the maximum journal record size */
	assert(tcom_record.prefix.forwptr == tcom_record.suffix.backptr);
	assert(TCOM_RECLEN == tcom_record.suffix.backptr);
	assert(SIZEOF(token_split_t) == SIZEOF(token_build));   /* Required for TOKEN_SET macro */
}

void gvcst_init(gd_region *greg)
{
	sgmnt_addrs		*csa, *prevcsa, *regcsa;
	sgmnt_data_ptr_t	csd;
#	ifdef VMS
	char			cs_data_buff[ROUND_UP(SGMNT_HDR_LEN, DISK_BLOCK_SIZE)];
	sgmnt_data_ptr_t	temp_cs_data;
#	endif
	uint4			segment_update_array_size;
	int4			bsize;
	boolean_t		realloc_alt_buff;
	file_control		*fc;
	gd_region		*prev_reg, *reg_top;
#	ifdef DEBUG
	cache_rec_ptr_t		cr;
	bt_rec_ptr_t		bt;
	blk_ident		tmp_blk;
#	endif
	mstr			log_nam, trans_log_nam;
	char			trans_buff[MAX_FN_LEN + 1];
	unique_file_id		*greg_fid, *reg_fid;
	gd_addr			*addr_ptr;
	tp_region		*tr;
	ua_list			*tmp_ua;
	time_t			curr_time;
	uint4			curr_time_uint4, next_warn_uint4;
	unsigned int            minus1 = (unsigned)-1;
	enum db_acc_method	greg_acc_meth;
	ht_ent_mname		*tabent, *topent, *stayent;
	gv_namehead		*gvt, *gvt_stay;
	gvnh_reg_t		*gvnh_reg;
	hash_table_mname	*table;
	boolean_t		added, first_wasopen, onln_rlbk_cycle_mismatch = FALSE;
	intrpt_state_t		save_intrpt_ok_state;
#	ifdef UNIX
	replpool_identifier	replpool_id;
	unsigned int		full_len;
	int4			db_init_retry;
#	endif
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	UNSUPPORTED_PLATFORM_CHECK;
	assert(!jgbl.forw_phase_recovery);
	CWS_INIT;	/* initialize the cw_stagnate hash-table */
	/* check the header design assumptions */
	assert(SIZEOF(th_rec) == (SIZEOF(bt_rec) - SIZEOF(bt->blkque)));
	assert(SIZEOF(cache_rec) == (SIZEOF(cache_state_rec) + SIZEOF(cr->blkque)));
	DEBUG_ONLY(assert_jrec_member_offsets();)
	assert(MAX_DB_BLK_SIZE < (1 << NEXT_OFF_MAX_BITS));	/* Ensure a off_chain record's next_off member
								 * can work with all possible block sizes */
        set_num_additional_processors();
	DEBUG_ONLY(
		/* Note that the "block" member in the blk_ident structure in gdskill.h has 30 bits.
		 * Currently, the maximum number of blocks is 2**30. If ever this increases, something
		 * has to be correspondingly done to the "block" member to increase its capacity.
		 * The following assert checks that we always have space in the "block" member
		 * to represent a GDS block number.
		 */
		tmp_blk.block = minus1;
		assert(MAXTOTALBLKS_MAX - 1 <= tmp_blk.block);
	)
	/* TH_BLOCK is currently a hardcoded constant as basing it on the offsetof macro does not work with the VMS compiler.
	 * Therefore assert that TH_BLOCK points to the 512-byte block where the "trans_hist" member lies in the fileheader.
	 */
	assert(DIVIDE_ROUND_UP(offsetof(sgmnt_data, trans_hist), DISK_BLOCK_SIZE) == TH_BLOCK);
	if ((prev_reg = dbfilopn(greg)) != greg)
	{
		if (NULL == prev_reg || (gd_region *)-1L == prev_reg) /* (gd_region *)-1 == prev_reg => cm region open attempted */
			return;
		/* Found same database already open - prev_reg contains addr of originally openned region */
		greg->dyn.addr->file_cntl = prev_reg->dyn.addr->file_cntl;
		memcpy(greg->dyn.addr->fname, prev_reg->dyn.addr->fname, prev_reg->dyn.addr->fname_len);
		greg->dyn.addr->fname_len = prev_reg->dyn.addr->fname_len;
		csa = (sgmnt_addrs *)&FILE_INFO(greg)->s_addrs;
		PROCESS_GVT_PENDING_LIST(greg, csa, gvt_pending_list);
		csd = csa->hdr;
		if (NULL == csa->gvt_hashtab)
		{	/* Already have another region that points to the same physical database file as this one.
			 * Since two regions point to the same physical file, start maintaining a list of all global variable
			 * names whose gv_targets have already been allocated on behalf of the current database file.
			 * Future targ_allocs will check this list before they allocate (to avoid duplicate allocations).
			 */
			csa->gvt_hashtab = (hash_table_mname *)malloc(SIZEOF(hash_table_mname));
			init_hashtab_mname(csa->gvt_hashtab, 0, HASHTAB_NO_COMPACT, HASHTAB_NO_SPARE_TABLE);
			assert(1 == csa->regcnt);
			first_wasopen = TRUE;
		} else
			first_wasopen = FALSE;
		for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr))
		{
			table = addr_ptr->tab_ptr;
			for (tabent = table->base, topent = tabent + table->size; tabent < topent; tabent++)
			{
				if (HTENT_VALID_MNAME(tabent, gvnh_reg_t, gvnh_reg))
				{	/* Check if the gvt's region is the current region.
					 * If so add gvt's variable name into the csa hashtable.
					 */
					gvt = gvnh_reg->gvt;
					assert((gvnh_reg->gd_reg != greg) || (csa == gvt->gd_csa));
					/* If this is the first time a was_open region is happening for this csa, then
					 * we want to merge gv_targets from both the regions into csa->gvt_hashtab. For
					 * all future was_open cases, we want only to add gv_targets from the was_open region.
					 */
					if (first_wasopen && (csa == gvt->gd_csa) || !first_wasopen && (gvnh_reg->gd_reg == greg))
					{	/* Add gv_target into csa->gvt_hashtab */
						added = add_hashtab_mname(csa->gvt_hashtab, &gvt->gvname, gvt, &stayent);
						assert(!added || (1 <= gvt->regcnt));
						if (!added)
						{	/* Entry already present. Increment gvt->regcnt.
							 * If NOISOLATION status differs between the two,
							 * choose the more pessimistic one.
							 */
							gvt_stay = (gv_namehead *)stayent->value;
							assert(gvt_stay != gvt);
							if (FALSE == gvt->noisolation)
								gvt_stay->noisolation = FALSE;
							assert(1 <= gvt_stay->regcnt);
							/* Now make gvnh_reg->gvt point to gvt_stay (instead of gvt) */
							gvt_stay->regcnt++;
							gvt->regcnt--;
							gvnh_reg->gvt = gvt_stay;
							targ_free(gvt);
						}
					}
				}
			}
		}
		greg->max_rec_size = csd->max_rec_size;
		greg->max_key_size = csd->max_key_size;
	 	greg->null_subs = csd->null_subs;
		greg->std_null_coll = csd->std_null_coll;
		greg->jnl_state = csd->jnl_state;
		greg->jnl_file_len = csd->jnl_file_len;		/* journal file name length */
		memcpy(greg->jnl_file_name, csd->jnl_file_name, greg->jnl_file_len);	/* journal file name */
		greg->jnl_alq = csd->jnl_alq;
		greg->jnl_deq = csd->jnl_deq;
		greg->jnl_buffer_size = csd->jnl_buffer_size;
		greg->jnl_before_image = csd->jnl_before_image;
		SET_REGION_OPEN_TRUE(greg, WAS_OPEN_TRUE);
		assert(1 <= csa->regcnt);
		csa->regcnt++;	/* Increment # of regions that point to this csa */
		return;
	}
	GTM_FD_TRACE_ONLY(gtm_dbjnl_dupfd_check();)	/* check if any of db or jnl fds collide (D9I11-002714) */
	greg->was_open = FALSE;
	/* We shouldn't have crit on any region unless we are in TP and in the final retry or we are in mupip_set_journal trying to
	 * switch journals across all regions. WBTEST_HOLD_CRIT_ENABLED is an exception because it exercises a deadlock situation so
	 * it needs to hold multiple crits at the same time. Currently, there is no fine-granular checking for mupip_set_journal,
	 * hence a coarse MUPIP_IMAGE check for image_type.
	 */
	assert(dollar_tlevel && (CDB_STAGNATE <= t_tries) || IS_MUPIP_IMAGE || (0 == have_crit(CRIT_HAVE_ANY_REG))
	       || WBTEST_ENABLED(WBTEST_HOLD_CRIT_ENABLED));
	if (dollar_tlevel && (0 != have_crit(CRIT_HAVE_ANY_REG)))
	{	/* To avoid deadlocks with currently holding crits and the DLM lock request to be done in db_init(),
		 * we should insert this region in the tp_reg_list and tp_restart should do the gvcst_init after
		 * having released crit on all regions. Note that this check should be done AFTER checking if the
		 * region has already been opened (i.e. greg->was_open = TRUE logic above) since in that case we dont
		 * do any heavyweight processing (like db_init which involves crit/DLM locks) and so dont need to restart.
		 */
		insert_region(greg, &tp_reg_list, &tp_reg_free_list, SIZEOF(tp_region));
		t_retry(cdb_sc_needcrit);
		assert(FALSE);	/* we should never reach here since t_retry should have unwound the M-stack and restarted the TP */
	}
	csa = (sgmnt_addrs *)&FILE_INFO(greg)->s_addrs;

#ifdef	NOLICENSE
	licensed = TRUE;
#else
	CRYPT_CHKSYSTEM;
#endif
	db_init_region = greg;	/* initialized for dbinit_ch */
	csa->hdr = NULL;
        csa->nl = NULL;
        csa->jnl = NULL;
	csa->persistent_freeze = FALSE;	/* want secshr_db_clnup() to clear an incomplete freeze/unfreeze codepath */
	csa->regcnt = 1;	/* At this point, only one region points to this csa */
	csa->db_addrs[0] = csa->db_addrs[1] = NULL;
	csa->lock_addrs[0] = csa->lock_addrs[1] = NULL;
#	ifdef VMS
	greg_acc_meth = greg->dyn.addr->acc_meth;
	assert(dba_cm != greg_acc_meth);
	temp_cs_data = (sgmnt_data_ptr_t)cs_data_buff;
	fc = greg->dyn.addr->file_cntl;
	fc->file_type = greg_acc_meth;
	fc->op = FC_READ;
	fc->op_buff = (sm_uc_ptr_t)temp_cs_data;
	fc->op_len = SIZEOF(*temp_cs_data);
	fc->op_pos = 1;
	dbfilop(fc);
	DO_BADDBVER_CHK(greg, temp_cs_data);
	DO_DB_HDR_CHECK(greg, temp_cs_data); /* Basic sanity check on the file header fields */
	if (greg_acc_meth != temp_cs_data->acc_meth)
	{
		greg_acc_meth = temp_cs_data->acc_meth;
		greg->dyn.addr->acc_meth = greg_acc_meth;
	}
#	endif
	/* Here's the shared memory layout:
	 *
	 * low address
	 *
	 * both
	 *	segment_data
	 *	(file_header)
	 *	MM_BLOCK
	 *	(master_map)
	 *	TH_BLOCK
	 * BG
	 *	bt_header
	 *	(bt_buckets * bt_rec)
	 *	th_base (SIZEOF(que_ent) into an odd bt_rec)
	 *	bt_base
	 *	(n_bts * bt_rec)
	 *	LOCK_BLOCK (lock_space)
	 *	(lock_space_size)
	 *	cs_addrs->acc_meth.bg.cache_state
	 *	(cache_que_heads)
	 *	(bt_buckets * cache_rec)
	 *	(n_bts * cache_rec)
	 *	critical
	 *	(mutex_struct)
	 *	nl
	 *	(node_local)
	 *	[jnl_name
	 *	jnl_buffer]
	 * MM
	 *	file contents
	 *	LOCK_BLOCK (lock_space)
	 *	(lock_space_size)
	 *	cs_addrs->acc_meth.mm.mmblk_state
	 *	(mmblk_que_heads)
	 *	(bt_buckets * mmblk_rec)
	 *	(n_bts * mmblk_rec)
	 *	critical
	 *	(mutex_struct)
	 *	nl
	 *	(node_local)
	 *	[jnl_name
	 *	jnl_buffer]
	 * high address
	 */
 	/* Ensure first 3 members (upto now_running) of node_local are at the same offset for any version.
	 *
	 * Structure ----> node_local <----    size 59392 [0xe800]
	 *
	 *	offset = 0000 [0x0000]      size = 0012 [0x000c]    ----> node_local.label
	 *	offset = 0012 [0x000c]      size = 0256 [0x0100]    ----> node_local.fname
	 *	offset = 0268 [0x010c]      size = 0036 [0x0024]    ----> node_local.now_running
	 *
	 * This is so that the VERMISMATCH error can be successfully detected in db_init/mu_rndwn_file
	 *	and so that the db-file-name can be successfully obtained from orphaned shm by mu_rndwn_all.
	 */
 	assert(0 == OFFSETOF(node_local, label[0]));
 	assert(12 == SIZEOF(((node_local *)NULL)->label));
 	assert(12 == GDS_LABEL_SZ);
	assert(12 == OFFSETOF(node_local, fname[0]));
 	assert(256 == SIZEOF(((node_local *)NULL)->fname));
 	assert(256 == (MAX_FN_LEN + 1));
	assert(268 == OFFSETOF(node_local, now_running[0]));
 	assert(36 == SIZEOF(((node_local *)NULL)->now_running));
	assert(36 == MAX_REL_NAME);
#	ifdef UNIX
	START_HEARTBEAT_IF_NEEDED;
	if (!pool_init && jnlpool_init_needed && ANTICIPATORY_FREEZE_AVAILABLE && REPL_INST_AVAILABLE)
		jnlpool_init(GTMRELAXED, (boolean_t)FALSE, (boolean_t *)NULL);
	/* Any LSEEKWRITEs hence forth will wait if the instance is frozen. To aid in printing the region information before
	 * and after the wait, csa->region is referenced. Since it is NULL at this point, set it to greg. This is a safe
	 * thing to do since csa->region is anyways set in db_common_init (few lines below).
	 */
	csa->region = greg;
#	endif
	/* Protect the db_init and the code below until we set greg->open to TRUE. This is needed as otherwise,
	 * if a MUPIP STOP is issued to this process at a time-window when db_init is completed but greg->open
	 * is NOT set to TRUE, will cause gds_rundown NOT to clean up the shared memory created by db_init and
	 * thus would be left over in the system.
	 */
	DEFER_INTERRUPTS(INTRPT_IN_GVCST_INIT);
	VMS_ONLY(db_init(greg, temp_cs_data));
#	ifdef UNIX
	db_init_retry = 0;
	GTM_WHITE_BOX_TEST(WBTEST_HOLD_FTOK_UNTIL_BYPASS, db_init_retry, 3);
	for (; db_init_retry < MAX_DBINIT_RETRY; db_init_retry++)
	{
		if (0 == db_init(greg))
			break;
		db_init_err_cleanup(MAX_DBINIT_RETRY >  (db_init_retry + 1));
	}
	if (MAX_DBINIT_RETRY == db_init_retry) /* We retried enough. Error out. */
	{
		assert(IS_LKE_IMAGE || IS_DSE_IMAGE);
		rts_error_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_REGOPENFAIL, 4, REG_LEN_STR(greg), DB_LEN_STR(greg));
	}

#	endif
	/* At this point, we have initialized the database, but haven't yet set reg->open to TRUE. If any rts_errors happen in
	 * the meantime, there are no condition handlers established to handle the rts_error. More importantly, it is non-trivial
	 * to add logic to such a condition handler to undo the effects of db_init. Also, in some cases, the rts_error can can
	 * confuse future calls of db_init. By invoking DBG_MARK_RTS_ERROR_UNUSABLE, we can catch any rts_errors in future and
	 * eliminate it on a case by case basis.
	 */
	UNIX_ONLY(DBG_MARK_RTS_ERROR_UNUSABLE);
	crash_count = csa->critical->crashcnt;
	csa->regnum = ++region_open_count;
	csd = csa->hdr;
#	ifdef GTM_TRIGGER
	/* Take copy of db trigger cycle into csa at db startup. Any concurrent changes to the
	 * db trigger cycle (by MUPIP TRIGGER) will be detected at tcommit (t_end/tp_tend) time.
	 */
	csa->db_trigger_cycle = csd->db_trigger_cycle;
#	endif
	/* set csd and fill in selected fields */
	assert(greg->dyn.addr->acc_meth == csd->acc_meth); /* db_init should have made sure this assert holds good */
	greg_acc_meth = csd->acc_meth;
	/* It is necessary that we do the pending gv_target list reallocation BEFORE db_common_init as the latter resets
	 * greg->max_key_size to be equal to the csd->max_key_size and hence process_gvt_pending_list might wrongly conclude
	 * that NO reallocation (since it checks greg->max_key_size with csd->max_key_size) is needed when in fact a
	 * reallocation might be necessary (if the user changed max_key_size AFTER database creation)
	 */
	PROCESS_GVT_PENDING_LIST(greg, csa, gvt_pending_list);
	db_common_init(greg, csa, csd);	/* do initialization common to db_init() and mu_rndwn_file() */

	/* If we are not fully upgraded, see if we need to send a warning to the operator console about
	   performance. Compatibility mode is a known performance drain. Actually, we can send one of two
	   messages. If the desired_db_format is for an earlier release than the current release, we send
	   a performance warning that this mode degrades performance. However, if the desired_db_format is
	   for the current version but there are blocks to convert still, we send a gengle reminder that
	   running mupip reorg upgrade would be a good idea to get the full performance benefit of V5.
	*/
	time(&curr_time);
	assert(MAXUINT4 > curr_time);
	curr_time_uint4 = (uint4)curr_time;
	next_warn_uint4 = csd->next_upgrd_warn.cas_time;
	if (!csd->fully_upgraded && curr_time_uint4 > next_warn_uint4
	    && COMPSWAP_LOCK(&csd->next_upgrd_warn.time_latch, next_warn_uint4, 0, (curr_time_uint4 + UPGRD_WARN_INTERVAL), 0))
	{	/* The msg is due and we have successfully updated the next time interval */
		if (GDSVCURR != csd->desired_db_format)
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBVERPERFWARN1, 2, DB_LEN_STR(greg));
		else
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBVERPERFWARN2, 2, DB_LEN_STR(greg));
	}

	/* Compute the maximum journal space requirements for a PBLK (including possible ALIGN record).
	 * Use this variable in the TOTAL_TPJNL_REC_SIZE and TOTAL_NONTP_JNL_REC_SIZE macros instead of recomputing.
	 */
	csa->pblk_align_jrecsize = (int4)MIN_PBLK_RECLEN + csd->blk_size + (int4)MIN_ALIGN_RECLEN;
	segment_update_array_size = UA_SIZE(csd);

	if (first_ua == NULL)
	{	/* first open of first database - establish an update array system */
		assert(update_array == NULL);
		assert(update_array_ptr == NULL);
		assert(update_array_size == 0);
		tmp_ua = (ua_list *)malloc(SIZEOF(ua_list));
		memset(tmp_ua, 0, SIZEOF(ua_list));	/* initialize tmp_ua->update_array and tmp_ua->next_ua to NULL */
		tmp_ua->update_array = (char *)malloc(segment_update_array_size);
		tmp_ua->update_array_size = segment_update_array_size;
		/* assign global variables only after malloc() succeeds */
		update_array_size = cumul_update_array_size = segment_update_array_size;
		update_array = update_array_ptr = tmp_ua->update_array;
		first_ua = curr_ua = tmp_ua;
	} else
	{	/* there's already an update_array system in place */
		assert(update_array != NULL);
		assert(update_array_size != 0);
		if (!dollar_tlevel && segment_update_array_size > first_ua->update_array_size)
		{
			/* no transaction in progress and the current array is too small - replace it */
			assert(first_ua->update_array == update_array);
			assert(first_ua->update_array_size == update_array_size);
			assert(first_ua->next_ua == NULL);
			tmp_ua = first_ua;
			first_ua = curr_ua = NULL;
			free(update_array);
			tmp_ua->update_array = update_array = update_array_ptr = NULL;
			tmp_ua->update_array = (char *)malloc(segment_update_array_size);
			tmp_ua->update_array_size = segment_update_array_size;
			/* assign global variables only after malloc() succeeds */
			update_array_size = cumul_update_array_size = segment_update_array_size;
			update_array = update_array_ptr = tmp_ua->update_array;
			first_ua = curr_ua = tmp_ua;
		}
	}
	assert(global_tlvl_info_list || !csa->sgm_info_ptr);
	if (JNL_ALLOWED(csa))
	{
		bsize = csd->blk_size;
		realloc_alt_buff = FALSE;
		if (NULL == non_tp_jfb_ptr)
		{
			non_tp_jfb_ptr = (jnl_format_buffer *)malloc(SIZEOF(jnl_format_buffer));
			non_tp_jfb_ptr->hi_water_bsize = bsize;
			non_tp_jfb_ptr->buff = (char *)malloc(MAX_NONTP_JNL_REC_SIZE(bsize));
			non_tp_jfb_ptr->record_size = 0;	/* initialize it to 0 since TOTAL_NONTPJNL_REC_SIZE macro uses it */
			GTMCRYPT_ONLY(non_tp_jfb_ptr->alt_buff = NULL);
		} else if (bsize > non_tp_jfb_ptr->hi_water_bsize)
		{	/* Need a larger buffer to accommodate larger non-TP journal records */
			non_tp_jfb_ptr->hi_water_bsize = bsize;
			free(non_tp_jfb_ptr->buff);
			non_tp_jfb_ptr->buff = (char *)malloc(MAX_NONTP_JNL_REC_SIZE(bsize));
#			ifdef GTM_CRYPT
			if (NULL != non_tp_jfb_ptr->alt_buff)
			{
				free(non_tp_jfb_ptr->alt_buff);
				realloc_alt_buff = TRUE;
			}
#			endif
		}
		/* If the journal records need to be encrypted in the journal file and if replication is in use,
		 * we will need access to both the encrypted (for the journal file) and unencrypted (for the
		 * journal pool) journal record contents. Allocate an alternative buffer if any open journaled region
		 * is encrypted.
		 */
#		ifdef GTM_CRYPT
		if (realloc_alt_buff || (csd->is_encrypted && (NULL == non_tp_jfb_ptr->alt_buff)))
			non_tp_jfb_ptr->alt_buff = (char *)malloc(MAX_NONTP_JNL_REC_SIZE(non_tp_jfb_ptr->hi_water_bsize));
#		endif
		/* csa->min_total_tpjnl_rec_size represents the minimum journal buffer space needed for a TP transaction.
		 * It is a conservative estimate assuming that one ALIGN record and one PINI record will be written for
		 * one set of fixed size jnl records written.
		 * si->total_jnl_rec_size is initialized/reinitialized  to this value here and in tp_clean_up().
		 * The purpose of this field is to avoid recomputation of the variable in tp_clean_up().
		 * In addition to this, space requirements for whatever journal records get formatted as part of
		 * jnl_format() need to be taken into account.
		 * This is done in jnl_format() where si->total_jnl_rec_size is appropriately incremented.
		 */
		csa->min_total_tpjnl_rec_size = PINI_RECLEN + TCOM_RECLEN + MIN_ALIGN_RECLEN;
		/* Similarly csa->min_total_nontpjnl_rec_size represents the minimum journal buffer space needed
		 * for a non-TP transaction.
		 * It is a conservative estimate assuming that one ALIGN record and one PINI record will be written for
		 * one set of fixed size jnl records written.
		 */
		csa->min_total_nontpjnl_rec_size = PINI_RECLEN + MIN_ALIGN_RECLEN;
	}
	if (tp_in_use || !IS_GTM_IMAGE)
		gvcst_tp_init(greg);	/* Initialize TP structures, else postpone till TP is used (only if GTM) */
	if (!global_tlvl_info_list)
	{
		global_tlvl_info_list = (buddy_list *)malloc(SIZEOF(buddy_list));
		initialize_list(global_tlvl_info_list, SIZEOF(global_tlvl_info), GBL_TLVL_INFO_LIST_INIT_ALLOC);
	}
	assert(!greg->was_open);
	SET_REGION_OPEN_TRUE(greg, WAS_OPEN_FALSE);
	csa = (sgmnt_addrs*)&FILE_INFO(greg)->s_addrs;
	if (NULL != csa->dir_tree)
	{	/* It is possible that dir_tree has already been targ_alloc'ed. This is because GT.CM or VMS DAL
		 * calls can run down regions without the process halting out. We don't want to double malloc.
		 */
		csa->dir_tree->clue.end = 0;
	}
	SET_CSA_DIR_TREE(csa, greg->max_key_size, greg);
	/* Now that reg->open is set to TRUE and directory tree is initialized, go ahead and set rts_error back to being usable */
	UNIX_ONLY(DBG_MARK_RTS_ERROR_USABLE);
	/* gds_rundown if invoked from now on will take care of cleaning up the shared memory segment */
	ENABLE_INTERRUPTS(INTRPT_IN_GVCST_INIT);
	if (dba_bg == greg_acc_meth)
	{	/* Check if (a) this region has non-upgraded blocks and if so, (b) the reformat buffer exists and
		 * (c) if it is big enough to deal with this region. If the region does not have any non-upgraded
		 * block (blks_to_upgrd is 0) we will not allocate the buffer at this time. Note that this opens up
		 * a small window for errors. If this buffer is not allocated and someone turns on compatibility
		 * mode and before the process can discover this and allocate the buffer, it runs out of memory,
		 * errors out and finds it is responsible for running down the database, it could fail on a recursive
		 * memory error when it tries to allocate the block. This is (to me) an acceptable risk as it is
		 * very low and compares favorably to the cost of every process allocating a database block sized
		 * chunk of private storage that will be seldom if ever used (SE 3/2005).
		 */
		if (0 != csd->blks_to_upgrd && csd->blk_size > reformat_buffer_len)
		{	/* Buffer not big enough (or does not exist) .. get a new one releasing old if it exists */
			assert(0 == fast_lock_count);	/* this is mainline (non-interrupt) code */
			++fast_lock_count;		/* No interrupts across this use of reformat_buffer */
			/* reformat_buffer_in_use should always be incremented only AFTER incrementing fast_lock_count
			 * as it is the latter that prevents interrupts from using the reformat buffer. Similarly
			 * the decrement of fast_lock_count should be done AFTER decrementing reformat_buffer_in_use.
			 */
			assert(0 == reformat_buffer_in_use);
			DEBUG_ONLY(reformat_buffer_in_use++;)
			if (reformat_buffer)
				free(reformat_buffer);	/* Different blksized databases in use .. keep only largest one */
			reformat_buffer = malloc(csd->blk_size);
			reformat_buffer_len = csd->blk_size;
			DEBUG_ONLY(reformat_buffer_in_use--;)
			assert(0 == reformat_buffer_in_use);
			--fast_lock_count;
		}

	}
	if ((dba_bg == greg_acc_meth) || (dba_mm == greg_acc_meth))
	{
		/* Determine fid_index of current region's file_id across sorted file_ids of all regions open until now.
		 * All regions which have a file_id lesser than that of current region will have no change to their fid_index
		 * All regions which have a file_id greater than that of current region will have their fid_index incremented by 1
		 * The fid_index determination algorithm below has an optimization in that if the current region's file_id is
		 * determined to be greater than that of a particular region, then all regions whose fid_index is lesser
		 * than that particular region's fid_index are guaranteed to have a lesser file_id than the current region
		 * so we do not compare those against the current region's file_id.
		 * Note that the sorting is done only on DB/MM regions. GT.CM/DDP regions should not be part of TP transactions,
		 * hence they will not be sorted.
		 */
		prevcsa = NULL;
		greg_fid = &(csa->nl->unique_id);
		for (regcsa = cs_addrs_list; NULL != regcsa; regcsa = regcsa->next_csa)
		{
			UNIX_ONLY(onln_rlbk_cycle_mismatch |= (regcsa->db_onln_rlbkd_cycle != regcsa->nl->db_onln_rlbkd_cycle));
			if ((NULL != prevcsa) && (regcsa->fid_index < prevcsa->fid_index))
				continue;
			reg_fid = &((regcsa)->nl->unique_id);
			VMS_ONLY(if (0 < memcmp(&(greg_fid->file_id), (char *)&(reg_fid->file_id), SIZEOF(gd_id))))
			UNIX_ONLY(if (0 < gdid_cmp(&(greg_fid->uid), &(reg_fid->uid))))
			{
				if ((NULL == prevcsa) || (regcsa->fid_index > prevcsa->fid_index))
					prevcsa = regcsa;
			} else
				regcsa->fid_index++;
		}
		if (NULL == prevcsa)
			csa->fid_index = 1;
		else
			csa->fid_index = prevcsa->fid_index + 1;
		UNIX_ONLY(
			if (onln_rlbk_cycle_mismatch)
			{
				csa->root_search_cycle--;
				csa->onln_rlbk_cycle--;
				csa->db_onln_rlbkd_cycle--;
			}
		)
		/* Add current csa into list of open csas */
		csa->next_csa = cs_addrs_list;
		cs_addrs_list = csa;
		/* Also update tp_reg_list fid_index's as insert_region relies on it */
		for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr)
			tr->file.fid_index = (&FILE_INFO(tr->reg)->s_addrs)->fid_index;
		DBG_CHECK_TP_REG_LIST_SORTING(tp_reg_list);
	}
#	ifdef UNIX
	if (pool_init && REPL_ALLOWED(csd) && jnlpool_init_needed)
	{
		/* Last parameter to VALIDATE_INITIALIZED_JNLPOOL is TRUE if the process does logical updates and FALSE otherwise.
		 * This parameter governs whether the macro can do SCNDDBNOUPD check or not. All the utilities that sets
		 * jnlpool_init_needed global variable don't do logical updates (REORG, EXTEND, etc.). But, for GT.M,
		 * jnlpool_init_needed is set to TRUE unconditionally. Even though GT.M can do logical updates, we pass FALSE
		 * unconditionally to the macro (indicating no logical updates). This is because, at this point, there is no way to
		 * tell if this process wants to open the database for read or write operation. If it is for a read operation, we
		 * don't want the below macro to issue SCNDDBNOUPD error. If it is for write operation, we will skip the
		 * SCNDDBNOUPD error message here. But, eventually when this process goes to gvcst_{put,kill} or op_ztrigger,
		 * SCNDDBNOUPD is issued.
		 */
		VALIDATE_INITIALIZED_JNLPOOL(csa, csa->nl, greg, GTMRELAXED, SCNDDBNOUPD_CHECK_FALSE);
	}
#	endif
	return;
}