660 lines
30 KiB
C
660 lines
30 KiB
C
/****************************************************************
|
|
* *
|
|
* Copyright 2001, 2011 Fidelity Information Services, Inc *
|
|
* *
|
|
* This source code contains the intellectual property *
|
|
* of its copyright holder(s), and is made available *
|
|
* under a license. If you do not know the terms of *
|
|
* the license, please stop and do not read further. *
|
|
* *
|
|
****************************************************************/
|
|
|
|
#include "mdef.h"
|
|
|
|
#include <stddef.h> /* for offsetof macro */
|
|
|
|
#include "gtm_string.h"
|
|
#include "gtm_time.h"
|
|
|
|
#include "cdb_sc.h"
|
|
#include "gdsroot.h"
|
|
#include "gtm_facility.h"
|
|
#include "fileinfo.h"
|
|
#include "gdsbt.h"
|
|
#include "gdsfhead.h"
|
|
#include "gdsblk.h"
|
|
#include "gdskill.h"
|
|
#include "gdscc.h"
|
|
#include "min_max.h" /* needed for gdsblkops.h */
|
|
#include "gdsblkops.h"
|
|
#include "filestruct.h"
|
|
#include "iosp.h"
|
|
#include "jnl.h"
|
|
#include "hashtab_int4.h" /* needed for tp.h */
|
|
#include "buddy_list.h" /* needed for tp.h */
|
|
#include "tp.h"
|
|
#include "gtm_stdlib.h" /* for ATOI */
|
|
#include "cryptdef.h"
|
|
#include "mlkdef.h"
|
|
#include "error.h"
|
|
#include "gt_timer.h"
|
|
#include "gtmimagename.h"
|
|
#include "trans_log_name.h"
|
|
#include "gtm_logicals.h"
|
|
#include "dbfilop.h"
|
|
#include "set_num_additional_processors.h"
|
|
#include "have_crit.h"
|
|
#include "t_retry.h"
|
|
#include "dpgbldir.h"
|
|
#include "longset.h" /* needed for cws_insert.h */
|
|
#include "cws_insert.h" /* for CWS_INIT macro */
|
|
#include "gvcst_protos.h" /* for gvcst_init,gvcst_init_sysops,gvcst_tp_init prototype */
|
|
#include "compswap.h"
|
|
#include "send_msg.h"
|
|
#include "targ_alloc.h" /* for "targ_free" prototype */
|
|
#include "hashtab_mname.h"
|
|
#include "process_gvt_pending_list.h"
|
|
#include "gtmmsg.h"
|
|
#ifdef UNIX
|
|
#include "heartbeat_timer.h"
|
|
#endif
|
|
|
|
#ifdef GTM_FD_TRACE
|
|
#include "gtm_dbjnl_dupfd_check.h"
|
|
#endif
|
|
|
|
GBLREF gd_region *gv_cur_region, *db_init_region;
|
|
GBLREF sgmnt_data_ptr_t cs_data;
|
|
GBLREF sgmnt_addrs *cs_addrs;
|
|
GBLREF sgmnt_addrs *cs_addrs_list;
|
|
GBLREF boolean_t gtcm_connection;
|
|
GBLREF bool licensed;
|
|
GBLREF int4 lkid;
|
|
GBLREF char *update_array, *update_array_ptr;
|
|
GBLREF uint4 update_array_size, cumul_update_array_size;
|
|
GBLREF ua_list *first_ua, *curr_ua;
|
|
GBLREF short crash_count;
|
|
GBLREF uint4 dollar_tlevel;
|
|
GBLREF jnl_format_buffer *non_tp_jfb_ptr;
|
|
GBLREF unsigned char *non_tp_jfb_buff_ptr;
|
|
GBLREF boolean_t mupip_jnl_recover;
|
|
GBLREF buddy_list *global_tlvl_info_list;
|
|
GBLREF tp_region *tp_reg_free_list; /* Ptr to list of tp_regions that are unused */
|
|
GBLREF tp_region *tp_reg_list; /* Ptr to list of tp_regions for this transaction */
|
|
GBLREF unsigned int t_tries;
|
|
GBLREF struct_jrec_tcom tcom_record;
|
|
GBLREF boolean_t tp_in_use;
|
|
GBLREF uint4 region_open_count;
|
|
GBLREF sm_uc_ptr_t reformat_buffer;
|
|
GBLREF int reformat_buffer_len;
|
|
GBLREF volatile int reformat_buffer_in_use; /* used only in DEBUG mode */
|
|
GBLREF volatile int4 fast_lock_count;
|
|
GBLREF gvt_container *gvt_pending_list;
|
|
GBLREF boolean_t dse_running;
|
|
GBLREF jnl_gbls_t jgbl;
|
|
|
|
LITREF char gtm_release_name[];
|
|
LITREF int4 gtm_release_name_len;
|
|
|
|
error_def(ERR_BADDBVER);
|
|
error_def(ERR_DBCREINCOMP);
|
|
error_def(ERR_DBFLCORRP);
|
|
error_def(ERR_DBNOTGDS);
|
|
error_def(ERR_DBVERPERFWARN1);
|
|
error_def(ERR_DBVERPERFWARN2);
|
|
error_def(ERR_MMNODYNUPGRD);
|
|
|
|
void assert_jrec_member_offsets(void)
|
|
{
|
|
assert(REAL_JNL_HDR_LEN % DISK_BLOCK_SIZE == 0);
|
|
assert(JNL_HDR_LEN % DISK_BLOCK_SIZE == 0);
|
|
/* We currently assume that the journal file header size is aligned relative to the filesystem block size.
|
|
* which is currently assumed to be a 2-power (e.g. 512 bytes, 1K, 2K, 4K etc.) but never more than 64K
|
|
* (MAX_IO_BLOCK_SIZE). Given this, we keep the journal file header size at 64K for Unix and 512-byte aligned
|
|
* for VMS. This way any process updating the file header will hold crit and do aligned writes. Any process
|
|
* writing the journal file data (journal records) on disk will hold the qio lock and can safely do so without
|
|
* ever touching the journal file header area. If ever MAX_IO_BLOCK_SIZE changes (say because some filesystem
|
|
* block size changes to 128K) such that JNL_HDR_LEN is no longer aligned to that, we want to know hence this assert.
|
|
*/
|
|
assert(JNL_HDR_LEN % MAX_IO_BLOCK_SIZE == 0);
|
|
assert(REAL_JNL_HDR_LEN == SIZEOF(jnl_file_header));
|
|
UNIX_ONLY(assert(REAL_JNL_HDR_LEN <= JNL_HDR_LEN);)
|
|
VMS_ONLY(assert(REAL_JNL_HDR_LEN == JNL_HDR_LEN);)
|
|
assert(JNL_HDR_LEN == JNL_FILE_FIRST_RECORD);
|
|
assert(DISK_BLOCK_SIZE >= PINI_RECLEN + EPOCH_RECLEN + PFIN_RECLEN + EOF_RECLEN);
|
|
assert((JNL_ALLOC_MIN * DISK_BLOCK_SIZE) > JNL_HDR_LEN);
|
|
/* Following assert is for JNL_FILE_TAIL_PRESERVE macro in tp.h */
|
|
assert(PINI_RECLEN >= EPOCH_RECLEN && PINI_RECLEN >= PFIN_RECLEN && PINI_RECLEN >= EOF_RECLEN);
|
|
/* jnl_string structure has a 8-bit nodeflags field and a 24-bit length field. In some cases, this is
|
|
* used as a 32-bit length field (e.g. in the value part of the SET record or ZTWORMHOLE record). These
|
|
* usages treat the 32-bits as a jnl_str_len_t type and access it directly. Hence the requirement that
|
|
* jnl_str_len_t be the same size as 32-bits and also the same as the offset to the "text" member.
|
|
* If this assert fails, all places that reference jnl_str_len_t need to be revisited.
|
|
*/
|
|
assert(SIZEOF(jnl_str_len_t) == SIZEOF(uint4));
|
|
assert(SIZEOF(jnl_str_len_t) == offsetof(jnl_string, text[0]));
|
|
/* since time in jnl record is a uint4, and since JNL_SHORT_TIME expects time_t, we better ensure they are same.
|
|
* A change in the size of time_t would mean a redesign of the fields. */
|
|
|
|
assert(SIZEOF(time_t) == GTM64_ONLY(SIZEOF(gtm_int8)) NON_GTM64_ONLY(SIZEOF(int4)));
|
|
|
|
/* Make sure all jnl_seqno fields start at same offset. mur_output_record and others rely on this. */
|
|
assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_upd, token_seq.jnl_seqno));
|
|
assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_epoch, jnl_seqno));
|
|
assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_eof, jnl_seqno));
|
|
assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_tcom, token_seq.jnl_seqno));
|
|
assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_ztworm, token_seq.jnl_seqno));
|
|
|
|
/* Make sure all strm_seqno fields start at same offset. Lot of modules rely on this */
|
|
assert(offsetof(struct_jrec_null, strm_seqno) == offsetof(struct_jrec_upd, strm_seqno));
|
|
assert(offsetof(struct_jrec_null, strm_seqno) == offsetof(struct_jrec_tcom, strm_seqno));
|
|
assert(offsetof(struct_jrec_null, strm_seqno) == offsetof(struct_jrec_ztworm, strm_seqno));
|
|
/* EOF and EPOCH are not included in the above asserts because they have not ONE but 16 strm_seqno values each */
|
|
|
|
assert(offsetof(struct_jrec_ztcom, token) == offsetof(struct_jrec_upd, token_seq));
|
|
/* Make sure all jnl_seqno and token fields start at 8-byte boundary */
|
|
assert(offsetof(struct_jrec_upd, token_seq.jnl_seqno) ==
|
|
(ROUND_UP(offsetof(struct_jrec_upd, token_seq.jnl_seqno), SIZEOF(seq_num))));
|
|
assert(offsetof(struct_jrec_tcom, token_seq.jnl_seqno) ==
|
|
(ROUND_UP(offsetof(struct_jrec_tcom, token_seq.jnl_seqno), SIZEOF(seq_num))));
|
|
assert(offsetof(struct_jrec_null, jnl_seqno) ==
|
|
(ROUND_UP(offsetof(struct_jrec_null, jnl_seqno), SIZEOF(seq_num))));
|
|
assert(offsetof(struct_jrec_epoch, jnl_seqno) ==
|
|
(ROUND_UP(offsetof(struct_jrec_epoch, jnl_seqno), SIZEOF(seq_num))));
|
|
assert(offsetof(struct_jrec_eof, jnl_seqno) ==
|
|
(ROUND_UP(offsetof(struct_jrec_eof, jnl_seqno), SIZEOF(seq_num))));
|
|
/* All fixed size records must be multiple of 8-byte */
|
|
assert(TCOM_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_tcom), JNL_REC_START_BNDRY)));
|
|
assert(ZTCOM_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_ztcom), JNL_REC_START_BNDRY)));
|
|
assert(INCTN_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_inctn), JNL_REC_START_BNDRY)));
|
|
assert(PINI_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_pini), JNL_REC_START_BNDRY)));
|
|
assert(PFIN_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_pfin), JNL_REC_START_BNDRY)));
|
|
assert(NULL_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_null), JNL_REC_START_BNDRY)));
|
|
assert(EPOCH_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_epoch), JNL_REC_START_BNDRY)));
|
|
assert(EOF_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_eof), JNL_REC_START_BNDRY)));
|
|
/* Assumption about the structures in code */
|
|
assert(0 == MIN_ALIGN_RECLEN % JNL_REC_START_BNDRY);
|
|
assert(SIZEOF(uint4) == SIZEOF(jrec_suffix));
|
|
assert((MAX_JNL_REC_SIZE - MAX_LOGI_JNL_REC_SIZE) > MIN_PBLK_RECLEN);
|
|
assert((DISK_BLOCK_SIZE * JNL_DEF_ALIGNSIZE) >= MAX_JNL_REC_SIZE);/* default alignsize supports max jnl record length */
|
|
assert(MAX_DB_BLK_SIZE < MAX_JNL_REC_SIZE); /* Ensure a PBLK record can accommodate a full GDS block */
|
|
assert(MAX_JNL_REC_SIZE <= (1 << 24));
|
|
/* Ensure that the 24-bit length field in the journal record can accommodate the maximum journal record size */
|
|
assert(tcom_record.prefix.forwptr == tcom_record.suffix.backptr);
|
|
assert(TCOM_RECLEN == tcom_record.suffix.backptr);
|
|
assert(SIZEOF(token_split_t) == SIZEOF(token_build)); /* Required for TOKEN_SET macro */
|
|
}
|
|
|
|
void gvcst_init(gd_region *greg)
|
|
{
|
|
sgmnt_addrs *csa, *prevcsa, *regcsa;
|
|
sgmnt_data_ptr_t csd;
|
|
# ifdef VMS
|
|
char cs_data_buff[ROUND_UP(SGMNT_HDR_LEN, DISK_BLOCK_SIZE)];
|
|
sgmnt_data_ptr_t temp_cs_data;
|
|
# endif
|
|
uint4 segment_update_array_size;
|
|
file_control *fc;
|
|
gd_region *prev_reg, *reg_top;
|
|
# ifdef DEBUG
|
|
cache_rec_ptr_t cr;
|
|
bt_rec_ptr_t bt;
|
|
blk_ident tmp_blk;
|
|
# endif
|
|
mstr log_nam, trans_log_nam;
|
|
char trans_buff[MAX_FN_LEN + 1];
|
|
unique_file_id *greg_fid, *reg_fid;
|
|
gd_addr *addr_ptr;
|
|
tp_region *tr;
|
|
ua_list *tmp_ua;
|
|
time_t curr_time;
|
|
uint4 curr_time_uint4, next_warn_uint4;
|
|
unsigned int minus1 = (unsigned)-1;
|
|
enum db_acc_method greg_acc_meth;
|
|
ht_ent_mname *tabent, *topent, *stayent;
|
|
gv_namehead *gvt, *gvt_stay;
|
|
gvnh_reg_t *gvnh_reg;
|
|
hash_table_mname *table;
|
|
boolean_t added, first_wasopen;
|
|
intrpt_state_t save_intrpt_ok_state;
|
|
|
|
UNSUPPORTED_PLATFORM_CHECK;
|
|
assert(!jgbl.forw_phase_recovery);
|
|
CWS_INIT; /* initialize the cw_stagnate hash-table */
|
|
/* check the header design assumptions */
|
|
assert(SIZEOF(th_rec) == (SIZEOF(bt_rec) - SIZEOF(bt->blkque)));
|
|
assert(SIZEOF(cache_rec) == (SIZEOF(cache_state_rec) + SIZEOF(cr->blkque)));
|
|
DEBUG_ONLY(assert_jrec_member_offsets();)
|
|
set_num_additional_processors();
|
|
|
|
DEBUG_ONLY(
|
|
/* Note that the "block" member in the blk_ident structure in gdskill.h has 28 bits.
|
|
* Currently, the maximum number of blocks is 2**28. If ever this increases, something
|
|
* has to be correspondingly done to the "block" member to increase its capacity.
|
|
* The following assert checks that we always have space in the "block" member
|
|
* to represent a GDS block number.
|
|
*/
|
|
tmp_blk.block = minus1;
|
|
assert(MAXTOTALBLKS_MAX - 1 <= tmp_blk.block);
|
|
)
|
|
/* TH_BLOCK is currently a hardcoded constant as basing it on the offsetof macro does not work with the VMS compiler.
|
|
* Therefore assert that TH_BLOCK points to the 512-byte block where the "trans_hist" member lies in the fileheader.
|
|
*/
|
|
assert(DIVIDE_ROUND_UP(offsetof(sgmnt_data, trans_hist), DISK_BLOCK_SIZE) == TH_BLOCK);
|
|
if ((prev_reg = dbfilopn(greg)) != greg)
|
|
{
|
|
if (NULL == prev_reg || (gd_region *)-1L == prev_reg) /* (gd_region *)-1 == prev_reg => cm region open attempted */
|
|
return;
|
|
/* Found same database already open - prev_reg contains addr of originally openned region */
|
|
greg->dyn.addr->file_cntl = prev_reg->dyn.addr->file_cntl;
|
|
memcpy(greg->dyn.addr->fname, prev_reg->dyn.addr->fname, prev_reg->dyn.addr->fname_len);
|
|
greg->dyn.addr->fname_len = prev_reg->dyn.addr->fname_len;
|
|
csa = (sgmnt_addrs *)&FILE_INFO(greg)->s_addrs;
|
|
PROCESS_GVT_PENDING_LIST(greg, csa, gvt_pending_list);
|
|
csd = csa->hdr;
|
|
if (NULL == csa->gvt_hashtab)
|
|
{ /* Already have another region that points to the same physical database file as this one.
|
|
* Since two regions point to the same physical file, start maintaining a list of all global variable
|
|
* names whose gv_targets have already been allocated on behalf of the current database file.
|
|
* Future targ_allocs will check this list before they allocate (to avoid duplicate allocations).
|
|
*/
|
|
csa->gvt_hashtab = (hash_table_mname *)malloc(SIZEOF(hash_table_mname));
|
|
init_hashtab_mname(csa->gvt_hashtab, 0, HASHTAB_NO_COMPACT, HASHTAB_NO_SPARE_TABLE);
|
|
assert(1 == csa->regcnt);
|
|
first_wasopen = TRUE;
|
|
} else
|
|
first_wasopen = FALSE;
|
|
for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr))
|
|
{
|
|
table = addr_ptr->tab_ptr;
|
|
for (tabent = table->base, topent = tabent + table->size; tabent < topent; tabent++)
|
|
{
|
|
if (HTENT_VALID_MNAME(tabent, gvnh_reg_t, gvnh_reg))
|
|
{ /* Check if the gvt's region is the current region.
|
|
* If so add gvt's variable name into the csa hashtable.
|
|
*/
|
|
gvt = gvnh_reg->gvt;
|
|
assert((gvnh_reg->gd_reg != greg) || (csa == gvt->gd_csa));
|
|
/* If this is the first time a was_open region is happening for this csa, then
|
|
* we want to merge gv_targets from both the regions into csa->gvt_hashtab. For
|
|
* all future was_open cases, we want only to add gv_targets from the was_open region.
|
|
*/
|
|
if (first_wasopen && (csa == gvt->gd_csa) || !first_wasopen && (gvnh_reg->gd_reg == greg))
|
|
{ /* Add gv_target into csa->gvt_hashtab */
|
|
added = add_hashtab_mname(csa->gvt_hashtab, &gvt->gvname, gvt, &stayent);
|
|
assert(!added || (1 <= gvt->regcnt));
|
|
if (!added)
|
|
{ /* Entry already present. Increment gvt->regcnt.
|
|
* If NOISOLATION status differs between the two,
|
|
* choose the more pessimistic one.
|
|
*/
|
|
gvt_stay = (gv_namehead *)stayent->value;
|
|
assert(gvt_stay != gvt);
|
|
if (FALSE == gvt->noisolation)
|
|
gvt_stay->noisolation = FALSE;
|
|
assert(1 <= gvt_stay->regcnt);
|
|
/* Now make gvnh_reg->gvt point to gvt_stay (instead of gvt) */
|
|
gvt_stay->regcnt++;
|
|
gvt->regcnt--;
|
|
gvnh_reg->gvt = gvt_stay;
|
|
targ_free(gvt);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
greg->max_rec_size = csd->max_rec_size;
|
|
greg->max_key_size = csd->max_key_size;
|
|
greg->null_subs = csd->null_subs;
|
|
greg->std_null_coll = csd->std_null_coll;
|
|
greg->jnl_state = csd->jnl_state;
|
|
greg->jnl_file_len = csd->jnl_file_len; /* journal file name length */
|
|
memcpy(greg->jnl_file_name, csd->jnl_file_name, greg->jnl_file_len); /* journal file name */
|
|
greg->jnl_alq = csd->jnl_alq;
|
|
greg->jnl_deq = csd->jnl_deq;
|
|
greg->jnl_buffer_size = csd->jnl_buffer_size;
|
|
greg->jnl_before_image = csd->jnl_before_image;
|
|
greg->open = TRUE;
|
|
greg->opening = FALSE;
|
|
greg->was_open = TRUE;
|
|
assert(1 <= csa->regcnt);
|
|
csa->regcnt++; /* Increment # of regions that point to this csa */
|
|
return;
|
|
}
|
|
GTM_FD_TRACE_ONLY(gtm_dbjnl_dupfd_check();) /* check if any of db or jnl fds collide (D9I11-002714) */
|
|
greg->was_open = FALSE;
|
|
/* we shouldn't have crit on any region unless we are in TP and in the final retry or we are in
|
|
* mupip_set_journal trying to switch journals across all regions. Currently, there is no fine-granular
|
|
* checking for mupip_set_journal, hence a coarse MUPIP_IMAGE check for image_type
|
|
*/
|
|
assert(dollar_tlevel && (CDB_STAGNATE <= t_tries) || IS_MUPIP_IMAGE || (0 == have_crit(CRIT_HAVE_ANY_REG)));
|
|
if (dollar_tlevel && (0 != have_crit(CRIT_HAVE_ANY_REG)))
|
|
{ /* To avoid deadlocks with currently holding crits and the DLM lock request to be done in db_init(),
|
|
* we should insert this region in the tp_reg_list and tp_restart should do the gvcst_init after
|
|
* having released crit on all regions. Note that this check should be done AFTER checking if the
|
|
* region has already been opened (i.e. greg->was_open = TRUE logic above) since in that case we dont
|
|
* do any heavyweight processing (like db_init which involves crit/DLM locks) and so dont need to restart.
|
|
*/
|
|
insert_region(greg, &tp_reg_list, &tp_reg_free_list, SIZEOF(tp_region));
|
|
t_retry(cdb_sc_needcrit);
|
|
assert(FALSE); /* we should never reach here since t_retry should have unwound the M-stack and restarted the TP */
|
|
}
|
|
csa = (sgmnt_addrs *)&FILE_INFO(greg)->s_addrs;
|
|
|
|
#ifdef NOLICENSE
|
|
licensed = TRUE;
|
|
#else
|
|
CRYPT_CHKSYSTEM;
|
|
#endif
|
|
db_init_region = greg; /* initialized for dbinit_ch */
|
|
csa->hdr = NULL;
|
|
csa->nl = NULL;
|
|
csa->jnl = NULL;
|
|
csa->persistent_freeze = FALSE; /* want secshr_db_clnup() to clear an incomplete freeze/unfreeze codepath */
|
|
csa->regcnt = 1; /* At this point, only one region points to this csa */
|
|
# ifdef VMS
|
|
csa->db_addrs[0] = csa->db_addrs[1] = NULL;
|
|
csa->lock_addrs[0] = csa->lock_addrs[1] = NULL;
|
|
greg_acc_meth = greg->dyn.addr->acc_meth;
|
|
assert(dba_cm != greg_acc_meth);
|
|
temp_cs_data = (sgmnt_data_ptr_t)cs_data_buff;
|
|
fc = greg->dyn.addr->file_cntl;
|
|
fc->file_type = greg_acc_meth;
|
|
fc->op = FC_READ;
|
|
fc->op_buff = (sm_uc_ptr_t)temp_cs_data;
|
|
fc->op_len = SIZEOF(*temp_cs_data);
|
|
fc->op_pos = 1;
|
|
dbfilop(fc);
|
|
DO_BADDBVER_CHK(greg, temp_cs_data);
|
|
DO_DB_HDR_CHECK(greg, temp_cs_data); /* Basic sanity check on the file header fields */
|
|
if (greg_acc_meth != temp_cs_data->acc_meth)
|
|
{
|
|
greg_acc_meth = temp_cs_data->acc_meth;
|
|
greg->dyn.addr->acc_meth = greg_acc_meth;
|
|
}
|
|
# endif
|
|
/* Here's the shared memory layout:
|
|
*
|
|
* low address
|
|
*
|
|
* both
|
|
* segment_data
|
|
* (file_header)
|
|
* MM_BLOCK
|
|
* (master_map)
|
|
* TH_BLOCK
|
|
* BG
|
|
* bt_header
|
|
* (bt_buckets * bt_rec)
|
|
* th_base (SIZEOF(que_ent) into an odd bt_rec)
|
|
* bt_base
|
|
* (n_bts * bt_rec)
|
|
* LOCK_BLOCK (lock_space)
|
|
* (lock_space_size)
|
|
* cs_addrs->acc_meth.bg.cache_state
|
|
* (cache_que_heads)
|
|
* (bt_buckets * cache_rec)
|
|
* (n_bts * cache_rec)
|
|
* critical
|
|
* (mutex_struct)
|
|
* nl
|
|
* (node_local)
|
|
* [jnl_name
|
|
* jnl_buffer]
|
|
* MM
|
|
* file contents
|
|
* LOCK_BLOCK (lock_space)
|
|
* (lock_space_size)
|
|
* cs_addrs->acc_meth.mm.mmblk_state
|
|
* (mmblk_que_heads)
|
|
* (bt_buckets * mmblk_rec)
|
|
* (n_bts * mmblk_rec)
|
|
* critical
|
|
* (mutex_struct)
|
|
* nl
|
|
* (node_local)
|
|
* [jnl_name
|
|
* jnl_buffer]
|
|
* high address
|
|
*/
|
|
/* Ensure first 3 members (upto now_running) of node_local are at the same offset for any version.
|
|
*
|
|
* Structure ----> node_local <---- size 59392 [0xe800]
|
|
*
|
|
* offset = 0000 [0x0000] size = 0012 [0x000c] ----> node_local.label
|
|
* offset = 0012 [0x000c] size = 0256 [0x0100] ----> node_local.fname
|
|
* offset = 0268 [0x010c] size = 0036 [0x0024] ----> node_local.now_running
|
|
*
|
|
* This is so that the VERMISMATCH error can be successfully detected in db_init/mu_rndwn_file
|
|
* and so that the db-file-name can be successfully obtained from orphaned shm by mu_rndwn_all.
|
|
*/
|
|
assert(0 == OFFSETOF(node_local, label[0]));
|
|
assert(12 == SIZEOF(((node_local *)NULL)->label));
|
|
assert(12 == GDS_LABEL_SZ);
|
|
assert(12 == OFFSETOF(node_local, fname[0]));
|
|
assert(256 == SIZEOF(((node_local *)NULL)->fname));
|
|
assert(256 == (MAX_FN_LEN + 1));
|
|
assert(268 == OFFSETOF(node_local, now_running[0]));
|
|
assert(36 == SIZEOF(((node_local *)NULL)->now_running));
|
|
assert(36 == MAX_REL_NAME);
|
|
UNIX_ONLY(START_HEARTBEAT_IF_NEEDED;)
|
|
/* Protect the db_init and the code below until we set greg->open to TRUE. This is needed as otherwise,
|
|
* if a MUPIP STOP is issued to this process at a time-window when db_init is completed but greg->open
|
|
* is NOT set to TRUE, will cause gds_rundown NOT to clean up the shared memory created by db_init and
|
|
* thus would be left over in the system.
|
|
*/
|
|
DEFER_INTERRUPTS(INTRPT_IN_GVCST_INIT);
|
|
VMS_ONLY(db_init(greg, temp_cs_data);)
|
|
UNIX_ONLY(db_init(greg);)
|
|
crash_count = csa->critical->crashcnt;
|
|
csa->regnum = ++region_open_count;
|
|
csd = csa->hdr;
|
|
# ifdef GTM_TRIGGER
|
|
/* Take copy of db trigger cycle into csa at db startup. Any concurrent changes to the
|
|
* db trigger cycle (by MUPIP TRIGGER) will be detected at tcommit (t_end/tp_tend) time.
|
|
*/
|
|
csa->db_trigger_cycle = csd->db_trigger_cycle;
|
|
# endif
|
|
/* set csd and fill in selected fields */
|
|
assert(greg->dyn.addr->acc_meth == csd->acc_meth); /* db_init should have made sure this assert holds good */
|
|
greg_acc_meth = csd->acc_meth;
|
|
switch (greg_acc_meth)
|
|
{
|
|
case dba_mm:
|
|
csa->acc_meth.mm.base_addr = (sm_uc_ptr_t)((sm_ulong_t)csd + (int)(csd->start_vbn - 1) * DISK_BLOCK_SIZE);
|
|
break;
|
|
case dba_bg:
|
|
db_csh_ini(csa);
|
|
break;
|
|
default:
|
|
GTMASSERT;
|
|
}
|
|
/* It is necessary that we do the pending gv_target list reallocation BEFORE db_common_init as the latter resets
|
|
* greg->max_key_size to be equal to the csd->max_key_size and hence process_gvt_pending_list might wrongly conclude
|
|
* that NO reallocation (since it checks greg->max_key_size with csd->max_key_size) is needed when in fact a
|
|
* reallocation might be necessary (if the user changed max_key_size AFTER database creation)
|
|
*/
|
|
PROCESS_GVT_PENDING_LIST(greg, csa, gvt_pending_list);
|
|
db_common_init(greg, csa, csd); /* do initialization common to db_init() and mu_rndwn_file() */
|
|
|
|
/* If we are not fully upgraded, see if we need to send a warning to the operator console about
|
|
performance. Compatibility mode is a known performance drain. Actually, we can send one of two
|
|
messages. If the desired_db_format is for an earlier release than the current release, we send
|
|
a performance warning that this mode degrades performance. However, if the desired_db_format is
|
|
for the current version but there are blocks to convert still, we send a gengle reminder that
|
|
running mupip reorg upgrade would be a good idea to get the full performance benefit of V5.
|
|
*/
|
|
time(&curr_time);
|
|
assert(MAXUINT4 > curr_time);
|
|
curr_time_uint4 = (uint4)curr_time;
|
|
next_warn_uint4 = csd->next_upgrd_warn.cas_time;
|
|
if (!csd->fully_upgraded && curr_time_uint4 > next_warn_uint4
|
|
&& COMPSWAP_LOCK(&csd->next_upgrd_warn.time_latch, next_warn_uint4, 0, (curr_time_uint4 + UPGRD_WARN_INTERVAL), 0))
|
|
{ /* The msg is due and we have successfully updated the next time interval */
|
|
if (GDSVCURR != csd->desired_db_format)
|
|
send_msg(VARLSTCNT(4) ERR_DBVERPERFWARN1, 2, DB_LEN_STR(greg));
|
|
else
|
|
send_msg(VARLSTCNT(4) ERR_DBVERPERFWARN2, 2, DB_LEN_STR(greg));
|
|
}
|
|
|
|
/* Compute the maximum journal space requirements for a PBLK (including possible ALIGN record).
|
|
* Use this variable in the TOTAL_TPJNL_REC_SIZE and TOTAL_NONTP_JNL_REC_SIZE macros instead of recomputing.
|
|
*/
|
|
csa->pblk_align_jrecsize = (int4)MIN_PBLK_RECLEN + csd->blk_size + (int4)MIN_ALIGN_RECLEN;
|
|
segment_update_array_size = UA_SIZE(csd);
|
|
|
|
if (first_ua == NULL)
|
|
{ /* first open of first database - establish an update array system */
|
|
assert(update_array == NULL);
|
|
assert(update_array_ptr == NULL);
|
|
assert(update_array_size == 0);
|
|
tmp_ua = (ua_list *)malloc(SIZEOF(ua_list));
|
|
memset(tmp_ua, 0, SIZEOF(ua_list)); /* initialize tmp_ua->update_array and tmp_ua->next_ua to NULL */
|
|
tmp_ua->update_array = (char *)malloc(segment_update_array_size);
|
|
tmp_ua->update_array_size = segment_update_array_size;
|
|
/* assign global variables only after malloc() succeeds */
|
|
update_array_size = cumul_update_array_size = segment_update_array_size;
|
|
update_array = update_array_ptr = tmp_ua->update_array;
|
|
first_ua = curr_ua = tmp_ua;
|
|
} else
|
|
{ /* there's already an update_array system in place */
|
|
assert(update_array != NULL);
|
|
assert(update_array_size != 0);
|
|
if (!dollar_tlevel && segment_update_array_size > first_ua->update_array_size)
|
|
{
|
|
/* no transaction in progress and the current array is too small - replace it */
|
|
assert(first_ua->update_array == update_array);
|
|
assert(first_ua->update_array_size == update_array_size);
|
|
assert(first_ua->next_ua == NULL);
|
|
tmp_ua = first_ua;
|
|
first_ua = curr_ua = NULL;
|
|
free(update_array);
|
|
tmp_ua->update_array = update_array = update_array_ptr = NULL;
|
|
tmp_ua->update_array = (char *)malloc(segment_update_array_size);
|
|
tmp_ua->update_array_size = segment_update_array_size;
|
|
/* assign global variables only after malloc() succeeds */
|
|
update_array_size = cumul_update_array_size = segment_update_array_size;
|
|
update_array = update_array_ptr = tmp_ua->update_array;
|
|
first_ua = curr_ua = tmp_ua;
|
|
}
|
|
}
|
|
assert(global_tlvl_info_list || !csa->sgm_info_ptr);
|
|
if (JNL_ALLOWED(csa))
|
|
{
|
|
if (NULL == non_tp_jfb_ptr)
|
|
{
|
|
non_tp_jfb_ptr = (jnl_format_buffer *)malloc(SIZEOF(jnl_format_buffer));
|
|
non_tp_jfb_buff_ptr = (unsigned char *)malloc(MAX_JNL_REC_SIZE);
|
|
non_tp_jfb_ptr->buff = (char *)non_tp_jfb_buff_ptr;
|
|
/* If the journal records need to be encrypted in the journal file and if replication is in use,
|
|
* we will need access to both the encrypted (for the journal file) and unencrypted (for the
|
|
* journal pool) journal record contents. Since this code is executed only once (for the first
|
|
* journaled database opened) by this process, we will have to allocate an alternate buffer
|
|
* for this purpose (to hold the unencrypted data) as long as this GT.M version supports encryption.
|
|
*/
|
|
GTMCRYPT_ONLY(
|
|
non_tp_jfb_ptr->alt_buff = (char *)malloc(MAX_JNL_REC_SIZE);
|
|
)
|
|
non_tp_jfb_ptr->record_size = 0; /* initialize it to 0 since TOTAL_NONTPJNL_REC_SIZE macro uses it */
|
|
}
|
|
/* csa->min_total_tpjnl_rec_size represents the minimum journal buffer space needed for a TP transaction.
|
|
* It is a conservative estimate assuming that one ALIGN record and one PINI record will be written for
|
|
* one set of fixed size jnl records written.
|
|
* si->total_jnl_rec_size is initialized/reinitialized to this value here and in tp_clean_up().
|
|
* The purpose of this field is to avoid recomputation of the variable in tp_clean_up().
|
|
* In addition to this, space requirements for whatever journal records get formatted as part of
|
|
* jnl_format() need to be taken into account.
|
|
* This is done in jnl_format() where si->total_jnl_rec_size is appropriately incremented.
|
|
*/
|
|
csa->min_total_tpjnl_rec_size = PINI_RECLEN + TCOM_RECLEN + MIN_ALIGN_RECLEN;
|
|
/* Similarly csa->min_total_nontpjnl_rec_size represents the minimum journal buffer space needed
|
|
* for a non-TP transaction.
|
|
* It is a conservative estimate assuming that one ALIGN record and one PINI record will be written for
|
|
* one set of fixed size jnl records written.
|
|
*/
|
|
csa->min_total_nontpjnl_rec_size = PINI_RECLEN + MIN_ALIGN_RECLEN;
|
|
}
|
|
if (tp_in_use || !IS_GTM_IMAGE)
|
|
gvcst_tp_init(greg); /* Initialize TP structures, else postpone till TP is used (only if GTM) */
|
|
if (!global_tlvl_info_list)
|
|
{
|
|
global_tlvl_info_list = (buddy_list *)malloc(SIZEOF(buddy_list));
|
|
initialize_list(global_tlvl_info_list, SIZEOF(global_tlvl_info), GBL_TLVL_INFO_LIST_INIT_ALLOC);
|
|
}
|
|
greg->open = TRUE;
|
|
greg->opening = FALSE;
|
|
/* gds_rundown if invoked from now on will take care of cleaning up the shared memory segment */
|
|
ENABLE_INTERRUPTS(INTRPT_IN_GVCST_INIT);
|
|
if (dba_bg == greg_acc_meth)
|
|
{ /* Check if (a) this region has non-upgraded blocks and if so, (b) the reformat buffer exists and
|
|
* (c) if it is big enough to deal with this region. If the region does not have any non-upgraded
|
|
* block (blks_to_upgrd is 0) we will not allocate the buffer at this time. Note that this opens up
|
|
* a small window for errors. If this buffer is not allocated and someone turns on compatibility
|
|
* mode and before the process can discover this and allocate the buffer, it runs out of memory,
|
|
* errors out and finds it is responsible for running down the database, it could fail on a recursive
|
|
* memory error when it tries to allocate the block. This is (to me) an acceptable risk as it is
|
|
* very low and compares favorably to the cost of every process allocating a database block sized
|
|
* chunk of private storage that will be seldom if ever used (SE 3/2005).
|
|
*/
|
|
if (0 != csd->blks_to_upgrd && csd->blk_size > reformat_buffer_len)
|
|
{ /* Buffer not big enough (or does not exist) .. get a new one releasing old if it exists */
|
|
assert(0 == fast_lock_count); /* this is mainline (non-interrupt) code */
|
|
++fast_lock_count; /* No interrupts across this use of reformat_buffer */
|
|
/* reformat_buffer_in_use should always be incremented only AFTER incrementing fast_lock_count
|
|
* as it is the latter that prevents interrupts from using the reformat buffer. Similarly
|
|
* the decrement of fast_lock_count should be done AFTER decrementing reformat_buffer_in_use.
|
|
*/
|
|
assert(0 == reformat_buffer_in_use);
|
|
DEBUG_ONLY(reformat_buffer_in_use++;)
|
|
if (reformat_buffer)
|
|
free(reformat_buffer); /* Different blksized databases in use .. keep only largest one */
|
|
reformat_buffer = malloc(csd->blk_size);
|
|
reformat_buffer_len = csd->blk_size;
|
|
DEBUG_ONLY(reformat_buffer_in_use--;)
|
|
assert(0 == reformat_buffer_in_use);
|
|
--fast_lock_count;
|
|
}
|
|
|
|
}
|
|
if ((dba_bg == greg_acc_meth) || (dba_mm == greg_acc_meth))
|
|
{
|
|
/* Determine fid_index of current region's file_id across sorted file_ids of all regions open until now.
|
|
* All regions which have a file_id lesser than that of current region will have no change to their fid_index
|
|
* All regions which have a file_id greater than that of current region will have their fid_index incremented by 1
|
|
* The fid_index determination algorithm below has an optimization in that if the current region's file_id is
|
|
* determined to be greater than that of a particular region, then all regions whose fid_index is lesser
|
|
* than that particular region's fid_index are guaranteed to have a lesser file_id than the current region
|
|
* so we do not compare those against the current region's file_id.
|
|
* Note that the sorting is done only on DB/MM regions. GT.CM/DDP regions should not be part of TP transactions,
|
|
* hence they will not be sorted.
|
|
*/
|
|
prevcsa = NULL;
|
|
greg_fid = &(csa->nl->unique_id);
|
|
for (regcsa = cs_addrs_list; NULL != regcsa; regcsa = regcsa->next_csa)
|
|
{
|
|
if ((NULL != prevcsa) && (regcsa->fid_index < prevcsa->fid_index))
|
|
continue;
|
|
reg_fid = &((regcsa)->nl->unique_id);
|
|
VMS_ONLY(if (0 < memcmp(&(greg_fid->file_id), (char *)&(reg_fid->file_id), SIZEOF(gd_id))))
|
|
UNIX_ONLY(if (0 < gdid_cmp(&(greg_fid->uid), &(reg_fid->uid))))
|
|
{
|
|
if ((NULL == prevcsa) || (regcsa->fid_index > prevcsa->fid_index))
|
|
prevcsa = regcsa;
|
|
} else
|
|
regcsa->fid_index++;
|
|
}
|
|
if (NULL == prevcsa)
|
|
csa->fid_index = 1;
|
|
else
|
|
csa->fid_index = prevcsa->fid_index + 1;
|
|
/* Add current csa into list of open csas */
|
|
csa->next_csa = cs_addrs_list;
|
|
cs_addrs_list = csa;
|
|
/* Also update tp_reg_list fid_index's as insert_region relies on it */
|
|
for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr)
|
|
tr->file.fid_index = (&FILE_INFO(tr->reg)->s_addrs)->fid_index;
|
|
DBG_CHECK_TP_REG_LIST_SORTING(tp_reg_list);
|
|
}
|
|
return;
|
|
}
|