/**************************************************************** * * * Copyright 2001, 2012 Fidelity Information Services, Inc * * * * This source code contains the intellectual property * * of its copyright holder(s), and is made available * * under a license. If you do not know the terms of * * the license, please stop and do not read further. * * * ****************************************************************/ #include "mdef.h" #include /* for offsetof macro */ #include "gtm_string.h" #include "gtm_time.h" #include "cdb_sc.h" #include "gdsroot.h" #include "gtm_facility.h" #include "fileinfo.h" #include "gdsbt.h" #include "gdsfhead.h" #include "gdsblk.h" #include "gdskill.h" #include "gdscc.h" #include "min_max.h" /* needed for gdsblkops.h */ #include "gdsblkops.h" #include "filestruct.h" #include "iosp.h" #include "jnl.h" #include "hashtab_int4.h" /* needed for tp.h */ #include "buddy_list.h" /* needed for tp.h */ #include "tp.h" #include "gtm_stdlib.h" /* for ATOI */ #include "cryptdef.h" #include "mlkdef.h" #include "error.h" #include "gt_timer.h" #include "gtmimagename.h" #include "trans_log_name.h" #include "gtm_logicals.h" #include "dbfilop.h" #include "set_num_additional_processors.h" #include "have_crit.h" #include "t_retry.h" #include "dpgbldir.h" #include "longset.h" /* needed for cws_insert.h */ #include "cws_insert.h" /* for CWS_INIT macro */ #include "gvcst_protos.h" /* for gvcst_init,gvcst_init_sysops,gvcst_tp_init prototype */ #include "compswap.h" #include "send_msg.h" #include "targ_alloc.h" /* for "targ_free" prototype */ #include "hashtab_mname.h" #include "process_gvt_pending_list.h" #include "gtmmsg.h" #ifdef UNIX #include "heartbeat_timer.h" #include "anticipatory_freeze.h" #endif #ifdef GTM_FD_TRACE #include "gtm_dbjnl_dupfd_check.h" #endif GBLREF gd_region *gv_cur_region, *db_init_region; GBLREF sgmnt_data_ptr_t cs_data; GBLREF sgmnt_addrs *cs_addrs; GBLREF sgmnt_addrs *cs_addrs_list; GBLREF boolean_t gtcm_connection; GBLREF bool licensed; GBLREF int4 lkid; GBLREF char *update_array, *update_array_ptr; GBLREF uint4 update_array_size, cumul_update_array_size; GBLREF ua_list *first_ua, *curr_ua; GBLREF short crash_count; GBLREF uint4 dollar_tlevel; GBLREF jnl_format_buffer *non_tp_jfb_ptr; GBLREF unsigned char *non_tp_jfb_buff_ptr; GBLREF boolean_t mupip_jnl_recover; GBLREF buddy_list *global_tlvl_info_list; GBLREF tp_region *tp_reg_free_list; /* Ptr to list of tp_regions that are unused */ GBLREF tp_region *tp_reg_list; /* Ptr to list of tp_regions for this transaction */ GBLREF unsigned int t_tries; GBLREF struct_jrec_tcom tcom_record; GBLREF boolean_t tp_in_use; GBLREF uint4 region_open_count; GBLREF sm_uc_ptr_t reformat_buffer; GBLREF int reformat_buffer_len; GBLREF volatile int reformat_buffer_in_use; /* used only in DEBUG mode */ GBLREF volatile int4 fast_lock_count; GBLREF gvt_container *gvt_pending_list; GBLREF boolean_t dse_running; GBLREF jnl_gbls_t jgbl; #ifdef UNIX GBLREF boolean_t pool_init; GBLREF boolean_t jnlpool_init_needed; GBLREF jnlpool_addrs jnlpool; #endif LITREF char gtm_release_name[]; LITREF int4 gtm_release_name_len; error_def(ERR_BADDBVER); error_def(ERR_DBCREINCOMP); error_def(ERR_DBFLCORRP); error_def(ERR_DBNOTGDS); error_def(ERR_DBVERPERFWARN1); error_def(ERR_DBVERPERFWARN2); error_def(ERR_MMNODYNUPGRD); void assert_jrec_member_offsets(void) { assert(REAL_JNL_HDR_LEN % DISK_BLOCK_SIZE == 0); assert(JNL_HDR_LEN % DISK_BLOCK_SIZE == 0); /* We currently assume that the journal file header size is aligned relative to the filesystem block size. * which is currently assumed to be a 2-power (e.g. 512 bytes, 1K, 2K, 4K etc.) but never more than 64K * (MAX_IO_BLOCK_SIZE). Given this, we keep the journal file header size at 64K for Unix and 512-byte aligned * for VMS. This way any process updating the file header will hold crit and do aligned writes. Any process * writing the journal file data (journal records) on disk will hold the qio lock and can safely do so without * ever touching the journal file header area. If ever MAX_IO_BLOCK_SIZE changes (say because some filesystem * block size changes to 128K) such that JNL_HDR_LEN is no longer aligned to that, we want to know hence this assert. */ assert(JNL_HDR_LEN % MAX_IO_BLOCK_SIZE == 0); assert(REAL_JNL_HDR_LEN == SIZEOF(jnl_file_header)); UNIX_ONLY(assert(REAL_JNL_HDR_LEN <= JNL_HDR_LEN);) VMS_ONLY(assert(REAL_JNL_HDR_LEN == JNL_HDR_LEN);) assert(JNL_HDR_LEN == JNL_FILE_FIRST_RECORD); assert(DISK_BLOCK_SIZE >= PINI_RECLEN + EPOCH_RECLEN + PFIN_RECLEN + EOF_RECLEN); assert((JNL_ALLOC_MIN * DISK_BLOCK_SIZE) > JNL_HDR_LEN); /* Following assert is for JNL_FILE_TAIL_PRESERVE macro in tp.h */ assert(PINI_RECLEN >= EPOCH_RECLEN && PINI_RECLEN >= PFIN_RECLEN && PINI_RECLEN >= EOF_RECLEN); /* jnl_string structure has a 8-bit nodeflags field and a 24-bit length field. In some cases, this is * used as a 32-bit length field (e.g. in the value part of the SET record or ZTWORMHOLE record). These * usages treat the 32-bits as a jnl_str_len_t type and access it directly. Hence the requirement that * jnl_str_len_t be the same size as 32-bits and also the same as the offset to the "text" member. * If this assert fails, all places that reference jnl_str_len_t need to be revisited. */ assert(SIZEOF(jnl_str_len_t) == SIZEOF(uint4)); assert(SIZEOF(jnl_str_len_t) == offsetof(jnl_string, text[0])); /* since time in jnl record is a uint4, and since JNL_SHORT_TIME expects time_t, we better ensure they are same. * A change in the size of time_t would mean a redesign of the fields. */ assert(SIZEOF(time_t) == GTM64_ONLY(SIZEOF(gtm_int8)) NON_GTM64_ONLY(SIZEOF(int4))); /* Make sure all jnl_seqno fields start at same offset. mur_output_record and others rely on this. */ assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_upd, token_seq.jnl_seqno)); assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_epoch, jnl_seqno)); assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_eof, jnl_seqno)); assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_tcom, token_seq.jnl_seqno)); assert(offsetof(struct_jrec_null, jnl_seqno) == offsetof(struct_jrec_ztworm, token_seq.jnl_seqno)); /* Make sure all strm_seqno fields start at same offset. Lot of modules rely on this */ assert(offsetof(struct_jrec_null, strm_seqno) == offsetof(struct_jrec_upd, strm_seqno)); assert(offsetof(struct_jrec_null, strm_seqno) == offsetof(struct_jrec_tcom, strm_seqno)); assert(offsetof(struct_jrec_null, strm_seqno) == offsetof(struct_jrec_ztworm, strm_seqno)); /* EOF and EPOCH are not included in the above asserts because they have not ONE but 16 strm_seqno values each */ assert(offsetof(struct_jrec_ztcom, token) == offsetof(struct_jrec_upd, token_seq)); /* Make sure all jnl_seqno and token fields start at 8-byte boundary */ assert(offsetof(struct_jrec_upd, token_seq.jnl_seqno) == (ROUND_UP(offsetof(struct_jrec_upd, token_seq.jnl_seqno), SIZEOF(seq_num)))); assert(offsetof(struct_jrec_tcom, token_seq.jnl_seqno) == (ROUND_UP(offsetof(struct_jrec_tcom, token_seq.jnl_seqno), SIZEOF(seq_num)))); assert(offsetof(struct_jrec_null, jnl_seqno) == (ROUND_UP(offsetof(struct_jrec_null, jnl_seqno), SIZEOF(seq_num)))); assert(offsetof(struct_jrec_epoch, jnl_seqno) == (ROUND_UP(offsetof(struct_jrec_epoch, jnl_seqno), SIZEOF(seq_num)))); assert(offsetof(struct_jrec_eof, jnl_seqno) == (ROUND_UP(offsetof(struct_jrec_eof, jnl_seqno), SIZEOF(seq_num)))); /* All fixed size records must be multiple of 8-byte */ assert(TCOM_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_tcom), JNL_REC_START_BNDRY))); assert(ZTCOM_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_ztcom), JNL_REC_START_BNDRY))); assert(INCTN_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_inctn), JNL_REC_START_BNDRY))); assert(PINI_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_pini), JNL_REC_START_BNDRY))); assert(PFIN_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_pfin), JNL_REC_START_BNDRY))); assert(NULL_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_null), JNL_REC_START_BNDRY))); assert(EPOCH_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_epoch), JNL_REC_START_BNDRY))); assert(EOF_RECLEN == (ROUND_UP(SIZEOF(struct_jrec_eof), JNL_REC_START_BNDRY))); /* Assert following comment which is relied upon in JNL_FILE_TAIL_PRESERVE macro. * "We know PINI_RECLEN is maximum of EPOCH_RECLEN, PFIN_RECLEN, EOF_RECLEN" */ assert(PINI_RECLEN > EPOCH_RECLEN); assert(PINI_RECLEN > PFIN_RECLEN); assert(PINI_RECLEN > EOF_RECLEN); /* Assumption about the structures in code */ assert(0 == MIN_ALIGN_RECLEN % JNL_REC_START_BNDRY); assert(SIZEOF(uint4) == SIZEOF(jrec_suffix)); assert((MAX_JNL_REC_SIZE - MAX_LOGI_JNL_REC_SIZE) > MIN_PBLK_RECLEN); assert((DISK_BLOCK_SIZE * JNL_DEF_ALIGNSIZE) >= MAX_JNL_REC_SIZE);/* default alignsize supports max jnl record length */ assert(MAX_DB_BLK_SIZE < MAX_JNL_REC_SIZE); /* Ensure a PBLK record can accommodate a full GDS block */ assert(MAX_JNL_REC_SIZE <= (1 << 24)); /* Ensure that the 24-bit length field in the journal record can accommodate the maximum journal record size */ assert(tcom_record.prefix.forwptr == tcom_record.suffix.backptr); assert(TCOM_RECLEN == tcom_record.suffix.backptr); assert(SIZEOF(token_split_t) == SIZEOF(token_build)); /* Required for TOKEN_SET macro */ } void gvcst_init(gd_region *greg) { sgmnt_addrs *csa, *prevcsa, *regcsa; sgmnt_data_ptr_t csd; # ifdef VMS char cs_data_buff[ROUND_UP(SGMNT_HDR_LEN, DISK_BLOCK_SIZE)]; sgmnt_data_ptr_t temp_cs_data; # endif uint4 segment_update_array_size; file_control *fc; gd_region *prev_reg, *reg_top; # ifdef DEBUG cache_rec_ptr_t cr; bt_rec_ptr_t bt; blk_ident tmp_blk; # endif mstr log_nam, trans_log_nam; char trans_buff[MAX_FN_LEN + 1]; unique_file_id *greg_fid, *reg_fid; gd_addr *addr_ptr; tp_region *tr; ua_list *tmp_ua; time_t curr_time; uint4 curr_time_uint4, next_warn_uint4; unsigned int minus1 = (unsigned)-1; enum db_acc_method greg_acc_meth; ht_ent_mname *tabent, *topent, *stayent; gv_namehead *gvt, *gvt_stay; gvnh_reg_t *gvnh_reg; hash_table_mname *table; boolean_t added, first_wasopen, onln_rlbk_cycle_mismatch = FALSE; intrpt_state_t save_intrpt_ok_state; # ifdef UNIX replpool_identifier replpool_id; unsigned int full_len; # endif DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; UNSUPPORTED_PLATFORM_CHECK; assert(!jgbl.forw_phase_recovery); CWS_INIT; /* initialize the cw_stagnate hash-table */ /* check the header design assumptions */ assert(SIZEOF(th_rec) == (SIZEOF(bt_rec) - SIZEOF(bt->blkque))); assert(SIZEOF(cache_rec) == (SIZEOF(cache_state_rec) + SIZEOF(cr->blkque))); DEBUG_ONLY(assert_jrec_member_offsets();) set_num_additional_processors(); DEBUG_ONLY( /* Note that the "block" member in the blk_ident structure in gdskill.h has 30 bits. * Currently, the maximum number of blocks is 2**30. If ever this increases, something * has to be correspondingly done to the "block" member to increase its capacity. * The following assert checks that we always have space in the "block" member * to represent a GDS block number. */ tmp_blk.block = minus1; assert(MAXTOTALBLKS_MAX - 1 <= tmp_blk.block); ) /* TH_BLOCK is currently a hardcoded constant as basing it on the offsetof macro does not work with the VMS compiler. * Therefore assert that TH_BLOCK points to the 512-byte block where the "trans_hist" member lies in the fileheader. */ assert(DIVIDE_ROUND_UP(offsetof(sgmnt_data, trans_hist), DISK_BLOCK_SIZE) == TH_BLOCK); if ((prev_reg = dbfilopn(greg)) != greg) { if (NULL == prev_reg || (gd_region *)-1L == prev_reg) /* (gd_region *)-1 == prev_reg => cm region open attempted */ return; /* Found same database already open - prev_reg contains addr of originally openned region */ greg->dyn.addr->file_cntl = prev_reg->dyn.addr->file_cntl; memcpy(greg->dyn.addr->fname, prev_reg->dyn.addr->fname, prev_reg->dyn.addr->fname_len); greg->dyn.addr->fname_len = prev_reg->dyn.addr->fname_len; csa = (sgmnt_addrs *)&FILE_INFO(greg)->s_addrs; PROCESS_GVT_PENDING_LIST(greg, csa, gvt_pending_list); csd = csa->hdr; if (NULL == csa->gvt_hashtab) { /* Already have another region that points to the same physical database file as this one. * Since two regions point to the same physical file, start maintaining a list of all global variable * names whose gv_targets have already been allocated on behalf of the current database file. * Future targ_allocs will check this list before they allocate (to avoid duplicate allocations). */ csa->gvt_hashtab = (hash_table_mname *)malloc(SIZEOF(hash_table_mname)); init_hashtab_mname(csa->gvt_hashtab, 0, HASHTAB_NO_COMPACT, HASHTAB_NO_SPARE_TABLE); assert(1 == csa->regcnt); first_wasopen = TRUE; } else first_wasopen = FALSE; for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr)) { table = addr_ptr->tab_ptr; for (tabent = table->base, topent = tabent + table->size; tabent < topent; tabent++) { if (HTENT_VALID_MNAME(tabent, gvnh_reg_t, gvnh_reg)) { /* Check if the gvt's region is the current region. * If so add gvt's variable name into the csa hashtable. */ gvt = gvnh_reg->gvt; assert((gvnh_reg->gd_reg != greg) || (csa == gvt->gd_csa)); /* If this is the first time a was_open region is happening for this csa, then * we want to merge gv_targets from both the regions into csa->gvt_hashtab. For * all future was_open cases, we want only to add gv_targets from the was_open region. */ if (first_wasopen && (csa == gvt->gd_csa) || !first_wasopen && (gvnh_reg->gd_reg == greg)) { /* Add gv_target into csa->gvt_hashtab */ added = add_hashtab_mname(csa->gvt_hashtab, &gvt->gvname, gvt, &stayent); assert(!added || (1 <= gvt->regcnt)); if (!added) { /* Entry already present. Increment gvt->regcnt. * If NOISOLATION status differs between the two, * choose the more pessimistic one. */ gvt_stay = (gv_namehead *)stayent->value; assert(gvt_stay != gvt); if (FALSE == gvt->noisolation) gvt_stay->noisolation = FALSE; assert(1 <= gvt_stay->regcnt); /* Now make gvnh_reg->gvt point to gvt_stay (instead of gvt) */ gvt_stay->regcnt++; gvt->regcnt--; gvnh_reg->gvt = gvt_stay; targ_free(gvt); } } } } } greg->max_rec_size = csd->max_rec_size; greg->max_key_size = csd->max_key_size; greg->null_subs = csd->null_subs; greg->std_null_coll = csd->std_null_coll; greg->jnl_state = csd->jnl_state; greg->jnl_file_len = csd->jnl_file_len; /* journal file name length */ memcpy(greg->jnl_file_name, csd->jnl_file_name, greg->jnl_file_len); /* journal file name */ greg->jnl_alq = csd->jnl_alq; greg->jnl_deq = csd->jnl_deq; greg->jnl_buffer_size = csd->jnl_buffer_size; greg->jnl_before_image = csd->jnl_before_image; SET_REGION_OPEN_TRUE(greg, WAS_OPEN_TRUE); assert(1 <= csa->regcnt); csa->regcnt++; /* Increment # of regions that point to this csa */ return; } GTM_FD_TRACE_ONLY(gtm_dbjnl_dupfd_check();) /* check if any of db or jnl fds collide (D9I11-002714) */ greg->was_open = FALSE; /* we shouldn't have crit on any region unless we are in TP and in the final retry or we are in * mupip_set_journal trying to switch journals across all regions. Currently, there is no fine-granular * checking for mupip_set_journal, hence a coarse MUPIP_IMAGE check for image_type */ assert(dollar_tlevel && (CDB_STAGNATE <= t_tries) || IS_MUPIP_IMAGE || (0 == have_crit(CRIT_HAVE_ANY_REG))); if (dollar_tlevel && (0 != have_crit(CRIT_HAVE_ANY_REG))) { /* To avoid deadlocks with currently holding crits and the DLM lock request to be done in db_init(), * we should insert this region in the tp_reg_list and tp_restart should do the gvcst_init after * having released crit on all regions. Note that this check should be done AFTER checking if the * region has already been opened (i.e. greg->was_open = TRUE logic above) since in that case we dont * do any heavyweight processing (like db_init which involves crit/DLM locks) and so dont need to restart. */ insert_region(greg, &tp_reg_list, &tp_reg_free_list, SIZEOF(tp_region)); t_retry(cdb_sc_needcrit); assert(FALSE); /* we should never reach here since t_retry should have unwound the M-stack and restarted the TP */ } csa = (sgmnt_addrs *)&FILE_INFO(greg)->s_addrs; #ifdef NOLICENSE licensed = TRUE; #else CRYPT_CHKSYSTEM; #endif db_init_region = greg; /* initialized for dbinit_ch */ csa->hdr = NULL; csa->nl = NULL; csa->jnl = NULL; csa->persistent_freeze = FALSE; /* want secshr_db_clnup() to clear an incomplete freeze/unfreeze codepath */ csa->regcnt = 1; /* At this point, only one region points to this csa */ # ifdef VMS csa->db_addrs[0] = csa->db_addrs[1] = NULL; csa->lock_addrs[0] = csa->lock_addrs[1] = NULL; greg_acc_meth = greg->dyn.addr->acc_meth; assert(dba_cm != greg_acc_meth); temp_cs_data = (sgmnt_data_ptr_t)cs_data_buff; fc = greg->dyn.addr->file_cntl; fc->file_type = greg_acc_meth; fc->op = FC_READ; fc->op_buff = (sm_uc_ptr_t)temp_cs_data; fc->op_len = SIZEOF(*temp_cs_data); fc->op_pos = 1; dbfilop(fc); DO_BADDBVER_CHK(greg, temp_cs_data); DO_DB_HDR_CHECK(greg, temp_cs_data); /* Basic sanity check on the file header fields */ if (greg_acc_meth != temp_cs_data->acc_meth) { greg_acc_meth = temp_cs_data->acc_meth; greg->dyn.addr->acc_meth = greg_acc_meth; } # endif /* Here's the shared memory layout: * * low address * * both * segment_data * (file_header) * MM_BLOCK * (master_map) * TH_BLOCK * BG * bt_header * (bt_buckets * bt_rec) * th_base (SIZEOF(que_ent) into an odd bt_rec) * bt_base * (n_bts * bt_rec) * LOCK_BLOCK (lock_space) * (lock_space_size) * cs_addrs->acc_meth.bg.cache_state * (cache_que_heads) * (bt_buckets * cache_rec) * (n_bts * cache_rec) * critical * (mutex_struct) * nl * (node_local) * [jnl_name * jnl_buffer] * MM * file contents * LOCK_BLOCK (lock_space) * (lock_space_size) * cs_addrs->acc_meth.mm.mmblk_state * (mmblk_que_heads) * (bt_buckets * mmblk_rec) * (n_bts * mmblk_rec) * critical * (mutex_struct) * nl * (node_local) * [jnl_name * jnl_buffer] * high address */ /* Ensure first 3 members (upto now_running) of node_local are at the same offset for any version. * * Structure ----> node_local <---- size 59392 [0xe800] * * offset = 0000 [0x0000] size = 0012 [0x000c] ----> node_local.label * offset = 0012 [0x000c] size = 0256 [0x0100] ----> node_local.fname * offset = 0268 [0x010c] size = 0036 [0x0024] ----> node_local.now_running * * This is so that the VERMISMATCH error can be successfully detected in db_init/mu_rndwn_file * and so that the db-file-name can be successfully obtained from orphaned shm by mu_rndwn_all. */ assert(0 == OFFSETOF(node_local, label[0])); assert(12 == SIZEOF(((node_local *)NULL)->label)); assert(12 == GDS_LABEL_SZ); assert(12 == OFFSETOF(node_local, fname[0])); assert(256 == SIZEOF(((node_local *)NULL)->fname)); assert(256 == (MAX_FN_LEN + 1)); assert(268 == OFFSETOF(node_local, now_running[0])); assert(36 == SIZEOF(((node_local *)NULL)->now_running)); assert(36 == MAX_REL_NAME); # ifdef UNIX START_HEARTBEAT_IF_NEEDED; if (!pool_init && jnlpool_init_needed && ANTICIPATORY_FREEZE_AVAILABLE && REPL_INST_AVAILABLE) { jnlpool_init(GTMRELAXED, (boolean_t)FALSE, (boolean_t *)NULL); /* Any LSEEKWRITEs hence forth will wait if the instance is frozen. To aid in printing the region information before * and after the wait, csa->region is referenced. Since it is NULL at this point, set it to greg. This is a safe * thing to do since csa->region is anyways set in db_common_init (few lines below). */ csa->region = greg; } # endif /* Protect the db_init and the code below until we set greg->open to TRUE. This is needed as otherwise, * if a MUPIP STOP is issued to this process at a time-window when db_init is completed but greg->open * is NOT set to TRUE, will cause gds_rundown NOT to clean up the shared memory created by db_init and * thus would be left over in the system. */ DEFER_INTERRUPTS(INTRPT_IN_GVCST_INIT); VMS_ONLY(db_init(greg, temp_cs_data);) UNIX_ONLY(db_init(greg);) /* At this point, we have initialized the database, but haven't yet set reg->open to TRUE. If any rts_errors happen in * the meantime, there are no condition handlers established to handle the rts_error. More importantly, it is non-trivial * to add logic to such a condition handler to undo the effects of db_init. Also, in some cases, the rts_error can can * confuse future calls of db_init. By invoking DBG_MARK_RTS_ERROR_UNUSABLE, we can catch any rts_errors in future and * eliminate it on a case by case basis. */ UNIX_ONLY(DBG_MARK_RTS_ERROR_UNUSABLE); crash_count = csa->critical->crashcnt; csa->regnum = ++region_open_count; csd = csa->hdr; # ifdef GTM_TRIGGER /* Take copy of db trigger cycle into csa at db startup. Any concurrent changes to the * db trigger cycle (by MUPIP TRIGGER) will be detected at tcommit (t_end/tp_tend) time. */ csa->db_trigger_cycle = csd->db_trigger_cycle; # endif /* set csd and fill in selected fields */ assert(greg->dyn.addr->acc_meth == csd->acc_meth); /* db_init should have made sure this assert holds good */ greg_acc_meth = csd->acc_meth; switch (greg_acc_meth) { case dba_mm: csa->acc_meth.mm.base_addr = (sm_uc_ptr_t)((sm_ulong_t)csd + (int)(csd->start_vbn - 1) * DISK_BLOCK_SIZE); break; case dba_bg: db_csh_ini(csa); break; default: GTMASSERT; } /* It is necessary that we do the pending gv_target list reallocation BEFORE db_common_init as the latter resets * greg->max_key_size to be equal to the csd->max_key_size and hence process_gvt_pending_list might wrongly conclude * that NO reallocation (since it checks greg->max_key_size with csd->max_key_size) is needed when in fact a * reallocation might be necessary (if the user changed max_key_size AFTER database creation) */ PROCESS_GVT_PENDING_LIST(greg, csa, gvt_pending_list); db_common_init(greg, csa, csd); /* do initialization common to db_init() and mu_rndwn_file() */ /* If we are not fully upgraded, see if we need to send a warning to the operator console about performance. Compatibility mode is a known performance drain. Actually, we can send one of two messages. If the desired_db_format is for an earlier release than the current release, we send a performance warning that this mode degrades performance. However, if the desired_db_format is for the current version but there are blocks to convert still, we send a gengle reminder that running mupip reorg upgrade would be a good idea to get the full performance benefit of V5. */ time(&curr_time); assert(MAXUINT4 > curr_time); curr_time_uint4 = (uint4)curr_time; next_warn_uint4 = csd->next_upgrd_warn.cas_time; if (!csd->fully_upgraded && curr_time_uint4 > next_warn_uint4 && COMPSWAP_LOCK(&csd->next_upgrd_warn.time_latch, next_warn_uint4, 0, (curr_time_uint4 + UPGRD_WARN_INTERVAL), 0)) { /* The msg is due and we have successfully updated the next time interval */ if (GDSVCURR != csd->desired_db_format) send_msg(VARLSTCNT(4) ERR_DBVERPERFWARN1, 2, DB_LEN_STR(greg)); else send_msg(VARLSTCNT(4) ERR_DBVERPERFWARN2, 2, DB_LEN_STR(greg)); } /* Compute the maximum journal space requirements for a PBLK (including possible ALIGN record). * Use this variable in the TOTAL_TPJNL_REC_SIZE and TOTAL_NONTP_JNL_REC_SIZE macros instead of recomputing. */ csa->pblk_align_jrecsize = (int4)MIN_PBLK_RECLEN + csd->blk_size + (int4)MIN_ALIGN_RECLEN; segment_update_array_size = UA_SIZE(csd); if (first_ua == NULL) { /* first open of first database - establish an update array system */ assert(update_array == NULL); assert(update_array_ptr == NULL); assert(update_array_size == 0); tmp_ua = (ua_list *)malloc(SIZEOF(ua_list)); memset(tmp_ua, 0, SIZEOF(ua_list)); /* initialize tmp_ua->update_array and tmp_ua->next_ua to NULL */ tmp_ua->update_array = (char *)malloc(segment_update_array_size); tmp_ua->update_array_size = segment_update_array_size; /* assign global variables only after malloc() succeeds */ update_array_size = cumul_update_array_size = segment_update_array_size; update_array = update_array_ptr = tmp_ua->update_array; first_ua = curr_ua = tmp_ua; } else { /* there's already an update_array system in place */ assert(update_array != NULL); assert(update_array_size != 0); if (!dollar_tlevel && segment_update_array_size > first_ua->update_array_size) { /* no transaction in progress and the current array is too small - replace it */ assert(first_ua->update_array == update_array); assert(first_ua->update_array_size == update_array_size); assert(first_ua->next_ua == NULL); tmp_ua = first_ua; first_ua = curr_ua = NULL; free(update_array); tmp_ua->update_array = update_array = update_array_ptr = NULL; tmp_ua->update_array = (char *)malloc(segment_update_array_size); tmp_ua->update_array_size = segment_update_array_size; /* assign global variables only after malloc() succeeds */ update_array_size = cumul_update_array_size = segment_update_array_size; update_array = update_array_ptr = tmp_ua->update_array; first_ua = curr_ua = tmp_ua; } } assert(global_tlvl_info_list || !csa->sgm_info_ptr); if (JNL_ALLOWED(csa)) { if (NULL == non_tp_jfb_ptr) { non_tp_jfb_ptr = (jnl_format_buffer *)malloc(SIZEOF(jnl_format_buffer)); non_tp_jfb_buff_ptr = (unsigned char *)malloc(MAX_JNL_REC_SIZE); non_tp_jfb_ptr->buff = (char *)non_tp_jfb_buff_ptr; /* If the journal records need to be encrypted in the journal file and if replication is in use, * we will need access to both the encrypted (for the journal file) and unencrypted (for the * journal pool) journal record contents. Since this code is executed only once (for the first * journaled database opened) by this process, we will have to allocate an alternate buffer * for this purpose (to hold the unencrypted data) as long as this GT.M version supports encryption. */ GTMCRYPT_ONLY( non_tp_jfb_ptr->alt_buff = (char *)malloc(MAX_JNL_REC_SIZE); ) non_tp_jfb_ptr->record_size = 0; /* initialize it to 0 since TOTAL_NONTPJNL_REC_SIZE macro uses it */ } /* csa->min_total_tpjnl_rec_size represents the minimum journal buffer space needed for a TP transaction. * It is a conservative estimate assuming that one ALIGN record and one PINI record will be written for * one set of fixed size jnl records written. * si->total_jnl_rec_size is initialized/reinitialized to this value here and in tp_clean_up(). * The purpose of this field is to avoid recomputation of the variable in tp_clean_up(). * In addition to this, space requirements for whatever journal records get formatted as part of * jnl_format() need to be taken into account. * This is done in jnl_format() where si->total_jnl_rec_size is appropriately incremented. */ csa->min_total_tpjnl_rec_size = PINI_RECLEN + TCOM_RECLEN + MIN_ALIGN_RECLEN; /* Similarly csa->min_total_nontpjnl_rec_size represents the minimum journal buffer space needed * for a non-TP transaction. * It is a conservative estimate assuming that one ALIGN record and one PINI record will be written for * one set of fixed size jnl records written. */ csa->min_total_nontpjnl_rec_size = PINI_RECLEN + MIN_ALIGN_RECLEN; } if (tp_in_use || !IS_GTM_IMAGE) gvcst_tp_init(greg); /* Initialize TP structures, else postpone till TP is used (only if GTM) */ if (!global_tlvl_info_list) { global_tlvl_info_list = (buddy_list *)malloc(SIZEOF(buddy_list)); initialize_list(global_tlvl_info_list, SIZEOF(global_tlvl_info), GBL_TLVL_INFO_LIST_INIT_ALLOC); } assert(!greg->was_open); SET_REGION_OPEN_TRUE(greg, WAS_OPEN_FALSE); csa = (sgmnt_addrs*)&FILE_INFO(greg)->s_addrs; if (NULL != csa->dir_tree) { /* It is possible that dir_tree has already been targ_alloc'ed. This is because GT.CM or VMS DAL * calls can run down regions without the process halting out. We don't want to double malloc. */ csa->dir_tree->clue.end = 0; } SET_CSA_DIR_TREE(csa, greg->max_key_size, greg); /* Now that reg->open is set to TRUE and directory tree is initialized, go ahead and set rts_error back to being usable */ UNIX_ONLY(DBG_MARK_RTS_ERROR_USABLE); /* gds_rundown if invoked from now on will take care of cleaning up the shared memory segment */ ENABLE_INTERRUPTS(INTRPT_IN_GVCST_INIT); if (dba_bg == greg_acc_meth) { /* Check if (a) this region has non-upgraded blocks and if so, (b) the reformat buffer exists and * (c) if it is big enough to deal with this region. If the region does not have any non-upgraded * block (blks_to_upgrd is 0) we will not allocate the buffer at this time. Note that this opens up * a small window for errors. If this buffer is not allocated and someone turns on compatibility * mode and before the process can discover this and allocate the buffer, it runs out of memory, * errors out and finds it is responsible for running down the database, it could fail on a recursive * memory error when it tries to allocate the block. This is (to me) an acceptable risk as it is * very low and compares favorably to the cost of every process allocating a database block sized * chunk of private storage that will be seldom if ever used (SE 3/2005). */ if (0 != csd->blks_to_upgrd && csd->blk_size > reformat_buffer_len) { /* Buffer not big enough (or does not exist) .. get a new one releasing old if it exists */ assert(0 == fast_lock_count); /* this is mainline (non-interrupt) code */ ++fast_lock_count; /* No interrupts across this use of reformat_buffer */ /* reformat_buffer_in_use should always be incremented only AFTER incrementing fast_lock_count * as it is the latter that prevents interrupts from using the reformat buffer. Similarly * the decrement of fast_lock_count should be done AFTER decrementing reformat_buffer_in_use. */ assert(0 == reformat_buffer_in_use); DEBUG_ONLY(reformat_buffer_in_use++;) if (reformat_buffer) free(reformat_buffer); /* Different blksized databases in use .. keep only largest one */ reformat_buffer = malloc(csd->blk_size); reformat_buffer_len = csd->blk_size; DEBUG_ONLY(reformat_buffer_in_use--;) assert(0 == reformat_buffer_in_use); --fast_lock_count; } } if ((dba_bg == greg_acc_meth) || (dba_mm == greg_acc_meth)) { /* Determine fid_index of current region's file_id across sorted file_ids of all regions open until now. * All regions which have a file_id lesser than that of current region will have no change to their fid_index * All regions which have a file_id greater than that of current region will have their fid_index incremented by 1 * The fid_index determination algorithm below has an optimization in that if the current region's file_id is * determined to be greater than that of a particular region, then all regions whose fid_index is lesser * than that particular region's fid_index are guaranteed to have a lesser file_id than the current region * so we do not compare those against the current region's file_id. * Note that the sorting is done only on DB/MM regions. GT.CM/DDP regions should not be part of TP transactions, * hence they will not be sorted. */ prevcsa = NULL; greg_fid = &(csa->nl->unique_id); for (regcsa = cs_addrs_list; NULL != regcsa; regcsa = regcsa->next_csa) { UNIX_ONLY(onln_rlbk_cycle_mismatch |= (regcsa->db_onln_rlbkd_cycle != regcsa->nl->db_onln_rlbkd_cycle)); if ((NULL != prevcsa) && (regcsa->fid_index < prevcsa->fid_index)) continue; reg_fid = &((regcsa)->nl->unique_id); VMS_ONLY(if (0 < memcmp(&(greg_fid->file_id), (char *)&(reg_fid->file_id), SIZEOF(gd_id)))) UNIX_ONLY(if (0 < gdid_cmp(&(greg_fid->uid), &(reg_fid->uid)))) { if ((NULL == prevcsa) || (regcsa->fid_index > prevcsa->fid_index)) prevcsa = regcsa; } else regcsa->fid_index++; } if (NULL == prevcsa) csa->fid_index = 1; else csa->fid_index = prevcsa->fid_index + 1; UNIX_ONLY( if (onln_rlbk_cycle_mismatch) { csa->root_search_cycle--; csa->onln_rlbk_cycle--; csa->db_onln_rlbkd_cycle--; } ) /* Add current csa into list of open csas */ csa->next_csa = cs_addrs_list; cs_addrs_list = csa; /* Also update tp_reg_list fid_index's as insert_region relies on it */ for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr) tr->file.fid_index = (&FILE_INFO(tr->reg)->s_addrs)->fid_index; DBG_CHECK_TP_REG_LIST_SORTING(tp_reg_list); } # ifdef UNIX if (pool_init && REPL_ALLOWED(csd) && jnlpool_init_needed) { /* Last parameter to VALIDATE_INITIALIZED_JNLPOOL is TRUE if the process does logical updates and FALSE otherwise. * This parameter governs whether the macro can do SCNDDBNOUPD check or not. All the utilities that sets * jnlpool_init_needed global variable don't do logical updates (REORG, EXTEND, etc.). But, for GT.M, * jnlpool_init_needed is set to TRUE unconditionally. Even though GT.M can do logical updates, we pass FALSE * unconditionally to the macro (indicating no logical updates). This is because, at this point, there is no way to * tell if this process wants to open the database for read or write operation. If it is for a read operation, we * don't want the below macro to issue SCNDDBNOUPD error. If it is for write operation, we will skip the * SCNDDBNOUPD error message here. But, eventually when this process goes to gvcst_{put,kill} or op_ztrigger, * SCNDDBNOUPD is issued. */ VALIDATE_INITIALIZED_JNLPOOL(csa, csa->nl, greg, GTMRELAXED, SCNDDBNOUPD_CHECK_FALSE); } # endif return; }