/**************************************************************** * * * Copyright 2001, 2012 Fidelity Information Services, Inc * * * * This source code contains the intellectual property * * of its copyright holder(s), and is made available * * under a license. If you do not know the terms of * * the license, please stop and do not read further. * * * ****************************************************************/ #include "mdef.h" #include #ifndef __MVS__ #include #endif #include #include #include #include #include #include "gtm_ipc.h" #include "gtm_socket.h" #include "gtm_fcntl.h" #include "gtm_unistd.h" #include "gtm_stdio.h" #include "gtm_string.h" #include "gtm_sem.h" #include "gtm_statvfs.h" #include "gt_timer.h" #include "gdsroot.h" #include "gtm_facility.h" #include "fileinfo.h" #include "gdsbt.h" #include "gdsfhead.h" #include "gdsblk.h" #include "gdscc.h" #include "min_max.h" #include "gdsblkops.h" #include "filestruct.h" #include "parse_file.h" #include "jnl.h" #include "interlock.h" #include "io.h" #include "iosp.h" #include "error.h" #include "mutex.h" #include "gtmio.h" #include "mupipbckup.h" #include "gtmimagename.h" #include "mmseg.h" #include "gtmsecshr.h" #include "secshr_client.h" #include "ftok_sems.h" #include "repl_msg.h" #include "gtmsource.h" #include "anticipatory_freeze.h" /* Include prototypes */ #include "mlk_shr_init.h" #include "gtm_c_stack_trace.h" #include "eintr_wrappers.h" #include "eintr_wrapper_semop.h" #include "is_file_identical.h" #include "repl_instance.h" #include "heartbeat_timer.h" #include "util.h" #include "dbfilop.h" #include "gvcst_protos.h" #include "is_raw_dev.h" #include "gv_match.h" #include "do_semop.h" #include "gvcmy_open.h" #include "wcs_sleep.h" #include "do_shmat.h" #include "send_msg.h" #include "gtmmsg.h" #include "shmpool.h" #include "gtm_permissions.h" #include "wbox_test_init.h" #include "wcs_clean_dbsync.h" /* for setting wcs_clean_dbsync pointer */ #ifdef GTM_CRYPT #include "gtmcrypt.h" #endif #include "have_crit.h" #ifdef __MVS__ #include "gtm_zos_io.h" #endif #include "db_snapshot.h" #include "lockconst.h" /* for LOCK_AVAILABLE */ #ifdef GTM_TRUNCATE #include "recover_truncate.h" #endif #ifndef GTM_SNAPSHOT # error "Snapshot facility not supported in this platform" #endif #define REQRUNDOWN_TEXT "semid is invalid but shmid is valid or at least one of sem_ctime or shm_ctime are non-zero" #define MAX_ACCESS_SEM_RETRIES 2 #define SS_INFO_INIT(CSA) \ { \ shm_snapshot_ptr_t ss_shm_ptr; \ node_local_ptr_t lcl_cnl; \ \ lcl_cnl = CSA->nl; \ lcl_cnl->ss_shmid = INVALID_SHMID; \ lcl_cnl->ss_shmcycle = 0; \ CLEAR_SNAPSHOTS_IN_PROG(lcl_cnl); \ lcl_cnl->num_snapshots_in_effect = 0; \ SET_LATCH_GLOBAL(&lcl_cnl->snapshot_crit_latch, LOCK_AVAILABLE); \ assert(1 == MAX_SNAPSHOTS); /* To ensure that we revisit this whenever multiple snapshots is implemented */ \ ss_shm_ptr = (shm_snapshot_ptr_t)(SS_GETSTARTPTR(CSA)); \ SS_DEFAULT_INIT_POOL(ss_shm_ptr); \ } #define GTM_ATTACH_CHECK_ERROR \ { \ if (-1 == status_l) \ { \ rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), \ ERR_TEXT, 2, LEN_AND_LIT("Error attaching to database shared memory"), errno); \ } \ } #define GTM_ATTACH_SHM \ { \ status_l = (sm_long_t)(csa->db_addrs[0] = (sm_uc_ptr_t)do_shmat(udi->shmid, 0, SHM_RND)); \ GTM_ATTACH_CHECK_ERROR; \ csa->nl = (node_local_ptr_t)csa->db_addrs[0]; \ } #define GTM_ATTACH_SHM_AND_CHECK_VERS(VERMISMATCH, SHM_SETUP_OK) \ { \ GTM_ATTACH_SHM; \ /* The following checks for GDS_LABEL_GENERIC and gtm_release_name ensure that the shared memory under consideration \ * is valid. If shared memory is already initialized, do VERMISMATCH check BEFORE referencing any other fields in \ * shared memory. \ */ \ VERMISMATCH = FALSE; \ SHM_SETUP_OK = FALSE; \ if (!MEMCMP_LIT(csa->nl->label, GDS_LABEL_GENERIC)) \ { \ if (memcmp(csa->nl->now_running, gtm_release_name, gtm_release_name_len + 1)) \ { /* Copy csa->nl->now_running into a local variable before passing to rts_error() due to the following \ * issue: \ * In VMS, a call to rts_error() copies only the error message and its arguments (as pointers) and \ * transfers control to the topmost condition handler which is dbinit_ch() in this case. dbinit_ch() \ * does a PRN_ERROR only for SUCCESS/INFO (VERMISMATCH is neither of them) and in addition \ * nullifies csa->nl as part of its condition handling. It then transfers control to the next level \ * condition handler which does a PRN_ERROR but at that point in time, the parameter \ * csa->nl->now_running is no longer accessible and hence no \parameter substitution occurs (i.e. the \ * error message gets displayed with plain !ADs). \ * In UNIX, this is not an issue since the first call to rts_error() does the error message \ * construction before handing control to the topmost condition handler. But it does not hurt to do \ * the copy. \ */ \ assert(strlen(csa->nl->now_running) < SIZEOF(now_running)); \ memcpy(now_running, csa->nl->now_running, SIZEOF(now_running)); \ now_running[SIZEOF(now_running) - 1] = '\0'; /* protection against bad csa->nl->now_running values */ \ VERMISMATCH = TRUE; \ } else \ SHM_SETUP_OK = TRUE; \ } \ } #define GTM_VERMISMATCH_ERROR \ { \ if (!vermismatch_already_printed) \ { \ vermismatch_already_printed = TRUE; \ /* for DSE, change VERMISMATCH to be INFO (instead of the more appropriate WARNING) \ * as we want the condition handler (dbinit_ch) to do a CONTINUE (which it does \ * only for severity levels SUCCESS or INFO) and resume processing in gvcst_init.c \ * instead of detaching from shared memory. \ */ \ rts_error(VARLSTCNT(8) MAKE_MSG_TYPE(ERR_VERMISMATCH, (!IS_DSE_IMAGE ? ERROR : INFO)), 6, \ DB_LEN_STR(reg), gtm_release_name_len, gtm_release_name, LEN_AND_STR(now_running)); \ } \ } #ifdef GTM_CRYPT #define INIT_DB_ENCRYPTION_IF_NEEDED(DO_CRYPT_INIT, INIT_STATUS, REG, CSA, TSD) \ { \ if (DO_CRYPT_INIT) \ { /* Do database specific encryption initialization. For all utilities other than GT.M, defer the error until \ * encryption invocation is actually necessary. This way, MUPIP/DSE can continue as long as the operation does \ * not involve encryption (for instance MUPIP JOURNAL -EXTRACT -SHOW=HEADER). For GT.M, issue error right away \ */ \ if (0 == INIT_STATUS) \ INIT_DB_ENCRYPTION(REG->dyn.addr->fname, CSA, TSD, INIT_STATUS); \ if ((0 != INIT_STATUS) && IS_GTM_IMAGE) \ GC_RTS_ERROR(INIT_STATUS, REG->dyn.addr->fname); \ CSA->encrypt_init_status = INIT_STATUS; /* defer error reporting */ \ } \ } #define INIT_PROC_ENCRYPTION_IF_NEEDED(DO_CRYPT_INIT, INIT_STATUS) \ { \ if (DO_CRYPT_INIT) \ INIT_PROC_ENCRYPTION(INIT_STATUS); \ } #else #define INIT_DB_ENCRYPTION_IF_NEEDED(IS_ENCRYPTED, INIT_STATUS, REG, CSA, TSD) #define INIT_PROC_ENCRYPTION_IF_NEEDED(IS_ENCRYPTED, INIT_STATUS) #endif #define READ_DB_FILE_HEADER(REG, TSD) \ { \ file_control *fc; \ \ fc = REG->dyn.addr->file_cntl; \ fc->file_type = REG->dyn.addr->acc_meth; \ fc->op = FC_READ; \ fc->op_buff = (sm_uc_ptr_t)TSD; \ fc->op_pos = 1; \ fc->op_len = SIZEOF(sgmnt_data); \ dbfilop(fc); \ } #define READ_DB_FILE_MASTERMAP(REG, CSD) \ { \ file_control *fc; \ \ assert(dba_bg == CSD->acc_meth); \ fc = REG->dyn.addr->file_cntl; \ fc->file_type = dba_bg; \ fc->op = FC_READ; \ fc->op_buff = MM_ADDR(CSD); \ fc->op_len = MASTER_MAP_SIZE(CSD); \ fc->op_pos = MM_BLOCK; \ dbfilop(fc); \ } /* Depending on whether journaling and/or replication was enabled at the time of the crash, * print REQRUNDOWN, REQRECOV, or REQROLLBACK error message. */ #define PRINT_CRASH_MESSAGE(CNT, ARG, ...) \ { \ if (JNL_ENABLED(tsd)) \ { \ if (REPL_ENABLED(tsd) && tsd->jnl_before_image) \ rts_error(VARLSTCNT(10 + CNT) ERR_REQROLLBACK, 4, DB_LEN_STR(reg), \ LEN_AND_STR((ARG)->machine_name), __VA_ARGS__); \ else \ rts_error(VARLSTCNT(10 + CNT) ERR_REQRECOV, 4, DB_LEN_STR(reg), \ LEN_AND_STR((ARG)->machine_name), __VA_ARGS__); \ } else \ rts_error(VARLSTCNT(10 + CNT) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), \ LEN_AND_STR((ARG)->machine_name), __VA_ARGS__); \ } GBLREF boolean_t gtm_fullblockwrites; /* Do full (not partial) database block writes T/F */ GBLREF boolean_t is_src_server; GBLREF boolean_t mupip_jnl_recover; GBLREF gd_region *gv_cur_region; GBLREF ipcs_mesg db_ipcs; GBLREF jnlpool_addrs jnlpool; GBLREF node_local_ptr_t locknl; GBLREF uint4 heartbeat_counter; GBLREF uint4 mutex_per_process_init_pid; GBLREF uint4 process_id; GBLREF void (*wcs_clean_dbsync_fptr)(); GBLREF jnl_gbls_t jgbl; GTMCRYPT_ONLY( GBLREF gtmcrypt_key_t mu_int_encrypt_key_handle; ) #ifndef MUTEX_MSEM_WAKE GBLREF int mutex_sock_fd; #endif LITREF char gtm_release_name[]; LITREF int4 gtm_release_name_len; OS_PAGE_SIZE_DECLARE error_def(ERR_BADDBVER); ZOS_ONLY(error_def(ERR_BADTAG);) error_def(ERR_CLSTCONFLICT); error_def(ERR_CRITSEMFAIL); error_def(ERR_DBCREINCOMP); error_def(ERR_DBFILERR); error_def(ERR_DBFLCORRP); error_def(ERR_DBIDMISMATCH); error_def(ERR_DBNAMEMISMATCH); error_def(ERR_DBNOTGDS); error_def(ERR_DBSHMNAMEDIFF); error_def(ERR_JNLBUFFREGUPD); error_def(ERR_NLMISMATCHCALC); error_def(ERR_MMNODYNUPGRD); error_def(ERR_PERMGENFAIL); error_def(ERR_REQROLLBACK); error_def(ERR_REQRECOV); error_def(ERR_REQRUNDOWN); error_def(ERR_SYSCALL); error_def(ERR_TEXT); error_def(ERR_VERMISMATCH); gd_region *dbfilopn (gd_region *reg) { unix_db_info *udi; parse_blk pblk; mstr file; char *fnptr, fbuff[MAX_FBUFF + 1]; struct stat buf; gd_region *prev_reg; gd_segment *seg; int status; bool raw; int stat_res, rc, save_errno; ZOS_ONLY(int realfiletag;) seg = reg->dyn.addr; assert(seg->acc_meth == dba_bg || seg->acc_meth == dba_mm); FILE_CNTL_INIT_IF_NULL(seg); file.addr = (char *)seg->fname; file.len = seg->fname_len; memset(&pblk, 0, SIZEOF(pblk)); pblk.buffer = fbuff; pblk.buff_size = MAX_FBUFF; pblk.fop = (F_SYNTAXO | F_PARNODE); memcpy(fbuff,file.addr,file.len); *(fbuff + file.len) = '\0'; if (is_raw_dev(fbuff)) { raw = TRUE; pblk.def1_buf = DEF_NODBEXT; pblk.def1_size = SIZEOF(DEF_NODBEXT) - 1; } else { raw = FALSE; pblk.def1_buf = DEF_DBEXT; pblk.def1_size = SIZEOF(DEF_DBEXT) - 1; } status = parse_file(&file, &pblk); if (!(status & 1)) { if (!IS_GTCM_GNP_SERVER_IMAGE) { free(seg->file_cntl->file_info); free(seg->file_cntl); seg->file_cntl = 0; } rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), status); } assert(((int)pblk.b_esl + 1) <= SIZEOF(seg->fname)); memcpy(seg->fname, pblk.buffer, pblk.b_esl); pblk.buffer[pblk.b_esl] = 0; seg->fname[pblk.b_esl] = 0; seg->fname_len = pblk.b_esl; if (pblk.fnb & F_HAS_NODE) { /* Remote node specification given */ assert(pblk.b_node && pblk.l_node[pblk.b_node - 1] == ':'); gvcmy_open(reg, &pblk); return (gd_region *)-1L; } fnptr = (char *)seg->fname + pblk.b_node; udi = FILE_INFO(reg); udi->raw = raw; udi->fn = (char *)fnptr; OPENFILE(fnptr, O_RDWR, udi->fd); if (!udi->grabbed_access_sem) { /* If the process already has standalone access, these fields are initialized in mu_rndwn_file */ udi->ftok_semid = INVALID_SEMID; udi->semid = INVALID_SEMID; udi->shmid = INVALID_SHMID; udi->gt_sem_ctime = 0; udi->gt_shm_ctime = 0; } reg->read_only = FALSE; /* maintain csa->read_write simultaneously */ udi->s_addrs.read_write = TRUE; /* maintain reg->read_only simultaneously */ if (FD_INVALID == udi->fd) { OPENFILE(fnptr, O_RDONLY, udi->fd); if (FD_INVALID == udi->fd) { save_errno = errno; if (!IS_GTCM_GNP_SERVER_IMAGE) { free(seg->file_cntl->file_info); free(seg->file_cntl); seg->file_cntl = 0; } rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), save_errno); } reg->read_only = TRUE; /* maintain csa->read_write simultaneously */ udi->s_addrs.read_write = FALSE; /* maintain reg->read_only simultaneously */ } # ifdef __MVS__ if (-1 == gtm_zos_tag_to_policy(udi->fd, TAG_BINARY, &realfiletag)) TAG_POLICY_SEND_MSG(fnptr, errno, realfiletag, TAG_BINARY); # endif STAT_FILE(fnptr, &buf, stat_res); if (-1 == stat_res) { save_errno = errno; rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), save_errno); } set_gdid_from_stat(&udi->fileid, &buf); if (prev_reg = gv_match(reg)) { CLOSEFILE_RESET(udi->fd, rc); /* resets "udi->fd" to FD_INVALID */ free(seg->file_cntl->file_info); free(seg->file_cntl); seg->file_cntl = 0; return prev_reg; } return reg; } void dbsecspc(gd_region *reg, sgmnt_data_ptr_t csd, gtm_uint64_t *sec_size) { /* Ensure that all the various sections that the shared memory contains are actually * aligned at the OS_PAGE_SIZE boundary */ assert(0 == NODE_LOCAL_SPACE % OS_PAGE_SIZE); assert(0 == LOCK_SPACE_SIZE(csd) % OS_PAGE_SIZE); assert(0 == JNL_SHARE_SIZE(csd) % OS_PAGE_SIZE); assert(0 == SHMPOOL_SECTION_SIZE % OS_PAGE_SIZE); switch(reg->dyn.addr->acc_meth) { case dba_mm: assert(0 == MMBLK_CONTROL_SIZE(csd) % OS_PAGE_SIZE); *sec_size = ROUND_UP(NODE_LOCAL_SPACE + LOCK_SPACE_SIZE(csd) + MMBLK_CONTROL_SIZE(csd) \ + JNL_SHARE_SIZE(csd) + SHMPOOL_SECTION_SIZE, OS_PAGE_SIZE); break; case dba_bg: assert(0 == CACHE_CONTROL_SIZE(csd) % OS_PAGE_SIZE); *sec_size = ROUND_UP(NODE_LOCAL_SPACE + (LOCK_BLOCK(csd) * DISK_BLOCK_SIZE) + LOCK_SPACE_SIZE(csd) \ + CACHE_CONTROL_SIZE(csd) + JNL_SHARE_SIZE(csd) + SHMPOOL_SECTION_SIZE, OS_PAGE_SIZE); break; default: GTMASSERT; } return; } void db_init(gd_region *reg) { boolean_t is_bg, read_only, sem_created = FALSE, need_stacktrace, have_standalone_access; boolean_t shm_setup_ok = FALSE, vermismatch = FALSE, vermismatch_already_printed = FALSE; boolean_t new_shm_ipc, do_crypt_init = FALSE, replinst_mismatch; char machine_name[MAX_MCNAMELEN]; int gethostname_res, stat_res, mm_prot, group_id, perm, save_udi_semid; int4 status, semval, dblksize, fbwsize, save_errno, wait_time, loopcnt, sem_pid; sm_long_t status_l; sgmnt_addrs *csa; sgmnt_data tsdbuff; sgmnt_data_ptr_t csd, tsd; struct sembuf sop[3]; struct stat stat_buf; union semun semarg; struct semid_ds semstat; struct shmid_ds shmstat; struct statvfs dbvfs; uint4 sopcnt, start_hrtbt_cntr; unix_db_info *udi; char now_running[MAX_REL_NAME]; int init_status; gtm_uint64_t sec_size; semwait_status_t retstat; struct perm_diag_data pdd; boolean_t bypassed_ftok = FALSE, bypassed_access = FALSE; int jnl_buffer_size; char s[JNLBUFFUPDAPNDX_SIZE]; /* JNLBUFFUPDAPNDX_SIZE is defined in jnl.h */ DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; ESTABLISH(dbinit_ch); assert(INTRPT_IN_GVCST_INIT == intrpt_ok_state); /* we better be called from gvcst_init */ wcs_clean_dbsync_fptr = &wcs_clean_dbsync; tsd = &tsdbuff; read_only = reg->read_only; TREF(new_dbinit_ipc) = 0; /* we did not create a new ipc resource */ udi = FILE_INFO(reg); memset(machine_name, 0, SIZEOF(machine_name)); if (GETHOSTNAME(machine_name, MAX_MCNAMELEN, gethostname_res)) rts_error(VARLSTCNT(5) ERR_TEXT, 2, LEN_AND_LIT("Unable to get the hostname"), errno); assert(strlen(machine_name) < MAX_MCNAMELEN); csa = &udi->s_addrs; csa->db_addrs[0] = csa->db_addrs[1] = csa->lock_addrs[0] = NULL; /* to help in dbinit_ch and gds_rundown */ reg->opening = TRUE; assert(0 <= udi->fd); /* database file must have been already opened by dbfilopn() done from gvcst_init() */ FSTAT_FILE(udi->fd, &stat_buf, stat_res); /* get the stats for the database file */ if (-1 == stat_res) rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), errno); /* Setup new group and permissions if indicated by the security rules. */ if (gtm_set_group_and_perm(&stat_buf, &group_id, &perm, PERM_IPC, &pdd) < 0) { send_msg(VARLSTCNT(6 + PERMGENDIAG_ARG_COUNT) ERR_PERMGENFAIL, 4, RTS_ERROR_STRING("ipc resources"), RTS_ERROR_STRING(udi->fn), PERMGENDIAG_ARGS(pdd)); rts_error(VARLSTCNT(6 + PERMGENDIAG_ARG_COUNT) ERR_PERMGENFAIL, 4, RTS_ERROR_STRING("ipc resources"), RTS_ERROR_STRING(udi->fn), PERMGENDIAG_ARGS(pdd)); } /* if the process has standalone access, it will have udi->grabbed_access_sem set to TRUE at this point. Note that down * in a local variable as the udi->grabbed_access_sem will be set to TRUE even for non-standalone access below and hence * we can't rely on that later to determine if the process had standalone access or not when it entered this function. */ have_standalone_access = udi->grabbed_access_sem; if (!have_standalone_access) { do_crypt_init = (reg->dyn.addr->is_encrypted && !IS_LKE_IMAGE); INIT_PROC_ENCRYPTION_IF_NEEDED(do_crypt_init, init_status); /* heavy-weight so needs to be done before ftok */ start_hrtbt_cntr = heartbeat_counter; if (!ftok_sem_get2(reg, start_hrtbt_cntr, &retstat, &bypassed_ftok)) ISSUE_SEMWAIT_ERROR((&retstat), reg, udi, "ftok"); if (bypassed_ftok) send_msg(VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("FTOK bypassed at database initialization")); /* At this point we have ftok_semid semaphore based on ftok key. Any ftok conflicted region will block at this * point. For example, if a.dat and b.dat both have same ftok and process A tries to open or close a.dat and * process B tries to open or close b.dat, even though the database accesses don't conflict, the first one to * control the ftok semaphore blocks (makes wait) the other(s). */ READ_DB_FILE_HEADER(reg, tsd); /* file already opened by dbfilopn() done from gvcst_init() */ DO_BADDBVER_CHK(reg, tsd); /* need to do BADDBVER check before de-referencing shmid and semid from file header * as they could be at different offsets if the database is V4-format */ if (reg->dyn.addr->is_encrypted != tsd->is_encrypted) { /* Encryption setting different between global directory and database file header */ reg->dyn.addr->is_encrypted = tsd->is_encrypted; /* override with the value in file header */ do_crypt_init = (tsd->is_encrypted && !IS_LKE_IMAGE); if (do_crypt_init) { /* Encryption is turned on in the file header. Need to do encryption initialization. Release ftok * as initialization is heavy-weight. */ if (!ftok_sem_release(reg, TRUE, FALSE)) /* decrement counter so later increment is correct */ rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); INIT_PROC_ENCRYPTION_IF_NEEDED(do_crypt_init, init_status); /* redo initialization */ start_hrtbt_cntr = heartbeat_counter; /* update to reflect time lost in encryption initialization */ if (!ftok_sem_get2(reg, start_hrtbt_cntr, &retstat, &bypassed_ftok)) ISSUE_SEMWAIT_ERROR((&retstat), reg, udi, "ftok"); if (bypassed_ftok) send_msg(VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("bypassed at database encryption initialization")); } /* else encryption is turned off in the file header. Continue as-is. Any encryption initialization done * before is discarded */ } INIT_DB_ENCRYPTION_IF_NEEDED(do_crypt_init, init_status, reg, csa, tsd); # ifdef DEBUG if (gtm_white_box_test_case_enabled && (WBTEST_HOLD_ONTO_FTOKSEM_IN_DBINIT == gtm_white_box_test_case_number)) { DBGFPF((stderr, "Holding the ftok semaphore.. Sleeping for 30 seconds\n")); LONG_SLEEP(30); DBGFPF((stderr, "30 second sleep exhausted.. continuing with rest of db_init..\n")); } # endif for (loopcnt = 0; MAX_ACCESS_SEM_RETRIES > loopcnt; loopcnt++) { CSD2UDI(tsd, udi); /* sets udi->semid/shmid/sem_ctime/shm_ctime from file header */ TREF(new_dbinit_ipc) = 0; sem_created = FALSE; if (INVALID_SEMID == udi->semid) { /* access control semaphore does not exist. Create one */ if (0 != udi->gt_sem_ctime || INVALID_SHMID != udi->shmid || 0 != udi->gt_shm_ctime) { /* We must have somthing wrong in protocol or, code, if this happens. */ assert(FALSE); PRINT_CRASH_MESSAGE(0, tsd, ERR_TEXT, 2, LEN_AND_STR(REQRUNDOWN_TEXT)); } /* Create new semaphore using IPC_PRIVATE. System guarantees a unique id. */ if (-1 == (udi->semid = semget(IPC_PRIVATE, FTOK_SEM_PER_ID, RWDALL | IPC_CREAT))) { udi->semid = INVALID_SEMID; rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control semget"), errno); } udi->shmid = INVALID_SHMID; /* reset shmid so dbinit_ch does not get confused in case we go there */ TREF(new_dbinit_ipc) |= (NEW_DBINIT_SEM_IPC_MASK | NEW_DBINIT_SHM_IPC_MASK); sem_created = TRUE; /* change group and permissions */ semarg.buf = &semstat; if (-1 == semctl(udi->semid, FTOK_SEM_PER_ID - 1, IPC_STAT, semarg)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control semctl IPC_STAT1"), errno); if ((-1 != group_id) && (group_id != semstat.sem_perm.gid)) semstat.sem_perm.gid = group_id; semstat.sem_perm.mode = perm; if (-1 == semctl(udi->semid, FTOK_SEM_PER_ID - 1, IPC_SET, semarg)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control semctl IPC_SET"), errno); SET_GTM_ID_SEM(udi->semid, status); if (-1 == status) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control semctl SETVAL"), errno); /* WARNING: Because SETVAL changes sem_ctime, we must NOT do any SETVAL after this one; code here * and elsewhere uses IPC_STAT to get sem_ctime and relies on sem_ctime as the creation time of the * semaphore. */ semarg.buf = &semstat; if (-1 == semctl(udi->semid, FTOK_SEM_PER_ID - 1, IPC_STAT, semarg)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control semctl IPC_STAT2"), errno); tsd->gt_sem_ctime.ctime = udi->gt_sem_ctime = semarg.buf->sem_ctime; } else { /* "semid" already exists. Need to lock it. Before that do sanity check on "semid" and "shmid" */ if (INVALID_SHMID != udi->shmid) { if (-1 == shmctl(udi->shmid, IPC_STAT, &shmstat)) { PRINT_CRASH_MESSAGE(1, tsd, ERR_TEXT, 2, LEN_AND_LIT("Error with database control shmctl"), errno); } else if (shmstat.shm_ctime != tsd->gt_shm_ctime.ctime) { GTM_ATTACH_SHM_AND_CHECK_VERS(vermismatch, shm_setup_ok); if (vermismatch) { GTM_VERMISMATCH_ERROR; } else { PRINT_CRASH_MESSAGE(0, tsd, ERR_TEXT, 2, LEN_AND_LIT("IPC creation time indicates a probable prior crash")); } } semarg.buf = &semstat; if (-1 == semctl(udi->semid, 0, IPC_STAT, semarg)) { /* file header has valid semid but semaphore does not exist */ PRINT_CRASH_MESSAGE(1, tsd, ERR_TEXT, 2, LEN_AND_LIT("Error with database control semaphore (IPC_STAT)"), errno); } else if (semarg.buf->sem_ctime != tsd->gt_sem_ctime.ctime) { GTM_ATTACH_SHM_AND_CHECK_VERS(vermismatch, shm_setup_ok); if (vermismatch) { GTM_VERMISMATCH_ERROR; } else { PRINT_CRASH_MESSAGE(0, tsd, ERR_TEXT, 2, LEN_AND_LIT("IPC creation time indicates a probable prior crash")); } } } else { /* else "shmid" is NOT valid. This is possible if - * (a) Another process is holding the access control semaphore for a longer duration of time * but does NOT have the shared memory setup (MUPIP INTEG -FILE or MUPIP RESTORE). * * (b) If a process (like in (a)) were kill -15ed or -9ed and hence did not get a chance to * do db_ipcs_reset which resets "semid"/"shmid" field in the file header to INVALID. * * In either case, try grabbing the semaphore. If not, wait (depending on the user specified * wait time). Eventually, we will either get hold of the semaphore OR will error out. */ TREF(new_dbinit_ipc) |= NEW_DBINIT_SHM_IPC_MASK; /* Need to create shared memory */ } } /* We already have ftok semaphore of this region, so all we need is the access control semaphore */ SET_GTM_SOP_ARRAY(sop, sopcnt, !read_only, (SEM_UNDO | IPC_NOWAIT)); SEMOP(udi->semid, sop, sopcnt, status, NO_WAIT); if (-1 != status) break; else { assert(!sem_created); /* if we created the semaphore, we should be able to do the semop */ save_errno = errno; if (EAGAIN == save_errno) { if (NO_SEMWAIT_ON_EAGAIN == TREF(dbinit_max_hrtbt_delta)) { sem_pid = semctl(udi->semid, 0, GETPID); if (-1 != sem_pid) { rts_error(VARLSTCNT(13) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_SEMWT2LONG, 7, process_id, 0, LEN_AND_LIT("access control"), DB_LEN_STR(reg), sem_pid); } else { save_errno = errno; if (!SEM_REMOVED(save_errno)) { rts_error(VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_LITERAL("semop()"), CALLFROM, save_errno); } /* else semaphore was removed. Fall-through */ } } else if (!do_blocking_semop(udi->semid, gtm_access_sem, start_hrtbt_cntr, &retstat, reg, &bypassed_access)) { if (!SEM_REMOVED(retstat.save_errno)) ISSUE_SEMWAIT_ERROR((&retstat), reg, udi, "access control"); save_errno = retstat.save_errno; } else { if (bypassed_access) send_msg(VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("Access control bypassed at init")); save_errno = status = SS_NORMAL; break; } } else if (!SEM_REMOVED(save_errno)) { rts_error(VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, \ RTS_ERROR_LITERAL("semop()"), CALLFROM, save_errno); } /* this is possible if a concurrent gds_rundown removed the access control semaphore (if * it was the last writer). Another possibility is if the user did an ipcrm which removed * the access control semaphore from the system. Instead of issuing an error right-away, * retry by reading the file header again. Note, it is not possible for another gds_rundown * removing the access control semaphore because any other process has to first get the * ftok lock at startup and since we hold it, they will wait for us to release the ftok. */ assert(SEM_REMOVED(save_errno)); if (1 == loopcnt) { rts_error(VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, \ RTS_ERROR_LITERAL("semop()"), CALLFROM, save_errno); } READ_DB_FILE_HEADER(reg, tsd); } } assert(-1 != status || bypassed_access); if (!bypassed_access) udi->grabbed_access_sem = TRUE; /* Now that we have the access control semaphore, re-read the file header so we have the uptodate information * in case some of the fields (like access method) were modified concurrently by MUPIP SET -FILE */ READ_DB_FILE_HEADER(reg, tsd); UDI2CSD(udi, tsd); /* Since we read the file header again, tsd->semid/shmid and corresponding ctime fields * will not be uptodate. Refresh it with the udi copies as they are the ones used above */ } else { /* for have_standalone_access we were already in "mu_rndwn_file" and got "semid" semaphore. Since mu_rndwn_file * would have gotten "ftok" semaphore before acquiring the access control semaphore, no need to get the "ftok" * semaphore as well. */ READ_DB_FILE_HEADER(reg, tsd); /* file already opened by dbfilopn() done from gvcst_init() */ do_crypt_init = (tsd->is_encrypted && !IS_LKE_IMAGE); INIT_PROC_ENCRYPTION_IF_NEEDED(do_crypt_init, init_status); INIT_DB_ENCRYPTION_IF_NEEDED(do_crypt_init, init_status, reg, csa, tsd); CSD2UDI(tsd, udi); /* Make sure "mu_rndwn_file" has created semaphore for standalone access */ if (INVALID_SEMID == udi->semid || 0 == udi->gt_sem_ctime) GTMASSERT; /* Make sure "mu_rndwn_file" has reset shared memory. In pro, just clear it and proceed. */ assert((INVALID_SHMID == udi->shmid) && (0 == udi->gt_shm_ctime)); /* In pro, just clear it and proceed */ udi->shmid = INVALID_SHMID; /* reset shmid so dbinit_ch does not get confused in case we go there */ TREF(new_dbinit_ipc) |= (NEW_DBINIT_SEM_IPC_MASK | NEW_DBINIT_SHM_IPC_MASK); } assert(udi->grabbed_access_sem || bypassed_access); DO_DB_HDR_CHECK(reg, tsd); /* Basic sanity check on the file header fields */ # ifdef DEBUG if (gtm_white_box_test_case_enabled && (WBTEST_HOLD_ONTO_ACCSEM_IN_DBINIT == gtm_white_box_test_case_number)) { DBGFPF((stderr, "Holding the access control semaphore.. Sleeping for 30 seconds\n")); LONG_SLEEP(30); DBGFPF((stderr, "30 second sleep exhausted.. continuing with rest of db_init..\n")); } # endif /* Now that the access control lock is obtained and file header passed all sanity checks, update the acc_meth of the * region from the one in the file header (in case they are different). This way, any later code that relies on the * acc_meth dereferenced from the region will work correctly. Instead of checking if they are different, do the assignment * unconditionally */ reg->dyn.addr->acc_meth = tsd->acc_meth; new_shm_ipc = (TREF(new_dbinit_ipc) & NEW_DBINIT_SHM_IPC_MASK); if (new_shm_ipc) { /* Bypassers are not allowed to create shared memory so we don't end up with conflicting shared memories */ if (bypassed_ftok || bypassed_access) PRINT_CRASH_MESSAGE(0, tsd, ERR_TEXT, 2, LEN_AND_LIT("DSE/LKE database initialization attempt tried a startup short cut that " "failed due to a conflict with a database shutdown - please retry")); /* Since we are about to allocate new shared memory, if necessary, adjust the journal buffer size right now. * Note that if the process setting up shared memory is a read-only process, then we might not flush updated * jnl_buffer_size to the file header, which is fine because the value in shared memory is what all processes * are looking at. If necessary, the next process to initialize shared memory will repeat the process of * adjusting the jnl_buffer_size value. */ jnl_buffer_size = tsd->jnl_buffer_size; if ((0 != jnl_buffer_size) && (jnl_buffer_size < JNL_BUFFER_MIN)) { ROUND_UP_MIN_JNL_BUFF_SIZE(tsd->jnl_buffer_size, tsd); SNPRINTF(s, JNLBUFFUPDAPNDX_SIZE, JNLBUFFUPDAPNDX, JNL_BUFF_PORT_MIN(tsd), JNL_BUFFER_MAX); send_msg(VARLSTCNT(10) ERR_JNLBUFFREGUPD, 4, REG_LEN_STR(reg), jnl_buffer_size, tsd->jnl_buffer_size, ERR_TEXT, 2, LEN_AND_STR(s)); } dbsecspc(reg, tsd, &sec_size); /* Find db segment size */ /* Create new shared memory using IPC_PRIVATE. System guarantees a unique id */ GTM_WHITE_BOX_TEST(WBTEST_FAIL_ON_SHMGET, sec_size, GTM_UINT64_MAX); if (-1 == (status_l = udi->shmid = shmget(IPC_PRIVATE, sec_size, RWDALL | IPC_CREAT))) { udi->shmid = (int)INVALID_SHMID; status_l = INVALID_SHMID; rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database shmget"), errno); } tsd->shmid = udi->shmid; if (-1 == shmctl(udi->shmid, IPC_STAT, &shmstat)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control shmctl IPC_STAT1"), errno); /* change group and permissions */ if ((-1 != group_id) && (group_id != shmstat.shm_perm.gid)) shmstat.shm_perm.gid = group_id; shmstat.shm_perm.mode = perm; if (-1 == shmctl(udi->shmid, IPC_SET, &shmstat)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control shmctl IPC_SET"), errno); /* Warning: We must read the shm_ctime using IPC_STAT after IPC_SET, which changes it. * We must NOT do any more IPC_SET or SETVAL after this. Our design is to use * shm_ctime as creation time of shared memory and store it in file header. */ if (-1 == shmctl(udi->shmid, IPC_STAT, &shmstat)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control shmctl IPC_STAT2"), errno); tsd->gt_shm_ctime.ctime = udi->gt_shm_ctime = shmstat.shm_ctime; GTM_ATTACH_SHM; shm_setup_ok = TRUE; } else { GTM_ATTACH_SHM_AND_CHECK_VERS(vermismatch, shm_setup_ok); if (vermismatch) { GTM_VERMISMATCH_ERROR; } else if (!shm_setup_ok) { PRINT_CRASH_MESSAGE(0, tsd, ERR_TEXT, 2, LEN_AND_LIT("shared memory is invalid")); } } csa->critical = (mutex_struct_ptr_t)(csa->db_addrs[0] + NODE_LOCAL_SIZE); assert(((INTPTR_T)csa->critical & 0xf) == 0); /* critical should be 16-byte aligned */ # ifdef CACHELINE_SIZE assert(0 == ((INTPTR_T)csa->critical & (CACHELINE_SIZE - 1))); # endif /* Note: Here we check jnl_state from database file; its value cannot change without stand-alone access. * The jnl_buff should be initialized irrespective of read/write process */ JNL_INIT(csa, reg, tsd); csa->shmpool_buffer = (shmpool_buff_hdr_ptr_t)(csa->db_addrs[0] + NODE_LOCAL_SPACE + JNL_SHARE_SIZE(tsd)); /* Initialize memory for snapshot context */ \ csa->ss_ctx = malloc(SIZEOF(snapshot_context_t)); DEFAULT_INIT_SS_CTX((SS_CTX_CAST(csa->ss_ctx))); csa->lock_addrs[0] = (sm_uc_ptr_t)csa->shmpool_buffer + SHMPOOL_SECTION_SIZE; csa->lock_addrs[1] = csa->lock_addrs[0] + LOCK_SPACE_SIZE(tsd) - 1; csa->total_blks = tsd->trans_hist.total_blks; /* For test to see if file has extended */ if (new_shm_ipc) { memset(csa->nl, 0, SIZEOF(*csa->nl)); /* We allocated shared storage -- we have to init it */ csa->nl->sec_size = sec_size; /* Set the shared memory size */ if (JNL_ALLOWED(csa)) { /* initialize jb->cycle to a value different from initial value of jpc->cycle (0). although this is not * necessary right now, in the future, the plan is to change jnl_ensure_open() to only do a cycle mismatch * check in order to determine whether to call jnl_file_open() or not. this is in preparation for that. */ csa->jnl->jnl_buff->cycle = 1; } } is_bg = (dba_bg == tsd->acc_meth); if (is_bg) csd = csa->hdr = (sgmnt_data_ptr_t)(csa->lock_addrs[1] + 1 + CACHE_CONTROL_SIZE(tsd)); else { csa->acc_meth.mm.mmblk_state = (mmblk_que_heads_ptr_t)(csa->lock_addrs[1] + 1); FSTAT_FILE(udi->fd, &stat_buf, stat_res); if (-1 == stat_res) rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), errno); mm_prot = read_only ? PROT_READ : (PROT_READ | PROT_WRITE); if (-1 == (sm_long_t)(csa->db_addrs[0] = (sm_uc_ptr_t)mmap((caddr_t)NULL, (size_t)stat_buf.st_size, mm_prot, GTM_MM_FLAGS, udi->fd, (off_t)0))) rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), errno); csa->db_addrs[1] = csa->db_addrs[0] + stat_buf.st_size - 1; csd = csa->hdr = (sgmnt_data_ptr_t)csa->db_addrs[0]; } /* At this point, shm_setup_ok is TRUE so we are guaranteed that vermismatch is FALSE. Therefore, we can safely * dereference csa->nl->glob_sec_init without worrying about whether or not it could be at a different offset than * the current version. The only exception is DSE which can continue even after the VERMISMATCH error and hence * can have shm_setup_ok set to FALSE at this point. */ if (shm_setup_ok && !csa->nl->glob_sec_init && !(bypassed_ftok || bypassed_access)) { assert(new_shm_ipc); assert(!vermismatch); csa->dbinit_shm_created = TRUE; if (is_bg) { memcpy(csd, tsd, SIZEOF(sgmnt_data)); READ_DB_FILE_MASTERMAP(reg, csd); } if (csd->machine_name[0]) /* crash occurred */ { if (0 != STRNCMP_STR(csd->machine_name, machine_name, MAX_MCNAMELEN)) /* crashed on some other node */ rts_error(VARLSTCNT(6) ERR_CLSTCONFLICT, 4, DB_LEN_STR(reg), LEN_AND_STR(csd->machine_name)); else { PRINT_CRASH_MESSAGE(0, csd, ERR_TEXT, 2, LEN_AND_LIT("machine name in file header is non-null implying possible crash")); } } if (is_bg) { bt_malloc(csa); csa->nl->cache_off = -CACHE_CONTROL_SIZE(tsd); db_csh_ini(csa); } db_csh_ref(csa, TRUE); shmpool_buff_init(reg); SS_INFO_INIT(csa); STRNCPY_STR(csa->nl->machine_name, machine_name, MAX_MCNAMELEN); /* machine name */ assert(MAX_REL_NAME > gtm_release_name_len); memcpy(csa->nl->now_running, gtm_release_name, gtm_release_name_len + 1); /* GT.M release name */ memcpy(csa->nl->label, GDS_LABEL, GDS_LABEL_SZ - 1); /* GDS label */ memcpy(csa->nl->fname, reg->dyn.addr->fname, reg->dyn.addr->fname_len); /* database filename */ csa->nl->creation_date_time4 = csd->creation_time4; csa->nl->highest_lbm_blk_changed = -1; csa->nl->wcs_timers = -1; csa->nl->nbb = BACKUP_NOT_IN_PROGRESS; csa->nl->unique_id.uid = FILE_INFO(reg)->fileid; /* save what file we initialized this storage for */ /* save pointers in csa to access shared memory */ csa->nl->critical = (sm_off_t)((sm_uc_ptr_t)csa->critical - (sm_uc_ptr_t)csa->nl); if (JNL_ALLOWED(csa)) csa->nl->jnl_buff = (sm_off_t)((sm_uc_ptr_t)csa->jnl->jnl_buff - (sm_uc_ptr_t)csa->nl); csa->nl->shmpool_buffer = (sm_off_t)((sm_uc_ptr_t)csa->shmpool_buffer - (sm_uc_ptr_t)csa->nl); if (is_bg) /* Field is sm_off_t (4 bytes) so only in BG mode is this assurred to be 4 byte capable */ csa->nl->hdr = (sm_off_t)((sm_uc_ptr_t)csd - (sm_uc_ptr_t)csa->nl); csa->nl->lock_addrs = (sm_off_t)((sm_uc_ptr_t)csa->lock_addrs[0] - (sm_uc_ptr_t)csa->nl); if (!read_only || is_bg) { csd->trans_hist.early_tn = csd->trans_hist.curr_tn; csd->max_update_array_size = csd->max_non_bm_update_array_size = (int4)(ROUND_UP2(MAX_NON_BITMAP_UPDATE_ARRAY_SIZE(csd), UPDATE_ARRAY_ALIGN_SIZE)); csd->max_update_array_size += (int4)(ROUND_UP2(MAX_BITMAP_UPDATE_ARRAY_SIZE, UPDATE_ARRAY_ALIGN_SIZE)); /* add current db_csh counters into the cumulative counters and reset the current counters */ # define TAB_DB_CSH_ACCT_REC(COUNTER, DUMMY1, DUMMY2) \ csd->COUNTER.cumul_count += csd->COUNTER.curr_count; \ csd->COUNTER.curr_count = 0; # include "tab_db_csh_acct_rec.h" # undef TAB_DB_CSH_ACCT_REC } csa->nl->wc_blocked = FALSE; /* Since we are creating shared memory, reset wc_blocked to FALSE */ gvstats_rec_csd2cnl(csa); /* should be called before "db_auto_upgrade" */ reg->dyn.addr->ext_blk_count = csd->extension_size; mlk_shr_init(csa->lock_addrs[0], csd->lock_space_size, csa, (FALSE == read_only)); DEBUG_ONLY(locknl = csa->nl;) /* for DEBUG_ONLY LOCK_HIST macro */ gtm_mutex_init(reg, NUM_CRIT_ENTRY, FALSE); DEBUG_ONLY(locknl = NULL;) /* restore "locknl" to default value */ if (read_only) csa->nl->remove_shm = TRUE; /* gds_rundown can remove shmem if first process has read-only access */ db_auto_upgrade(reg); if (FALSE == csd->multi_site_open) { /* first time database is opened after upgrading to a GTM version that supports multi-site replication */ csd->zqgblmod_seqno = 0; csd->zqgblmod_tn = 0; if (csd->pre_multisite_resync_seqno > csd->reg_seqno) csd->pre_multisite_resync_seqno = csd->reg_seqno; csd->multi_site_open = TRUE; } csa->nl->glob_sec_init = TRUE; STAT_FILE((char *)csa->nl->fname, &stat_buf, stat_res); if (-1 == stat_res) { save_errno = errno; rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), save_errno); } set_gdid_from_stat(&csa->nl->unique_id.uid, &stat_buf); # ifdef RELEASE_LATCH_GLOBAL /* On HP-UX, it is possible that mucregini/cs_data is not aligned at the same address * boundary as csd would be in shared memory. This may lead to the initialization and * usage of different elements of hp_latch_space. This may lead to the latch being * "in-use" permanently. To resolve this, shm-initialer re-initializes the global latch * to the "available" state. * Although Solaris doesn't have the same issue of alignment, we'll cover the case of * a corrupt latch (say in case of abnormal process termination). */ RELEASE_LATCH_GLOBAL(&csd->next_upgrd_warn.time_latch); # endif GTM_TRUNCATE_ONLY(recover_truncate(csa, csd, reg);) csa->nl->jnlpool_shmid = INVALID_SHMID; } else { if (STRNCMP_STR(csa->nl->machine_name, machine_name, MAX_MCNAMELEN)) /* machine names do not match */ { if (csa->nl->machine_name[0]) rts_error(VARLSTCNT(6) ERR_CLSTCONFLICT, 4, DB_LEN_STR(reg), LEN_AND_STR(csa->nl->machine_name)); else { PRINT_CRASH_MESSAGE(0, csd, ERR_TEXT, 2, LEN_AND_LIT("machine name in shared memory is non-null implying possible crash")); } } /* Since nl is memset to 0 initially and then fname is copied over from gv_cur_region and since "fname" is * guaranteed to not exceed MAX_FN_LEN, we should have a terminating '\0' atleast at csa->nl->fname[MAX_FN_LEN] */ assert(csa->nl->fname[MAX_FN_LEN] == '\0'); /* Note: the first '\0' in csa->nl->fname can be much earlier */ /* Check whether csa->nl->fname exists. If not, then it is a serious condition. Error out. */ STAT_FILE((char *)csa->nl->fname, &stat_buf, stat_res); if (-1 == stat_res) { save_errno = errno; send_msg(VARLSTCNT(13) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(csa->nl->machine_name), ERR_DBNAMEMISMATCH, 4, DB_LEN_STR(reg), udi->shmid, csa->nl->fname, save_errno); PRINT_CRASH_MESSAGE(3, csa->nl, ERR_DBNAMEMISMATCH, 4, DB_LEN_STR(reg), udi->shmid, csa->nl->fname, save_errno); } /* Check whether csa->nl->fname and csa->nl->unique_id.uid are in sync. If not error out. */ if (FALSE == is_gdid_stat_identical(&csa->nl->unique_id.uid, &stat_buf)) { send_msg(VARLSTCNT(12) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(csa->nl->machine_name), ERR_DBIDMISMATCH, 4, csa->nl->fname, DB_LEN_STR(reg), udi->shmid); PRINT_CRASH_MESSAGE(2, csa->nl, ERR_DBIDMISMATCH, 4, csa->nl->fname, DB_LEN_STR(reg), udi->shmid); } /* Previously, we used to check for csa->nl->creation_date_time4 vs csd->creation_time4 and treat it as * an id mismatch situation as well. But later it was determined that as long as the filename and the fileid * match between the database file header and the copy in shared memory, there is no more matching that needs * to be done. It is not possible for the user to create a situation where the filename/fileid matches but * the creation time does not. The only way for this to happen is shared memory corruption in which case we * have a much bigger problem to deal with -- 2011/03/30 --- nars. */ if (FALSE == is_gdid_gdid_identical(&FILE_INFO(reg)->fileid, &csa->nl->unique_id.uid)) { send_msg(VARLSTCNT(12) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(csa->nl->machine_name), ERR_DBSHMNAMEDIFF, 4, DB_LEN_STR(reg), udi->shmid, csa->nl->fname); PRINT_CRASH_MESSAGE(2, csa->nl, ERR_DBSHMNAMEDIFF, 4, DB_LEN_STR(reg), udi->shmid, csa->nl->fname); } /* If a regular Recover/Rollback created the shared memory and died (because of a user error or runtime error), * any process that comes up after that should NOT touch the shared memory or database. The user should reissue * Rollback/Recover command that will fix the state of the shared memory and bring the database back to a consistent * state. Note that the reissue of a regular Rollback/Recover command will NOT hit this condition because it invokes * mu_rndwn_file (STANDALONE) that removes the shared memory. The only case in which mu_rndwn_file does NOT remove * shared memory is if it was invoked by an Online Rollback in which case the below check should be bypassed */ if (csa->nl->donotflush_dbjnl && !jgbl.onlnrlbk) { assert(FALSE); PRINT_CRASH_MESSAGE(0, csa->nl, ERR_TEXT, 2, LEN_AND_LIT("mupip recover/rollback created shared memory. Needs MUPIP RUNDOWN")); } /* verify pointers from our calculation vs. the copy in shared memory */ if (csa->nl->critical != (sm_off_t)((sm_uc_ptr_t)csa->critical - (sm_uc_ptr_t)csa->nl)) { PRINT_CRASH_MESSAGE(2, csa->nl, ERR_NLMISMATCHCALC, 4, LEN_AND_LIT("critical"), (uint4)((sm_uc_ptr_t)csa->critical - (sm_uc_ptr_t)csa->nl), (uint4)csa->nl->critical); } if ((JNL_ALLOWED(csa)) && (csa->nl->jnl_buff != (sm_off_t)((sm_uc_ptr_t)csa->jnl->jnl_buff - (sm_uc_ptr_t)csa->nl))) { PRINT_CRASH_MESSAGE(2, csa->nl, ERR_NLMISMATCHCALC, 4, LEN_AND_LIT("journal buffer"), (uint4)((sm_uc_ptr_t)csa->jnl->jnl_buff - (sm_uc_ptr_t)csa->nl), (uint4)csa->nl->jnl_buff); } if (csa->nl->shmpool_buffer != (sm_off_t)((sm_uc_ptr_t)csa->shmpool_buffer - (sm_uc_ptr_t)csa->nl)) { PRINT_CRASH_MESSAGE(2, csa->nl, ERR_NLMISMATCHCALC, 4, LEN_AND_LIT("backup buffer"), (uint4)((sm_uc_ptr_t)csa->shmpool_buffer - (sm_uc_ptr_t)csa->nl), (uint4)csa->nl->shmpool_buffer); } if ((is_bg) && (csa->nl->hdr != (sm_off_t)((sm_uc_ptr_t)csd - (sm_uc_ptr_t)csa->nl))) { PRINT_CRASH_MESSAGE(2, csa->nl, ERR_NLMISMATCHCALC, 4, LEN_AND_LIT("file header"), (uint4)((sm_uc_ptr_t)csd - (sm_uc_ptr_t)csa->nl), (uint4)csa->nl->hdr); } if (csa->nl->lock_addrs != (sm_off_t)((sm_uc_ptr_t)csa->lock_addrs[0] - (sm_uc_ptr_t)csa->nl)) { PRINT_CRASH_MESSAGE(2, csa->nl, ERR_NLMISMATCHCALC, 4, LEN_AND_LIT("lock address"), (uint4)((sm_uc_ptr_t)csa->lock_addrs[0] - (sm_uc_ptr_t)csa->nl), (uint4)csa->nl->lock_addrs); } csa->dbinit_shm_created = FALSE; } if (REPL_ALLOWED(csd) && is_src_server) { /* Bind this database to the journal pool shmid & instance file name that the source server started with. * Assert that jnlpool_init has already been done by the source server before it does db_init. */ assert(NULL != jnlpool.repl_inst_filehdr); /* Note: csa->nl->replinstfilename is changed under control of the init/rundown semaphore only. */ assert('\0' != jnlpool.jnlpool_ctl->jnlpool_id.instfilename[0]); replinst_mismatch = FALSE; if ('\0' == csa->nl->replinstfilename[0]) STRCPY(csa->nl->replinstfilename, jnlpool.jnlpool_ctl->jnlpool_id.instfilename); else if (STRCMP(csa->nl->replinstfilename, jnlpool.jnlpool_ctl->jnlpool_id.instfilename)) replinst_mismatch = TRUE; /* Note: csa->nl->jnlpool_shmid is changed under control of the init/rundown semaphore only. */ assert(INVALID_SHMID != jnlpool.repl_inst_filehdr->jnlpool_shmid); if (INVALID_SHMID == csa->nl->jnlpool_shmid) csa->nl->jnlpool_shmid = jnlpool.repl_inst_filehdr->jnlpool_shmid; else if (csa->nl->jnlpool_shmid != jnlpool.repl_inst_filehdr->jnlpool_shmid) { /* shmid mismatch. Check if the shmid noted down in db filehdr is out-of-date. * Possible if the jnlpool has since been deleted. If so, note the new one down. * If not, then issue an error. */ if (-1 == shmctl(csa->nl->jnlpool_shmid, IPC_STAT, &shmstat)) { save_errno = errno; if ((EINVAL == save_errno) || (EIDRM == save_errno)) /* EIDRM is only on Linux */ { replinst_mismatch = FALSE; csa->nl->jnlpool_shmid = jnlpool.repl_inst_filehdr->jnlpool_shmid; } else replinst_mismatch = TRUE; } else replinst_mismatch = TRUE; } /* Replication instance file or jnlpool id mismatch. Issue error. */ if (replinst_mismatch) rts_error(VARLSTCNT(10) ERR_REPLINSTMISMTCH, 8, LEN_AND_STR(jnlpool.jnlpool_ctl->jnlpool_id.instfilename), jnlpool.repl_inst_filehdr->jnlpool_shmid, DB_LEN_STR(reg), LEN_AND_STR(csa->nl->replinstfilename), csa->nl->jnlpool_shmid); } csa->root_search_cycle = csa->nl->root_search_cycle; csa->onln_rlbk_cycle = csa->nl->onln_rlbk_cycle; /* take local copy of the current Online Rollback cycle */ csa->db_onln_rlbkd_cycle = csa->nl->db_onln_rlbkd_cycle; /* take local copy of the current Online Rollback mod cycle */ /* Record ftok information as soon as shared memory set up is done */ if (!have_standalone_access && !bypassed_ftok) FTOK_TRACE(csa, csd->trans_hist.curr_tn, ftok_ops_lock, process_id); if (-1 == (semval = semctl(udi->semid, 1, GETVAL))) /* semval = number of process attached */ { save_errno = errno; rts_error(VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, \ RTS_ERROR_LITERAL("semctl()"), CALLFROM, save_errno); } if (!read_only && (1 == semval) && !bypassed_ftok && !bypassed_access) { /* For read-write process flush file header to write machine_name, * semaphore, shared memory id and semaphore creation time to disk. */ csa->nl->remove_shm = FALSE; STRNCPY_STR(csd->machine_name, machine_name, MAX_MCNAMELEN); if (!is_bg) { csd->shmid = tsd->shmid; csd->semid = tsd->semid; csd->gt_sem_ctime = tsd->gt_sem_ctime; csd->gt_shm_ctime = tsd->gt_shm_ctime; } DB_LSEEKWRITE(csa, udi->fn, udi->fd, (off_t)0, (sm_uc_ptr_t)csd, SIZEOF(sgmnt_data), save_errno); if (0 != save_errno) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database header flush"), save_errno); } } else if (read_only && new_shm_ipc) { /* For read-only process if shared memory and semaphore created for first time, * semaphore and shared memory id, and semaphore creation time are written to disk. */ db_ipcs.semid = tsd->semid; /* use tsd instead of csd in order for MM to work too */ db_ipcs.shmid = tsd->shmid; db_ipcs.gt_sem_ctime = tsd->gt_sem_ctime.ctime; db_ipcs.gt_shm_ctime = tsd->gt_shm_ctime.ctime; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; WAIT_FOR_REPL_INST_UNFREEZE_SAFE(csa); if (0 != send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("gtmsecshr failed to update database file header")); } if (gtm_fullblockwrites) { /* We have been asked to do FULL BLOCK WRITES for this database. On *NIX, attempt to get the filesystem * blocksize from statvfs. This allows a full write of a blockwithout the OS having to fetch the old * block for a read/update operation. We will round the IOs to the next filesystem blocksize if the * following criteria are met: * * 1) Database blocksize must be a whole multiple of the filesystem blocksize for the above * mentioned reason. * * 2) Filesystem blocksize must be a factor of the location of the first data block * given by the start_vbn. * * The saved length (if the feature is enabled) will be the filesystem blocksize and will be the * length that a database IO is rounded up to prior to initiation of the IO. */ FSTATVFS_FILE(udi->fd, &dbvfs, status); if (-1 != status) { dblksize = csd->blk_size; fbwsize = (int4)dbvfs.f_bsize; if (0 != fbwsize && (0 == dblksize % fbwsize) && (0 == ((csd->start_vbn - 1) * DISK_BLOCK_SIZE) % fbwsize)) csa->do_fullblockwrites = TRUE; /* This region is fullblockwrite enabled */ /* Report this length in DSE even if not enabled */ csa->fullblockwrite_len = fbwsize; /* Length for rounding fullblockwrite */ } else { save_errno = errno; send_msg(VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("fstatvfs"), CALLFROM, save_errno); } } ++csa->nl->ref_cnt; /* This value is changed under control of the init/rundown semaphore only */ assert(!csa->ref_cnt); /* Increment shared ref_cnt before private ref_cnt increment. */ csa->ref_cnt++; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ # ifdef DEBUG if (!IS_GTM_IMAGE && gtm_white_box_test_case_enabled && (WBTEST_HOLD_SEM_BYPASS == gtm_white_box_test_case_number)) { if (0 == csa->nl->wbox_test_seq_num) { csa->nl->wbox_test_seq_num = 1; DBGFPF((stderr, "Holding semaphores...\n")); while (1 == csa->nl->wbox_test_seq_num) LONG_SLEEP(1); } } # endif if (!have_standalone_access && !jgbl.onlnrlbk && !bypassed_access) { /* Release control lockout now that it is init'd */ if (0 != (save_errno = do_semop(udi->semid, 0, -1, SEM_UNDO))) { save_errno = errno; rts_error(VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, \ RTS_ERROR_LITERAL("semop()"), CALLFROM, save_errno); } udi->grabbed_access_sem = FALSE; } # ifdef DEBUG if (gtm_white_box_test_case_enabled && (WBTEST_SEMTOOLONG_STACK_TRACE == gtm_white_box_test_case_number) \ && (1 == csa->nl->wbox_test_seq_num)) { csa->nl->wbox_test_seq_num = 2; /* Wait till the other process has got some stack traces */ while (csa->nl->wbox_test_seq_num != 3) LONG_SLEEP(10); } # endif if (!have_standalone_access && !bypassed_ftok) { /* Release ftok semaphore lock so that any other ftok conflicted database can continue now */ if (!ftok_sem_release(reg, FALSE, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); FTOK_TRACE(csa, csd->trans_hist.curr_tn, ftok_ops_release, process_id); } /* Do the per process initialization of mutex stuff */ assert(!mutex_per_process_init_pid || mutex_per_process_init_pid == process_id); if (!mutex_per_process_init_pid) mutex_per_process_init(); REVERT; return; }