/**************************************************************** * * * Copyright 2009, 2012 Fidelity Information Services, Inc * * * * This source code contains the intellectual property * * of its copyright holder(s), and is made available * * under a license. If you do not know the terms of * * the license, please stop and do not read further. * * * ****************************************************************/ #include "mdef.h" #include "gtm_fcntl.h" #include "gtm_stat.h" #include "gtm_unistd.h" #include "gtm_permissions.h" #include "gtm_string.h" #include "gtm_stdio.h" #include "gtm_stdlib.h" #include "gtm_tempnam.h" #include "gtm_time.h" #include "gdsroot.h" #include "gdskill.h" #include "gtm_facility.h" #include "fileinfo.h" #include "gdsbt.h" #include "gdsblk.h" #include "gdsfhead.h" #include "filestruct.h" #include "iosp.h" #include "error.h" #include "cli.h" #include "eintr_wrappers.h" #include "gtmio.h" #include "repl_sp.h" #include "gtm_file_stat.h" #include "util.h" #include "gtm_caseconv.h" #include "gt_timer.h" #include "is_proc_alive.h" #include "wcs_flu.h" #include "jnl.h" #include "wcs_sleep.h" #include "interlock.h" #include "add_inter.h" #include "sleep_cnt.h" #include "trans_log_name.h" #include "mupint.h" #include "memcoherency.h" #include "gtm_logicals.h" #ifdef __MVS__ #include "gtm_zos_io.h" #endif #include "shmpool.h" #include "db_snapshot.h" #include "gtm_c_stack_trace.h" #include "sys/shm.h" #include "do_shmat.h" #include "have_crit.h" #include "ss_lock_facility.h" #include "gtmimagename.h" GBLREF uint4 process_id; GBLREF uint4 mu_int_errknt; GBLREF boolean_t ointeg_this_reg; GBLREF boolean_t online_specified; GBLREF gd_region *gv_cur_region; GBLREF void (*call_on_signal)(); GBLREF int process_exiting; error_def(ERR_BUFFLUFAILED); error_def(ERR_DBROLLEDBACK); error_def(ERR_FILEPARSE); error_def(ERR_MAXSSREACHED); error_def(ERR_PERMGENFAIL); error_def(ERR_SSFILOPERR); error_def(ERR_SSTMPCREATE); error_def(ERR_SSTMPDIRSTAT); error_def(ERR_SSV4NOALLOW); error_def(ERR_SYSCALL); ZOS_ONLY(error_def(ERR_BADTAG);) ZOS_ONLY(error_def(ERR_TEXT);) #define SNAPSHOT_TMP_PREFIX "gtm_snapshot_" #define ISSUE_WRITE_ERROR_AND_EXIT(reg, RC, csa, tempfilename) \ { \ gtm_putmsg(VARLSTCNT(7) ERR_SSFILOPERR, 4, LEN_AND_LIT("write"), LEN_AND_STR(tempfilename), RC); \ if (csa->now_crit) \ rel_crit(csa->region); \ UNFREEZE_REGION_IF_NEEDED(csa->hdr, reg); \ return FALSE; \ } #define TOT_BYTES_REQUIRED(BLKS) DIVIDE_ROUND_UP(BLKS, 8) /* One byte can represent 8 blocks' before image state */ #define EOF_MARKER_SIZE DISK_BLOCK_SIZE #define MAX_TRY_FOR_TOT_BLKS 3 /* This value is the # of tries we do outside of crit in the hope that no * concurrent db extensions occur. This is chosen to be the same as * CDB_STAGNATE but is kept as a different name because there is no reason * for it to be the same. */ #define FILL_SS_FILHDR_INFO(ss_shm_ptr, ss_filhdr_ptr) \ { \ MEMCPY_LIT(ss_filhdr_ptr->label, SNAPSHOT_HDR_LABEL); \ ss_filhdr_ptr->ss_info.ss_pid = ss_shm_ptr->ss_info.ss_pid; \ ss_filhdr_ptr->ss_info.snapshot_tn = ss_shm_ptr->ss_info.snapshot_tn; \ ss_filhdr_ptr->ss_info.db_blk_size = ss_shm_ptr->ss_info.db_blk_size; \ ss_filhdr_ptr->ss_info.free_blks = ss_shm_ptr->ss_info.free_blks; \ ss_filhdr_ptr->ss_info.total_blks = ss_shm_ptr->ss_info.total_blks; \ STRCPY(ss_filhdr_ptr->ss_info.shadow_file, ss_shm_ptr->ss_info.shadow_file); \ ss_filhdr_ptr->shadow_file_len = STRLEN((ss_shm_ptr->ss_info.shadow_file)); \ ss_filhdr_ptr->ss_info.shadow_vbn = ss_shm_ptr->ss_info.shadow_vbn; \ ss_filhdr_ptr->ss_info.ss_shmsize = ss_shm_ptr->ss_info.ss_shmsize; \ } #define GET_CRIT_AND_DECR_INHIBIT_KILLS(REG, CNL) \ { \ grab_crit(REG); \ DECR_INHIBIT_KILLS(CNL); \ rel_crit(REG); \ } #define SS_INIT_SHM(SS_SHMSIZE, SS_SHMADDR, SS_SHMID, RESIZE_NEEDED, RC) \ { \ RC = 0; \ \ if (RESIZE_NEEDED) \ { \ assert(INVALID_SHMID != SS_SHMID); \ assert(NULL != SS_SHMADDR); \ assert(0 == ((long)SS_SHMADDR % OS_PAGE_SIZE)); \ if (0 != SHMDT(SS_SHMADDR)) \ { \ RC = errno; \ assert(FALSE); \ gtm_putmsg(VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("Error with shmdt"), \ CALLFROM, RC); \ } \ if (0 != shmctl(SS_SHMID, IPC_RMID, 0)) \ { \ RC = errno; \ assert(FALSE); \ gtm_putmsg(VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("Error with shmctl"), \ CALLFROM, RC); \ } \ } \ SS_SHMID = shmget(IPC_PRIVATE, SS_SHMSIZE, RWDALL | IPC_CREAT); \ if (-1 == SS_SHMID) \ { \ RC = errno; \ assert(FALSE); \ gtm_putmsg(VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("Error with shmget"), CALLFROM, \ RC); \ } \ if (-1 == (sm_long_t)(SS_SHMADDR = do_shmat(SS_SHMID, 0, 0))) \ { \ RC = errno; \ assert(FALSE); \ gtm_putmsg(VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("Error with shmat"), CALLFROM, \ RC); \ } \ } /* In case of an error, un-freeze the region before returning if we had done the region_freeze * in mu_int_reg for a read_only process */ #define UNFREEZE_REGION_IF_NEEDED(CSD, REG) \ { \ if (process_id == CSD->freeze) \ { \ assert(reg->read_only); \ region_freeze(reg, FALSE, FALSE, FALSE); \ } \ } /* The below function is modelled around mupip_backup_call_on_signal. This is invoked for doing snapshot clean up if ss_initiate * gets interrupted by a MUPIP STOP or other interruptible signals. In such cases, information would not have been transferred to * database shared memory and hence gds_rundown cannot do the cleanup. */ void ss_initiate_call_on_signal(void) { sgmnt_addrs *csa; csa = &FILE_INFO(gv_cur_region)->s_addrs; call_on_signal = NULL; /* Do not recurse via call_on_signal if there is an error */ process_exiting = TRUE; /* Signal function "free" (in gtm_malloc_src.h) not to bother with frees as we are anyways exiting. * This avoids assert failures that would otherwise occur due to nested storage mgmt calls * just in case we came here because of an interrupt (e.g. SIGTERM) while a malloc was in progress. */ assert(NULL != csa->ss_ctx); ss_release(&csa->ss_ctx); return; } boolean_t ss_initiate(gd_region *reg, /* Region in which snapshot has to be started */ util_snapshot_ptr_t util_ss_ptr, /* Utility specific snapshot structure */ snapshot_context_ptr_t *ss_ctx, /* Snapshot context */ boolean_t preserve_snapshot, /* Should the snapshots be preserved ? */ char *calling_utility) /* Who called ss_initiate */ { sgmnt_addrs *csa; sgmnt_data_ptr_t csd; node_local_ptr_t cnl; shm_snapshot_ptr_t ss_shm_ptr; snapshot_context_ptr_t lcl_ss_ctx, ss_orphan_ctx; snapshot_filhdr_ptr_t ss_filhdr_ptr; int shdw_fd, tmpfd, fclose_res, fstat_res, status, perm, group_id, pwrite_res, dsk_addr = 0; int retries, idx, this_snapshot_idx, save_errno, ss_shmsize, ss_shm_vbn; ZOS_ONLY(int realfiletag;) long ss_shmid = INVALID_SHMID; char tempnamprefix[MAX_FN_LEN + 1], tempdir_full_buffer[GTM_PATH_MAX], *tempfilename; char tempdir_trans_buffer[GTM_PATH_MAX], eof_marker[EOF_MARKER_SIZE]; char *time_ptr, time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ struct stat stat_buf; enum db_acc_method acc_meth; void *ss_shmaddr; gtm_int64_t db_file_size; uint4 tempnamprefix_len, crit_counter, tot_blks, prev_ss_shmsize, native_size, fstat_status; uint4 *kip_pids_arr_ptr; mstr tempdir_log, tempdir_full, tempdir_trans; boolean_t debug_mupip = FALSE, wait_for_zero_kip, final_retry; now_t now; struct perm_diag_data pdd; assert(IS_MUPIP_IMAGE); assert(NULL != calling_utility); csa = &FILE_INFO(reg)->s_addrs; csd = csa->hdr; cnl = csa->nl; acc_meth = csd->acc_meth; debug_mupip = (CLI_PRESENT == cli_present("DBG")); /* Create a context containing default information pertinent to this initiate invocation */ lcl_ss_ctx = malloc(SIZEOF(snapshot_context_t)); /* should be free'd by ss_release */ DEFAULT_INIT_SS_CTX(lcl_ss_ctx); call_on_signal = ss_initiate_call_on_signal; /* Snapshot context created. Any error below before the next cur_state assignment and a later call to ss_release will * have to free the malloc'ed snapshot context instance */ lcl_ss_ctx->cur_state = BEFORE_SHADOW_FIL_CREAT; *ss_ctx = lcl_ss_ctx; assert(!csa->now_crit); /* Right now mu_int_reg (which does not hold crit) is the only one that calls ss_initiate */ assert(!csa->hold_onto_crit); /* this ensures we can safely do unconditional grab_crit and rel_crit */ ss_get_lock(reg); /* Grab hold of the snapshot crit lock (low level latch) */ ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa); if (MAX_SNAPSHOTS == cnl->num_snapshots_in_effect) { /* SS_MULTI: If multiple snapshots are supported, then we should run through each of the "possibly" running * snapshots */ assert(1 == MAX_SNAPSHOTS); /* Check if the existing snapshot is still alive. If not, go ahead and cleanup that for us to continue */ if ((0 != ss_shm_ptr->ss_info.ss_pid) && !is_proc_alive(ss_shm_ptr->ss_info.ss_pid, 0)) ss_release(NULL); else { gtm_putmsg(VARLSTCNT(5) ERR_MAXSSREACHED, 3, MAX_SNAPSHOTS, REG_LEN_STR(reg)); ss_release_lock(reg); UNFREEZE_REGION_IF_NEEDED(csd, reg); return FALSE; } } ss_shm_ptr->ss_info.ss_pid = process_id; cnl->num_snapshots_in_effect++; assert(ss_lock_held_by_us(reg)); ss_release_lock(reg); /* For a readonly database for the current process, we better have the region frozen */ assert(!reg->read_only || csd->freeze); /* ============================ STEP 1 : Shadow file name construction ============================== * * --> Directory is taken from GTM_SNAPTMPDIR, if available, else GTM_BAK_TEMPDIR_LOG_NAME_UC, if available, * else use current directory. * --> use the template - gtm_snapshot___XXXXXX * --> the last six characters will be replaced by MKSTEMP below * Note: MUPIP RUNDOWN will rely on the above template to do the cleanup if there were a * crash that led to the improper shutdown of the snapshot process. So, if the above template * changes make sure it's taken care of in MUPIP RUNDOWN * Note: most of the shadow file name construction and the creation of shadow file is been * borrowed from mupip_backup.c. This code should be modularized and should be used in both mupip_backup * as well as here. */ /* set up a prefix for the temporary file. */ tempnamprefix_len = 0; memset(tempnamprefix, 0, MAX_FN_LEN); memcpy(tempnamprefix, SNAPSHOT_TMP_PREFIX, STR_LIT_LEN(SNAPSHOT_TMP_PREFIX)); tempnamprefix_len += STR_LIT_LEN(SNAPSHOT_TMP_PREFIX); memcpy(tempnamprefix + tempnamprefix_len, reg->rname, reg->rname_len); tempnamprefix_len += reg->rname_len; SNPRINTF(&tempnamprefix[tempnamprefix_len], MAX_FN_LEN, "_%x", process_id); tempdir_log.addr = GTM_SNAPTMPDIR; tempdir_log.len = STR_LIT_LEN(GTM_SNAPTMPDIR); tempfilename = tempdir_full.addr = tempdir_full_buffer; /* Check if the environment variable is defined or not. * Side-effect: tempdir_trans.addr = tempdir_trans_buffer irrespective of whether TRANS_LOG_NAME * succeeded or not. */ status = TRANS_LOG_NAME(&tempdir_log, &tempdir_trans, tempdir_trans_buffer, SIZEOF(tempdir_trans_buffer), do_sendmsg_on_log2long); if (SS_NORMAL == status && (NULL != tempdir_trans.addr) && (0 != tempdir_trans.len)) *(tempdir_trans.addr + tempdir_trans.len) = 0; else { /* Not found - try GTM_BAK_TEMPDIR_LOG_NAME_UC instead */ tempdir_log.addr = GTM_BAK_TEMPDIR_LOG_NAME_UC; tempdir_log.len = STR_LIT_LEN(GTM_BAK_TEMPDIR_LOG_NAME_UC); status = TRANS_LOG_NAME(&tempdir_log, &tempdir_trans, tempdir_trans_buffer, SIZEOF(tempdir_trans_buffer), do_sendmsg_on_log2long); if (SS_NORMAL == status && (NULL != tempdir_trans.addr) && (0 != tempdir_trans.len)) *(tempdir_trans.addr + tempdir_trans.len) = 0; else { /* Not found - use the current directory via a relative filespec */ tempdir_trans_buffer[0] = '.'; tempdir_trans_buffer[1] = '\0'; tempdir_trans.len = 1; } } /* Verify if we can stat the temporary directory */ if (FILE_STAT_ERROR == (fstat_res = gtm_file_stat(&tempdir_trans, NULL, &tempdir_full, FALSE, &fstat_status))) { gtm_putmsg(VARLSTCNT(5) ERR_SSTMPDIRSTAT, 2, tempdir_trans.len, tempdir_trans.addr, fstat_status); UNFREEZE_REGION_IF_NEEDED(csd, reg); return FALSE; } SNPRINTF(tempfilename + tempdir_full.len, GTM_PATH_MAX, "/%s_XXXXXX", tempnamprefix); /* ========================== STEP 2 : Create the shadow file ======================== */ /* get a unique temporary file name. The file gets created on success */ DEFER_INTERRUPTS(INTRPT_IN_SS_INITIATE); /* Defer MUPIP STOP till the file is created */ MKSTEMP(tempfilename, tmpfd); STRCPY(lcl_ss_ctx->shadow_file, tempfilename); /* Shadow file created. Any error below before the next cur_state assignment and a later call to ss_release will have * to delete at least the shadow file apart from free'ing the malloc'ed snapshot context instance */ lcl_ss_ctx->cur_state = AFTER_SHADOW_FIL_CREAT; if (FD_INVALID == tmpfd) { status = errno; gtm_putmsg(VARLSTCNT(5) ERR_SSTMPCREATE, 2, tempdir_trans.len, tempdir_trans.addr, status); UNFREEZE_REGION_IF_NEEDED(csd, reg); return FALSE; } # ifdef __MVS__ if (-1 == gtm_zos_set_tag(tmpfd, TAG_BINARY, TAG_NOTTEXT, TAG_FORCE, &realfiletag)) TAG_POLICY_GTM_PUTMSG(tempfilename, realfiletag, TAG_BINARY, errno); # endif /* Temporary file for backup was created above using "mkstemp" which on AIX opens the file without * large file support enabled. Work around that by closing the file descriptor returned and reopening * the file with the "open" system call (which gets correctly translated to "open64"). We need to do * this because the temporary file can get > 2GB. Since it is not clear if mkstemp on other Unix platforms * will open the file for large file support, we use this solution for other Unix flavours as well. */ OPENFILE(tempfilename, O_RDWR, shdw_fd); if (FD_INVALID == shdw_fd) { status = errno; gtm_putmsg(VARLSTCNT(7) ERR_SSFILOPERR, 4, LEN_AND_LIT("open"), tempdir_full.len, tempdir_full.addr, status); UNFREEZE_REGION_IF_NEEDED(csd, reg); return FALSE; } # ifdef __MVS__ if (-1 == gtm_zos_set_tag(shdw_fd, TAG_BINARY, TAG_NOTTEXT, TAG_FORCE, &realfiletag)) TAG_POLICY_GTM_PUTMSG(tempfilename, realfiletag, TAG_BINARY, errno); # endif /* Now that the temporary file has been opened successfully, close the fd returned by mkstemp */ F_CLOSE(tmpfd, fclose_res); lcl_ss_ctx->shdw_fd = shdw_fd; ENABLE_INTERRUPTS(INTRPT_IN_SS_INITIATE); tempdir_full.len = STRLEN(tempdir_full.addr); /* update the length */ assert(GTM_PATH_MAX >= tempdir_full.len); /* give temporary files the group and permissions as other shared resources - like journal files */ FSTAT_FILE(((unix_db_info *)(reg->dyn.addr->file_cntl->file_info))->fd, &stat_buf, fstat_res); assert(-1 != fstat_res); if (-1 != fstat_res) if (gtm_set_group_and_perm(&stat_buf, &group_id, &perm, PERM_FILE, &pdd) < 0) { send_msg(VARLSTCNT(6+PERMGENDIAG_ARG_COUNT) ERR_PERMGENFAIL, 4, RTS_ERROR_STRING("snapshot file"), RTS_ERROR_STRING(((unix_db_info *)(reg->dyn.addr->file_cntl->file_info))->fn), PERMGENDIAG_ARGS(pdd)); gtm_putmsg(VARLSTCNT(6+PERMGENDIAG_ARG_COUNT) ERR_PERMGENFAIL, 4, RTS_ERROR_STRING("snapshot file"), RTS_ERROR_STRING(((unix_db_info *)(reg->dyn.addr->file_cntl->file_info))->fn), PERMGENDIAG_ARGS(pdd)); UNFREEZE_REGION_IF_NEEDED(csd, reg); return FALSE; } if ((-1 == fstat_res) || (-1 == FCHMOD(shdw_fd, perm)) || ((-1 != group_id) && (-1 == fchown(shdw_fd, -1, group_id)))) { status = errno; gtm_putmsg(VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("fchmod/fchown"), CALLFROM, status); UNFREEZE_REGION_IF_NEEDED(csd, reg); return FALSE; } if (debug_mupip) { util_out_print("!/MUPIP INFO: Successfully created the shadow file: !AD", TRUE, tempdir_full.len, tempdir_full.addr); } /* STEP 3: Wait for kills-in-progress and initialize snapshot shadow file * * Snapshot Shadow File Layout - * * ----------------------- * | RESERVE_SHM_SPACE | <-- Variable sized space reserved in case we would like the snapshots to be preserved * ----------------------- * | DB_FILE_HEADER | <-- Copy of the database file header taken at the snapshot time (in crit) (SGMNT_HDR_LEN) * ----------------------- * | DB_MASTER_MAP | <-- MASTER_MAP_SIZE_MAX * ----------------------- <-- offset of block zero * | BLOCK_ZERO_BIMG | <-- csd->blk_size * ----------------------- * | BLOCK_ONE_BIMG | <-- csd->blk_size * ----------------------- * . * . * ----------------------- * | EOF_MARKER | <-- 512 bytes (DISK_BLOCK_SIZE) * ----------------------- */ /* Below, master map and the file header will be copied and hence check if we have the right * pointers setup by the caller */ assert(NULL != util_ss_ptr); assert(NULL != util_ss_ptr->master_map); assert(NULL != util_ss_ptr->header); grab_crit(reg); INCR_INHIBIT_KILLS(cnl); kip_pids_arr_ptr = cnl->kip_pid_array; prev_ss_shmsize = 0; for (retries = 0; MAX_TRY_FOR_TOT_BLKS >= retries; ++retries) { final_retry = (MAX_TRY_FOR_TOT_BLKS == retries); /* On all but the 4th retry (inclusive of retries = -1), release crit */ if (!final_retry) rel_crit(reg); /* * Outside crit (on the final retry, we will be holding crit while calculating the below): * 1. total blocks * 2. native size (total number of blocks in terms of DISK_BLOCK_SIZE) * 3. write EOF an offset equal to the current database file size * 4. create shared memory segment to be used as bitmap for storing whether a given database block * was before imaged or not. */ DEFER_INTERRUPTS(INTRPT_IN_SS_INITIATE); /* Defer MUPIP STOP till the shared memory is created */ tot_blks = csd->trans_hist.total_blks; prev_ss_shmsize = ss_shmsize; ss_shmsize = (int)(ROUND_UP((SNAPSHOT_HDR_SIZE + TOT_BYTES_REQUIRED(tot_blks)), OS_PAGE_SIZE)); assert((0 == retries) || (0 != prev_ss_shmsize)); if (0 == retries) { /* If this is the first try, then create shared memory anyways. */ SS_INIT_SHM(ss_shmsize, ss_shmaddr, ss_shmid, FALSE, status); } else if ((prev_ss_shmsize - SNAPSHOT_HDR_SIZE) < TOT_BYTES_REQUIRED(tot_blks)) { /* Shared memory created with SS_INIT_SHM (in prior retries) is aligned at the OS page size boundary. * So, in case of an extension (in subsequent retry) we check if the newly added blocks would fit in * the existing shared memory (possible if the extra bytes allocated for OS page size alignment will * be enough to hold the newly added database blocks). If not, remove the original shared memory and * create a new one. */ if (debug_mupip) { util_out_print("!/MUPIP INFO: Existing shared memory !UL of size !UL not enough for total blocks \ !UL. Creating new shared memory", TRUE, ss_shmid, prev_ss_shmsize, tot_blks); } SS_INIT_SHM(ss_shmsize, ss_shmaddr, ss_shmid, TRUE, status); } if (status) { /* error while creating shared memory */ GET_CRIT_AND_DECR_INHIBIT_KILLS(reg, cnl); UNFREEZE_REGION_IF_NEEDED(csd, reg); return FALSE; } /* At this point, we are done with allocating shared memory (aligned with OS_PAGE_SIZE) enough to hold * the total number of blocks. Any error below before the next cur_state assignment and a later call * to ss_release will have to detach/rmid the shared memory identifier apart from deleting the shadow * file and free'ing the malloc'ed snapshot context instance. */ lcl_ss_ctx->attach_shmid = ss_shmid; lcl_ss_ctx->start_shmaddr = ss_shmaddr; lcl_ss_ctx->cur_state = AFTER_SHM_CREAT; ENABLE_INTERRUPTS(INTRPT_IN_SS_INITIATE); if (debug_mupip) { util_out_print("!/MUPIP INFO: Shared memory created. SHMID = !UL", TRUE, ss_shmid); } /* Write EOF block in the snapshot file */ native_size = gds_file_size(reg->dyn.addr->file_cntl); db_file_size = (gtm_int64_t)(native_size) * DISK_BLOCK_SIZE; /* Size of database file in bytes */ LSEEKWRITE(shdw_fd, ((off_t)db_file_size + ss_shmsize), eof_marker, EOF_MARKER_SIZE, status); if (0 != status) { /* error while writing EOF record to snapshot file */ GET_CRIT_AND_DECR_INHIBIT_KILLS(reg, cnl); ISSUE_WRITE_ERROR_AND_EXIT(reg, status, csa, tempfilename); } /* Wait for KIP to reset */ wait_for_zero_kip = csd->kill_in_prog; /* Wait for existing kills-in-progress to be reset. Since a database file extension is possible as * we don't hold crit while wcs_sleep below, we will retry if the total blocks have changed since * we last checked it. However, in the final retry, the shared memory and shadow file initialization * will happen in crit. Also, in the final retry, we won't be waiting for the kills in progress as * we would be holding crit and cannot afford to wcs_sleep. So, in such a case, MUKILLIP should be * issued by the caller of ss_initiate if csd->kill_in_prog is set to TRUE. But, such a case should * be rare. */ for (crit_counter = 1; wait_for_zero_kip && !final_retry; ) { /* Release crit before going into the wait loop */ rel_crit(reg); if (debug_mupip) { GET_CUR_TIME; util_out_print("!/MUPIP INFO: !AD : Start kill-in-prog wait for database !AD", TRUE, CTIME_BEFORE_NL, time_ptr, DB_LEN_STR(reg)); } while (csd->kill_in_prog && (MAX_CRIT_TRY > crit_counter++)) { GET_C_STACK_FOR_KIP(kip_pids_arr_ptr, crit_counter, MAX_CRIT_TRY, 1, MAX_KIP_PID_SLOTS); wcs_sleep(crit_counter); } if (debug_mupip) { GET_CUR_TIME; util_out_print("!/MUPIP INFO: !AD : Done with kill-in-prog wait on !AD", TRUE, CTIME_BEFORE_NL, time_ptr, DB_LEN_STR(reg)); } wait_for_zero_kip = (MAX_CRIT_TRY > crit_counter); /* if TRUE, we can wait for some more time on * this region */ grab_crit(reg); if (csd->kill_in_prog) { /* It is possible for this to happen in case a concurrent GT.M process is in its 4th retry. * In that case, it will not honor the inhibit_kills flag since it holds crit and therefore * could have set kill-in-prog to a non-zero value while we were outside of crit. * Since we have waited for 1 minute already, proceed with snapshot. The reasoning is that * once the GT.M process that is in the final retry finishes off the second part of the M-kill, * it will not start a new transaction in the first try which is outside of crit so will honor * the inhibit-kills flag and therefore not increment the kill_in_prog counter any more until * this crit is released. So we could be waiting for at most 1 kip increment per concurrent process * that is updating the database. We expect these kills to be complete within 1 minute. */ if (!wait_for_zero_kip) break; } else break; } grab_crit(reg); /* There are two reasons why we might go for another iteration of this loop * (a) After we have created the shared memory and before we grab crit, another process can add new blocks to the * database, in which case csd->trans_hist.total_blks is no longer the value as we noted down above. Check if this * is the case and if so, retry to obtain a consistent copy of the total number of blocks. * (b) Similarly, csd->kill_in_prog was FALSE before grab_crit, but non-zero after grab-crit due to concurrency * reasons. Check if this is the case and if so, retry to wait for KIP to reset. */ if ((tot_blks == csd->trans_hist.total_blks) && !csd->kill_in_prog) { /* We have a consistent copy of the total blocks and csd->kill_in_prog is FALSE inside crit. No need for * retry. */ assert(native_size == ((tot_blks * (csd->blk_size / DISK_BLOCK_SIZE)) + (csd->start_vbn))); break; } } /* At this point, we are MOST likely guaranteed that kill-in-prog is set to zero for this region and CERTAINLY * guaranteed that no more kills will be started for this region. Now, we are still holding crit, so any error * that occurs henceforth should do a DECR_INHIBIT_KILLS and rel_crit before exiting to let the other processes * proceed gracefully */ assert(csa->now_crit); assert(native_size == ((tot_blks * (csd->blk_size / DISK_BLOCK_SIZE)) + (csd->start_vbn))); assert(NULL != ss_shmaddr); assert(0 == ((long)ss_shmaddr % OS_PAGE_SIZE)); assert(0 == ss_shmsize % OS_PAGE_SIZE); assert(INVALID_SHMID != ss_shmid); assert(0 == (ss_shmsize % DISK_BLOCK_SIZE)); ss_shm_vbn = ss_shmsize / DISK_BLOCK_SIZE; /* # of DISK_BLOCK_SIZEs the shared memory spans across */ assert(ss_shmid == lcl_ss_ctx->attach_shmid); assert(AFTER_SHM_CREAT == lcl_ss_ctx->cur_state); if (debug_mupip) { util_out_print("!/MUPIP INFO: Successfully created shared memory. SHMID = !UL", TRUE, ss_shmid); } /* It is possible that we did saw csd->full_upgraded TRUE before invoking ss_initiate but MUPIP SET -VER=V4 was done after * ss_initiate was called. Handle appropriately. */ if (!csd->fully_upgraded) { /* If -ONLINE was specified explicitly, then it is an ERROR. Issue the error and return FALSE. */ if (online_specified) { gtm_putmsg(VARLSTCNT(4) ERR_SSV4NOALLOW, 2, DB_LEN_STR(reg)); util_out_print(NO_ONLINE_ERR_MSG, TRUE); mu_int_errknt++; GET_CRIT_AND_DECR_INHIBIT_KILLS(reg, cnl); UNFREEZE_REGION_IF_NEEDED(csd, reg); return FALSE; } else { /* If -ONLINE was assumed implicitly (default of INTEG -REG), then set ointeg_this_reg to FALSE, * relinquish the resources and continue as if it is -NOONLINE */ ointeg_this_reg = FALSE; assert(NULL != *ss_ctx); ss_release(ss_ctx); GET_CRIT_AND_DECR_INHIBIT_KILLS(reg, cnl); UNFREEZE_REGION_IF_NEEDED(csd, reg); return TRUE; } } /* ===================== STEP 5: Flush the pending updates in the global cache =================== */ /* For a readonly database for the current process, we cannot do wcs_flu. We would have waited for the active queues * to complete in mu_int_reg after doing a freeze. Now that we have crit, unfreeze the region */ if (reg->read_only) { region_freeze(reg, FALSE, FALSE, FALSE); } else if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_MSYNC_DB)) /* wcs_flu guarantees that all the pending * phase 2 commits are done with before returning */ { assert(process_id != csd->freeze); /* We would not have frozen the region if the database is read-write */ gtm_putmsg(VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_STR(calling_utility), DB_LEN_STR(reg)); GET_CRIT_AND_DECR_INHIBIT_KILLS(reg, cnl); return FALSE; } assert(0 < cnl->num_snapshots_in_effect || (!SNAPSHOTS_IN_PROG(cnl))); assert(csa->now_crit); /* ========== STEP 6: Copy the file header, master map and the native file size into a private structure =========== */ memcpy(util_ss_ptr->header, csd, SGMNT_HDR_LEN); memcpy(util_ss_ptr->master_map, MM_ADDR(csd), MASTER_MAP_SIZE_MAX); util_ss_ptr->native_size = native_size; /* We are about to copy the process private variables to shared memory. Although we have done grab_crit above, we take * snapshot crit lock to ensure that no other process attempts snapshot cleanup. */ DEFER_INTERRUPTS(INTRPT_IN_SS_INITIATE); /* Defer MUPIP STOP until we complete copying to shared memory */ /* == STEP 7: Populate snapshot context, database shared memory snapshot structure and snapshot file header structure == */ ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa); DBG_ENSURE_PTR_WITHIN_SS_BOUNDS(csa, (sm_uc_ptr_t)ss_shm_ptr); /* Fill in the information for this snapshot in database shared memory */ ss_shm_ptr->ss_info.snapshot_tn = csd->trans_hist.curr_tn; ss_shm_ptr->ss_info.free_blks = csd->trans_hist.free_blocks; ss_shm_ptr->ss_info.total_blks = csd->trans_hist.total_blks; ss_shm_ptr->ss_info.db_blk_size = csd->blk_size; ss_shm_ptr->failure_errno = 0; ss_shm_ptr->failed_pid = 0; STRCPY(ss_shm_ptr->ss_info.shadow_file, tempfilename); ss_shm_ptr->ss_info.shadow_vbn = ss_shm_vbn + csd->start_vbn; ss_shm_ptr->ss_info.ss_shmid = cnl->ss_shmid = ss_shmid; ss_shm_ptr->ss_info.ss_shmsize = ss_shmsize; ss_shm_ptr->in_use = 1; /* Fill in the information for the process specific snapshot context information */ lcl_ss_ctx->ss_shm_ptr = (shm_snapshot_ptr_t)(ss_shm_ptr); lcl_ss_ctx->start_shmaddr = (sm_uc_ptr_t)ss_shmaddr; lcl_ss_ctx->bitmap_addr = ((sm_uc_ptr_t)ss_shmaddr + SNAPSHOT_HDR_SIZE); lcl_ss_ctx->shadow_vbn = ss_shm_ptr->ss_info.shadow_vbn; lcl_ss_ctx->total_blks = ss_shm_ptr->ss_info.total_blks; /* Fill snapshot file header information. This data will be persisted if -PRESERVE option is given */ ss_filhdr_ptr = (snapshot_filhdr_ptr_t)(ss_shmaddr); FILL_SS_FILHDR_INFO(ss_shm_ptr, ss_filhdr_ptr) ss_shm_ptr->preserve_snapshot = preserve_snapshot; SET_LATCH_GLOBAL(&ss_shm_ptr->bitmap_latch, LOCK_AVAILABLE); DECR_INHIBIT_KILLS(cnl); /* announce GT.M that it's now ok to write the before images */ if (!SNAPSHOTS_IN_PROG(cnl)) cnl->snapshot_in_prog = TRUE; csa->snapshot_in_prog = TRUE; /* Having a write memory barrier here ensures that whenever GT.M reads a newer value of cnl->ss_shmcycle, it is guaranteed * that the remaining fields that it reads from the shared memory will be the latest ones. */ SHM_WRITE_MEMORY_BARRIER; cnl->ss_shmcycle++; /* indicate that the ss_shmid field of cnl is being reused */ lcl_ss_ctx->ss_shmcycle = cnl->ss_shmcycle; rel_crit(reg); if ((csa->onln_rlbk_cycle != csa->nl->onln_rlbk_cycle) || csa->nl->onln_rlbk_pid) { /* A concurrent online rollback happened since we did the gvcst_init. The INTEG is not reliable. * Cleanup and exit */ gtm_putmsg(VARLSTCNT(1) ERR_DBROLLEDBACK); UNFREEZE_REGION_IF_NEEDED(csa, reg); return FALSE; } /* ============= STEP 8: Write the database file header and the master map ============= * Write the database file header at an offset equal to the database shared memory size. * This is because if we want to preserve the snapshot then we would want to write the * shared memory information at the beginning of the file. */ LSEEKWRITE(shdw_fd, (off_t)ss_shmsize, (sm_uc_ptr_t)util_ss_ptr->header, SGMNT_HDR_LEN, pwrite_res); if (0 != pwrite_res) ISSUE_WRITE_ERROR_AND_EXIT(reg, pwrite_res, csa, tempfilename); dsk_addr += ((int)ss_shmsize + (int)SGMNT_HDR_LEN); /* write the database master map to the shadow file */ LSEEKWRITE(shdw_fd, dsk_addr, (sm_uc_ptr_t)util_ss_ptr->master_map, MASTER_MAP_SIZE_MAX, pwrite_res); if (0 != pwrite_res) ISSUE_WRITE_ERROR_AND_EXIT(reg, pwrite_res, csa, tempfilename); dsk_addr += MASTER_MAP_SIZE_MAX; assert(0 == (dsk_addr % OS_PAGE_SIZE)); lcl_ss_ctx->cur_state = SNAPSHOT_INIT_DONE; /* Same as AFTER_SHM_CREAT but set for clarity of the snapshot state */ call_on_signal = NULL; /* Any further cleanup on signals will be taken care by gds_rundown */ ENABLE_INTERRUPTS(INTRPT_IN_SS_INITIATE); assert(!ss_lock_held_by_us(reg)); /* We should never leave the function with the snapshot latch not being released */ return TRUE; }