725 lines
31 KiB
C
725 lines
31 KiB
C
/****************************************************************
|
|
* *
|
|
* Copyright 2009, 2012 Fidelity Information Services, Inc *
|
|
* *
|
|
* This source code contains the intellectual property *
|
|
* of its copyright holder(s), and is made available *
|
|
* under a license. If you do not know the terms of *
|
|
* the license, please stop and do not read further. *
|
|
* *
|
|
****************************************************************/
|
|
|
|
#include "mdef.h"
|
|
|
|
#include "gtm_fcntl.h"
|
|
#include "gtm_stat.h"
|
|
#include "gtm_unistd.h"
|
|
#include "gtm_permissions.h"
|
|
#include "gtm_string.h"
|
|
#include "gtm_stdio.h"
|
|
#include "gtm_stdlib.h"
|
|
#include "gtm_tempnam.h"
|
|
#include "gtm_time.h"
|
|
|
|
#include "gdsroot.h"
|
|
#include "gdskill.h"
|
|
#include "gtm_facility.h"
|
|
#include "fileinfo.h"
|
|
#include "gdsbt.h"
|
|
#include "gdsblk.h"
|
|
#include "gdsfhead.h"
|
|
#include "filestruct.h"
|
|
#include "iosp.h"
|
|
#include "error.h"
|
|
#include "cli.h"
|
|
#include "eintr_wrappers.h"
|
|
#include "gtmio.h"
|
|
#include "repl_sp.h"
|
|
#include "gtm_file_stat.h"
|
|
#include "util.h"
|
|
#include "gtm_caseconv.h"
|
|
#include "gt_timer.h"
|
|
#include "is_proc_alive.h"
|
|
#include "wcs_flu.h"
|
|
#include "jnl.h"
|
|
#include "wcs_sleep.h"
|
|
#include "interlock.h"
|
|
#include "add_inter.h"
|
|
#include "sleep_cnt.h"
|
|
#include "trans_log_name.h"
|
|
#include "mupint.h"
|
|
#include "memcoherency.h"
|
|
#include "gtm_logicals.h"
|
|
#ifdef __MVS__
|
|
#include "gtm_zos_io.h"
|
|
#endif
|
|
#include "shmpool.h"
|
|
#include "db_snapshot.h"
|
|
#include "gtm_c_stack_trace.h"
|
|
#include "sys/shm.h"
|
|
#include "do_shmat.h"
|
|
#include "have_crit.h"
|
|
#include "ss_lock_facility.h"
|
|
#include "gtmimagename.h"
|
|
|
|
GBLREF uint4 process_id;
|
|
GBLREF uint4 mu_int_errknt;
|
|
GBLREF boolean_t ointeg_this_reg;
|
|
GBLREF boolean_t online_specified;
|
|
GBLREF gd_region *gv_cur_region;
|
|
GBLREF void (*call_on_signal)();
|
|
GBLREF int process_exiting;
|
|
|
|
error_def(ERR_BUFFLUFAILED);
|
|
error_def(ERR_DBROLLEDBACK);
|
|
error_def(ERR_FILEPARSE);
|
|
error_def(ERR_MAXSSREACHED);
|
|
error_def(ERR_PERMGENFAIL);
|
|
error_def(ERR_SSFILOPERR);
|
|
error_def(ERR_SSTMPCREATE);
|
|
error_def(ERR_SSTMPDIRSTAT);
|
|
error_def(ERR_SSV4NOALLOW);
|
|
error_def(ERR_SYSCALL);
|
|
ZOS_ONLY(error_def(ERR_BADTAG);)
|
|
ZOS_ONLY(error_def(ERR_TEXT);)
|
|
|
|
|
|
#define SNAPSHOT_TMP_PREFIX "gtm_snapshot_"
|
|
#define ISSUE_WRITE_ERROR_AND_EXIT(reg, RC, csa, tempfilename) \
|
|
{ \
|
|
gtm_putmsg(VARLSTCNT(7) ERR_SSFILOPERR, 4, LEN_AND_LIT("write"), LEN_AND_STR(tempfilename), RC); \
|
|
if (csa->now_crit) \
|
|
rel_crit(csa->region); \
|
|
UNFREEZE_REGION_IF_NEEDED(csa->hdr, reg); \
|
|
return FALSE; \
|
|
}
|
|
|
|
#define TOT_BYTES_REQUIRED(BLKS) DIVIDE_ROUND_UP(BLKS, 8) /* One byte can represent 8 blocks' before image state */
|
|
#define EOF_MARKER_SIZE DISK_BLOCK_SIZE
|
|
#define MAX_TRY_FOR_TOT_BLKS 3 /* This value is the # of tries we do outside of crit in the hope that no
|
|
* concurrent db extensions occur. This is chosen to be the same as
|
|
* CDB_STAGNATE but is kept as a different name because there is no reason
|
|
* for it to be the same.
|
|
*/
|
|
|
|
#define FILL_SS_FILHDR_INFO(ss_shm_ptr, ss_filhdr_ptr) \
|
|
{ \
|
|
MEMCPY_LIT(ss_filhdr_ptr->label, SNAPSHOT_HDR_LABEL); \
|
|
ss_filhdr_ptr->ss_info.ss_pid = ss_shm_ptr->ss_info.ss_pid; \
|
|
ss_filhdr_ptr->ss_info.snapshot_tn = ss_shm_ptr->ss_info.snapshot_tn; \
|
|
ss_filhdr_ptr->ss_info.db_blk_size = ss_shm_ptr->ss_info.db_blk_size; \
|
|
ss_filhdr_ptr->ss_info.free_blks = ss_shm_ptr->ss_info.free_blks; \
|
|
ss_filhdr_ptr->ss_info.total_blks = ss_shm_ptr->ss_info.total_blks; \
|
|
STRCPY(ss_filhdr_ptr->ss_info.shadow_file, ss_shm_ptr->ss_info.shadow_file); \
|
|
ss_filhdr_ptr->shadow_file_len = STRLEN((ss_shm_ptr->ss_info.shadow_file)); \
|
|
ss_filhdr_ptr->ss_info.shadow_vbn = ss_shm_ptr->ss_info.shadow_vbn; \
|
|
ss_filhdr_ptr->ss_info.ss_shmsize = ss_shm_ptr->ss_info.ss_shmsize; \
|
|
}
|
|
|
|
#define GET_CRIT_AND_DECR_INHIBIT_KILLS(REG, CNL) \
|
|
{ \
|
|
grab_crit(REG); \
|
|
DECR_INHIBIT_KILLS(CNL); \
|
|
rel_crit(REG); \
|
|
}
|
|
|
|
#define SS_INIT_SHM(SS_SHMSIZE, SS_SHMADDR, SS_SHMID, RESIZE_NEEDED, RC) \
|
|
{ \
|
|
RC = 0; \
|
|
\
|
|
if (RESIZE_NEEDED) \
|
|
{ \
|
|
assert(INVALID_SHMID != SS_SHMID); \
|
|
assert(NULL != SS_SHMADDR); \
|
|
assert(0 == ((long)SS_SHMADDR % OS_PAGE_SIZE)); \
|
|
if (0 != SHMDT(SS_SHMADDR)) \
|
|
{ \
|
|
RC = errno; \
|
|
assert(FALSE); \
|
|
gtm_putmsg(VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("Error with shmdt"), \
|
|
CALLFROM, RC); \
|
|
} \
|
|
if (0 != shmctl(SS_SHMID, IPC_RMID, 0)) \
|
|
{ \
|
|
RC = errno; \
|
|
assert(FALSE); \
|
|
gtm_putmsg(VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("Error with shmctl"), \
|
|
CALLFROM, RC); \
|
|
} \
|
|
} \
|
|
SS_SHMID = shmget(IPC_PRIVATE, SS_SHMSIZE, RWDALL | IPC_CREAT); \
|
|
if (-1 == SS_SHMID) \
|
|
{ \
|
|
RC = errno; \
|
|
assert(FALSE); \
|
|
gtm_putmsg(VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("Error with shmget"), CALLFROM, \
|
|
RC); \
|
|
} \
|
|
if (-1 == (sm_long_t)(SS_SHMADDR = do_shmat(SS_SHMID, 0, 0))) \
|
|
{ \
|
|
RC = errno; \
|
|
assert(FALSE); \
|
|
gtm_putmsg(VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("Error with shmat"), CALLFROM, \
|
|
RC); \
|
|
} \
|
|
}
|
|
|
|
/* In case of an error, un-freeze the region before returning if we had done the region_freeze
|
|
* in mu_int_reg for a read_only process
|
|
*/
|
|
#define UNFREEZE_REGION_IF_NEEDED(CSD, REG) \
|
|
{ \
|
|
if (process_id == CSD->freeze) \
|
|
{ \
|
|
assert(reg->read_only); \
|
|
region_freeze(reg, FALSE, FALSE, FALSE); \
|
|
} \
|
|
}
|
|
|
|
/* The below function is modelled around mupip_backup_call_on_signal. This is invoked for doing snapshot clean up if ss_initiate
|
|
* gets interrupted by a MUPIP STOP or other interruptible signals. In such cases, information would not have been transferred to
|
|
* database shared memory and hence gds_rundown cannot do the cleanup.
|
|
*/
|
|
void ss_initiate_call_on_signal(void)
|
|
{
|
|
sgmnt_addrs *csa;
|
|
|
|
csa = &FILE_INFO(gv_cur_region)->s_addrs;
|
|
call_on_signal = NULL; /* Do not recurse via call_on_signal if there is an error */
|
|
process_exiting = TRUE; /* Signal function "free" (in gtm_malloc_src.h) not to bother with frees as we are anyways exiting.
|
|
* This avoids assert failures that would otherwise occur due to nested storage mgmt calls
|
|
* just in case we came here because of an interrupt (e.g. SIGTERM) while a malloc was in progress.
|
|
*/
|
|
assert(NULL != csa->ss_ctx);
|
|
ss_release(&csa->ss_ctx);
|
|
return;
|
|
}
|
|
|
|
boolean_t ss_initiate(gd_region *reg, /* Region in which snapshot has to be started */
|
|
util_snapshot_ptr_t util_ss_ptr, /* Utility specific snapshot structure */
|
|
snapshot_context_ptr_t *ss_ctx, /* Snapshot context */
|
|
boolean_t preserve_snapshot, /* Should the snapshots be preserved ? */
|
|
char *calling_utility) /* Who called ss_initiate */
|
|
{
|
|
sgmnt_addrs *csa;
|
|
sgmnt_data_ptr_t csd;
|
|
node_local_ptr_t cnl;
|
|
shm_snapshot_ptr_t ss_shm_ptr;
|
|
snapshot_context_ptr_t lcl_ss_ctx, ss_orphan_ctx;
|
|
snapshot_filhdr_ptr_t ss_filhdr_ptr;
|
|
int shdw_fd, tmpfd, fclose_res, fstat_res, status, perm, group_id, pwrite_res, dsk_addr = 0;
|
|
int retries, idx, this_snapshot_idx, save_errno, ss_shmsize, ss_shm_vbn;
|
|
ZOS_ONLY(int realfiletag;)
|
|
long ss_shmid = INVALID_SHMID;
|
|
char tempnamprefix[MAX_FN_LEN + 1], tempdir_full_buffer[GTM_PATH_MAX], *tempfilename;
|
|
char tempdir_trans_buffer[GTM_PATH_MAX], eof_marker[EOF_MARKER_SIZE];
|
|
char *time_ptr, time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */
|
|
struct stat stat_buf;
|
|
enum db_acc_method acc_meth;
|
|
void *ss_shmaddr;
|
|
gtm_int64_t db_file_size;
|
|
uint4 tempnamprefix_len, crit_counter, tot_blks, prev_ss_shmsize, native_size, fstat_status;
|
|
uint4 *kip_pids_arr_ptr;
|
|
mstr tempdir_log, tempdir_full, tempdir_trans;
|
|
boolean_t debug_mupip = FALSE, wait_for_zero_kip, final_retry;
|
|
now_t now;
|
|
struct perm_diag_data pdd;
|
|
|
|
assert(IS_MUPIP_IMAGE);
|
|
assert(NULL != calling_utility);
|
|
csa = &FILE_INFO(reg)->s_addrs;
|
|
csd = csa->hdr;
|
|
cnl = csa->nl;
|
|
acc_meth = csd->acc_meth;
|
|
debug_mupip = (CLI_PRESENT == cli_present("DBG"));
|
|
|
|
/* Create a context containing default information pertinent to this initiate invocation */
|
|
lcl_ss_ctx = malloc(SIZEOF(snapshot_context_t)); /* should be free'd by ss_release */
|
|
DEFAULT_INIT_SS_CTX(lcl_ss_ctx);
|
|
call_on_signal = ss_initiate_call_on_signal;
|
|
/* Snapshot context created. Any error below before the next cur_state assignment and a later call to ss_release will
|
|
* have to free the malloc'ed snapshot context instance
|
|
*/
|
|
lcl_ss_ctx->cur_state = BEFORE_SHADOW_FIL_CREAT;
|
|
*ss_ctx = lcl_ss_ctx;
|
|
assert(!csa->now_crit); /* Right now mu_int_reg (which does not hold crit) is the only one that calls ss_initiate */
|
|
assert(!csa->hold_onto_crit); /* this ensures we can safely do unconditional grab_crit and rel_crit */
|
|
ss_get_lock(reg); /* Grab hold of the snapshot crit lock (low level latch) */
|
|
ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa);
|
|
if (MAX_SNAPSHOTS == cnl->num_snapshots_in_effect)
|
|
{
|
|
/* SS_MULTI: If multiple snapshots are supported, then we should run through each of the "possibly" running
|
|
* snapshots
|
|
*/
|
|
assert(1 == MAX_SNAPSHOTS);
|
|
/* Check if the existing snapshot is still alive. If not, go ahead and cleanup that for us to continue */
|
|
if ((0 != ss_shm_ptr->ss_info.ss_pid) && !is_proc_alive(ss_shm_ptr->ss_info.ss_pid, 0))
|
|
ss_release(NULL);
|
|
else
|
|
{
|
|
gtm_putmsg(VARLSTCNT(5) ERR_MAXSSREACHED, 3, MAX_SNAPSHOTS, REG_LEN_STR(reg));
|
|
ss_release_lock(reg);
|
|
UNFREEZE_REGION_IF_NEEDED(csd, reg);
|
|
return FALSE;
|
|
}
|
|
}
|
|
ss_shm_ptr->ss_info.ss_pid = process_id;
|
|
cnl->num_snapshots_in_effect++;
|
|
assert(ss_lock_held_by_us(reg));
|
|
ss_release_lock(reg);
|
|
|
|
/* For a readonly database for the current process, we better have the region frozen */
|
|
assert(!reg->read_only || csd->freeze);
|
|
/* ============================ STEP 1 : Shadow file name construction ==============================
|
|
*
|
|
* --> Directory is taken from GTM_SNAPTMPDIR, if available, else GTM_BAK_TEMPDIR_LOG_NAME_UC, if available,
|
|
* else use current directory.
|
|
* --> use the template - gtm_snapshot_<region_name>_<pid_in_hex>_XXXXXX
|
|
* --> the last six characters will be replaced by MKSTEMP below
|
|
* Note: MUPIP RUNDOWN will rely on the above template to do the cleanup if there were a
|
|
* crash that led to the improper shutdown of the snapshot process. So, if the above template
|
|
* changes make sure it's taken care of in MUPIP RUNDOWN
|
|
* Note: most of the shadow file name construction and the creation of shadow file is been
|
|
* borrowed from mupip_backup.c. This code should be modularized and should be used in both mupip_backup
|
|
* as well as here.
|
|
*/
|
|
|
|
/* set up a prefix for the temporary file. */
|
|
tempnamprefix_len = 0;
|
|
memset(tempnamprefix, 0, MAX_FN_LEN);
|
|
memcpy(tempnamprefix, SNAPSHOT_TMP_PREFIX, STR_LIT_LEN(SNAPSHOT_TMP_PREFIX));
|
|
tempnamprefix_len += STR_LIT_LEN(SNAPSHOT_TMP_PREFIX);
|
|
memcpy(tempnamprefix + tempnamprefix_len, reg->rname, reg->rname_len);
|
|
tempnamprefix_len += reg->rname_len;
|
|
SNPRINTF(&tempnamprefix[tempnamprefix_len], MAX_FN_LEN, "_%x", process_id);
|
|
|
|
tempdir_log.addr = GTM_SNAPTMPDIR;
|
|
tempdir_log.len = STR_LIT_LEN(GTM_SNAPTMPDIR);
|
|
tempfilename = tempdir_full.addr = tempdir_full_buffer;
|
|
/* Check if the environment variable is defined or not.
|
|
* Side-effect: tempdir_trans.addr = tempdir_trans_buffer irrespective of whether TRANS_LOG_NAME
|
|
* succeeded or not.
|
|
*/
|
|
status = TRANS_LOG_NAME(&tempdir_log,
|
|
&tempdir_trans,
|
|
tempdir_trans_buffer,
|
|
SIZEOF(tempdir_trans_buffer),
|
|
do_sendmsg_on_log2long);
|
|
|
|
if (SS_NORMAL == status && (NULL != tempdir_trans.addr) && (0 != tempdir_trans.len))
|
|
*(tempdir_trans.addr + tempdir_trans.len) = 0;
|
|
else
|
|
{ /* Not found - try GTM_BAK_TEMPDIR_LOG_NAME_UC instead */
|
|
tempdir_log.addr = GTM_BAK_TEMPDIR_LOG_NAME_UC;
|
|
tempdir_log.len = STR_LIT_LEN(GTM_BAK_TEMPDIR_LOG_NAME_UC);
|
|
status = TRANS_LOG_NAME(&tempdir_log,
|
|
&tempdir_trans,
|
|
tempdir_trans_buffer,
|
|
SIZEOF(tempdir_trans_buffer),
|
|
do_sendmsg_on_log2long);
|
|
if (SS_NORMAL == status && (NULL != tempdir_trans.addr) && (0 != tempdir_trans.len))
|
|
*(tempdir_trans.addr + tempdir_trans.len) = 0;
|
|
else
|
|
{ /* Not found - use the current directory via a relative filespec */
|
|
tempdir_trans_buffer[0] = '.';
|
|
tempdir_trans_buffer[1] = '\0';
|
|
tempdir_trans.len = 1;
|
|
}
|
|
}
|
|
|
|
/* Verify if we can stat the temporary directory */
|
|
if (FILE_STAT_ERROR == (fstat_res = gtm_file_stat(&tempdir_trans, NULL, &tempdir_full, FALSE, &fstat_status)))
|
|
{
|
|
gtm_putmsg(VARLSTCNT(5) ERR_SSTMPDIRSTAT, 2, tempdir_trans.len, tempdir_trans.addr, fstat_status);
|
|
UNFREEZE_REGION_IF_NEEDED(csd, reg);
|
|
return FALSE;
|
|
}
|
|
SNPRINTF(tempfilename + tempdir_full.len, GTM_PATH_MAX, "/%s_XXXXXX", tempnamprefix);
|
|
|
|
/* ========================== STEP 2 : Create the shadow file ======================== */
|
|
/* get a unique temporary file name. The file gets created on success */
|
|
DEFER_INTERRUPTS(INTRPT_IN_SS_INITIATE); /* Defer MUPIP STOP till the file is created */
|
|
MKSTEMP(tempfilename, tmpfd);
|
|
STRCPY(lcl_ss_ctx->shadow_file, tempfilename);
|
|
/* Shadow file created. Any error below before the next cur_state assignment and a later call to ss_release will have
|
|
* to delete at least the shadow file apart from free'ing the malloc'ed snapshot context instance
|
|
*/
|
|
lcl_ss_ctx->cur_state = AFTER_SHADOW_FIL_CREAT;
|
|
if (FD_INVALID == tmpfd)
|
|
{
|
|
status = errno;
|
|
gtm_putmsg(VARLSTCNT(5) ERR_SSTMPCREATE, 2, tempdir_trans.len, tempdir_trans.addr, status);
|
|
UNFREEZE_REGION_IF_NEEDED(csd, reg);
|
|
return FALSE;
|
|
}
|
|
# ifdef __MVS__
|
|
if (-1 == gtm_zos_set_tag(tmpfd, TAG_BINARY, TAG_NOTTEXT, TAG_FORCE, &realfiletag))
|
|
TAG_POLICY_GTM_PUTMSG(tempfilename, realfiletag, TAG_BINARY, errno);
|
|
# endif
|
|
/* Temporary file for backup was created above using "mkstemp" which on AIX opens the file without
|
|
* large file support enabled. Work around that by closing the file descriptor returned and reopening
|
|
* the file with the "open" system call (which gets correctly translated to "open64"). We need to do
|
|
* this because the temporary file can get > 2GB. Since it is not clear if mkstemp on other Unix platforms
|
|
* will open the file for large file support, we use this solution for other Unix flavours as well.
|
|
*/
|
|
OPENFILE(tempfilename, O_RDWR, shdw_fd);
|
|
if (FD_INVALID == shdw_fd)
|
|
{
|
|
status = errno;
|
|
gtm_putmsg(VARLSTCNT(7) ERR_SSFILOPERR, 4, LEN_AND_LIT("open"),
|
|
tempdir_full.len, tempdir_full.addr, status);
|
|
UNFREEZE_REGION_IF_NEEDED(csd, reg);
|
|
return FALSE;
|
|
}
|
|
# ifdef __MVS__
|
|
if (-1 == gtm_zos_set_tag(shdw_fd, TAG_BINARY, TAG_NOTTEXT, TAG_FORCE, &realfiletag))
|
|
TAG_POLICY_GTM_PUTMSG(tempfilename, realfiletag, TAG_BINARY, errno);
|
|
# endif
|
|
/* Now that the temporary file has been opened successfully, close the fd returned by mkstemp */
|
|
F_CLOSE(tmpfd, fclose_res);
|
|
lcl_ss_ctx->shdw_fd = shdw_fd;
|
|
ENABLE_INTERRUPTS(INTRPT_IN_SS_INITIATE);
|
|
tempdir_full.len = STRLEN(tempdir_full.addr); /* update the length */
|
|
assert(GTM_PATH_MAX >= tempdir_full.len);
|
|
|
|
/* give temporary files the group and permissions as other shared resources - like journal files */
|
|
FSTAT_FILE(((unix_db_info *)(reg->dyn.addr->file_cntl->file_info))->fd, &stat_buf, fstat_res);
|
|
assert(-1 != fstat_res);
|
|
if (-1 != fstat_res)
|
|
if (gtm_set_group_and_perm(&stat_buf, &group_id, &perm, PERM_FILE, &pdd) < 0)
|
|
{
|
|
send_msg(VARLSTCNT(6+PERMGENDIAG_ARG_COUNT)
|
|
ERR_PERMGENFAIL, 4, RTS_ERROR_STRING("snapshot file"),
|
|
RTS_ERROR_STRING(((unix_db_info *)(reg->dyn.addr->file_cntl->file_info))->fn),
|
|
PERMGENDIAG_ARGS(pdd));
|
|
gtm_putmsg(VARLSTCNT(6+PERMGENDIAG_ARG_COUNT)
|
|
ERR_PERMGENFAIL, 4, RTS_ERROR_STRING("snapshot file"),
|
|
RTS_ERROR_STRING(((unix_db_info *)(reg->dyn.addr->file_cntl->file_info))->fn),
|
|
PERMGENDIAG_ARGS(pdd));
|
|
UNFREEZE_REGION_IF_NEEDED(csd, reg);
|
|
return FALSE;
|
|
}
|
|
|
|
if ((-1 == fstat_res) || (-1 == FCHMOD(shdw_fd, perm))
|
|
|| ((-1 != group_id) && (-1 == fchown(shdw_fd, -1, group_id))))
|
|
{
|
|
status = errno;
|
|
gtm_putmsg(VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("fchmod/fchown"), CALLFROM, status);
|
|
UNFREEZE_REGION_IF_NEEDED(csd, reg);
|
|
return FALSE;
|
|
}
|
|
if (debug_mupip)
|
|
{
|
|
util_out_print("!/MUPIP INFO: Successfully created the shadow file: !AD",
|
|
TRUE,
|
|
tempdir_full.len,
|
|
tempdir_full.addr);
|
|
}
|
|
|
|
/* STEP 3: Wait for kills-in-progress and initialize snapshot shadow file
|
|
*
|
|
* Snapshot Shadow File Layout -
|
|
*
|
|
* -----------------------
|
|
* | RESERVE_SHM_SPACE | <-- Variable sized space reserved in case we would like the snapshots to be preserved
|
|
* -----------------------
|
|
* | DB_FILE_HEADER | <-- Copy of the database file header taken at the snapshot time (in crit) (SGMNT_HDR_LEN)
|
|
* -----------------------
|
|
* | DB_MASTER_MAP | <-- MASTER_MAP_SIZE_MAX
|
|
* ----------------------- <-- offset of block zero
|
|
* | BLOCK_ZERO_BIMG | <-- csd->blk_size
|
|
* -----------------------
|
|
* | BLOCK_ONE_BIMG | <-- csd->blk_size
|
|
* -----------------------
|
|
* .
|
|
* .
|
|
* -----------------------
|
|
* | EOF_MARKER | <-- 512 bytes (DISK_BLOCK_SIZE)
|
|
* -----------------------
|
|
*/
|
|
/* Below, master map and the file header will be copied and hence check if we have the right
|
|
* pointers setup by the caller
|
|
*/
|
|
assert(NULL != util_ss_ptr);
|
|
assert(NULL != util_ss_ptr->master_map);
|
|
assert(NULL != util_ss_ptr->header);
|
|
grab_crit(reg);
|
|
INCR_INHIBIT_KILLS(cnl);
|
|
kip_pids_arr_ptr = cnl->kip_pid_array;
|
|
prev_ss_shmsize = 0;
|
|
for (retries = 0; MAX_TRY_FOR_TOT_BLKS >= retries; ++retries)
|
|
{
|
|
final_retry = (MAX_TRY_FOR_TOT_BLKS == retries);
|
|
/* On all but the 4th retry (inclusive of retries = -1), release crit */
|
|
if (!final_retry)
|
|
rel_crit(reg);
|
|
/*
|
|
* Outside crit (on the final retry, we will be holding crit while calculating the below):
|
|
* 1. total blocks
|
|
* 2. native size (total number of blocks in terms of DISK_BLOCK_SIZE)
|
|
* 3. write EOF an offset equal to the current database file size
|
|
* 4. create shared memory segment to be used as bitmap for storing whether a given database block
|
|
* was before imaged or not.
|
|
*/
|
|
DEFER_INTERRUPTS(INTRPT_IN_SS_INITIATE); /* Defer MUPIP STOP till the shared memory is created */
|
|
tot_blks = csd->trans_hist.total_blks;
|
|
prev_ss_shmsize = ss_shmsize;
|
|
ss_shmsize = (int)(ROUND_UP((SNAPSHOT_HDR_SIZE + TOT_BYTES_REQUIRED(tot_blks)), OS_PAGE_SIZE));
|
|
assert((0 == retries) || (0 != prev_ss_shmsize));
|
|
if (0 == retries)
|
|
{ /* If this is the first try, then create shared memory anyways. */
|
|
SS_INIT_SHM(ss_shmsize, ss_shmaddr, ss_shmid, FALSE, status);
|
|
} else if ((prev_ss_shmsize - SNAPSHOT_HDR_SIZE) < TOT_BYTES_REQUIRED(tot_blks))
|
|
{ /* Shared memory created with SS_INIT_SHM (in prior retries) is aligned at the OS page size boundary.
|
|
* So, in case of an extension (in subsequent retry) we check if the newly added blocks would fit in
|
|
* the existing shared memory (possible if the extra bytes allocated for OS page size alignment will
|
|
* be enough to hold the newly added database blocks). If not, remove the original shared memory and
|
|
* create a new one.
|
|
*/
|
|
if (debug_mupip)
|
|
{
|
|
util_out_print("!/MUPIP INFO: Existing shared memory !UL of size !UL not enough for total blocks \
|
|
!UL. Creating new shared memory", TRUE, ss_shmid, prev_ss_shmsize, tot_blks);
|
|
}
|
|
SS_INIT_SHM(ss_shmsize, ss_shmaddr, ss_shmid, TRUE, status);
|
|
}
|
|
if (status)
|
|
{ /* error while creating shared memory */
|
|
GET_CRIT_AND_DECR_INHIBIT_KILLS(reg, cnl);
|
|
UNFREEZE_REGION_IF_NEEDED(csd, reg);
|
|
return FALSE;
|
|
}
|
|
/* At this point, we are done with allocating shared memory (aligned with OS_PAGE_SIZE) enough to hold
|
|
* the total number of blocks. Any error below before the next cur_state assignment and a later call
|
|
* to ss_release will have to detach/rmid the shared memory identifier apart from deleting the shadow
|
|
* file and free'ing the malloc'ed snapshot context instance.
|
|
*/
|
|
lcl_ss_ctx->attach_shmid = ss_shmid;
|
|
lcl_ss_ctx->start_shmaddr = ss_shmaddr;
|
|
lcl_ss_ctx->cur_state = AFTER_SHM_CREAT;
|
|
ENABLE_INTERRUPTS(INTRPT_IN_SS_INITIATE);
|
|
if (debug_mupip)
|
|
{
|
|
util_out_print("!/MUPIP INFO: Shared memory created. SHMID = !UL",
|
|
TRUE,
|
|
ss_shmid);
|
|
}
|
|
/* Write EOF block in the snapshot file */
|
|
native_size = gds_file_size(reg->dyn.addr->file_cntl);
|
|
db_file_size = (gtm_int64_t)(native_size) * DISK_BLOCK_SIZE; /* Size of database file in bytes */
|
|
LSEEKWRITE(shdw_fd, ((off_t)db_file_size + ss_shmsize), eof_marker, EOF_MARKER_SIZE, status);
|
|
if (0 != status)
|
|
{ /* error while writing EOF record to snapshot file */
|
|
GET_CRIT_AND_DECR_INHIBIT_KILLS(reg, cnl);
|
|
ISSUE_WRITE_ERROR_AND_EXIT(reg, status, csa, tempfilename);
|
|
}
|
|
/* Wait for KIP to reset */
|
|
wait_for_zero_kip = csd->kill_in_prog;
|
|
/* Wait for existing kills-in-progress to be reset. Since a database file extension is possible as
|
|
* we don't hold crit while wcs_sleep below, we will retry if the total blocks have changed since
|
|
* we last checked it. However, in the final retry, the shared memory and shadow file initialization
|
|
* will happen in crit. Also, in the final retry, we won't be waiting for the kills in progress as
|
|
* we would be holding crit and cannot afford to wcs_sleep. So, in such a case, MUKILLIP should be
|
|
* issued by the caller of ss_initiate if csd->kill_in_prog is set to TRUE. But, such a case should
|
|
* be rare.
|
|
*/
|
|
for (crit_counter = 1; wait_for_zero_kip && !final_retry; )
|
|
{
|
|
/* Release crit before going into the wait loop */
|
|
rel_crit(reg);
|
|
if (debug_mupip)
|
|
{
|
|
GET_CUR_TIME;
|
|
util_out_print("!/MUPIP INFO: !AD : Start kill-in-prog wait for database !AD", TRUE,
|
|
CTIME_BEFORE_NL, time_ptr, DB_LEN_STR(reg));
|
|
}
|
|
while (csd->kill_in_prog && (MAX_CRIT_TRY > crit_counter++))
|
|
{
|
|
GET_C_STACK_FOR_KIP(kip_pids_arr_ptr, crit_counter, MAX_CRIT_TRY, 1, MAX_KIP_PID_SLOTS);
|
|
wcs_sleep(crit_counter);
|
|
}
|
|
if (debug_mupip)
|
|
{
|
|
GET_CUR_TIME;
|
|
util_out_print("!/MUPIP INFO: !AD : Done with kill-in-prog wait on !AD", TRUE,
|
|
CTIME_BEFORE_NL, time_ptr, DB_LEN_STR(reg));
|
|
}
|
|
wait_for_zero_kip = (MAX_CRIT_TRY > crit_counter); /* if TRUE, we can wait for some more time on
|
|
* this region */
|
|
grab_crit(reg);
|
|
if (csd->kill_in_prog)
|
|
{
|
|
/* It is possible for this to happen in case a concurrent GT.M process is in its 4th retry.
|
|
* In that case, it will not honor the inhibit_kills flag since it holds crit and therefore
|
|
* could have set kill-in-prog to a non-zero value while we were outside of crit.
|
|
* Since we have waited for 1 minute already, proceed with snapshot. The reasoning is that
|
|
* once the GT.M process that is in the final retry finishes off the second part of the M-kill,
|
|
* it will not start a new transaction in the first try which is outside of crit so will honor
|
|
* the inhibit-kills flag and therefore not increment the kill_in_prog counter any more until
|
|
* this crit is released. So we could be waiting for at most 1 kip increment per concurrent process
|
|
* that is updating the database. We expect these kills to be complete within 1 minute.
|
|
*/
|
|
if (!wait_for_zero_kip)
|
|
break;
|
|
} else
|
|
break;
|
|
}
|
|
grab_crit(reg);
|
|
/* There are two reasons why we might go for another iteration of this loop
|
|
* (a) After we have created the shared memory and before we grab crit, another process can add new blocks to the
|
|
* database, in which case csd->trans_hist.total_blks is no longer the value as we noted down above. Check if this
|
|
* is the case and if so, retry to obtain a consistent copy of the total number of blocks.
|
|
* (b) Similarly, csd->kill_in_prog was FALSE before grab_crit, but non-zero after grab-crit due to concurrency
|
|
* reasons. Check if this is the case and if so, retry to wait for KIP to reset.
|
|
*/
|
|
if ((tot_blks == csd->trans_hist.total_blks) && !csd->kill_in_prog)
|
|
{ /* We have a consistent copy of the total blocks and csd->kill_in_prog is FALSE inside crit. No need for
|
|
* retry.
|
|
*/
|
|
assert(native_size == ((tot_blks * (csd->blk_size / DISK_BLOCK_SIZE)) + (csd->start_vbn)));
|
|
break;
|
|
}
|
|
}
|
|
/* At this point, we are MOST likely guaranteed that kill-in-prog is set to zero for this region and CERTAINLY
|
|
* guaranteed that no more kills will be started for this region. Now, we are still holding crit, so any error
|
|
* that occurs henceforth should do a DECR_INHIBIT_KILLS and rel_crit before exiting to let the other processes
|
|
* proceed gracefully
|
|
*/
|
|
assert(csa->now_crit);
|
|
assert(native_size == ((tot_blks * (csd->blk_size / DISK_BLOCK_SIZE)) + (csd->start_vbn)));
|
|
assert(NULL != ss_shmaddr);
|
|
assert(0 == ((long)ss_shmaddr % OS_PAGE_SIZE));
|
|
assert(0 == ss_shmsize % OS_PAGE_SIZE);
|
|
assert(INVALID_SHMID != ss_shmid);
|
|
assert(0 == (ss_shmsize % DISK_BLOCK_SIZE));
|
|
ss_shm_vbn = ss_shmsize / DISK_BLOCK_SIZE; /* # of DISK_BLOCK_SIZEs the shared memory spans across */
|
|
assert(ss_shmid == lcl_ss_ctx->attach_shmid);
|
|
assert(AFTER_SHM_CREAT == lcl_ss_ctx->cur_state);
|
|
if (debug_mupip)
|
|
{
|
|
util_out_print("!/MUPIP INFO: Successfully created shared memory. SHMID = !UL",
|
|
TRUE,
|
|
ss_shmid);
|
|
}
|
|
/* It is possible that we did saw csd->full_upgraded TRUE before invoking ss_initiate but MUPIP SET -VER=V4 was done after
|
|
* ss_initiate was called. Handle appropriately.
|
|
*/
|
|
if (!csd->fully_upgraded)
|
|
{
|
|
/* If -ONLINE was specified explicitly, then it is an ERROR. Issue the error and return FALSE. */
|
|
if (online_specified)
|
|
{
|
|
gtm_putmsg(VARLSTCNT(4) ERR_SSV4NOALLOW, 2, DB_LEN_STR(reg));
|
|
util_out_print(NO_ONLINE_ERR_MSG, TRUE);
|
|
mu_int_errknt++;
|
|
GET_CRIT_AND_DECR_INHIBIT_KILLS(reg, cnl);
|
|
UNFREEZE_REGION_IF_NEEDED(csd, reg);
|
|
return FALSE;
|
|
} else
|
|
{ /* If -ONLINE was assumed implicitly (default of INTEG -REG), then set ointeg_this_reg to FALSE,
|
|
* relinquish the resources and continue as if it is -NOONLINE
|
|
*/
|
|
ointeg_this_reg = FALSE;
|
|
assert(NULL != *ss_ctx);
|
|
ss_release(ss_ctx);
|
|
GET_CRIT_AND_DECR_INHIBIT_KILLS(reg, cnl);
|
|
UNFREEZE_REGION_IF_NEEDED(csd, reg);
|
|
return TRUE;
|
|
}
|
|
}
|
|
/* ===================== STEP 5: Flush the pending updates in the global cache =================== */
|
|
|
|
/* For a readonly database for the current process, we cannot do wcs_flu. We would have waited for the active queues
|
|
* to complete in mu_int_reg after doing a freeze. Now that we have crit, unfreeze the region
|
|
*/
|
|
if (reg->read_only)
|
|
{
|
|
region_freeze(reg, FALSE, FALSE, FALSE);
|
|
}
|
|
else if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_MSYNC_DB)) /* wcs_flu guarantees that all the pending
|
|
* phase 2 commits are done with before returning */
|
|
{
|
|
assert(process_id != csd->freeze); /* We would not have frozen the region if the database is read-write */
|
|
gtm_putmsg(VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_STR(calling_utility), DB_LEN_STR(reg));
|
|
GET_CRIT_AND_DECR_INHIBIT_KILLS(reg, cnl);
|
|
return FALSE;
|
|
}
|
|
assert(0 < cnl->num_snapshots_in_effect || (!SNAPSHOTS_IN_PROG(cnl)));
|
|
assert(csa->now_crit);
|
|
/* ========== STEP 6: Copy the file header, master map and the native file size into a private structure =========== */
|
|
|
|
memcpy(util_ss_ptr->header, csd, SGMNT_HDR_LEN);
|
|
memcpy(util_ss_ptr->master_map, MM_ADDR(csd), MASTER_MAP_SIZE_MAX);
|
|
util_ss_ptr->native_size = native_size;
|
|
|
|
/* We are about to copy the process private variables to shared memory. Although we have done grab_crit above, we take
|
|
* snapshot crit lock to ensure that no other process attempts snapshot cleanup.
|
|
*/
|
|
DEFER_INTERRUPTS(INTRPT_IN_SS_INITIATE); /* Defer MUPIP STOP until we complete copying to shared memory */
|
|
/* == STEP 7: Populate snapshot context, database shared memory snapshot structure and snapshot file header structure == */
|
|
ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa);
|
|
DBG_ENSURE_PTR_WITHIN_SS_BOUNDS(csa, (sm_uc_ptr_t)ss_shm_ptr);
|
|
/* Fill in the information for this snapshot in database shared memory */
|
|
ss_shm_ptr->ss_info.snapshot_tn = csd->trans_hist.curr_tn;
|
|
ss_shm_ptr->ss_info.free_blks = csd->trans_hist.free_blocks;
|
|
ss_shm_ptr->ss_info.total_blks = csd->trans_hist.total_blks;
|
|
ss_shm_ptr->ss_info.db_blk_size = csd->blk_size;
|
|
ss_shm_ptr->failure_errno = 0;
|
|
ss_shm_ptr->failed_pid = 0;
|
|
STRCPY(ss_shm_ptr->ss_info.shadow_file, tempfilename);
|
|
ss_shm_ptr->ss_info.shadow_vbn = ss_shm_vbn + csd->start_vbn;
|
|
ss_shm_ptr->ss_info.ss_shmid = cnl->ss_shmid = ss_shmid;
|
|
ss_shm_ptr->ss_info.ss_shmsize = ss_shmsize;
|
|
ss_shm_ptr->in_use = 1;
|
|
/* Fill in the information for the process specific snapshot context information */
|
|
lcl_ss_ctx->ss_shm_ptr = (shm_snapshot_ptr_t)(ss_shm_ptr);
|
|
lcl_ss_ctx->start_shmaddr = (sm_uc_ptr_t)ss_shmaddr;
|
|
lcl_ss_ctx->bitmap_addr = ((sm_uc_ptr_t)ss_shmaddr + SNAPSHOT_HDR_SIZE);
|
|
lcl_ss_ctx->shadow_vbn = ss_shm_ptr->ss_info.shadow_vbn;
|
|
lcl_ss_ctx->total_blks = ss_shm_ptr->ss_info.total_blks;
|
|
/* Fill snapshot file header information. This data will be persisted if -PRESERVE option is given */
|
|
ss_filhdr_ptr = (snapshot_filhdr_ptr_t)(ss_shmaddr);
|
|
FILL_SS_FILHDR_INFO(ss_shm_ptr, ss_filhdr_ptr)
|
|
ss_shm_ptr->preserve_snapshot = preserve_snapshot;
|
|
SET_LATCH_GLOBAL(&ss_shm_ptr->bitmap_latch, LOCK_AVAILABLE);
|
|
DECR_INHIBIT_KILLS(cnl);
|
|
/* announce GT.M that it's now ok to write the before images */
|
|
if (!SNAPSHOTS_IN_PROG(cnl))
|
|
cnl->snapshot_in_prog = TRUE;
|
|
csa->snapshot_in_prog = TRUE;
|
|
/* Having a write memory barrier here ensures that whenever GT.M reads a newer value of cnl->ss_shmcycle, it is guaranteed
|
|
* that the remaining fields that it reads from the shared memory will be the latest ones.
|
|
*/
|
|
SHM_WRITE_MEMORY_BARRIER;
|
|
cnl->ss_shmcycle++; /* indicate that the ss_shmid field of cnl is being reused */
|
|
lcl_ss_ctx->ss_shmcycle = cnl->ss_shmcycle;
|
|
rel_crit(reg);
|
|
if ((csa->onln_rlbk_cycle != csa->nl->onln_rlbk_cycle) || csa->nl->onln_rlbk_pid)
|
|
{ /* A concurrent online rollback happened since we did the gvcst_init. The INTEG is not reliable.
|
|
* Cleanup and exit
|
|
*/
|
|
gtm_putmsg(VARLSTCNT(1) ERR_DBROLLEDBACK);
|
|
UNFREEZE_REGION_IF_NEEDED(csa, reg);
|
|
return FALSE;
|
|
}
|
|
/* ============= STEP 8: Write the database file header and the master map =============
|
|
* Write the database file header at an offset equal to the database shared memory size.
|
|
* This is because if we want to preserve the snapshot then we would want to write the
|
|
* shared memory information at the beginning of the file. */
|
|
LSEEKWRITE(shdw_fd, (off_t)ss_shmsize, (sm_uc_ptr_t)util_ss_ptr->header, SGMNT_HDR_LEN, pwrite_res);
|
|
if (0 != pwrite_res)
|
|
ISSUE_WRITE_ERROR_AND_EXIT(reg, pwrite_res, csa, tempfilename);
|
|
dsk_addr += ((int)ss_shmsize + (int)SGMNT_HDR_LEN);
|
|
/* write the database master map to the shadow file */
|
|
LSEEKWRITE(shdw_fd, dsk_addr, (sm_uc_ptr_t)util_ss_ptr->master_map, MASTER_MAP_SIZE_MAX, pwrite_res);
|
|
if (0 != pwrite_res)
|
|
ISSUE_WRITE_ERROR_AND_EXIT(reg, pwrite_res, csa, tempfilename);
|
|
dsk_addr += MASTER_MAP_SIZE_MAX;
|
|
assert(0 == (dsk_addr % OS_PAGE_SIZE));
|
|
lcl_ss_ctx->cur_state = SNAPSHOT_INIT_DONE; /* Same as AFTER_SHM_CREAT but set for clarity of the snapshot state */
|
|
call_on_signal = NULL; /* Any further cleanup on signals will be taken care by gds_rundown */
|
|
ENABLE_INTERRUPTS(INTRPT_IN_SS_INITIATE);
|
|
assert(!ss_lock_held_by_us(reg)); /* We should never leave the function with the snapshot latch not being released */
|
|
return TRUE;
|
|
}
|