1149 lines
48 KiB
C
1149 lines
48 KiB
C
/****************************************************************
|
|
* *
|
|
* Copyright 2001, 2013 Fidelity Information Services, Inc *
|
|
* *
|
|
* This source code contains the intellectual property *
|
|
* of its copyright holder(s), and is made available *
|
|
* under a license. If you do not know the terms of *
|
|
* the license, please stop and do not read further. *
|
|
* *
|
|
****************************************************************/
|
|
|
|
#include "mdef.h"
|
|
#include "min_max.h"
|
|
|
|
#include "gtm_time.h"
|
|
#include "gtm_string.h"
|
|
#include "gtm_stdio.h"
|
|
#ifdef VMS
|
|
#include <descrip.h>
|
|
#endif
|
|
#include "gdsroot.h"
|
|
#include "gdsbt.h"
|
|
#include "gtm_facility.h"
|
|
#include "fileinfo.h"
|
|
#include "gdsfhead.h"
|
|
#include "filestruct.h"
|
|
#include "jnl.h"
|
|
#include "copy.h"
|
|
#include "util.h"
|
|
#include "buddy_list.h"
|
|
#include "hashtab_int4.h" /* needed for muprec.h */
|
|
#include "hashtab_int8.h" /* needed for muprec.h */
|
|
#include "hashtab_mname.h" /* needed for muprec.h */
|
|
#include "muprec.h"
|
|
#include "mur_read_file.h"
|
|
#include "iosp.h"
|
|
#include "gtmmsg.h" /* for gtm_putmsg() prototype */
|
|
#include "dbfilop.h" /* for dbfilop() prototype */
|
|
#include "cli.h"
|
|
#include "mupip_exit.h"
|
|
#include "mur_validate_checksum.h"
|
|
#include "gdsblk.h"
|
|
#ifdef GTM_CRYPT
|
|
#include "gtmcrypt.h"
|
|
#endif
|
|
#include "wbox_test_init.h"
|
|
#include "timers.h"
|
|
#ifdef GTM_TRUNCATE
|
|
#include "gdsfilext_nojnl.h"
|
|
#endif
|
|
#include "have_crit.h"
|
|
|
|
GBLREF mur_gbls_t murgbl;
|
|
GBLREF reg_ctl_list *mur_ctl;
|
|
GBLREF mur_opt_struct mur_options;
|
|
GBLREF jnl_gbls_t jgbl;
|
|
|
|
#ifdef DEBUG
|
|
static boolean_t iterationcnt;
|
|
static jnl_tm_t prev_max_lvrec_time, prev_min_bov_time;
|
|
#endif
|
|
|
|
error_def(ERR_CHNGTPRSLVTM);
|
|
error_def(ERR_DUPTOKEN);
|
|
error_def(ERR_EPOCHTNHI);
|
|
error_def(ERR_JNLBADRECFMT);
|
|
error_def(ERR_JNLREADBOF);
|
|
error_def(ERR_MUINFOSTR);
|
|
error_def(ERR_MUINFOUINT4);
|
|
error_def(ERR_MUINFOUINT8);
|
|
error_def(ERR_MUJNLSTAT);
|
|
error_def(ERR_NOPREVLINK);
|
|
error_def(ERR_RESOLVESEQNO);
|
|
error_def(ERR_RESOLVESEQSTRM);
|
|
error_def(ERR_TEXT);
|
|
|
|
#define MAX_BACK_PROCESS_REDO_CNT 8
|
|
|
|
#define SAVE_PRE_RESOLVE_SEQNO(rectype, rec_time, rec_token_seq, pre_resolve_seqno) \
|
|
{ \
|
|
if ((JRT_EPOCH == rectype) || (JRT_EOF == rectype)) \
|
|
{ \
|
|
if (rec_token_seq > *pre_resolve_seqno) \
|
|
*pre_resolve_seqno = rec_token_seq; \
|
|
} else \
|
|
{ \
|
|
if ((rec_token_seq + 1) > *pre_resolve_seqno) \
|
|
*pre_resolve_seqno = rec_token_seq + 1; \
|
|
} \
|
|
if (mur_options.verbose) \
|
|
{ \
|
|
gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_MUINFOUINT8, 4, LEN_AND_LIT("Pre-resolve seqno"), \
|
|
pre_resolve_seqno, pre_resolve_seqno); \
|
|
} \
|
|
}
|
|
|
|
#define MUR_BACK_PROCESS_ERROR(JCTL, JJCTL, MESSAGE_STRING) \
|
|
{ \
|
|
if (JCTL->after_end_of_data) \
|
|
{ \
|
|
*JJCTL = JCTL; \
|
|
return ERR_JNLBADRECFMT; \
|
|
} \
|
|
gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT(MESSAGE_STRING)); \
|
|
if (!mur_report_error(JCTL, MUR_JNLBADRECFMT)) \
|
|
{ \
|
|
*JJCTL = JCTL; \
|
|
return ERR_JNLBADRECFMT; \
|
|
} else \
|
|
continue; \
|
|
}
|
|
|
|
#define MUR_BACK_PROCESS_ERROR_STR(JCTL, JJCTL, MESSAGE_STRING) \
|
|
{ \
|
|
if (JCTL->after_end_of_data) \
|
|
{ \
|
|
*JJCTL = JCTL; \
|
|
return ERR_JNLBADRECFMT; \
|
|
} \
|
|
gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_STR(MESSAGE_STRING)); \
|
|
if (!mur_report_error(JCTL, MUR_JNLBADRECFMT)) \
|
|
{ \
|
|
*JJCTL = JCTL; \
|
|
return ERR_JNLBADRECFMT; \
|
|
} else \
|
|
continue; \
|
|
}
|
|
|
|
#ifdef VMS
|
|
#define VMS_MUR_BACK_PROCESS_GET_IMAGE_COUNT(JCTL, JNLREC, JJCTL, REC_IMAGE_COUNT, STATUS) \
|
|
{ \
|
|
MUR_GET_IMAGE_COUNT(JCTL, JNLREC, REC_IMAGE_COUNT, STATUS); \
|
|
if (SS_NORMAL != STATUS) \
|
|
{ /* We saw a corrupt journal record. Possible only if journal file had a crash \
|
|
* and have not yet reached the last epoch in backward processing and the \
|
|
* pini_addr should also point to an offset that is after the last epoch. \
|
|
*/ \
|
|
assert(JCTL->jfh->crash && (JCTL->rec_offset >= JNLREC->prefix.pini_addr) \
|
|
&& (JNLREC->prefix.pini_addr >= JCTL->jfh->end_of_data)); \
|
|
/* Register the offset of the corrupt record to be the PINI record (not the \
|
|
* current journal record), this way the "mur_back_processing" redo logic \
|
|
* inside "mur_back_process" restarts mur_fread_eof_crash search from the \
|
|
* lower offset and avoids lots of mur_back_processing redos (GTM-7393). \
|
|
*/ \
|
|
JCTL->rec_offset = JNLREC->prefix.pini_addr; \
|
|
MUR_BACK_PROCESS_ERROR(JCTL, JJCTL, "pini_addr is bad"); \
|
|
} \
|
|
}
|
|
#else
|
|
#define VMS_MUR_BACK_PROCESS_GET_IMAGE_COUNT(JCTL, JNLREC, JJCTL, REC_IMAGE_COUNT, STATUS)
|
|
#endif
|
|
|
|
#define MUR_TCOM_TOKEN_PROCESSING(jctl, jjctl, token, rec_image_count, rec_time, rec_fence, regno, reg_total, jnlrec) \
|
|
{ \
|
|
GBLREF mur_opt_struct mur_options; \
|
|
GBLREF mur_gbls_t murgbl; \
|
|
\
|
|
multi_struct *multi; \
|
|
\
|
|
if (NULL != (multi = MUR_TOKEN_LOOKUP(token, rec_image_count, rec_time, rec_fence))) \
|
|
{ \
|
|
if ((regno == multi->regnum) || (multi->fence != rec_fence) || (0 == multi->partner)) \
|
|
{ \
|
|
assert(0 != multi->partner); \
|
|
assert(!mur_options.rollback); /* jnl_seqno cannot be duplicate */ \
|
|
if (!mur_report_error(jctl, MUR_DUPTOKEN)) \
|
|
{ \
|
|
*jjctl = jctl; \
|
|
return ERR_DUPTOKEN; \
|
|
} \
|
|
SET_THIS_TN_AS_BROKEN(multi, reg_total); /* This is broken */ \
|
|
if (rec_time < multi->time) \
|
|
multi->time = rec_time; \
|
|
} else \
|
|
{ \
|
|
assert(&jnlrec->jrec_tcom.num_participants == &jnlrec->jrec_ztcom.participants); \
|
|
/* We expect each TCOM record to have the same # of participants. Assert that. There is one exception \
|
|
* though in that if the multi structure got created in the hash table as part of a broken transaction \
|
|
* (e.g. a TSET or USET record was seen in backward processing without having seen a TCOM record first) \
|
|
* we would have set the participants count to one more than the total # of regions participating in \
|
|
* the recovery thereby ensuring it gets treated as a broken transaction. \
|
|
*/ \
|
|
DEBUG_ONLY( \
|
|
if (jnlrec->jrec_tcom.num_participants != multi->tot_partner) \
|
|
{ \
|
|
assert(multi->this_is_broken); \
|
|
assert(multi->tot_partner >= (jnlrec->jrec_tcom.num_participants + 1)); \
|
|
} \
|
|
) \
|
|
assert(0 < multi->partner); \
|
|
multi->partner--; \
|
|
assert((TPFENCE != rec_fence) || rec_time == multi->time); \
|
|
assert((ZTPFENCE != rec_fence) || rec_time >= multi->time); \
|
|
if (0 == multi->partner) \
|
|
murgbl.broken_cnt--; /* It is resolved */ \
|
|
multi->regnum = regno; \
|
|
} \
|
|
} else \
|
|
{ \
|
|
assert(&jnlrec->jrec_tcom.num_participants == &jnlrec->jrec_ztcom.participants); \
|
|
MUR_TOKEN_ADD(multi, token, rec_image_count, rec_time, \
|
|
jnlrec->jrec_tcom.num_participants, rec_fence, regno); \
|
|
} \
|
|
}
|
|
|
|
#define TRANS_NUM_CONT_CHK_FAILED "Transaction number continuity check failed: [0x%08X] vs [0x%08X]"
|
|
#define SEQ_NUM_CONT_CHK_FAILED "Sequence number continuity check failed: [0x%08X] vs [0x%08X]"
|
|
#define TRANS_OR_SEQ_NUM_CONT_CHK_FAILED_SZ (MAX(SIZEOF(TRANS_NUM_CONT_CHK_FAILED), SIZEOF(SEQ_NUM_CONT_CHK_FAILED)) + 2 * 20)
|
|
|
|
STATICFNDCL void save_turn_around_point(reg_ctl_list *rctl, jnl_ctl_list *jctl, boolean_t apply_pblk);
|
|
|
|
STATICFNDEF void save_turn_around_point(reg_ctl_list *rctl, jnl_ctl_list *jctl, boolean_t apply_pblk)
|
|
{
|
|
jnl_record *jnlrec;
|
|
DEBUG_ONLY(jnl_ctl_list *tmpjctl;)
|
|
|
|
assert(!mur_options.forward);
|
|
assert(jctl->reg_ctl == rctl);
|
|
jnlrec = rctl->mur_desc->jnlrec;
|
|
assert(JRT_EPOCH == jnlrec->prefix.jrec_type);
|
|
assert(NULL == rctl->jctl_turn_around);
|
|
assert(0 == jctl->turn_around_offset);
|
|
rctl->jctl_turn_around = jctl;
|
|
jctl->turn_around_offset = jctl->rec_offset;
|
|
jctl->turn_around_time = jnlrec->prefix.time;
|
|
jctl->turn_around_seqno = jnlrec->jrec_epoch.jnl_seqno;
|
|
jctl->turn_around_tn = ((jrec_prefix *)jnlrec)->tn;
|
|
/* Note down the fully_upgraded field of the turn around EPOCH record. Later during forward recovery we will use this
|
|
* field to update rctl->csd->fully_upgraded
|
|
*/
|
|
jctl->turn_around_fullyupgraded = jnlrec->jrec_epoch.fully_upgraded;
|
|
DEBUG_ONLY(
|
|
/* before updating, check that previous pblk stop point is later than the final turn-around-point */
|
|
for (tmpjctl = rctl->jctl_apply_pblk; NULL != tmpjctl && tmpjctl != jctl; tmpjctl = tmpjctl->prev_gen)
|
|
;
|
|
assert((NULL == rctl->jctl_apply_pblk)
|
|
|| ((NULL != tmpjctl) && ((tmpjctl != rctl->jctl_apply_pblk)
|
|
|| (tmpjctl->apply_pblk_stop_offset >= jctl->turn_around_offset))));
|
|
)
|
|
if (apply_pblk)
|
|
{ /* we have applied more PBLKs than is already stored in rctl->jctl_apply_pblk. update that and related fields */
|
|
if (NULL != rctl->jctl_apply_pblk)
|
|
{ /* this was set to non-NULL by the previous iteration of mur_back_process. clear that. */
|
|
assert(rctl->jctl_apply_pblk->apply_pblk_stop_offset);
|
|
rctl->jctl_apply_pblk->apply_pblk_stop_offset = 0;
|
|
}
|
|
rctl->jctl_apply_pblk = jctl;
|
|
jctl->apply_pblk_stop_offset = jctl->turn_around_offset;
|
|
}
|
|
}
|
|
|
|
boolean_t mur_back_process(boolean_t apply_pblk, seq_num *pre_resolve_seqno)
|
|
{
|
|
jnl_ctl_list *jctl;
|
|
reg_ctl_list *rctl;
|
|
uint4 status;
|
|
int redo_cnt, regno, reg_total;
|
|
jnl_tm_t alt_tp_resolve_time;
|
|
jnl_record *jnlrec;
|
|
|
|
assert(!mur_options.forward || 0 == mur_options.since_time);
|
|
assert(!mur_options.forward || 0 == mur_options.lookback_time);
|
|
reg_total = murgbl.reg_total;
|
|
alt_tp_resolve_time = 0;
|
|
for ( redo_cnt = 0; ; redo_cnt++)
|
|
{
|
|
assert(MAX_BACK_PROCESS_REDO_CNT > redo_cnt);
|
|
/* ensure we are not doing too many redos of "mur_fread_eof_crash"/"mur_back_processing" */
|
|
*pre_resolve_seqno = 0;
|
|
DEBUG_ONLY(jctl = NULL;)
|
|
status = mur_back_processing(&jctl, apply_pblk, pre_resolve_seqno, alt_tp_resolve_time);
|
|
assert((SS_NORMAL == status) || (NULL != jctl)); /* should have been initialized by "mur_back_processing" */
|
|
if ((ERR_JNLBADRECFMT == status) && jctl->after_end_of_data)
|
|
{
|
|
assert(!jctl->next_gen);
|
|
PRINT_VERBOSE_TAIL_BAD(jctl);
|
|
if (SS_NORMAL != mur_fread_eof_crash(jctl, jctl->jfh->end_of_data, jctl->rec_offset))
|
|
return FALSE;
|
|
} else if (ERR_CHNGTPRSLVTM == status)
|
|
{
|
|
jnlrec = jctl->reg_ctl->mur_desc->jnlrec;
|
|
gtm_putmsg_csa(CSA_ARG(jctl->reg_ctl->csa) VARLSTCNT(6) ERR_CHNGTPRSLVTM, 4, jgbl.mur_tp_resolve_time,
|
|
jnlrec->prefix.time, jctl->jnl_fn_len, jctl->jnl_fn);
|
|
assert(jgbl.mur_tp_resolve_time > jnlrec->prefix.time);
|
|
alt_tp_resolve_time = jnlrec->prefix.time;
|
|
} else /* An error message must have already been printed if status != SS_NORMAL */
|
|
break;
|
|
JNL_PUT_MSG_PROGRESS("Restarting Backward processing");
|
|
REINITIALIZE_LIST(murgbl.multi_list);
|
|
reinitialize_hashtab_int8(&murgbl.token_table);
|
|
murgbl.broken_cnt = 0;
|
|
/* We must restart from latest generation. */
|
|
for (regno = 0; regno < reg_total; regno++)
|
|
{
|
|
rctl = &mur_ctl[regno];
|
|
jctl = rctl->jctl;
|
|
assert(jctl->reg_ctl == rctl);
|
|
for ( ; ;)
|
|
{
|
|
jctl->turn_around_offset = 0;
|
|
jctl->turn_around_time = 0;
|
|
jctl->turn_around_seqno = 0;
|
|
jctl->turn_around_tn = 0;
|
|
if (NULL == jctl->next_gen)
|
|
break;
|
|
jctl = jctl->next_gen;
|
|
assert(jctl->reg_ctl == rctl);
|
|
}
|
|
rctl->jctl = jctl; /* Restore latest generation before the failure */
|
|
rctl->jctl_turn_around = NULL;
|
|
}
|
|
} /* end infinite for loop */
|
|
return (SS_NORMAL == status);
|
|
}
|
|
|
|
uint4 mur_back_processing_one_region(mur_back_opt_t *mur_back_options)
|
|
{
|
|
boolean_t apply_pblk_this_region, first_epoch, reached_trnarnd, skip_rec, this_reg_resolved;
|
|
enum jnl_record_type rectype;
|
|
enum rec_fence_type rec_fence;
|
|
int idx, regno, reg_total, strm_idx;
|
|
int4 rec_image_count = 0; /* This is a dummy variable for UNIX */
|
|
jnl_ctl_list *jctl, **jjctl;
|
|
jnl_record *jnlrec;
|
|
jnl_string *keystr;
|
|
jnl_tm_t rec_time;
|
|
multi_struct *multi;
|
|
mur_read_desc_t *mur_desc;
|
|
reg_ctl_list *rctl;
|
|
seq_num *pre_resolve_seqno, rec_token_seq, save_resync_seqno, save_strm_seqno, strm_seqno;
|
|
token_num token;
|
|
trans_num rec_tn;
|
|
uint4 max_blk_size, max_rec_size;
|
|
uint4 status, val_len;
|
|
unsigned short max_key_size;
|
|
# ifdef GTM_CRYPT
|
|
int gtmcrypt_errno;
|
|
# endif
|
|
char s[TRANS_OR_SEQ_NUM_CONT_CHK_FAILED_SZ]; /* for appending sequence or transaction number */
|
|
# ifdef GTM_TRUNCATE
|
|
uint4 cur_total, old_total;
|
|
# endif
|
|
|
|
jctl = mur_back_options->jctl;
|
|
rctl = jctl->reg_ctl;
|
|
regno = rctl - &mur_ctl[0];
|
|
if (NULL != rctl->csa)
|
|
{
|
|
max_key_size = rctl->gd->max_key_size;
|
|
max_rec_size = rctl->gd->max_rec_size;
|
|
max_blk_size = rctl->csa->hdr->blk_size;
|
|
} else
|
|
{
|
|
max_key_size = MAX_KEY_SZ;
|
|
max_rec_size = MAX_LOGI_JNL_REC_SIZE;
|
|
max_blk_size = MAX_DB_BLK_SIZE;
|
|
}
|
|
mur_desc = rctl->mur_desc;
|
|
jnlrec = mur_desc->jnlrec;
|
|
rec_tn = jnlrec->prefix.tn;
|
|
rec_token_seq = mur_back_options->rec_token_seq;
|
|
first_epoch = mur_back_options->first_epoch;
|
|
status = mur_back_options->status;
|
|
jjctl = mur_back_options->jjctl;
|
|
this_reg_resolved = FALSE;
|
|
apply_pblk_this_region = mur_back_options->apply_pblk && !rctl->jfh_recov_interrupted;
|
|
pre_resolve_seqno = mur_back_options->pre_resolve_seqno;
|
|
reg_total = murgbl.reg_total;
|
|
for ( ; SS_NORMAL == status; status = mur_prev_rec(&jctl))
|
|
{
|
|
jctl->after_end_of_data = jctl->after_end_of_data && (jctl->rec_offset >= jctl->jfh->end_of_data);
|
|
assert(0 == jctl->turn_around_offset);
|
|
jnlrec = mur_desc->jnlrec;
|
|
rectype = (enum jnl_record_type)jnlrec->prefix.jrec_type;
|
|
/* Even if -verify is NOT specified, if the journal file had a crash, do verification until
|
|
* the first epoch is reached as the journal file could be corrupt anywhere until then
|
|
* (mur_fread_eof on the journal file at the start might not have caught it).
|
|
*/
|
|
if (mur_options.verify || (jctl->jfh->crash && jctl->after_end_of_data))
|
|
{
|
|
if (!mur_validate_checksum(jctl))
|
|
MUR_BACK_PROCESS_ERROR(jctl, jjctl, "Checksum validation failed");
|
|
if ((jnlrec->prefix.tn != rec_tn) && (jnlrec->prefix.tn != (rec_tn - 1)))
|
|
{
|
|
SNPRINTF(s, TRANS_OR_SEQ_NUM_CONT_CHK_FAILED_SZ, TRANS_NUM_CONT_CHK_FAILED,
|
|
jnlrec->prefix.tn, rec_tn);
|
|
rec_tn = jnlrec->prefix.tn;
|
|
MUR_BACK_PROCESS_ERROR_STR(jctl, jjctl, s);
|
|
}
|
|
if (mur_options.rollback && REC_HAS_TOKEN_SEQ(rectype) && (GET_JNL_SEQNO(jnlrec) > rec_token_seq))
|
|
{
|
|
SNPRINTF(s, TRANS_OR_SEQ_NUM_CONT_CHK_FAILED_SZ, SEQ_NUM_CONT_CHK_FAILED,
|
|
GET_JNL_SEQNO(jnlrec), rec_token_seq);
|
|
rec_token_seq = GET_JNL_SEQNO(jnlrec);
|
|
MUR_BACK_PROCESS_ERROR_STR(jctl, jjctl, s);
|
|
}
|
|
if (IS_SET_KILL_ZKILL_ZTRIG_ZTWORM(rectype))
|
|
{
|
|
keystr = (jnl_string *)&jnlrec->jrec_set_kill.mumps_node;
|
|
/* Assert that ZTWORMHOLE type record too has same layout as KILL/SET */
|
|
assert((sm_uc_ptr_t)keystr == (sm_uc_ptr_t)&jnlrec->jrec_ztworm.ztworm_str);
|
|
# ifdef GTM_CRYPT
|
|
if (jctl->jfh->is_encrypted)
|
|
{
|
|
MUR_DECRYPT_LOGICAL_RECS(keystr, jnlrec->prefix.forwptr, jctl->encr_key_handle,
|
|
gtmcrypt_errno);
|
|
if (0 != gtmcrypt_errno)
|
|
{
|
|
GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, gtm_putmsg, jctl->jnl_fn_len, jctl->jnl_fn);
|
|
*jjctl = jctl;
|
|
return gtmcrypt_errno;
|
|
}
|
|
}
|
|
# endif
|
|
if (IS_ZTWORM(rectype))
|
|
{ /* ZTWORMHOLE type */
|
|
# ifdef GTM_TRIGGER
|
|
if (MAX_ZTWORMHOLE_SIZE < keystr->length)
|
|
MUR_BACK_PROCESS_ERROR(jctl, jjctl, "ZTWORMHOLE size check failed");
|
|
# endif
|
|
} else
|
|
{ /* SET or KILL type */
|
|
if (keystr->length > max_key_size)
|
|
MUR_BACK_PROCESS_ERROR(jctl, jjctl, "Key size check failed");
|
|
if (0 != keystr->text[keystr->length - 1])
|
|
MUR_BACK_PROCESS_ERROR(jctl, jjctl, "Key null termination check failed");
|
|
if (IS_SET(rectype))
|
|
{
|
|
GET_MSTR_LEN(val_len, &keystr->text[keystr->length]);
|
|
if (val_len > max_rec_size)
|
|
MUR_BACK_PROCESS_ERROR(jctl, jjctl, "Record size check failed");
|
|
}
|
|
}
|
|
} else if (JRT_PBLK == rectype)
|
|
{
|
|
if (jnlrec->jrec_pblk.bsiz > max_blk_size)
|
|
MUR_BACK_PROCESS_ERROR(jctl, jjctl, "PBLK size check failed");
|
|
assert((FALSE == apply_pblk_this_region) || !mur_options.verify);
|
|
/* In case this journal file was crashed it is possible that we see a good PBLK at
|
|
* this point in time but could find bad journal data in the journal file at an
|
|
* EARLIER offset (further in backward processing). If the current recovery has been
|
|
* invoked with -noverify, we dont have a separate pblk application phase. One might
|
|
* wonder if in such a case, it is safe to apply good pblks at this point without
|
|
* knowing if bad pblks could be encountered later in backward processing. Turns out
|
|
* it is safe. If there were bad pblks BEFORE this good pblk, this means the good pblk
|
|
* landed in the journal file on disk because of pure chance (the IO system scheduled
|
|
* this write before the crash whereas the write of the previous bad pblks did not get
|
|
* a chance). As long as the bad pblks were not synced to the file, this means the db
|
|
* blocks corresponding to the good blks did NOT get modified at all (because the
|
|
* function wcs_wtstart ensures a db blk is written ONLY if all journal records until
|
|
* its corresponding pblk have been fsynced to the journal file). So basically the good
|
|
* pblk would be identical to the copy of the block in the unrecovered database at
|
|
* this point so it does not hurt to play it on top. For cases where there are no such
|
|
* bad data gaps in the journal file, playing the pblk when -noverify is specified is
|
|
* necessary as the pblk would be different from the copy of the blk in the database.
|
|
* So it is safe to do the play in both cases.
|
|
*/
|
|
if (apply_pblk_this_region)
|
|
{
|
|
assert(!mur_options.rollback_losttnonly);
|
|
mur_output_pblk(rctl);
|
|
}
|
|
continue;
|
|
}
|
|
} else if (JRT_PBLK == rectype && apply_pblk_this_region)
|
|
{
|
|
assert(!mur_options.rollback_losttnonly);
|
|
mur_output_pblk(rctl);
|
|
continue;
|
|
}
|
|
if (JRT_TRUNC == rectype)
|
|
{
|
|
NON_GTM_TRUNCATE_ONLY(assert(FALSE));
|
|
GTM_TRUNCATE_ONLY(
|
|
if (mur_options.forward)
|
|
continue;
|
|
old_total = jnlrec->jrec_trunc.orig_total_blks;
|
|
cur_total = rctl->csa->ti->total_blks;
|
|
assert(cur_total >= jnlrec->jrec_trunc.total_blks_after_trunc);
|
|
if (cur_total < old_total)
|
|
status = gdsfilext_nojnl(rctl->gd, old_total, cur_total);
|
|
if (0 != status)
|
|
MUR_BACK_PROCESS_ERROR(jctl, jjctl, "File extend for JRT_TRUNC record failed");
|
|
)
|
|
continue;
|
|
}
|
|
rec_tn = jnlrec->prefix.tn;
|
|
rec_time = jnlrec->prefix.time;
|
|
/* In journal records token_seq field is a union of jnl_seqno and token for TP, ZTP or unfenced records.
|
|
* For non-replication (that is, doing recover) token_seq.token field is used as token in hash table.
|
|
* For replication (that is, doing rollback) token_seq.jnl_seqno is used as token in hash table.
|
|
* Note : ZTP is not supported with replication.
|
|
*/
|
|
if (REC_HAS_TOKEN_SEQ(rectype))
|
|
{
|
|
assert(IS_SET_KILL_ZKILL_ZTRIG_ZTWORM(rectype) || IS_COM(rectype) || (JRT_EPOCH == (rectype))
|
|
|| (JRT_EOF == (rectype)) || (JRT_NULL == (rectype)));
|
|
assert(&jnlrec->jrec_set_kill.token_seq == (token_seq_t *)&jnlrec->jrec_epoch.jnl_seqno);
|
|
assert(&jnlrec->jrec_set_kill.token_seq == (token_seq_t *)&jnlrec->jrec_eof.jnl_seqno);
|
|
assert(&jnlrec->jrec_set_kill.token_seq == (token_seq_t *)&jnlrec->jrec_null.jnl_seqno);
|
|
assert(&jnlrec->jrec_set_kill.token_seq == (token_seq_t *)&jnlrec->jrec_tcom.token_seq);
|
|
assert(&jnlrec->jrec_set_kill.token_seq == (token_seq_t *)&jnlrec->jrec_ztcom.token);
|
|
rec_token_seq = GET_JNL_SEQNO(jnlrec);
|
|
if (mur_options.rollback)
|
|
{
|
|
# ifdef UNIX
|
|
/* In case of -rollback with -resync or -fetchresync on a supplementary instance
|
|
* with a <strm_seqno>, map back the input resync_strm_seqno to a resync_seqno
|
|
* as this is needed to set murgbl.losttn_seqno at the end of mur_back_process.
|
|
*/
|
|
if (murgbl.resync_strm_seqno_nonzero && IS_REPLICATED(rectype))
|
|
{
|
|
assert(IS_SET_KILL_ZKILL_ZTRIG_ZTWORM(rectype) || IS_COM(rectype)
|
|
|| (JRT_NULL == (rectype)));
|
|
assert(&jnlrec->jrec_set_kill.strm_seqno == &jnlrec->jrec_null.strm_seqno);
|
|
assert(&jnlrec->jrec_tcom.strm_seqno == &jnlrec->jrec_null.strm_seqno);
|
|
strm_seqno = GET_STRM_SEQNO(jnlrec);
|
|
strm_idx = GET_STRM_INDEX(strm_seqno);
|
|
strm_seqno = GET_STRM_SEQ60(strm_seqno);
|
|
if (murgbl.resync_strm_seqno[strm_idx]
|
|
&& (strm_seqno >= murgbl.resync_strm_seqno[strm_idx])
|
|
&& (murgbl.resync_seqno > rec_token_seq))
|
|
{
|
|
/* Assert that no adjustment of resync_seqno should happen in the second
|
|
* invocation of "mur_back_processing_one_region" for the same region.
|
|
*/
|
|
assert(mur_back_options->first_epoch);
|
|
murgbl.resync_seqno = rec_token_seq;
|
|
}
|
|
}
|
|
# endif
|
|
/* this_reg_resolved is set to true first time a sequence number is seen before the
|
|
* jgbl.mur_tp_resolve_time. This is necessary to find any gap in sequence numbers
|
|
* (C9D11-002465). Any gap will result in broken or lost transactions from the gap.
|
|
*/
|
|
if (!this_reg_resolved && (rec_time < jgbl.mur_tp_resolve_time))
|
|
{
|
|
SAVE_PRE_RESOLVE_SEQNO(rectype, rec_time, rec_token_seq, pre_resolve_seqno);
|
|
this_reg_resolved = TRUE;
|
|
}
|
|
}
|
|
} else
|
|
{
|
|
if (JRT_INCTN == rectype)
|
|
MUR_INCTN_BLKS_TO_UPGRD_ADJUST(rctl);
|
|
continue;
|
|
}
|
|
/* Resolve point is defined as the offset of the earliest journal record whose
|
|
* a) timestamp >= jgbl.mur_tp_resolve_time
|
|
* Turn around point is defined as the offset of the earliest EPOCH whose
|
|
* a) timestamp is < jgbl.mur_tp_resolve_time
|
|
* (if recover OR rollback with murgbl.resync_seqno == 0)
|
|
* b) timestamp is < jgbl.mur_tp_resolve_time AND jnl_seqno is < murgbl.resync_seqno
|
|
* (if rollback with murgbl.resync_seqno != 0)
|
|
* We maintain tokens (hash table) till Resolve Point, though Turn Around Point can be much before this.
|
|
* We apply PBLK till Turn Around Point.
|
|
*/
|
|
if (JRT_EPOCH == rectype)
|
|
{
|
|
if (!mur_options.forward && first_epoch && !rctl->recov_interrupted &&
|
|
(NULL != rctl->csd) && (rec_tn > rctl->csd->trans_hist.curr_tn))
|
|
{
|
|
assert(FALSE);
|
|
gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(7) ERR_EPOCHTNHI, 5, jctl->rec_offset,
|
|
jctl->jnl_fn_len, jctl->jnl_fn, &rec_tn, &rctl->csd->trans_hist.curr_tn);
|
|
MUR_BACK_PROCESS_ERROR(jctl, jjctl, "Epoch transaction number check failed");
|
|
}
|
|
if (first_epoch)
|
|
{
|
|
if (mur_options.verbose)
|
|
{
|
|
gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(6) ERR_MUINFOUINT4, 4,
|
|
LEN_AND_LIT(" First Epoch Record Offset"),
|
|
jctl->rec_offset, jctl->rec_offset);
|
|
gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(6) ERR_MUINFOUINT4, 4,
|
|
LEN_AND_LIT(" First Epoch Record timestamp"), rec_time, rec_time);
|
|
}
|
|
first_epoch = FALSE;
|
|
}
|
|
assert(mur_options.forward || murgbl.intrpt_recovery || (NULL == rctl->csd)
|
|
|| (jnlrec->prefix.tn <= rctl->csd->trans_hist.curr_tn));
|
|
if (rec_time < jgbl.mur_tp_resolve_time)
|
|
{ /* Reached EPOCH before resolve-time. Check if we have reached turnaround point.
|
|
* For no rollback OR for simple rollback with -resync or -fetchresync NOT specified,
|
|
* we need to go to an epoch BEFORE the resolve-time.
|
|
* For simple (i.e. journal files were cleanly shutdown) rollback with -resync or
|
|
* -fetchresync specified, we need to check if the epoch seqno is less than or
|
|
* equal to the input resync_seqno. If yes, then we can stop.
|
|
* For interrupted rollback, we need to additionally check if any of the potentially
|
|
* 16 streams had a resync_seqno specified as part of previous interrupted rollbacks
|
|
* and if so ensure the epoch is before all those points. That is, even if the epoch
|
|
* timestamp is LESSER than the tp_resolve_time, we have to continue backward
|
|
* processing until all the epoch's stream seqnos are less than any resync_seqnos
|
|
* specified as part of this or previous interrupted rollbacks.
|
|
*/
|
|
reached_trnarnd = TRUE; /* Assume we have reached turnaround point.
|
|
* Will be reset if we find otherwise.
|
|
*/
|
|
if (mur_options.rollback && (murgbl.resync_seqno UNIX_ONLY(|| murgbl.resync_strm_seqno_nonzero)))
|
|
{
|
|
assert(!mur_options.forward || !murgbl.resync_seqno);
|
|
if (murgbl.resync_seqno && (rec_token_seq > murgbl.resync_seqno))
|
|
reached_trnarnd = FALSE;
|
|
# ifdef UNIX
|
|
assert(!murgbl.resync_strm_seqno_nonzero || !mur_options.forward);
|
|
if (reached_trnarnd && murgbl.resync_strm_seqno_nonzero)
|
|
{ /* Check if any stream seqnos need to be compared as well */
|
|
if (!rctl->recov_interrupted)
|
|
{ /* For non-interrupted recovery, one strm needs checking */
|
|
idx = murgbl.resync_strm_index;
|
|
if (INVALID_SUPPL_STRM != idx)
|
|
{
|
|
assert((0 <= idx) && (MAX_SUPPL_STRMS > idx));
|
|
strm_seqno = jnlrec->jrec_epoch.strm_seqno[idx];
|
|
if (strm_seqno > murgbl.resync_strm_seqno[idx])
|
|
reached_trnarnd = FALSE;
|
|
}
|
|
} else
|
|
{ /* For interrupted recovery, upto 16 strms need checking */
|
|
for (idx = 0; idx < MAX_SUPPL_STRMS; idx++)
|
|
{
|
|
strm_seqno = jnlrec->jrec_epoch.strm_seqno[idx];
|
|
if (strm_seqno > murgbl.resync_strm_seqno[idx])
|
|
{
|
|
reached_trnarnd = FALSE;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
# endif
|
|
}
|
|
if (reached_trnarnd)
|
|
{
|
|
if (mur_options.rollback && !this_reg_resolved)
|
|
{ /* It is possible this region is not resolved yet in case we found an
|
|
* EPOCH whose rec_time is EQUAL to the tp_resolve_time. In this case
|
|
* use the epoch to note down pre-resolve-seqno.
|
|
*/
|
|
SAVE_PRE_RESOLVE_SEQNO(rectype, rec_time, rec_token_seq, pre_resolve_seqno);
|
|
this_reg_resolved = TRUE;
|
|
}
|
|
if (!mur_options.forward)
|
|
save_turn_around_point(rctl, jctl, apply_pblk_this_region);
|
|
PRINT_VERBOSE_STAT(jctl, "mur_back_processing:save_turn_around_point");
|
|
break;
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
/* Do preliminary checks to see if the jnl record needs to be involved in hashtable token processing */
|
|
if ((FENCE_NONE == mur_options.fences) || (rec_time > mur_options.before_time)
|
|
|| (rec_time < jgbl.mur_tp_resolve_time))
|
|
continue;
|
|
/* Do detailed checks on the jnl record for token processing */
|
|
token = rec_token_seq;
|
|
if (IS_FENCED(rectype))
|
|
{ /* Note for a ZTP if FSET/GSET is present before mur_options.before_time and
|
|
* GUPD/ZTCOM are present after mur_options.before_time, it is considered broken. */
|
|
rec_fence = GET_REC_FENCE_TYPE(rectype);
|
|
VMS_MUR_BACK_PROCESS_GET_IMAGE_COUNT(jctl, jnlrec, jjctl, rec_image_count, status);
|
|
assert(token == ((struct_jrec_upd *)jnlrec)->token_seq.token);
|
|
if (IS_SET_KILL_ZKILL_ZTRIG_ZTWORM(rectype)) /* TUPD/UUPD/FUPD/GUPD */
|
|
{
|
|
if (NULL != (multi = MUR_TOKEN_LOOKUP(token, rec_image_count, rec_time, rec_fence)))
|
|
{
|
|
if (multi->fence != rec_fence)
|
|
{
|
|
assert(!mur_options.rollback); /* jnl_seqno cannot be duplicate */
|
|
if (!(mur_report_error(jctl, MUR_DUPTOKEN)))
|
|
{
|
|
*jjctl = jctl;
|
|
return ERR_DUPTOKEN;
|
|
}
|
|
SET_THIS_TN_AS_BROKEN(multi, reg_total); /* This is broken */
|
|
if (rec_time < multi->time)
|
|
multi->time = rec_time;
|
|
} else
|
|
{
|
|
assert((TPFENCE != rec_fence) || multi->time == rec_time);
|
|
if (ZTPFENCE == rec_fence && multi->time > rec_time)
|
|
multi->time = rec_time;
|
|
if (multi->regnum != regno)
|
|
{ /* No TCOM or ZTCOM was seen in this region but corresponding
|
|
* TUPD/UUPD/FUPD/GUPD records are seen. This is automatically
|
|
* treated as broken because of the absence of TCOM/ZTCOM. But
|
|
* we need to signal to forward processing that this region
|
|
* (even though broken) was seen in backward processing. That is
|
|
* done by incrementing tot_partner.
|
|
*/
|
|
multi->tot_partner++;
|
|
multi->regnum = regno;
|
|
/* Set a debug-only flag indicating this "multi" structure never
|
|
* be treated as a GOOD_TN in forward processing. This will be
|
|
* checked there.
|
|
*/
|
|
DEBUG_ONLY(multi->this_is_broken = TRUE;)
|
|
}
|
|
}
|
|
} else
|
|
{ /* This is broken */
|
|
MUR_TOKEN_ADD(multi, token, rec_image_count,
|
|
rec_time, reg_total + 1, rec_fence, regno);
|
|
/* Set a debug-only flag indicating this "multi" structure never be
|
|
* treated as a GOOD_TN in forward processing. This will be checked there.
|
|
*/
|
|
DEBUG_ONLY(multi->this_is_broken = TRUE;)
|
|
}
|
|
} else /* TCOM/ZTCOM */
|
|
MUR_TCOM_TOKEN_PROCESSING(jctl, jjctl, token, rec_image_count,
|
|
rec_time, rec_fence, regno, reg_total, jnlrec);
|
|
} else if (mur_options.rollback && IS_REPLICATED(rectype))
|
|
{ /* Process unfenced transactions. They are either lost or good.
|
|
* For RESYNC and FETCH_RESYNC qualifiers, all non-tp transactions
|
|
* at or after murgbl.resync_seqno are considered lost.
|
|
* So, we do not need to add them in token(seqnum) table to find gap in sequence number.
|
|
* For consistent rollback murgbl.resync_seqno == 0 and we want to consider all records
|
|
* till tp_resolve_time for broken/lost/good determination so check accordingly..
|
|
*/
|
|
skip_rec = (murgbl.resync_seqno && (rec_token_seq > murgbl.resync_seqno));
|
|
UNIX_ONLY(
|
|
assert(!murgbl.resync_strm_seqno_nonzero || !mur_options.forward);
|
|
if (!skip_rec && murgbl.resync_strm_seqno_nonzero)
|
|
{
|
|
assert(IS_SET_KILL_ZKILL_ZTRIG_ZTWORM(rectype) || (JRT_NULL == (rectype)));
|
|
assert(&jnlrec->jrec_set_kill.strm_seqno == &jnlrec->jrec_null.strm_seqno);
|
|
/* strm_seqno & strm_idx have already been initialized before for this record.
|
|
* Assert that (i.e. they have not been changed since then) before using them.
|
|
*/
|
|
DEBUG_ONLY(save_strm_seqno = GET_STRM_SEQNO(jnlrec);)
|
|
assert(strm_idx == GET_STRM_INDEX(save_strm_seqno));
|
|
assert(strm_seqno == GET_STRM_SEQ60(save_strm_seqno));
|
|
skip_rec = (murgbl.resync_strm_seqno[strm_idx]
|
|
&& (strm_seqno > murgbl.resync_strm_seqno[strm_idx]));
|
|
}
|
|
)
|
|
if (!skip_rec)
|
|
{
|
|
rec_fence = GET_REC_FENCE_TYPE(rectype);
|
|
assert(token == ((struct_jrec_upd *)jnlrec)->token_seq.token);
|
|
/* For rollback, pid/image_type/time are not necessary to establish uniqueness of token
|
|
* as token (which is a seqno) is already guaranteed to be unique for an instance.
|
|
*/
|
|
if (NULL == (multi = MUR_TOKEN_LOOKUP(token, 0, 0, rec_fence)))
|
|
{ /* We reuse same token table. Most of the fields in multi_struct are unused */
|
|
MUR_TOKEN_ADD(multi, token, 0, 0, 1, rec_fence, 0);
|
|
} else
|
|
{
|
|
assert(FALSE);
|
|
if (!(mur_report_error(jctl, MUR_DUPTOKEN)))
|
|
{
|
|
*jjctl = jctl;
|
|
return ERR_DUPTOKEN;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
PRINT_VERBOSE_STAT(jctl, "mur_back_processing:at the end");
|
|
assert((SS_NORMAL != status) || !mur_options.rollback || this_reg_resolved);
|
|
if (SS_NORMAL != status)
|
|
{
|
|
if (!mur_options.forward)
|
|
{
|
|
if (ERR_NOPREVLINK == status)
|
|
{ /* We check if there is an EPOCH with a time EQUAL to the tp_resolve_time. If so we
|
|
* try not to issue the NOPREVLINK error for this boundary condition.
|
|
*/
|
|
assert(JNL_HDR_LEN == jctl->rec_offset);
|
|
if (rec_time <= jgbl.mur_tp_resolve_time)
|
|
{
|
|
jctl->rec_offset = JNL_HDR_LEN + PINI_RECLEN;
|
|
status = mur_prev(jctl, jctl->rec_offset);
|
|
if (SS_NORMAL != status)
|
|
{
|
|
*jjctl = jctl;
|
|
return status;
|
|
}
|
|
jnlrec = mur_desc->jnlrec;
|
|
rectype = (enum jnl_record_type)jnlrec->prefix.jrec_type;
|
|
rec_time = jnlrec->prefix.time;
|
|
rec_token_seq = GET_JNL_SEQNO(jnlrec);
|
|
assert(JRT_EPOCH == rectype);
|
|
/* handle non-epoch (out-of-design) situation in pro nevertheless */
|
|
reached_trnarnd = (JRT_EPOCH == rectype)
|
|
&& (!murgbl.resync_seqno || (rec_token_seq <= murgbl.resync_seqno));
|
|
UNIX_ONLY(
|
|
if (reached_trnarnd && murgbl.resync_strm_seqno_nonzero)
|
|
{ /* Check if any stream seqnos need to be compared as well */
|
|
if (!rctl->recov_interrupted)
|
|
{ /* For non-interupted recovery, one stream needs checking */
|
|
idx = murgbl.resync_strm_index;
|
|
if (INVALID_SUPPL_STRM != idx)
|
|
{
|
|
assert((0 <= idx) && (MAX_SUPPL_STRMS > idx));
|
|
strm_seqno = jnlrec->jrec_epoch.strm_seqno[idx];
|
|
if (strm_seqno > murgbl.resync_strm_seqno[idx])
|
|
reached_trnarnd = FALSE;
|
|
}
|
|
} else
|
|
{ /* For interupted recovery, upto 16 streams need checking */
|
|
for (idx = 0; idx < MAX_SUPPL_STRMS; idx++)
|
|
{
|
|
strm_seqno = jnlrec->jrec_epoch.strm_seqno[idx];
|
|
if (strm_seqno > murgbl.resync_strm_seqno[idx])
|
|
{
|
|
reached_trnarnd = FALSE;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
)
|
|
} else
|
|
reached_trnarnd = FALSE;
|
|
if (reached_trnarnd)
|
|
{
|
|
if (mur_options.rollback && !this_reg_resolved)
|
|
{
|
|
SAVE_PRE_RESOLVE_SEQNO(rectype, rec_time, rec_token_seq, pre_resolve_seqno);
|
|
this_reg_resolved = TRUE;
|
|
}
|
|
save_turn_around_point(rctl, jctl, apply_pblk_this_region);
|
|
} else
|
|
{
|
|
gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(4) ERR_NOPREVLINK,
|
|
2, jctl->jnl_fn_len, jctl->jnl_fn);
|
|
*jjctl = jctl;
|
|
return ERR_NOPREVLINK;
|
|
}
|
|
} else /* mur_read_file should have issued messages as necessary */
|
|
{
|
|
*jjctl = jctl;
|
|
return status;
|
|
}
|
|
} else if (ERR_JNLREADBOF != status) /* mur_read_file should have issued messages */
|
|
{
|
|
*jjctl = jctl;
|
|
return status;
|
|
}
|
|
/* for mur_options.forward ERR_JNLREADBOF is not error but others are */
|
|
}
|
|
if (!mur_options.forward && (NULL == rctl->jctl_turn_around))
|
|
GTMASSERT;
|
|
return SS_NORMAL;
|
|
}
|
|
|
|
/* This routine performs backward processing for forward and backward recover/rollback.
|
|
* This creates list of tokens for broken fenced transactions.
|
|
* For noverify qualifier in backward recovry, it may apply PBLK calling "mur_output_pblk"
|
|
*/
|
|
uint4 mur_back_processing(jnl_ctl_list **jjctl, boolean_t apply_pblk, seq_num *pre_resolve_seqno, jnl_tm_t alt_tp_resolve_time)
|
|
{
|
|
enum jnl_record_type rectype;
|
|
file_control *fc;
|
|
int idx, regno, reg_total, status;
|
|
jnl_ctl_list *jctl;
|
|
jnl_record *jnlrec;
|
|
jnl_tm_t max_lvrec_time, min_bov_time;
|
|
mur_back_opt_t mur_back_options;
|
|
mur_read_desc_t *mur_desc;
|
|
reg_ctl_list *rctl, *rctl_top;
|
|
seq_num rec_token_seq, save_resync_seqno, strm_seqno;
|
|
sgmnt_data_ptr_t csd;
|
|
|
|
reg_total = murgbl.reg_total;
|
|
max_lvrec_time = 0; /* To find maximum of all valid record's timestamp */
|
|
min_bov_time = MAXUINT4; /* For forward qualifier we need to find minimum of bov_timestamps */
|
|
for (regno = 0; regno < reg_total; regno++)
|
|
{
|
|
rctl = &mur_ctl[regno];
|
|
rctl->lvrec_time = mur_ctl[regno].jctl->lvrec_time;
|
|
if (rctl->lvrec_time > max_lvrec_time)
|
|
max_lvrec_time = rctl->lvrec_time;
|
|
/* copy lvrec_time into region structure */
|
|
if (mur_options.forward && (jnl_tm_t)rctl->jctl_head->jfh->bov_timestamp < min_bov_time)
|
|
min_bov_time = (jnl_tm_t)rctl->jctl_head->jfh->bov_timestamp;
|
|
}
|
|
/* Time qualifier processing cannot be done in mur_get_options() as it does not have max_lvrec_time
|
|
* Also this should be done after interrupted recovery processing.
|
|
* Otherwise delta time of previous command and delta time of this recover may not be same. */
|
|
assert(0 == iterationcnt || prev_max_lvrec_time >= max_lvrec_time);
|
|
assert(0 == iterationcnt || prev_min_bov_time >= min_bov_time);
|
|
mur_process_timequal(max_lvrec_time, min_bov_time);
|
|
DEBUG_ONLY(prev_max_lvrec_time = max_lvrec_time;)
|
|
DEBUG_ONLY(prev_min_bov_time = min_bov_time;)
|
|
JNL_PUT_MSG_PROGRESS("Backward processing started");
|
|
mur_tp_resolve_time(max_lvrec_time);
|
|
if (0 != alt_tp_resolve_time && alt_tp_resolve_time < jgbl.mur_tp_resolve_time)
|
|
jgbl.mur_tp_resolve_time = alt_tp_resolve_time;
|
|
if (!mur_options.forward && mur_options.update)
|
|
{
|
|
/* Save murgbl.resync_seqno before it gets modified just in case we needed the original value for debugging */
|
|
DEBUG_ONLY(murgbl.save_resync_seqno = murgbl.resync_seqno;)
|
|
/* Following for loop code block does the same thing for every call to "mur_back_processing".
|
|
* Tail corruption in journal could cause multiple calls to this routine but that case should be very rare.
|
|
* So let's keep it here instead of moving to mur_back_process.
|
|
*/
|
|
for (rctl = mur_ctl, rctl_top = mur_ctl + reg_total; rctl < rctl_top; rctl++)
|
|
{
|
|
csd = rctl->csd;
|
|
assert(NULL != csd);
|
|
/* If we have done interrupted recovery processing (through mur_apply_pblk) already, we
|
|
* would have played all PBLKs until the turn-around-point of last interrupted recovery.
|
|
* We would not have inserted any more journal file generations as part of backward processing.
|
|
* Therefore we expect "rctl->jctl_head" to be equal to "rctl->jctl_apply_pblk" when we come here.
|
|
* There is one exception though and that is if we come here for "iterationcnt > 0". In this case,
|
|
* it is possible that "rctl->jctl_head" is set to a generation earlier than "rctl->jctl_apply_pblk"
|
|
* during the previous iteration of "mur_back_processing" which did complete for this region but
|
|
* later encountered a JNLRECFMT error in a different region and hence had to restart.
|
|
*/
|
|
assert(!rctl->jfh_recov_interrupted || rctl->jctl_head == rctl->jctl_apply_pblk || iterationcnt);
|
|
assert(!rctl->recov_interrupted || murgbl.intrpt_recovery);
|
|
/* assert(!rctl->jfh_recov_interrupted || rctl->recov_interrupted); ???
|
|
* The above assert is temporarily commented out because in mur_close_files we set
|
|
* csd->recov_interrupted = FALSE before we set jctl->jfh->recover_interrupted = FALSE
|
|
* so it can fail if recover crashes in between those two assignments. But the assert is
|
|
* not removed as the implications of the assert not being true have to be handled in
|
|
* the entire recover code before removing it.
|
|
*/
|
|
if (rctl->recov_interrupted)
|
|
{
|
|
if (csd->intrpt_recov_resync_seqno)
|
|
{
|
|
assert(mur_options.rollback); /* otherwise we would have issued a ERR_ROLLBKINTERRUPT
|
|
* error in mur_open_files.c
|
|
*/
|
|
if ((0 == murgbl.resync_seqno) || (csd->intrpt_recov_resync_seqno < murgbl.resync_seqno))
|
|
murgbl.resync_seqno = csd->intrpt_recov_resync_seqno;
|
|
}
|
|
UNIX_ONLY(
|
|
for (idx = 0; idx < MAX_SUPPL_STRMS; idx++)
|
|
{
|
|
strm_seqno = csd->intrpt_recov_resync_strm_seqno[idx];
|
|
if (strm_seqno)
|
|
{
|
|
assert(mur_options.rollback); /* otherwise we would have issued a
|
|
* ERR_ROLLBKINTERRUPT error in
|
|
* mur_open_files.c
|
|
*/
|
|
if ((0 == murgbl.resync_strm_seqno[idx])
|
|
|| (strm_seqno < murgbl.resync_strm_seqno[idx]))
|
|
{
|
|
murgbl.resync_strm_seqno[idx] = strm_seqno;
|
|
murgbl.resync_strm_seqno_nonzero = TRUE;
|
|
}
|
|
}
|
|
}
|
|
)
|
|
}
|
|
}
|
|
if (murgbl.resync_seqno)
|
|
gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_RESOLVESEQNO, 2, &murgbl.resync_seqno, &murgbl.resync_seqno);
|
|
UNIX_ONLY(
|
|
if (murgbl.resync_strm_seqno_nonzero)
|
|
{
|
|
for (idx = 0; idx < MAX_SUPPL_STRMS; idx++)
|
|
{
|
|
if (murgbl.resync_strm_seqno[idx])
|
|
gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(5) ERR_RESOLVESEQSTRM, 3, idx,
|
|
&murgbl.resync_strm_seqno[idx], &murgbl.resync_strm_seqno[idx]);
|
|
}
|
|
/* If -resync=<strm_seqno> is specified, we dont yet know what jnl_seqno it maps back to.
|
|
* To facilitate that determination, set resync_seqno to maximum possible value. It will
|
|
* be adjusted below based on the records we see in backward processing.
|
|
*/
|
|
if (!murgbl.resync_seqno)
|
|
murgbl.resync_seqno = MAXUINT8;
|
|
}
|
|
)
|
|
for (rctl = mur_ctl, rctl_top = mur_ctl + reg_total; rctl < rctl_top; rctl++)
|
|
{
|
|
csd = rctl->csd;
|
|
assert(csd->recov_interrupted); /* mur_open_files set this */
|
|
if (apply_pblk && !rctl->jfh_recov_interrupted)
|
|
{ /* When the 'if' condition is TRUE, we apply PBLKs in mur_back_process.
|
|
* Store the jgbl.mur_tp_resolve_time/murgbl.resync_seqno.
|
|
* So we remember to undo PBLKs at least upto that point,
|
|
* in case this recovery is interrupted/crashes.
|
|
*/
|
|
assert(0 == iterationcnt || csd->intrpt_recov_tp_resolve_time >= jgbl.mur_tp_resolve_time);
|
|
csd->intrpt_recov_tp_resolve_time = jgbl.mur_tp_resolve_time;
|
|
assert(0 == iterationcnt || (csd->intrpt_recov_resync_seqno == murgbl.resync_seqno));
|
|
assert(!csd->intrpt_recov_resync_seqno || (csd->intrpt_recov_resync_seqno >= murgbl.resync_seqno));
|
|
csd->intrpt_recov_resync_seqno = murgbl.resync_seqno;
|
|
UNIX_ONLY(
|
|
assert(!murgbl.resync_strm_seqno_nonzero || rctl->recov_interrupted
|
|
|| (INVALID_SUPPL_STRM == idx) || (-1 == iterationcnt)
|
|
|| ((0 == iterationcnt) && !strm_seqno)
|
|
|| (iterationcnt && (strm_seqno == murgbl.resync_strm_seqno[idx])));
|
|
)
|
|
MUR_SAVE_RESYNC_STRM_SEQNO(rctl, csd);
|
|
/* flush the changed csd to disk */
|
|
fc = rctl->gd->dyn.addr->file_cntl;
|
|
fc->op = FC_WRITE;
|
|
fc->op_buff = (sm_uc_ptr_t)csd;
|
|
fc->op_len = (int)ROUND_UP(SIZEOF_FILE_HDR(csd), DISK_BLOCK_SIZE);
|
|
fc->op_pos = 1;
|
|
dbfilop(fc);
|
|
}
|
|
}
|
|
} /* end else !mur_options.forward */
|
|
DEBUG_ONLY(iterationcnt++;)
|
|
*pre_resolve_seqno = 0;
|
|
save_resync_seqno = murgbl.resync_seqno;
|
|
assert(murgbl.ok_to_update_db == apply_pblk);
|
|
/* At this point we have computed jgbl.mur_tp_resolve_time. It is the time upto which (at least)
|
|
* we need to do token resolution. This is for all kinds of recovery and rollback.
|
|
* Following for loop will do backward processing and resolve token up to this jgbl.mur_tp_resolve_time.
|
|
* (For recover with lower since_time, we already set jgbl.mur_tp_resolve_time as since_time.
|
|
* For interrupted recovery we also considered previous recovery's jgbl.mur_tp_resolve_time.)
|
|
* For rollback command (with resync or fetchresync qualifier) we resolve only upto jgbl.mur_tp_resolve_time.
|
|
*/
|
|
for (regno = 0, rctl = mur_ctl, rctl_top = mur_ctl + reg_total; rctl < rctl_top; rctl++, regno++)
|
|
{
|
|
/* Note that for rctl->jfh_recov_interrupted we do not apply pblks in this routine */
|
|
jctl = rctl->jctl;
|
|
assert(jctl->reg_ctl == rctl);
|
|
assert(NULL == jctl->next_gen);
|
|
if (mur_options.verbose)
|
|
gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(6) ERR_MUINFOSTR, 4,
|
|
LEN_AND_LIT("Processing started for journal file"), jctl->jnl_fn_len, jctl->jnl_fn);
|
|
jctl->rec_offset = jctl->lvrec_off;
|
|
status = mur_prev(jctl, jctl->rec_offset);
|
|
mur_desc = rctl->mur_desc;
|
|
jnlrec = mur_desc->jnlrec;
|
|
if (!mur_options.forward && FENCE_NONE != mur_options.fences)
|
|
{ /* This is for the latest generation only */
|
|
rectype = (enum jnl_record_type)jnlrec->prefix.jrec_type;
|
|
if (JRT_EOF != rectype)
|
|
{ /* When a region is inactive but not closed, that is, no logical updates are done for some
|
|
* period of time (8 second), then EPOCH is written by periodic timer. However, for some
|
|
* existing bug/issue periodic timers can be deferred for long period of time.
|
|
* So we need this check here.
|
|
*/
|
|
for ( ; ; )
|
|
{
|
|
if ((JRT_PFIN == rectype) || (JRT_ALIGN == rectype) || (JRT_INCTN == rectype))
|
|
{
|
|
if (JRT_INCTN == rectype)
|
|
MUR_INCTN_BLKS_TO_UPGRD_ADJUST(rctl);
|
|
if (SS_NORMAL == (status = mur_prev(jctl, 0)))
|
|
{
|
|
jnlrec = mur_desc->jnlrec; /* keep jnlrec uptodate */
|
|
jctl->rec_offset -= mur_desc->jreclen;
|
|
assert(jctl->rec_offset >= mur_desc->cur_buff->dskaddr);
|
|
assert(JNL_HDR_LEN <= jctl->rec_offset);
|
|
rectype = (enum jnl_record_type)jnlrec->prefix.jrec_type;
|
|
} else
|
|
break;
|
|
} else
|
|
break;
|
|
}
|
|
if (SS_NORMAL == status && (JRT_EPOCH != rectype)
|
|
&& (jnlrec->prefix.time < jgbl.mur_tp_resolve_time))
|
|
{
|
|
*jjctl = jctl;
|
|
# ifdef UNIX
|
|
/* Assert that the new about-to-be-set TP resolve time does not differ by more than the
|
|
* twice the free-EPOCH interval (which is defined by TIM_DEFER_SYNC). Twice is not a magic
|
|
* number, but just to allow for some relaxation. The only exception is if this is an
|
|
* interrupted recovery in which case the difference could be significant. One reason we
|
|
* know why this could happen is because mur_close_files calls gds_rundown on all regions
|
|
* AFTER resetting csd->intrpt_recov_tp_resolve_time to 0. So, if we get killed at
|
|
* right AFTER doing gds_rundown on one region, but BEFORE doing gds_rundown on other
|
|
* regions, then a subsequent ROLLBACK finds a higher TP resolve time on one region and
|
|
* sets the value to jgbl.mur_tp_resolve_time but later finds other regions with records
|
|
* having timestamps less than jgbl.mur_tp_resolve_time. See GTM-7204 for more details.
|
|
*/
|
|
assert(((TIM_DEFER_DBSYNC * 2) >= (jgbl.mur_tp_resolve_time - jnlrec->prefix.time))
|
|
|| ((WBTEST_CRASH_SHUTDOWN_EXPECTED == gtm_white_box_test_case_number)
|
|
&& murgbl.intrpt_recovery));
|
|
# endif
|
|
return ERR_CHNGTPRSLVTM;
|
|
}
|
|
}
|
|
}
|
|
/* Do intializations before invoking "mur_back_processing_one_region" function */
|
|
jctl->after_end_of_data = TRUE;
|
|
mur_back_options.jctl = jctl;
|
|
mur_back_options.rec_token_seq = MAXUINT8;
|
|
mur_back_options.first_epoch = TRUE;
|
|
mur_back_options.status = status;
|
|
mur_back_options.jjctl = jjctl;
|
|
mur_back_options.apply_pblk = apply_pblk;
|
|
mur_back_options.pre_resolve_seqno = pre_resolve_seqno;
|
|
status = mur_back_processing_one_region(&mur_back_options);
|
|
if (SS_NORMAL != status)
|
|
return status;
|
|
} /* end rctl for loop */
|
|
if (save_resync_seqno != murgbl.resync_seqno)
|
|
{ /* murgbl.resync_seqno was adjusted in the middle of backward processing due to a -rsync_strm= specification.
|
|
* Check if any regions have to be further involved in backward processing. This is necessary because we might
|
|
* have stopped the first backward processing on seeing an EPOCH record whose strm_seqno is less than or equal
|
|
* to the input resync strm_seqno. But it is possible that murgbl.resync_seqno was initially at a higher value
|
|
* when a particular region stopped its backward processing but later got adjusted to a lower value during
|
|
* processing for the next region. In that case, we should redo processing for the first region with the new
|
|
* murgbl.resync_seqno in case this takes us back to a previous epoch record. <C9J02_003091_strm_seqno_rollback>
|
|
*/
|
|
assert(murgbl.resync_seqno < save_resync_seqno);
|
|
assert(mur_options.rollback);
|
|
UNIX_ONLY(assert(murgbl.resync_strm_seqno_nonzero);)
|
|
JNL_PUT_MSG_PROGRESS("Backward processing Round-II started");
|
|
for (regno = 0, rctl = mur_ctl, rctl_top = mur_ctl + reg_total; rctl < rctl_top; rctl++, regno++)
|
|
{
|
|
jctl = rctl->jctl_turn_around;
|
|
/* Check if this regions turn-around-point-seqno is higher than the final value of murgbl.resync_seqno.
|
|
* If so, we need to do further backward processing on this region.
|
|
*/
|
|
if (jctl->turn_around_seqno > murgbl.resync_seqno)
|
|
{ /* Do intializations before invoking "mur_back_processing_one_region" function */
|
|
/* jctl->after_end_of_data is already set from previous invocation of this function */
|
|
/* rctl->mur_desc already points to the turnaround point so no further adjustment needed */
|
|
mur_back_options.jctl = rctl->jctl_turn_around;
|
|
mur_desc = rctl->mur_desc;
|
|
jnlrec = mur_desc->jnlrec;
|
|
assert(JRT_EPOCH == jnlrec->prefix.jrec_type);
|
|
assert(jctl->turn_around_time == jnlrec->prefix.time);
|
|
assert(jctl->turn_around_seqno == jnlrec->jrec_epoch.jnl_seqno);
|
|
assert(jctl->turn_around_tn == jnlrec->prefix.tn);
|
|
assert(jctl->rec_offset == jctl->turn_around_offset);
|
|
/* Now that jctl->rec_offset points to the same offset as jctl->turn_around_offset, reset
|
|
* the latter as a lot of the code inside "mur_back_processing_one_region" relies on this.
|
|
*/
|
|
jctl->turn_around_offset = 0;
|
|
/* By a similar token, reset "rctl->jctl_turn_around" as later asserts rely on this
|
|
* and we have already stored this in mur_back_options.jctl.
|
|
*/
|
|
rctl->jctl_turn_around = NULL;
|
|
mur_back_options.rec_token_seq = GET_JNL_SEQNO(jnlrec);
|
|
mur_back_options.first_epoch = FALSE; /* since we have already seen at least one EPOCH in
|
|
* previous invocation of "mur_back_processing_one_region"
|
|
*/
|
|
mur_back_options.status = SS_NORMAL;
|
|
mur_back_options.jjctl = jjctl;
|
|
mur_back_options.apply_pblk = apply_pblk;
|
|
mur_back_options.pre_resolve_seqno = pre_resolve_seqno;
|
|
status = mur_back_processing_one_region(&mur_back_options);
|
|
if (SS_NORMAL != status)
|
|
return status;
|
|
}
|
|
}
|
|
}
|
|
/* Since jgbl.mur_tp_resolve_time is one resolve time for all regions, no implicit lookback processing
|
|
* to resolve transactions is necessary */
|
|
*jjctl = NULL;
|
|
return SS_NORMAL;
|
|
}
|