/**************************************************************** * * * Copyright 2003, 2012 Fidelity Information Services, Inc * * * * This source code contains the intellectual property * * of its copyright holder(s), and is made available * * under a license. If you do not know the terms of * * the license, please stop and do not read further. * * * ****************************************************************/ #include "mdef.h" #include "gdsroot.h" #include "gdsblk.h" #include "gdsbt.h" #include "gtm_facility.h" #include "fileinfo.h" #include "gdsfhead.h" #include "filestruct.h" #include "jnl.h" #include "buddy_list.h" #include "hashtab_int4.h" /* needed for muprec.h */ #include "hashtab_int8.h" /* needed for muprec.h */ #include "hashtab_mname.h" /* needed for muprec.h */ #include "muprec.h" #include "mur_read_file.h" #include "iosp.h" #include "gtmmsg.h" #include "send_msg.h" #include "dbfilop.h" #include "gds_blk_downgrade.h" #include "gdsbml.h" #include "bit_clear.h" #include "bit_set.h" #include "min_max.h" #include "anticipatory_freeze.h" #include "eintr_wrappers.h" #ifdef GTM_CRYPT #include "gtm_string.h" #endif #if defined(UNIX) #include "gtm_unistd.h" #include "gdsbgtr.h" #include "repl_msg.h" #include "gtmsource.h" #include GBLREF gd_region *gv_cur_region; GBLREF volatile int4 db_fsync_in_prog; /* for DB_FSYNC macro usage */ GBLREF sigset_t block_sigsent; GBLREF boolean_t blocksig_initialized; GBLREF jnlpool_addrs jnlpool; #endif GBLREF reg_ctl_list *mur_ctl; GBLREF mur_gbls_t murgbl; GBLREF mur_opt_struct mur_options; GBLREF seq_num seq_num_zero; GBLREF jnl_gbls_t jgbl; error_def(ERR_JNLREAD); error_def(ERR_JNLREADBOF); error_def(ERR_JNLBADRECFMT); error_def(ERR_NOPREVLINK); error_def(ERR_MUINFOUINT4); error_def(ERR_MUINFOUINT8); error_def(ERR_MUINFOSTR); error_def(ERR_DBFSYNCERR); error_def(ERR_ORLBKNOSTP); uint4 mur_apply_pblk(boolean_t apply_intrpt_pblk) { uint4 status; reg_ctl_list *rctl, *rctl_top; jnl_ctl_list *tmpjctl; file_control *fc; inctn_opcode_t opcode; struct_jrec_inctn *inctn_rec; jnl_ctl_list *jctl; enum jnl_record_type rectype; int save_errno; jnl_record *jnlrec; UNIX_ONLY(unix_db_info *udi;) for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { if (!apply_intrpt_pblk) { assert(NULL != rctl->jctl_turn_around); if (!rctl->jfh_recov_interrupted) { if (mur_options.verify) { jctl = rctl->jctl; assert(jctl->reg_ctl == rctl); while (NULL != jctl->next_gen) { jctl = jctl->next_gen; assert(jctl->reg_ctl == rctl); } rctl->jctl = jctl; jctl->rec_offset = jctl->lvrec_off; /* Start from last record */ } else { jctl = rctl->jctl = rctl->jctl_apply_pblk; assert(NULL != jctl); assert(jctl->reg_ctl == rctl); jctl->rec_offset = jctl->apply_pblk_stop_offset; } } else /* recover interrupted earlier */ { /* We already called mur_apply_pblk() to undo recover generated PBLKs. * Later we followed the next_jnl_file_name links to setup jctl list for this region. * We later called mur_back_process() to resolve transactions using the new turn-around point, * but mur_back_process() did not apply PBLKs for interrupted recovery (even for NOVERIFY). * Last time we called this routine, we set rctl->jctl_apply_pblk. * Now we are in the phase to apply original GT.M generated PBLKs. * We skip application of PBLKs till the last recover's turn-around point. */ assert(!mur_options.rollback_losttnonly); jctl = rctl->jctl = rctl->jctl_apply_pblk; assert(jctl->reg_ctl == rctl); assert(jctl->apply_pblk_stop_offset); jctl->rec_offset = jctl->apply_pblk_stop_offset; DEBUG_ONLY( /* assert that first pass turn-around-point is later than the final turn-around-point */ for (tmpjctl = jctl; NULL != tmpjctl && tmpjctl != rctl->jctl_turn_around; tmpjctl = tmpjctl->prev_gen) ; assert(NULL != tmpjctl && ((tmpjctl != jctl) || (jctl->rec_offset >= jctl->turn_around_offset))); ) } if (mur_options.verify || rctl->jfh_recov_interrupted) { /* if going to apply pblks then store prospective turnaround point now itself * so we remember to undo PBLKs at least upto here in case this recovery is interrupted. * in case of normal recovery with -noverify, we would have written this information * out in mur_back_process() itself so we do not need to write it again here. */ rctl->csd->intrpt_recov_tp_resolve_time = jgbl.mur_tp_resolve_time; rctl->csd->intrpt_recov_resync_seqno = murgbl.resync_seqno; MUR_SAVE_RESYNC_STRM_SEQNO(rctl, rctl->csd); /* flush the changed csd to disk */ fc = rctl->gd->dyn.addr->file_cntl; fc->op = FC_WRITE; fc->op_buff = (sm_uc_ptr_t)rctl->csd; fc->op_len = ROUND_UP(SGMNT_HDR_LEN, DISK_BLOCK_SIZE); fc->op_pos = 1; dbfilop(fc); } } else { assert(murgbl.ok_to_update_db); assert(NULL == rctl->jctl_turn_around); if (!rctl->jfh_recov_interrupted) continue; /* Recover was interrupted earlier. We are in the phase to apply interrupted recovery generated PBLKs. * In interrupted pblk applying phase, it is possible that we would be playing PBLKs of recover-created * as well as GT.M created journal files. this is necessary until we reach the saved turn-around point * of the previous interrupted recovery. * * Example of why we need to play GT.M generated (in addition to recover generated PBLKs) is below. * * Assume GT.M crashed and * journal file layout now is a_1.mjl <-- a.mjl. * First recovery found turn-around point in a.mjl so it renamed a.mjl to a_2.mjl and created * a.mjl and played a few post-turn-around-point records into a.mjl when it was interrupted * journal file layout now is a_1.mjl <-- a_2.mjl <-- a.mjl * Second recovery had a specified turn-around point which was in a_1.mjl and it took the * minimum of the specified and saved (in a_2.mjl) turn-around points and undid PBLKs * upto a_1.mjl and was about to create a new a.mjl (which pointed back to a_1.mjl) after * renaming the current a.mjl, but crashed before the rename. Note that at this point a_1.mjl * has a non-zero turn-around-offset set and the database has been rolled back to a_1.mjl. * journal file layout now is a_1.mjl <-- a_2.mjl <-- a.mjl * Third recovery is now attempted. This will do interrupted PBLK processing (now upto the * saved turn-around-offset which is in a_1.mjl). It has to undo PBLKs of a.mjl, a_2.mjl and * a_1.mjl in the process of reaching there. If instead it undid only PBLKs of recover-created * journal files (which will be only a.mjl) and went to the saved turn-around-offset in * a_1.mjl, we would have rolled back the database to a state as of the end of a_2.mjl * although a previous recovery had rolled the database back to a previous generation (a_1.mjl) * This will mean we left out playing PBLKs in a_2.mjl and a_1.mjl which can cause integrity errors. */ jctl = rctl->jctl; /* Latest generation */ assert(jctl->reg_ctl == rctl); assert(NULL == jctl->next_gen); jctl->rec_offset = jctl->lvrec_off; /* Start from last record */ } for ( ; ;) { assert(0 != jctl->rec_offset); if (!apply_intrpt_pblk) { PRINT_VERBOSE_STAT(jctl, "mur_apply_blk:start"); } else { PRINT_VERBOSE_STAT(jctl, "mur_apply_blk:start : Apply Interrupted PBLK"); } for (status = mur_prev(jctl, jctl->rec_offset), jctl->after_end_of_data = TRUE; SS_NORMAL == status; status = mur_prev_rec(&jctl)) { jnlrec = rctl->mur_desc->jnlrec; rectype = (enum jnl_record_type)jnlrec->prefix.jrec_type; jctl->after_end_of_data = jctl->after_end_of_data && (jctl->rec_offset >= jctl->jfh->end_of_data); if (apply_intrpt_pblk) { if (NULL == rctl->jctl_alt_head && !jctl->jfh->recover_interrupted) { assert(NULL != jctl->next_gen); assert(jctl->next_gen->jfh->recover_interrupted); rctl->jctl_alt_head = jctl->next_gen;/* Save the recover generated journal files we finished processing */ jctl->next_gen = NULL; /* Since we do not want to process them again */ } if ((JRT_INCTN == rectype) && jctl->jfh->recover_interrupted) { MUR_INCTN_BLKS_TO_UPGRD_ADJUST(rctl); } } if (JRT_EPOCH == rectype) { assert(NULL != rctl->csd); if (!apply_intrpt_pblk) { if ((jctl == rctl->jctl_turn_around) && (jctl->rec_offset <= jctl->turn_around_offset)) { /* jctl->rec_offset can be different from jctl->turn_around_offset in * case of mur_ztp_lookback() processing. But we are guaranteed an epoch * at the start of every journal file, so we should encounter an epoch * in the same journal file as rctl->jctl_turn_around. We have now reached * the turn-around point. * Note that the following assignments should parallel those done in * mur_back_process on reaching the turn-around point. */ assert((jctl->rec_offset != jctl->turn_around_offset) || (jctl->turn_around_time == jnlrec->prefix.time)); assert((jctl->rec_offset != jctl->turn_around_offset) || (jctl->turn_around_seqno == jnlrec->jrec_epoch.jnl_seqno)); assert((jctl->rec_offset != jctl->turn_around_offset) || (jctl->turn_around_tn == ((jrec_prefix *)jnlrec)->tn)); rctl->jctl_turn_around = jctl; jctl->turn_around_offset = jctl->rec_offset; jctl->turn_around_time = jnlrec->prefix.time; jctl->turn_around_seqno = jnlrec->jrec_epoch.jnl_seqno; jctl->turn_around_tn = jnlrec->prefix.tn; break; } } else { if (jctl->rec_offset == jctl->jfh->turn_around_offset) { /* we reached the turn-around point of last interrupted recovery */ assert(jctl->jfh->turn_around_time == jnlrec->prefix.time); assert(rctl->jctl_head == jctl); /* note down the fact that we have applied PBLKs upto this point */ rctl->jctl_apply_pblk = jctl; jctl->apply_pblk_stop_offset = jctl->rec_offset; break; } else if (jctl->rec_offset < jctl->jfh->turn_around_offset) { PRINT_VERBOSE_STAT(jctl, "mur_apply_blk:turn_around_offset is bad"); gtm_putmsg(VARLSTCNT(5) ERR_JNLBADRECFMT, 3, jctl->jnl_fn_len, jctl->jnl_fn, jctl->rec_offset); return ERR_JNLBADRECFMT; } } } else if ((JRT_PBLK == rectype) && (SS_NORMAL != (status = mur_output_pblk(rctl)))) { PRINT_VERBOSE_STAT(jctl, "mur_apply_blk:mur_output_pblk failed"); return status; } } PRINT_VERBOSE_STAT(jctl, "mur_apply_blk:end"); if (SS_NORMAL == status) break; if (ERR_NOPREVLINK == status) { gtm_putmsg(VARLSTCNT(4) ERR_NOPREVLINK, 2, jctl->jnl_fn_len, jctl->jnl_fn); return ERR_NOPREVLINK; } else if (ERR_JNLREADBOF == status) { gtm_putmsg(VARLSTCNT(4) ERR_JNLREADBOF, 2, jctl->jnl_fn_len, jctl->jnl_fn); return ERR_JNLREADBOF; } else if (ERR_JNLREAD == status) /* This message is already issued in mur_read_file */ return ERR_JNLREAD; if ((NULL != jctl->next_gen) || (jctl->rec_offset < jctl->jfh->end_of_data)) { gtm_putmsg(VARLSTCNT(5) ERR_JNLBADRECFMT, 3, jctl->jnl_fn_len, jctl->jnl_fn, jctl->rec_offset); return status; } /* We are in the interrupted pblk application phase and applying either interrupted recovery * generated pblks or GT.M generated pblks and encounter bad records in the tail of the * last generation journal file that was active during the crash. Skip those and continue. */ PRINT_VERBOSE_TAIL_BAD(jctl); if (SS_NORMAL != mur_fread_eof_crash(jctl, jctl->jfh->end_of_data, jctl->rec_offset)) return ERR_JNLBADRECFMT; } /* end infinite for */ UNIX_ONLY( gv_cur_region = rctl->csa->region; udi = FILE_INFO(gv_cur_region); DB_FSYNC(gv_cur_region, udi, rctl->csa, db_fsync_in_prog, save_errno); if (0 != save_errno) { send_msg(VARLSTCNT(5) ERR_DBFSYNCERR, 2, DB_LEN_STR(gv_cur_region), save_errno); gtm_putmsg(VARLSTCNT(5) ERR_DBFSYNCERR, 2, DB_LEN_STR(gv_cur_region), save_errno); return ERR_DBFSYNCERR; } ) } return SS_NORMAL; } uint4 mur_output_pblk(reg_ctl_list *rctl) { jnl_ctl_list *jctl; file_control *db_ctl; struct_jrec_blk pblkrec; uchar_ptr_t pblkcontents, pblk_jrec_start; int4 size, fbw_size, fullblockwrite_len, blks_in_lmap; sgmnt_addrs *csa, *repl_csa; node_local *cnl; sgmnt_data_ptr_t csd; jnl_record *jnlrec; GTMCRYPT_ONLY( int req_enc_blk_size; int crypt_status; blk_hdr_ptr_t bp; ) UNIX_ONLY(sigset_t savemask;) /* In case of a LOSTTNONLY rollback, it is still possible to reach here if one region has NOBEFORE_IMAGE * while another has BEFORE_IMAGE. Any case do NOT apply PBLKs. */ if (mur_options.rollback_losttnonly) return SS_NORMAL; assert(murgbl.ok_to_update_db); jnlrec = rctl->mur_desc->jnlrec; pblkrec = jnlrec->jrec_pblk; /* note that all fields in the "jrec_pblk" typedef structure are now referencible from the local variable "pblkrec". * the only exception is "blk_contents" which is a hung buffer at the end of the structure. * copy that address in a local variable "pblkcontents" separately. */ pblkcontents = (uchar_ptr_t)&jnlrec->jrec_pblk.blk_contents[0]; csa = rctl->csa; csd = rctl->csd; if (IS_BITMAP_BLK(pblkrec.blknum)) { /* Local bitmap block. Determine master map free/busy status and fix it accordingly. */ if (ROUND_DOWN2(csd->trans_hist.total_blks, BLKS_PER_LMAP) == pblkrec.blknum) blks_in_lmap = (csd->trans_hist.total_blks - pblkrec.blknum); else blks_in_lmap = BLKS_PER_LMAP; assert(MM_ADDR(csd) == csa->bmm); if (NO_FREE_SPACE == bml_find_free(0, pblkcontents + SIZEOF(blk_hdr), blks_in_lmap)) bit_clear(pblkrec.blknum / BLKS_PER_LMAP, csa->bmm); else bit_set(pblkrec.blknum / BLKS_PER_LMAP, csa->bmm); if (pblkrec.blknum > csa->nl->highest_lbm_blk_changed) csa->nl->highest_lbm_blk_changed = pblkrec.blknum; } if (IS_GDS_BLK_DOWNGRADE_NEEDED(pblkrec.ondsk_blkver)) { /* This block was not in GDSVCURR format before the GT.M update wrote this PBLK record. But since all buffers in * the cache are stored in GDSVCURR format, the before-image in the PBLK record is in GDSVCURR * format. In order to really undo the update, downgrade the before-image before playing it back. * This can thankfully be done inline (i.e. using the same buffer) due to the following reasons. * a) The reformat routine allows for the source and target buffers to be the same AND * b) The block downgrade routine always needs less space for the target buffer than the source buffer AND * c) Recovery does not rely on the blk_contents of a PBLK journal record other than in this routine. */ gds_blk_downgrade((v15_blk_hdr_ptr_t)pblkcontents, (blk_hdr_ptr_t)pblkcontents); } db_ctl = rctl->db_ctl; /* apply PBLKs to database of "rctl". * This only takes place during rollback/recover, and is thus the first restoration being done to the database; * therefore, it will not cause a conflict with the write cache, as the cache will be empty */ db_ctl->op = FC_WRITE; db_ctl->op_pos = ((gtm_int64_t)(csd->blk_size / DISK_BLOCK_SIZE) * pblkrec.blknum) + csd->start_vbn; /* Use jrec size even if downgrade may have shrunk block. If the block has an integ error, we don't run into any trouble. */ size = pblkrec.bsiz; assert(size <= csd->blk_size); if (size > csd->blk_size) /* safety check in pro to avoid buffer overflows */ size = csd->blk_size; /* If full-block-writes are enabled, round size up to next full logical filesys block. We want to use "dbfilop" to * do the write but it does not honour full-block-writes setting. So prepare the buffer accordingly before invoking it. */ if (csa->do_fullblockwrites) { /* Determine full-block-write size corresponding to the current PBLK record block size (need to write only as * many full-blocks as needed for current block size). For example, with database block size 16K, current block * size (in the pblk record) is 3K and filesystem pagesize (fullblockwrite_len) is 4K, it is enough to only * write 4K data out for the current pblk record (instead of the entire 16K). */ fullblockwrite_len = (int4)csa->fullblockwrite_len; assert(fullblockwrite_len); fbw_size = (int4)ROUND_UP(size, fullblockwrite_len); /* Even though we are going to write full-block-write aligned blocks, we are not going to copy the pblk record * to an alternate buffer. We are going to copy whatever follows the pblk record in the journal file (and has * been read into the mur_desc buffers) into the database block as part of the full-block write. It is ok to do * so since the database does not care about the data that follows the valid end of the block. But we need to * ensure that there is referencible memory for the entire length of the full-block write. This is guaranteed * because of the layout of the mur_desc buffers. We have a contiguous sequence of 5 buffers (random_buff, * aux_buff1, seq_buff[0], seq_buff[1], aux_buff2) each occupying MUR_BUFF_SIZE bytes. Usually the PBLK record * is expected to lie somewhere in seq_buff[0] or seq_buff[1]. If at all, it can overflow into aux_buff2. * But aux_buff2 is an overflow buffer and therefore can contain at most one PBLK record (overflowing from * seq_buff[1]) and since the current value of MUR_BUFF_SIZE is 128K, we have enough room to hold one * GDS block (given that the maximum database block size is MAX_DB_BLK_SIZE which is 64K). All this is * asserted below so whenever these constants change, this code is reworked. */ DEBUG_ONLY(pblk_jrec_start = (uchar_ptr_t)&jnlrec->jrec_pblk;) assert(pblk_jrec_start > rctl->mur_desc->aux_buff1); /* assert that PBLK record ends AFTER aux_buff1 ends */ assert((pblk_jrec_start + fbw_size) > rctl->mur_desc->seq_buff[0].base); assert(pblk_jrec_start < rctl->mur_desc->aux_buff2.base); /* assert that PBLK record begins BEFORE aux_buff2 */ assert((pblk_jrec_start + fbw_size) < rctl->mur_desc->aux_buff2.top); assert((pblk_jrec_start + fbw_size) < (rctl->mur_desc->aux_buff2.base + MAX_DB_BLK_SIZE)); assert(MUR_BUFF_SIZE > MAX_DB_BLK_SIZE); } else fbw_size = size; db_ctl->op_buff = pblkcontents; db_ctl->op_len = fbw_size; /* During recovery process, the dat file is recreated by reading the PBLK records from the jnl file and applying them * to the dat file. In case the database is encrypted, the journal file would also have encrypted PBLK records so * as long as both the journal file and database file have the same encryption keys (usual case) we dont need to do * any encryption in this function. But if the keys are different, we need to decrypt the journal record using the * key from the journal file and re-encrypt it using the key from the database file before applying the PBLK record. */ # ifdef GTM_CRYPT bp = (blk_hdr_ptr_t) pblkcontents; req_enc_blk_size = MIN(csd->blk_size, bp->bsiz) - SIZEOF(*bp); jctl = rctl->jctl; if (!jctl->is_same_hash_as_db && BLOCK_REQUIRE_ENCRYPTION(csd->is_encrypted, bp->levl, req_enc_blk_size)) { ASSERT_ENCRYPTION_INITIALIZED; /* The below assert cannot be moved before BLOCK_REQUIRE_ENCRYPTION check done above as tmp_ptr could * potentially point to a V4 block in which case the assert might fail when a V4 block is casted to * a V5 block header. */ assert((bp->bsiz <= csd->blk_size) && (bp->bsiz >= SIZEOF(*bp))); GTMCRYPT_DECODE_FAST(jctl->encr_key_handle, (char *)(bp + 1), req_enc_blk_size, NULL, crypt_status); if (0 == crypt_status) GTMCRYPT_ENCODE_FAST(csa->encr_key_handle, (char *)(bp + 1), req_enc_blk_size, NULL, crypt_status); if (0 != crypt_status) { GC_GTM_PUTMSG(crypt_status, NULL); return crypt_status; } } # endif rctl->db_updated = TRUE; /* updated database corresponding to this region */ # ifdef UNIX if (!murgbl.incr_onln_rlbk_cycle && jgbl.onlnrlbk) { murgbl.incr_onln_rlbk_cycle = TRUE; /* Now that we have started updating the database, do NOT honor any more interrupts like MUPIP STOP */ assert(NULL != jnlpool.repl_inst_filehdr); send_msg(VARLSTCNT(1) ERR_ORLBKNOSTP); gtm_putmsg(VARLSTCNT(1) ERR_ORLBKNOSTP); assert(blocksig_initialized); /* set to TRUE at process startup time */ savemask = block_sigsent; sigdelset(&savemask, SIGALRM); /* Block all signals except SIGALRM */ sigprocmask(SIG_BLOCK, &savemask, NULL); /* No more MUPIP STOPs until completion */ } # endif return (dbfilop(db_ctl)); }