fis-gtm/sr_port/mur_apply_pblk.c

463 lines
20 KiB
C

/****************************************************************
* *
* Copyright 2003, 2012 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gdsroot.h"
#include "gdsblk.h"
#include "gdsbt.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "jnl.h"
#include "buddy_list.h"
#include "hashtab_int4.h" /* needed for muprec.h */
#include "hashtab_int8.h" /* needed for muprec.h */
#include "hashtab_mname.h" /* needed for muprec.h */
#include "muprec.h"
#include "mur_read_file.h"
#include "iosp.h"
#include "gtmmsg.h"
#include "send_msg.h"
#include "dbfilop.h"
#include "gds_blk_downgrade.h"
#include "gdsbml.h"
#include "bit_clear.h"
#include "bit_set.h"
#include "min_max.h"
#include "anticipatory_freeze.h"
#include "eintr_wrappers.h"
#ifdef GTM_CRYPT
#include "gtm_string.h"
#endif
#if defined(UNIX)
#include "gtm_unistd.h"
#include "gdsbgtr.h"
#include "repl_msg.h"
#include "gtmsource.h"
#include <signal.h>
GBLREF gd_region *gv_cur_region;
GBLREF volatile int4 db_fsync_in_prog; /* for DB_FSYNC macro usage */
GBLREF sigset_t block_sigsent;
GBLREF boolean_t blocksig_initialized;
GBLREF jnlpool_addrs jnlpool;
#endif
GBLREF reg_ctl_list *mur_ctl;
GBLREF mur_gbls_t murgbl;
GBLREF mur_opt_struct mur_options;
GBLREF seq_num seq_num_zero;
GBLREF jnl_gbls_t jgbl;
error_def(ERR_JNLREAD);
error_def(ERR_JNLREADBOF);
error_def(ERR_JNLBADRECFMT);
error_def(ERR_NOPREVLINK);
error_def(ERR_MUINFOUINT4);
error_def(ERR_MUINFOUINT8);
error_def(ERR_MUINFOSTR);
error_def(ERR_DBFSYNCERR);
error_def(ERR_ORLBKNOSTP);
uint4 mur_apply_pblk(boolean_t apply_intrpt_pblk)
{
uint4 status;
reg_ctl_list *rctl, *rctl_top;
jnl_ctl_list *tmpjctl;
file_control *fc;
inctn_opcode_t opcode;
struct_jrec_inctn *inctn_rec;
jnl_ctl_list *jctl;
enum jnl_record_type rectype;
int save_errno;
jnl_record *jnlrec;
UNIX_ONLY(unix_db_info *udi;)
for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++)
{
if (!apply_intrpt_pblk)
{
assert(NULL != rctl->jctl_turn_around);
if (!rctl->jfh_recov_interrupted)
{
if (mur_options.verify)
{
jctl = rctl->jctl;
assert(jctl->reg_ctl == rctl);
while (NULL != jctl->next_gen)
{
jctl = jctl->next_gen;
assert(jctl->reg_ctl == rctl);
}
rctl->jctl = jctl;
jctl->rec_offset = jctl->lvrec_off; /* Start from last record */
} else
{
jctl = rctl->jctl = rctl->jctl_apply_pblk;
assert(NULL != jctl);
assert(jctl->reg_ctl == rctl);
jctl->rec_offset = jctl->apply_pblk_stop_offset;
}
} else /* recover interrupted earlier */
{ /* We already called mur_apply_pblk() to undo recover generated PBLKs.
* Later we followed the next_jnl_file_name links to setup jctl list for this region.
* We later called mur_back_process() to resolve transactions using the new turn-around point,
* but mur_back_process() did not apply PBLKs for interrupted recovery (even for NOVERIFY).
* Last time we called this routine, we set rctl->jctl_apply_pblk.
* Now we are in the phase to apply original GT.M generated PBLKs.
* We skip application of PBLKs till the last recover's turn-around point.
*/
assert(!mur_options.rollback_losttnonly);
jctl = rctl->jctl = rctl->jctl_apply_pblk;
assert(jctl->reg_ctl == rctl);
assert(jctl->apply_pblk_stop_offset);
jctl->rec_offset = jctl->apply_pblk_stop_offset;
DEBUG_ONLY(
/* assert that first pass turn-around-point is later than the final turn-around-point */
for (tmpjctl = jctl; NULL != tmpjctl && tmpjctl != rctl->jctl_turn_around;
tmpjctl = tmpjctl->prev_gen)
;
assert(NULL != tmpjctl && ((tmpjctl != jctl)
|| (jctl->rec_offset >= jctl->turn_around_offset)));
)
}
if (mur_options.verify || rctl->jfh_recov_interrupted)
{ /* if going to apply pblks then store prospective turnaround point now itself
* so we remember to undo PBLKs at least upto here in case this recovery is interrupted.
* in case of normal recovery with -noverify, we would have written this information
* out in mur_back_process() itself so we do not need to write it again here.
*/
rctl->csd->intrpt_recov_tp_resolve_time = jgbl.mur_tp_resolve_time;
rctl->csd->intrpt_recov_resync_seqno = murgbl.resync_seqno;
MUR_SAVE_RESYNC_STRM_SEQNO(rctl, rctl->csd);
/* flush the changed csd to disk */
fc = rctl->gd->dyn.addr->file_cntl;
fc->op = FC_WRITE;
fc->op_buff = (sm_uc_ptr_t)rctl->csd;
fc->op_len = ROUND_UP(SGMNT_HDR_LEN, DISK_BLOCK_SIZE);
fc->op_pos = 1;
dbfilop(fc);
}
} else
{
assert(murgbl.ok_to_update_db);
assert(NULL == rctl->jctl_turn_around);
if (!rctl->jfh_recov_interrupted)
continue;
/* Recover was interrupted earlier. We are in the phase to apply interrupted recovery generated PBLKs.
* In interrupted pblk applying phase, it is possible that we would be playing PBLKs of recover-created
* as well as GT.M created journal files. this is necessary until we reach the saved turn-around point
* of the previous interrupted recovery.
*
* Example of why we need to play GT.M generated (in addition to recover generated PBLKs) is below.
*
* Assume GT.M crashed and
* journal file layout now is a_1.mjl <-- a.mjl.
* First recovery found turn-around point in a.mjl so it renamed a.mjl to a_2.mjl and created
* a.mjl and played a few post-turn-around-point records into a.mjl when it was interrupted
* journal file layout now is a_1.mjl <-- a_2.mjl <-- a.mjl
* Second recovery had a specified turn-around point which was in a_1.mjl and it took the
* minimum of the specified and saved (in a_2.mjl) turn-around points and undid PBLKs
* upto a_1.mjl and was about to create a new a.mjl (which pointed back to a_1.mjl) after
* renaming the current a.mjl, but crashed before the rename. Note that at this point a_1.mjl
* has a non-zero turn-around-offset set and the database has been rolled back to a_1.mjl.
* journal file layout now is a_1.mjl <-- a_2.mjl <-- a.mjl
* Third recovery is now attempted. This will do interrupted PBLK processing (now upto the
* saved turn-around-offset which is in a_1.mjl). It has to undo PBLKs of a.mjl, a_2.mjl and
* a_1.mjl in the process of reaching there. If instead it undid only PBLKs of recover-created
* journal files (which will be only a.mjl) and went to the saved turn-around-offset in
* a_1.mjl, we would have rolled back the database to a state as of the end of a_2.mjl
* although a previous recovery had rolled the database back to a previous generation (a_1.mjl)
* This will mean we left out playing PBLKs in a_2.mjl and a_1.mjl which can cause integrity errors.
*/
jctl = rctl->jctl; /* Latest generation */
assert(jctl->reg_ctl == rctl);
assert(NULL == jctl->next_gen);
jctl->rec_offset = jctl->lvrec_off; /* Start from last record */
}
for ( ; ;)
{
assert(0 != jctl->rec_offset);
if (!apply_intrpt_pblk)
{
PRINT_VERBOSE_STAT(jctl, "mur_apply_blk:start");
} else
{
PRINT_VERBOSE_STAT(jctl, "mur_apply_blk:start : Apply Interrupted PBLK");
}
for (status = mur_prev(jctl, jctl->rec_offset), jctl->after_end_of_data = TRUE;
SS_NORMAL == status; status = mur_prev_rec(&jctl))
{
jnlrec = rctl->mur_desc->jnlrec;
rectype = (enum jnl_record_type)jnlrec->prefix.jrec_type;
jctl->after_end_of_data = jctl->after_end_of_data &&
(jctl->rec_offset >= jctl->jfh->end_of_data);
if (apply_intrpt_pblk)
{
if (NULL == rctl->jctl_alt_head && !jctl->jfh->recover_interrupted)
{
assert(NULL != jctl->next_gen);
assert(jctl->next_gen->jfh->recover_interrupted);
rctl->jctl_alt_head = jctl->next_gen;/* Save the recover generated journal
files we finished processing */
jctl->next_gen = NULL; /* Since we do not want to process them again */
}
if ((JRT_INCTN == rectype) && jctl->jfh->recover_interrupted)
{
MUR_INCTN_BLKS_TO_UPGRD_ADJUST(rctl);
}
}
if (JRT_EPOCH == rectype)
{
assert(NULL != rctl->csd);
if (!apply_intrpt_pblk)
{
if ((jctl == rctl->jctl_turn_around)
&& (jctl->rec_offset <= jctl->turn_around_offset))
{ /* jctl->rec_offset can be different from jctl->turn_around_offset in
* case of mur_ztp_lookback() processing. But we are guaranteed an epoch
* at the start of every journal file, so we should encounter an epoch
* in the same journal file as rctl->jctl_turn_around. We have now reached
* the turn-around point.
* Note that the following assignments should parallel those done in
* mur_back_process on reaching the turn-around point.
*/
assert((jctl->rec_offset != jctl->turn_around_offset)
|| (jctl->turn_around_time == jnlrec->prefix.time));
assert((jctl->rec_offset != jctl->turn_around_offset)
|| (jctl->turn_around_seqno == jnlrec->jrec_epoch.jnl_seqno));
assert((jctl->rec_offset != jctl->turn_around_offset)
|| (jctl->turn_around_tn == ((jrec_prefix *)jnlrec)->tn));
rctl->jctl_turn_around = jctl;
jctl->turn_around_offset = jctl->rec_offset;
jctl->turn_around_time = jnlrec->prefix.time;
jctl->turn_around_seqno = jnlrec->jrec_epoch.jnl_seqno;
jctl->turn_around_tn = jnlrec->prefix.tn;
break;
}
} else
{
if (jctl->rec_offset == jctl->jfh->turn_around_offset)
{ /* we reached the turn-around point of last interrupted recovery */
assert(jctl->jfh->turn_around_time == jnlrec->prefix.time);
assert(rctl->jctl_head == jctl);
/* note down the fact that we have applied PBLKs upto this point */
rctl->jctl_apply_pblk = jctl;
jctl->apply_pblk_stop_offset = jctl->rec_offset;
break;
} else if (jctl->rec_offset < jctl->jfh->turn_around_offset)
{
PRINT_VERBOSE_STAT(jctl, "mur_apply_blk:turn_around_offset is bad");
gtm_putmsg(VARLSTCNT(5) ERR_JNLBADRECFMT, 3, jctl->jnl_fn_len,
jctl->jnl_fn, jctl->rec_offset);
return ERR_JNLBADRECFMT;
}
}
} else if ((JRT_PBLK == rectype) && (SS_NORMAL != (status = mur_output_pblk(rctl))))
{
PRINT_VERBOSE_STAT(jctl, "mur_apply_blk:mur_output_pblk failed");
return status;
}
}
PRINT_VERBOSE_STAT(jctl, "mur_apply_blk:end");
if (SS_NORMAL == status)
break;
if (ERR_NOPREVLINK == status)
{
gtm_putmsg(VARLSTCNT(4) ERR_NOPREVLINK, 2, jctl->jnl_fn_len, jctl->jnl_fn);
return ERR_NOPREVLINK;
} else if (ERR_JNLREADBOF == status)
{
gtm_putmsg(VARLSTCNT(4) ERR_JNLREADBOF, 2, jctl->jnl_fn_len, jctl->jnl_fn);
return ERR_JNLREADBOF;
} else if (ERR_JNLREAD == status) /* This message is already issued in mur_read_file */
return ERR_JNLREAD;
if ((NULL != jctl->next_gen) || (jctl->rec_offset < jctl->jfh->end_of_data))
{
gtm_putmsg(VARLSTCNT(5) ERR_JNLBADRECFMT, 3, jctl->jnl_fn_len,
jctl->jnl_fn, jctl->rec_offset);
return status;
}
/* We are in the interrupted pblk application phase and applying either interrupted recovery
* generated pblks or GT.M generated pblks and encounter bad records in the tail of the
* last generation journal file that was active during the crash. Skip those and continue. */
PRINT_VERBOSE_TAIL_BAD(jctl);
if (SS_NORMAL != mur_fread_eof_crash(jctl, jctl->jfh->end_of_data, jctl->rec_offset))
return ERR_JNLBADRECFMT;
} /* end infinite for */
UNIX_ONLY(
gv_cur_region = rctl->csa->region;
udi = FILE_INFO(gv_cur_region);
DB_FSYNC(gv_cur_region, udi, rctl->csa, db_fsync_in_prog, save_errno);
if (0 != save_errno)
{
send_msg(VARLSTCNT(5) ERR_DBFSYNCERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
gtm_putmsg(VARLSTCNT(5) ERR_DBFSYNCERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
return ERR_DBFSYNCERR;
}
)
}
return SS_NORMAL;
}
uint4 mur_output_pblk(reg_ctl_list *rctl)
{
jnl_ctl_list *jctl;
file_control *db_ctl;
struct_jrec_blk pblkrec;
uchar_ptr_t pblkcontents, pblk_jrec_start;
int4 size, fbw_size, fullblockwrite_len, blks_in_lmap;
sgmnt_addrs *csa, *repl_csa;
node_local *cnl;
sgmnt_data_ptr_t csd;
jnl_record *jnlrec;
GTMCRYPT_ONLY(
int req_enc_blk_size;
int crypt_status;
blk_hdr_ptr_t bp;
)
UNIX_ONLY(sigset_t savemask;)
/* In case of a LOSTTNONLY rollback, it is still possible to reach here if one region has NOBEFORE_IMAGE
* while another has BEFORE_IMAGE. Any case do NOT apply PBLKs.
*/
if (mur_options.rollback_losttnonly)
return SS_NORMAL;
assert(murgbl.ok_to_update_db);
jnlrec = rctl->mur_desc->jnlrec;
pblkrec = jnlrec->jrec_pblk;
/* note that all fields in the "jrec_pblk" typedef structure are now referencible from the local variable "pblkrec".
* the only exception is "blk_contents" which is a hung buffer at the end of the structure.
* copy that address in a local variable "pblkcontents" separately.
*/
pblkcontents = (uchar_ptr_t)&jnlrec->jrec_pblk.blk_contents[0];
csa = rctl->csa;
csd = rctl->csd;
if (IS_BITMAP_BLK(pblkrec.blknum))
{ /* Local bitmap block. Determine master map free/busy status and fix it accordingly. */
if (ROUND_DOWN2(csd->trans_hist.total_blks, BLKS_PER_LMAP) == pblkrec.blknum)
blks_in_lmap = (csd->trans_hist.total_blks - pblkrec.blknum);
else
blks_in_lmap = BLKS_PER_LMAP;
assert(MM_ADDR(csd) == csa->bmm);
if (NO_FREE_SPACE == bml_find_free(0, pblkcontents + SIZEOF(blk_hdr), blks_in_lmap))
bit_clear(pblkrec.blknum / BLKS_PER_LMAP, csa->bmm);
else
bit_set(pblkrec.blknum / BLKS_PER_LMAP, csa->bmm);
if (pblkrec.blknum > csa->nl->highest_lbm_blk_changed)
csa->nl->highest_lbm_blk_changed = pblkrec.blknum;
}
if (IS_GDS_BLK_DOWNGRADE_NEEDED(pblkrec.ondsk_blkver))
{ /* This block was not in GDSVCURR format before the GT.M update wrote this PBLK record. But since all buffers in
* the cache are stored in GDSVCURR format, the before-image in the PBLK record is in GDSVCURR
* format. In order to really undo the update, downgrade the before-image before playing it back.
* This can thankfully be done inline (i.e. using the same buffer) due to the following reasons.
* a) The reformat routine allows for the source and target buffers to be the same AND
* b) The block downgrade routine always needs less space for the target buffer than the source buffer AND
* c) Recovery does not rely on the blk_contents of a PBLK journal record other than in this routine.
*/
gds_blk_downgrade((v15_blk_hdr_ptr_t)pblkcontents, (blk_hdr_ptr_t)pblkcontents);
}
db_ctl = rctl->db_ctl;
/* apply PBLKs to database of "rctl".
* This only takes place during rollback/recover, and is thus the first restoration being done to the database;
* therefore, it will not cause a conflict with the write cache, as the cache will be empty
*/
db_ctl->op = FC_WRITE;
db_ctl->op_pos = ((gtm_int64_t)(csd->blk_size / DISK_BLOCK_SIZE) * pblkrec.blknum) + csd->start_vbn;
/* Use jrec size even if downgrade may have shrunk block. If the block has an integ error, we don't run into any trouble. */
size = pblkrec.bsiz;
assert(size <= csd->blk_size);
if (size > csd->blk_size) /* safety check in pro to avoid buffer overflows */
size = csd->blk_size;
/* If full-block-writes are enabled, round size up to next full logical filesys block. We want to use "dbfilop" to
* do the write but it does not honour full-block-writes setting. So prepare the buffer accordingly before invoking it.
*/
if (csa->do_fullblockwrites)
{ /* Determine full-block-write size corresponding to the current PBLK record block size (need to write only as
* many full-blocks as needed for current block size). For example, with database block size 16K, current block
* size (in the pblk record) is 3K and filesystem pagesize (fullblockwrite_len) is 4K, it is enough to only
* write 4K data out for the current pblk record (instead of the entire 16K).
*/
fullblockwrite_len = (int4)csa->fullblockwrite_len;
assert(fullblockwrite_len);
fbw_size = (int4)ROUND_UP(size, fullblockwrite_len);
/* Even though we are going to write full-block-write aligned blocks, we are not going to copy the pblk record
* to an alternate buffer. We are going to copy whatever follows the pblk record in the journal file (and has
* been read into the mur_desc buffers) into the database block as part of the full-block write. It is ok to do
* so since the database does not care about the data that follows the valid end of the block. But we need to
* ensure that there is referencible memory for the entire length of the full-block write. This is guaranteed
* because of the layout of the mur_desc buffers. We have a contiguous sequence of 5 buffers (random_buff,
* aux_buff1, seq_buff[0], seq_buff[1], aux_buff2) each occupying MUR_BUFF_SIZE bytes. Usually the PBLK record
* is expected to lie somewhere in seq_buff[0] or seq_buff[1]. If at all, it can overflow into aux_buff2.
* But aux_buff2 is an overflow buffer and therefore can contain at most one PBLK record (overflowing from
* seq_buff[1]) and since the current value of MUR_BUFF_SIZE is 128K, we have enough room to hold one
* GDS block (given that the maximum database block size is MAX_DB_BLK_SIZE which is 64K). All this is
* asserted below so whenever these constants change, this code is reworked.
*/
DEBUG_ONLY(pblk_jrec_start = (uchar_ptr_t)&jnlrec->jrec_pblk;)
assert(pblk_jrec_start > rctl->mur_desc->aux_buff1); /* assert that PBLK record ends AFTER aux_buff1 ends */
assert((pblk_jrec_start + fbw_size) > rctl->mur_desc->seq_buff[0].base);
assert(pblk_jrec_start < rctl->mur_desc->aux_buff2.base); /* assert that PBLK record begins BEFORE aux_buff2 */
assert((pblk_jrec_start + fbw_size) < rctl->mur_desc->aux_buff2.top);
assert((pblk_jrec_start + fbw_size) < (rctl->mur_desc->aux_buff2.base + MAX_DB_BLK_SIZE));
assert(MUR_BUFF_SIZE > MAX_DB_BLK_SIZE);
} else
fbw_size = size;
db_ctl->op_buff = pblkcontents;
db_ctl->op_len = fbw_size;
/* During recovery process, the dat file is recreated by reading the PBLK records from the jnl file and applying them
* to the dat file. In case the database is encrypted, the journal file would also have encrypted PBLK records so
* as long as both the journal file and database file have the same encryption keys (usual case) we dont need to do
* any encryption in this function. But if the keys are different, we need to decrypt the journal record using the
* key from the journal file and re-encrypt it using the key from the database file before applying the PBLK record.
*/
# ifdef GTM_CRYPT
bp = (blk_hdr_ptr_t) pblkcontents;
req_enc_blk_size = MIN(csd->blk_size, bp->bsiz) - SIZEOF(*bp);
jctl = rctl->jctl;
if (!jctl->is_same_hash_as_db && BLOCK_REQUIRE_ENCRYPTION(csd->is_encrypted, bp->levl, req_enc_blk_size))
{
ASSERT_ENCRYPTION_INITIALIZED;
/* The below assert cannot be moved before BLOCK_REQUIRE_ENCRYPTION check done above as tmp_ptr could
* potentially point to a V4 block in which case the assert might fail when a V4 block is casted to
* a V5 block header.
*/
assert((bp->bsiz <= csd->blk_size) && (bp->bsiz >= SIZEOF(*bp)));
GTMCRYPT_DECODE_FAST(jctl->encr_key_handle, (char *)(bp + 1), req_enc_blk_size, NULL, crypt_status);
if (0 == crypt_status)
GTMCRYPT_ENCODE_FAST(csa->encr_key_handle, (char *)(bp + 1), req_enc_blk_size, NULL, crypt_status);
if (0 != crypt_status)
{
GC_GTM_PUTMSG(crypt_status, NULL);
return crypt_status;
}
}
# endif
rctl->db_updated = TRUE; /* updated database corresponding to this region */
# ifdef UNIX
if (!murgbl.incr_onln_rlbk_cycle && jgbl.onlnrlbk)
{
murgbl.incr_onln_rlbk_cycle = TRUE;
/* Now that we have started updating the database, do NOT honor any more interrupts like MUPIP STOP */
assert(NULL != jnlpool.repl_inst_filehdr);
send_msg(VARLSTCNT(1) ERR_ORLBKNOSTP);
gtm_putmsg(VARLSTCNT(1) ERR_ORLBKNOSTP);
assert(blocksig_initialized); /* set to TRUE at process startup time */
savemask = block_sigsent;
sigdelset(&savemask, SIGALRM); /* Block all signals except SIGALRM */
sigprocmask(SIG_BLOCK, &savemask, NULL); /* No more MUPIP STOPs until completion */
}
# endif
return (dbfilop(db_ctl));
}