fis-gtm/sr_unix/mu_swap_root.c

463 lines
16 KiB
C

/****************************************************************
* *
* Copyright 2012, 2013 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_string.h"
#include "cdb_sc.h"
#include "gdsroot.h"
#include "gdsblk.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "jnl.h"
#include "gdsblkops.h"
#include "gdskill.h"
#include "gdscc.h"
#include "copy.h"
#include "interlock.h"
#include "muextr.h"
#include "mu_reorg.h"
/* Include prototypes */
#include "t_end.h"
#include "t_retry.h"
#include "mupip_reorg.h"
#include "util.h"
#include "t_begin.h"
#include "op.h"
#include "gvcst_protos.h" /* for gvcst_rtsib,gvcst_search prototype */
#include "gvcst_bmp_mark_free.h"
#include "gvcst_kill_sort.h"
#include "gtmmsg.h"
#include "add_inter.h"
#include "t_abort.h"
#include "sleep_cnt.h"
#include "wcs_sleep.h"
#include "memcoherency.h"
#include "gdsbml.h"
#include "jnl_get_checksum.h"
#include "t_qread.h"
#include "t_create.h"
#include "t_write_map.h"
#include "t_write.h"
#ifdef GTM_TRIGGER
#include "hashtab_mname.h"
#include "gv_trigger.h"
#include "gv_trigger_common.h"
#include "targ_alloc.h"
#endif
GTMTRIG_ONLY(LITREF mval literal_hasht;)
GBLREF sgmnt_data_ptr_t cs_data;
GBLREF sgmnt_addrs *cs_addrs;
GBLREF gd_region *gv_cur_region;
GBLREF gv_key *gv_currkey, *gv_altkey;
GBLREF gv_namehead *gv_target;
GBLREF gv_namehead *reorg_gv_target;
GBLREF unsigned char cw_map_depth;
GBLREF unsigned char cw_set_depth;
GBLREF cw_set_element cw_set[];
GBLREF uint4 t_err;
GBLREF unsigned int t_tries;
GBLREF unsigned char rdfail_detail;
GBLREF inctn_opcode_t inctn_opcode;
GBLREF kill_set *kill_set_tail;
GBLREF sgmnt_addrs *kip_csa;
GBLREF boolean_t mu_reorg_process;
GBLREF boolean_t mu_reorg_upgrd_dwngrd_in_prog;
GBLREF boolean_t need_kip_incr;
GBLREF uint4 update_trans;
GBLREF gv_key *gv_altkey;
GBLREF char *update_array, *update_array_ptr;
GBLREF uint4 update_array_size;
GBLREF inctn_opcode_t inctn_opcode;
error_def(ERR_DBRDONLY);
error_def(ERR_GBLNOEXIST);
error_def(ERR_MAXBTLEVEL);
error_def(ERR_MUREORGFAIL);
error_def(ERR_MUTRUNCNOTBG);
#define RETRY_SWAP (0)
#define ABORT_SWAP (1)
boolean_t mu_swap_root(mval *gn, int *root_swap_statistic_ptr)
{
sgmnt_data_ptr_t csd;
sgmnt_addrs *csa;
node_local_ptr_t cnl;
srch_hist *dir_hist_ptr, *gvt_hist_ptr;
gv_namehead *save_targ;
block_id root_blk_id, child_blk_id, free_blk_id;
sm_uc_ptr_t root_blk_ptr, child_blk_ptr;
kill_set kill_set_list;
trans_num curr_tn, ret_tn;
int level, root_blk_lvl;
block_id save_root;
boolean_t tn_aborted;
unsigned int lcl_t_tries;
enum cdb_sc status;
# ifdef GTM_TRIGGER
gv_namehead *hasht_tree;
mname_entry gvent;
# endif
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
assert(mu_reorg_process);
gv_target->root = 0;
gv_target->clue.end = 0;
dir_hist_ptr = gv_target->alt_hist;
gvt_hist_ptr = &(gv_target->hist);
inctn_opcode = inctn_invalid_op;
# ifdef GTM_TRIGGER
if (IS_MNAME_HASHT_GBLNAME(gn->str))
{ /* Initialize ^#t global for this region. */
csa = cs_addrs; /* needed for SETUP_TRIGGER_GLOBAL and INITIAL_HASHT_ROOT_SEARCH_IF_NEEDED macros */
SETUP_TRIGGER_GLOBAL;
INITIAL_HASHT_ROOT_SEARCH_IF_NEEDED;
DBG_CHECK_GVTARGET_GVCURRKEY_IN_SYNC(CHECK_CSA_TRUE);
if (0 == gv_target->root)
return TRUE;
} else
# endif /* Initialization for current global */
op_gvname(VARLSTCNT(1) (gn));
csa = cs_addrs;
cnl = csa->nl;
csd = cs_data; /* Be careful to keep csd up to date. With MM, cs_data can change, and
* dereferencing an older copy can result in a SIG-11.
*/
if (0 == gv_target->root)
{ /* Global does not exist (online rollback). No problem. */
gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_GBLNOEXIST, 2, gn->str.len, gn->str.addr);
return TRUE;
}
if (dba_mm == csd->acc_meth)
/* return for now without doing any swapping operation because later mu_truncate
* is going to issue the MUTRUNCNOTBG message.
*/
return TRUE;
SET_GV_ALTKEY_TO_GBLNAME_FROM_GV_CURRKEY; /* set up gv_altkey to be just the gblname */
/* ------------ Swap root block of global variable tree --------- */
t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK);
for (;;)
{
curr_tn = csa->ti->curr_tn;
kill_set_list.used = 0;
save_root = gv_target->root;
gv_target->root = csa->dir_tree->root;
gv_target->clue.end = 0;
if (cdb_sc_normal != (status = gvcst_search(gv_altkey, dir_hist_ptr)))
{ /* Assign directory tree path to dir_hist_ptr */
assert(t_tries < CDB_STAGNATE);
gv_target->root = save_root;
t_retry(status);
continue;
}
gv_target->root = save_root;
gv_target->clue.end = 0;
if (cdb_sc_normal != (gvcst_search(gv_currkey, NULL)))
{ /* Assign global variable tree path to gvt_hist_ptr */
assert(t_tries < CDB_STAGNATE);
t_retry(status);
continue;
}
/* We've already search the directory tree in op_gvname/t_retry and obtained gv_target->root.
* Should restart with gvtrootmod2 if they don't agree. gvcst_root_search is the final arbiter.
* Really need that for debug info and also should assert(gv_currkey is global name).
*/
root_blk_lvl = gvt_hist_ptr->depth;
assert(root_blk_lvl > 0);
root_blk_ptr = gvt_hist_ptr->h[root_blk_lvl].buffaddr;
root_blk_id = gvt_hist_ptr->h[root_blk_lvl].blk_num;
assert((CDB_STAGNATE > t_tries) || (gv_target->root == gvt_hist_ptr->h[root_blk_lvl].blk_num));
free_blk_id = swap_root_or_directory_block(0, root_blk_lvl, dir_hist_ptr, root_blk_id,
root_blk_ptr, &kill_set_list, curr_tn);
if (RETRY_SWAP == free_blk_id)
continue;
else if (ABORT_SWAP == free_blk_id)
break;
update_trans = UPDTRNS_DB_UPDATED_MASK;
inctn_opcode = inctn_mu_reorg;
assert(1 == kill_set_list.used);
need_kip_incr = TRUE;
if (!csa->now_crit)
WAIT_ON_INHIBIT_KILLS(cnl, MAXWAIT2KILL);
DEBUG_ONLY(lcl_t_tries = t_tries);
TREF(in_mu_swap_root_state) = MUSWP_INCR_ROOT_CYCLE;
assert(!TREF(in_gvcst_redo_root_search));
if ((trans_num)0 == (ret_tn = t_end(gvt_hist_ptr, dir_hist_ptr, TN_NOT_SPECIFIED)))
{
TREF(in_mu_swap_root_state) = MUSWP_NONE;
need_kip_incr = FALSE;
assert(NULL == kip_csa);
ABORT_TRANS_IF_GBL_EXIST_NOMORE(lcl_t_tries, tn_aborted);
if (tn_aborted)
{ /* It is not an error if the global (that once existed) doesn't exist anymore (due to ROLLBACK) */
gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_GBLNOEXIST, 2, gn->str.len, gn->str.addr);
return TRUE;
}
continue;
}
TREF(in_mu_swap_root_state) = MUSWP_NONE;
/* Note that this particular process's csa->root_search_cycle is now behind cnl->root_search_cycle.
* This forces a cdb_sc_gvtrootmod2 restart in gvcst_bmp_mark_free below.
*/
assert(cnl->root_search_cycle > csa->root_search_cycle);
gvcst_kill_sort(&kill_set_list);
GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg, inctn_bmp_mark_free_mu_reorg, inctn_opcode, csa);
DECR_KIP(csd, csa, kip_csa);
*root_swap_statistic_ptr += 1;
break;
}
/* ------------ Swap blocks in branch of directory tree --------- */
for (level = 0; level <= MAX_BT_DEPTH; level++)
{
t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK);
for (;;)
{
curr_tn = csa->ti->curr_tn;
kill_set_list.used = 0;
save_root = gv_target->root;
gv_target->root = csa->dir_tree->root;
gv_target->clue.end = 0;
if (cdb_sc_normal != (status = gvcst_search(gv_altkey, dir_hist_ptr)))
{ /* assign branch path of directory tree into dir_hist_ptr */
assert(t_tries < CDB_STAGNATE);
gv_target->root = save_root;
t_retry(status);
continue;
}
gv_target->root = save_root;
gv_target->clue.end = 0;
if (level >= dir_hist_ptr->depth)
{ /* done */
t_abort(gv_cur_region, csa);
return TRUE;
}
child_blk_ptr = dir_hist_ptr->h[level].buffaddr;
child_blk_id = dir_hist_ptr->h[level].blk_num;
assert(csa->dir_tree->root != child_blk_id);
free_blk_id = swap_root_or_directory_block(level + 1, level, dir_hist_ptr, child_blk_id,
child_blk_ptr, &kill_set_list, curr_tn);
if (level == 0)
/* set level as 1 to mark this kill set is for level-0 block in directory tree.
* The kill-set level later will be used in gvcst_bmp_markfree to assign a special value to
* cw_set_element, which will be eventually used by t_end to write the block to snapshot
*/
kill_set_list.blk[kill_set_list.used - 1].level = 1;
if (RETRY_SWAP == free_blk_id)
continue;
else if (ABORT_SWAP == free_blk_id)
break;
update_trans = UPDTRNS_DB_UPDATED_MASK;
inctn_opcode = inctn_mu_reorg;
assert(1 == kill_set_list.used);
need_kip_incr = TRUE;
if (!csa->now_crit)
WAIT_ON_INHIBIT_KILLS(cnl, MAXWAIT2KILL);
DEBUG_ONLY(lcl_t_tries = t_tries);
TREF(in_mu_swap_root_state) = MUSWP_DIRECTORY_SWAP;
if ((trans_num)0 == (ret_tn = t_end(dir_hist_ptr, NULL, TN_NOT_SPECIFIED)))
{
TREF(in_mu_swap_root_state) = MUSWP_NONE;
need_kip_incr = FALSE;
assert(NULL == kip_csa);
continue;
}
TREF(in_mu_swap_root_state) = MUSWP_NONE;
gvcst_kill_sort(&kill_set_list);
TREF(in_mu_swap_root_state) = MUSWP_FREE_BLK;
GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg, inctn_bmp_mark_free_mu_reorg,
inctn_opcode, csa);
TREF(in_mu_swap_root_state) = MUSWP_NONE;
DECR_KIP(csd, csa, kip_csa);
break;
}
}
return TRUE;
}
/* Finds a free block and adds information to update array and cw_set */
block_id swap_root_or_directory_block(int parent_blk_lvl, int child_blk_lvl, srch_hist *dir_hist_ptr, block_id child_blk_id,
sm_uc_ptr_t child_blk_ptr, kill_set *kill_set_list, trans_num curr_tn)
{
sgmnt_data_ptr_t csd;
sgmnt_addrs *csa;
node_local_ptr_t cnl;
srch_blk_status bmlhist, freeblkhist;
block_id hint_blk_num, free_blk_id, parent_blk_id;
boolean_t free_blk_recycled;
int4 master_bit, num_local_maps, free_bit, hint_bit, maxbitsthismap;
uint4 total_blks;
int blk_seg_cnt, blk_size;
sm_uc_ptr_t parent_blk_ptr, bn_ptr, saved_blk;
blk_segment *bs1, *bs_ptr;
int parent_blk_size, child_blk_size, bsiz;
int rec_size1, curr_offset, bpntr_end, hdr_len;
int tmp_cmpc;
cw_set_element *tmpcse;
jnl_buffer_ptr_t jbbp; /* jbbp is non-NULL only if before-image journaling */
unsigned short temp_ushort;
unsigned long temp_long;
unsigned char save_cw_set_depth;
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
csd = cs_data;
csa = cs_addrs;
cnl = csa->nl;
blk_size = csd->blk_size;
/* Find a free/recycled block for new block location. */
hint_blk_num = 0;
total_blks = csa->ti->total_blks;
num_local_maps = DIVIDE_ROUND_UP(total_blks, BLKS_PER_LMAP);
master_bit = bmm_find_free((hint_blk_num / BLKS_PER_LMAP), csa->bmm, num_local_maps);
if ((NO_FREE_SPACE == master_bit))
{
t_abort(gv_cur_region, csa);
return ABORT_SWAP;
}
bmlhist.blk_num = (block_id)master_bit * BLKS_PER_LMAP;
if (NULL == (bmlhist.buffaddr = t_qread(bmlhist.blk_num, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr)))
{
assert(t_tries < CDB_STAGNATE);
t_retry((enum cdb_sc)rdfail_detail);
return RETRY_SWAP;
}
hint_bit = 0;
maxbitsthismap = (master_bit != (num_local_maps - 1)) ? BLKS_PER_LMAP : total_blks - bmlhist.blk_num;
free_bit = bm_find_blk(hint_bit, bmlhist.buffaddr + SIZEOF(blk_hdr), maxbitsthismap, &free_blk_recycled);
free_blk_id = bmlhist.blk_num + free_bit;
if (DIR_ROOT >= free_blk_id)
{ /* Bitmap block 0 and directory tree root block 1 should always be marked busy. */
assert(t_tries < CDB_STAGNATE);
t_retry(cdb_sc_badbitmap);
return RETRY_SWAP;
}
if (child_blk_id <= free_blk_id)
{ /* stop swapping root or DT blocks once the database is truncated well enough. A good heuristic for this is to check
* if the block is to be swapped into a higher block number and if so do not swap
*/
t_abort(gv_cur_region, csa);
return ABORT_SWAP;
}
/* ====== begin update array ======
* Four blocks get changed.
* 1. Free block becomes busy and gains the contents of child (root block/directory tree block)
* 2. Parent block in directory tree remains busy, but points to new root block location.
* 3. Free block's corresponding bitmap reflects above change.
* 4. Child block gets marked recycled in bitmap. (GVCST_BMP_MARK_FREE)
*/
parent_blk_ptr = dir_hist_ptr->h[parent_blk_lvl].buffaddr; /* parent_blk_lvl is 0 iff we're moving a gvt root block */
parent_blk_id = dir_hist_ptr->h[parent_blk_lvl].blk_num;
CHECK_AND_RESET_UPDATE_ARRAY;
if (free_blk_recycled)
{ /* Otherwise, it's a completely free block, in which case no need to read. */
freeblkhist.blk_num = (block_id)free_blk_id;
if (NULL == (freeblkhist.buffaddr = t_qread(free_blk_id, (sm_int_ptr_t)&freeblkhist.cycle, &freeblkhist.cr)))
{
assert(t_tries < CDB_STAGNATE);
t_retry((enum cdb_sc)rdfail_detail);
return RETRY_SWAP;
}
}
child_blk_size = ((blk_hdr_ptr_t)child_blk_ptr)->bsiz;
BLK_INIT(bs_ptr, bs1);
BLK_ADDR(saved_blk, child_blk_size, unsigned char);
memcpy(saved_blk, child_blk_ptr, child_blk_size);
BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), child_blk_size - SIZEOF(blk_hdr));
assert(blk_seg_cnt == child_blk_size);
if (!BLK_FINI(bs_ptr, bs1))
{
assert(t_tries < CDB_STAGNATE);
t_retry(cdb_sc_blkmod);
return RETRY_SWAP;
}
tmpcse = &cw_set[cw_set_depth];
(free_blk_recycled) ? BIT_SET_RECYCLED_AND_CLEAR_FREE(tmpcse->blk_prior_state)
: BIT_CLEAR_RECYCLED_AND_SET_FREE(tmpcse->blk_prior_state);
t_create(free_blk_id, (unsigned char *)bs1, 0, 0, child_blk_lvl);
tmpcse->mode = gds_t_acquired;
if (!free_blk_recycled || !cs_data->db_got_to_v5_once)
tmpcse->old_block = NULL;
else
{
tmpcse->old_block = freeblkhist.buffaddr;
tmpcse->cr = freeblkhist.cr;
tmpcse->cycle = freeblkhist.cycle;
jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
if ((NULL != jbbp) && (((blk_hdr_ptr_t)tmpcse->old_block)->tn < jbbp->epoch_tn))
{
bsiz = ((blk_hdr_ptr_t)(tmpcse->old_block))->bsiz;
if (bsiz > blk_size)
{
assert(CDB_STAGNATE > t_tries);
t_retry(cdb_sc_lostbmlcr);
return RETRY_SWAP;
}
JNL_GET_CHECKSUM_ACQUIRED_BLK(tmpcse, csd, csa, tmpcse->old_block, bsiz);
}
}
/* 2. Parent block in directory tree remains busy, but points to new child block location. */
curr_offset = dir_hist_ptr->h[parent_blk_lvl].curr_rec.offset;
parent_blk_size = ((blk_hdr_ptr_t)parent_blk_ptr)->bsiz;
GET_RSIZ(rec_size1, (parent_blk_ptr + curr_offset));
if ((parent_blk_size < rec_size1 + curr_offset) || (BSTAR_REC_SIZE > rec_size1))
{
assert(t_tries < CDB_STAGNATE);
t_retry(cdb_sc_blkmod);
return RETRY_SWAP;
}
BLK_INIT(bs_ptr, bs1);
if (0 == parent_blk_lvl)
/* There can be collation stuff in the record value after the block pointer. See gvcst_root_search. */
hdr_len = SIZEOF(rec_hdr) + gv_altkey->end + 1 - EVAL_CMPC((rec_hdr_ptr_t)(parent_blk_ptr + curr_offset));
else
hdr_len = rec_size1 - SIZEOF(block_id);
bpntr_end = curr_offset + hdr_len + SIZEOF(block_id);
BLK_SEG(bs_ptr, parent_blk_ptr + SIZEOF(blk_hdr), curr_offset + hdr_len - SIZEOF(blk_hdr));
BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
PUT_LONG(bn_ptr, free_blk_id);
BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
BLK_SEG(bs_ptr, parent_blk_ptr + bpntr_end, parent_blk_size - bpntr_end);
assert(blk_seg_cnt == parent_blk_size);
if (!BLK_FINI(bs_ptr, bs1))
{
assert(t_tries < CDB_STAGNATE);
t_retry(cdb_sc_blkmod);
return RETRY_SWAP;
}
t_write(&dir_hist_ptr->h[parent_blk_lvl], (unsigned char *)bs1, 0, 0, parent_blk_lvl, FALSE, TRUE, GDS_WRITE_KILLTN);
/* To indicate later snapshot file writing process during fast_integ not to skip writing the block to snapshot file */
BIT_SET_DIR_TREE(cw_set[cw_set_depth-1].blk_prior_state);
/* 3. Free block's corresponding bitmap reflects above change. */
PUT_LONG(update_array_ptr, free_bit);
save_cw_set_depth = cw_set_depth; /* Bit maps go on end of cw_set (more fake acquired) */
assert(!cw_map_depth);
t_write_map(&bmlhist, (uchar_ptr_t)update_array_ptr, curr_tn, 1);
cw_map_depth = cw_set_depth;
cw_set_depth = save_cw_set_depth;
update_array_ptr += SIZEOF(block_id);
temp_long = 0;
PUT_LONG(update_array_ptr, temp_long);
update_array_ptr += SIZEOF(block_id);
assert(1 == cw_set[cw_map_depth - 1].reference_cnt);
/* 4. Child block gets marked recycled in bitmap. (GVCST_BMP_MARK_FREE) */
kill_set_list->blk[kill_set_list->used].flag = 0;
kill_set_list->blk[kill_set_list->used].level = 0;
kill_set_list->blk[kill_set_list->used++].block = child_blk_id;
return free_blk_id;
}