fis-gtm/sr_port/gvcst_bmp_mark_free.c

363 lines
16 KiB
C

/****************************************************************
* *
* Copyright 2001, 2013 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
/* gvcst_bmp_mark_free.c
This marks all the blocks in kill set list to be marked free.
Note ks must be already sorted
*/
#include "mdef.h"
#include "gtm_string.h"
#include "cdb_sc.h"
#include "gdsroot.h"
#include "gdskill.h"
#include "gdsblk.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "gdscc.h"
#include "jnl.h"
#include "buddy_list.h" /* needed for tp.h */
#include "hashtab_int4.h" /* needed for tp.h */
#include "tp.h"
#include "memcoherency.h"
#include "gdsblkops.h" /* for CHECK_AND_RESET_UPDATE_ARRAY macro */
/* Include prototypes */
#include "t_qread.h"
#include "t_end.h"
#include "t_retry.h"
#include "t_begin.h"
#include "t_write_map.h"
#include "mm_read.h"
#include "add_inter.h"
#include "gvcst_bmp_mark_free.h"
#include "t_busy2free.h"
#include "t_abort.h"
#ifdef UNIX
#include "db_snapshot.h"
#endif
#include "muextr.h"
#include "mupip_reorg.h"
GBLREF char *update_array, *update_array_ptr;
GBLREF cw_set_element cw_set[];
GBLREF unsigned char cw_set_depth;
GBLREF sgmnt_addrs *cs_addrs;
GBLREF sgmnt_data_ptr_t cs_data;
GBLREF unsigned char rdfail_detail;
GBLREF sgm_info *sgm_info_ptr;
GBLREF boolean_t mu_reorg_process;
GBLREF inctn_opcode_t inctn_opcode;
GBLREF inctn_detail_t inctn_detail; /* holds detail to fill in to inctn jnl record */
GBLREF uint4 dollar_tlevel;
#ifdef UNIX
GBLREF unsigned int t_tries;
GBLREF unsigned char t_fail_hist[CDB_MAX_TRIES];
#endif
GBLREF gd_region *gv_cur_region;
error_def(ERR_GVKILLFAIL);
error_def(ERR_IGNBMPMRKFREE);
trans_num gvcst_bmp_mark_free(kill_set *ks)
{
block_id bit_map, next_bm, *updptr;
blk_ident *blk, *blk_top, *nextblk;
trans_num ctn, start_db_fmt_tn;
unsigned int len;
# if defined(UNIX) && defined(DEBUG)
unsigned int lcl_t_tries;
# endif
int4 blk_prev_version;
srch_hist alt_hist;
trans_num ret_tn = 0;
boolean_t visit_blks;
srch_blk_status bmphist;
cache_rec_ptr_t cr;
enum db_ver ondsk_blkver;
enum cdb_sc status;
boolean_t mark_level_as_special;
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
TREF(in_gvcst_bmp_mark_free) = TRUE;
assert(inctn_bmp_mark_free_gtm == inctn_opcode || inctn_bmp_mark_free_mu_reorg == inctn_opcode);
/* Note down the desired_db_format_tn before you start relying on cs_data->fully_upgraded.
* If the db is fully_upgraded, take the optimal path that does not need to read each block being freed.
* But in order to detect concurrent desired_db_format changes, note down the tn (when the last format change occurred)
* before the fully_upgraded check and after having noted down the database current_tn.
* If they are the same, then we are guaranteed no concurrent desired_db_format change occurred.
* If they are not, then fall through to the non-optimal path where each to-be-killed block has to be visited.
* The reason we need to visit every block in case desired_db_format changes is to take care of the case where
* MUPIP REORG DOWNGRADE concurrently changes a block that we are about to free.
*/
start_db_fmt_tn = cs_data->desired_db_format_tn;
visit_blks = (!cs_data->fully_upgraded); /* Local evaluation */
assert(!visit_blks || (visit_blks && dba_bg == cs_addrs->hdr->acc_meth)); /* must have blks_to_upgrd == 0 for non-BG */
assert(!dollar_tlevel); /* Should NOT be in TP now */
blk = &ks->blk[0];
blk_top = &ks->blk[ks->used];
if (!visit_blks)
{ /* Database has been completely upgraded. Free all blocks in one bitmap as part of one transaction. */
assert(cs_data->db_got_to_v5_once); /* assert all V4 fmt blocks (including RECYCLED) have space for V5 upgrade */
inctn_detail.blknum_struct.blknum = 0; /* to indicate no adjustment to "blks_to_upgrd" necessary */
/* If any of the mini transaction below restarts because of an online rollback, we don't want the application
* refresh to happen (like $ZONLNRLBK++ or rts_error(DBROLLEDBACK). This is because, although we are currently in {BYPASSOK}
* non-tp (dollar_tleve = 0), we could actually be in a TP transaction and have actually faked dollar_tlevel. In
* such a case, we should NOT * be issuing a DBROLLEDBACK error as TP transactions are supposed to just restart in
* case of an online rollback. So, set the global variable that gtm_onln_rlbk_clnup can check and skip doing the
* application refresh, but will reset the clues. The next update will see the cycle mismatch and will accordingly
* take the right action.
*/
for ( ; blk < blk_top; blk = nextblk)
{
if (0 != blk->flag)
{
nextblk = blk + 1;
continue;
}
assert(0 < blk->block);
assert((int4)blk->block < cs_addrs->ti->total_blks);
bit_map = ROUND_DOWN2((int)blk->block, BLKS_PER_LMAP);
next_bm = bit_map + BLKS_PER_LMAP;
CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */
/* Scan for the next local bitmap */
updptr = (block_id *)update_array_ptr;
for (nextblk = blk;
(0 == nextblk->flag) && (nextblk < blk_top) && ((block_id)nextblk->block < next_bm);
++nextblk)
{
assert((block_id)nextblk->block - bit_map);
*updptr++ = (block_id)nextblk->block - bit_map;
}
len = (unsigned int)((char *)nextblk - (char *)blk);
update_array_ptr = (char *)updptr;
alt_hist.h[0].blk_num = 0; /* need for calls to T_END for bitmaps */
alt_hist.h[0].blk_target = NULL; /* need to initialize for calls to T_END */
/* the following assumes SIZEOF(blk_ident) == SIZEOF(int) */
assert(SIZEOF(blk_ident) == SIZEOF(int));
*(int *)update_array_ptr = 0;
t_begin(ERR_GVKILLFAIL, UPDTRNS_DB_UPDATED_MASK);
for (;;)
{
ctn = cs_addrs->ti->curr_tn;
/* Need a read fence before reading fields from cs_data as we are reading outside
* of crit and relying on this value to detect desired db format state change.
*/
SHM_READ_MEMORY_BARRIER;
if (start_db_fmt_tn != cs_data->desired_db_format_tn)
{ /* Concurrent db format change has occurred. Need to visit every block to be killed
* to determine its block format. Fall through to the non-optimal path below
*/
ret_tn = 0;
break;
}
# ifdef GTM_SNAPSHOT
/* if this is freeing a level-0 directory tree block, we need to transition the block to free
* right away and write its before-image thereby enabling fast integ to avoid writing level-0
* block before-images altogether. It is possible the fast integ hasn't started at this stage,
* so we cannot use FASTINTEG_IN_PROG in the if condition, but fast integ may already start later
* at bg/mm update stage, so we always need to prepare cw_set element
*/
if ((MUSWP_FREE_BLK == TREF(in_mu_swap_root_state)) && blk->level)
{ /* blk->level was set as 1 for level-0 DIR tree block in mu_swap_root */
/* for mu_swap_root, only one block is freed during bmp_mark_free */
assert(1 == ks->used);
ctn = cs_addrs->ti->curr_tn;
alt_hist.h[0].cse = NULL;
alt_hist.h[0].tn = ctn;
alt_hist.h[0].blk_num = blk->block;
alt_hist.h[1].blk_num = 0; /* this is to terminate history reading in t_end */
if (NULL == (alt_hist.h[0].buffaddr = t_qread(alt_hist.h[0].blk_num,
(sm_int_ptr_t)&alt_hist.h[0].cycle,
&alt_hist.h[0].cr)))
{
t_retry((enum cdb_sc)rdfail_detail);
continue;
}
t_busy2free(&alt_hist.h[0]);
/* The special level value will be used later in t_end to indicate
* before_image of this block will be written to snapshot file
*/
cw_set[cw_set_depth-1].level = CSE_LEVEL_DRT_LVL0_FREE;
mark_level_as_special = TRUE;
} else
mark_level_as_special = FALSE;
# endif
bmphist.blk_num = bit_map;
if (NULL == (bmphist.buffaddr = t_qread(bmphist.blk_num, (sm_int_ptr_t)&bmphist.cycle,
&bmphist.cr)))
{
t_retry((enum cdb_sc)rdfail_detail);
continue;
}
t_write_map(&bmphist, (uchar_ptr_t)update_array, ctn, -(int4)(nextblk - blk));
# ifdef GTM_SNAPSHOT
if (mark_level_as_special)
{
/* The special level value will be used later in gvcst_map_build to set the block to be
* freed as free rather than recycled
*/
cw_set[cw_set_depth-1].level = CSE_LEVEL_DRT_LVL0_FREE;
}
# endif
UNIX_ONLY(DEBUG_ONLY(lcl_t_tries = t_tries));
if ((trans_num)0 == (ret_tn = t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)))
{
# ifdef UNIX
assert((CDB_STAGNATE == t_tries) || (lcl_t_tries == t_tries - 1));
status = LAST_RESTART_CODE;
if ((cdb_sc_onln_rlbk1 == status) || (cdb_sc_onln_rlbk2 == status)
|| TREF(rlbk_during_redo_root))
{ /* t_end restarted due to online rollback. Discard bitmap free-up and return control
* to the application. But, before that reset only_reset_clues_if_onln_rlbk to FALSE
*/
TREF(in_gvcst_bmp_mark_free) = FALSE;
send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) ERR_IGNBMPMRKFREE, 4,
REG_LEN_STR(gv_cur_region), DB_LEN_STR(gv_cur_region));
t_abort(gv_cur_region, cs_addrs);
return ret_tn; /* actually 0 */
}
# endif
continue;
}
break;
}
if (0 == ret_tn) /* db format change occurred. Fall through to below for loop to visit each block */
{
/* Abort any active transaction to get rid of lingering Non-TP artifacts */
t_abort(gv_cur_region, cs_addrs);
break;
}
}
} /* for all blocks in the kill_set */
for ( ; blk < blk_top; blk++)
{ /* Database has NOT been completely upgraded. Have to read every block that is going to be freed
* and determine whether it has been upgraded or not. Every block will be freed as part of one
* separate update to the bitmap. This will cause as many transactions as the blocks are being freed.
* But this overhead will be present only as long as the database is not completely upgraded.
* The reason why every block is updated separately is in order to accurately maintain the "blks_to_upgrd"
* counter in the database file-header when the block-freeup phase (2nd phase) of the M-kill proceeds
* concurrently with a MUPIP REORG UPGRADE/DOWNGRADE. If the bitmap is not updated for every block freeup
* then MUPIP REORG UPGRADE/DOWNGRADE should also upgrade/downgrade all blocks in one bitmap as part of
* one transaction (only then will we avoid double-decrement of "blks_to_upgrd" counter by the M-kill as
* well as the MUPIP REORG UPGRADE/DOWNGRADE). That is a non-trivial task as potentially 512 blocks need
* to be modified as part of one non-TP transaction which is unnecessarily making it heavyweight. Compared
* to that, incurring a per-block bitmap update overhead in the M-kill is considered acceptable since this
* will be the case only as long as we are in compatibility mode which should be hopefully not for long.
*/
if (0 != blk->flag)
continue;
assert(0 < blk->block);
assert((int4)blk->block < cs_addrs->ti->total_blks);
assert(!IS_BITMAP_BLK(blk->block));
bit_map = ROUND_DOWN2((int)blk->block, BLKS_PER_LMAP);
assert(dba_bg == cs_addrs->hdr->acc_meth);
/* We need to check each block we are deleting to see if it is in the format of a previous version.
* If it is, then "csd->blks_to_upgrd" needs to be correspondingly adjusted.
*/
alt_hist.h[0].level = 0; /* Initialize for loop below */
alt_hist.h[1].blk_num = 0;
alt_hist.h[0].blk_target = NULL; /* need to initialize for calls to T_END */
CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */
assert((block_id)blk->block - bit_map);
assert(SIZEOF(block_id) == SIZEOF(blk_ident));
*((block_id *)update_array_ptr) = ((block_id)blk->block - bit_map);
update_array_ptr += SIZEOF(blk_ident);
/* the following assumes SIZEOF(blk_ident) == SIZEOF(int) */
assert(SIZEOF(blk_ident) == SIZEOF(int));
*(int *)update_array_ptr = 0;
t_begin(ERR_GVKILLFAIL, UPDTRNS_DB_UPDATED_MASK);
for (;;)
{
ctn = cs_addrs->ti->curr_tn;
alt_hist.h[0].cse = NULL;
alt_hist.h[0].tn = ctn;
alt_hist.h[0].blk_num = blk->block;
if (NULL == (alt_hist.h[0].buffaddr = t_qread(alt_hist.h[0].blk_num,
(sm_int_ptr_t)&alt_hist.h[0].cycle,
&alt_hist.h[0].cr)))
{
t_retry((enum cdb_sc)rdfail_detail);
continue;
}
/* IF csd->db_got_to_v5_once is FALSE
* a) mark the block as FREE (not RECYCLED to avoid confusing MUPIP REORG UPGRADE with a
* block that was RECYCLED right at the time of MUPIP UPGRADE from a V4 to V5 version).
* MUPIP REORG UPGRADE will mark all existing RECYCLED blocks as FREE.
* b) need to write PBLK
* ELSE
* a) mark this block as RECYCLED
* b) no need to write PBLK (it will be written when the block later gets reused).
* ENDIF
*
* Create a cw-set-element with mode gds_t_busy2free that will cause a PBLK to be written in t_end
* (the value csd->db_got_to_v5_once will be checked while holding crit) only in the IF case above.
* At the same time bg_update will NOT be invoked for this cw-set-element so this block will not be
* touched. But the corresponding bitmap block will be updated as part of the same transaction (see
* t_write_map below) to mark this block as FREE or RECYCLED depending on whether csd->db_got_to_v5_once
* is FALSE or TRUE (actual check done in gvcst_map_build and sec_shr_map_build).
*/
t_busy2free(&alt_hist.h[0]);
cr = alt_hist.h[0].cr;
ondsk_blkver = cr->ondsk_blkver; /* Get local copy in case cr->ondsk_blkver changes between
* first and second part of the ||
*/
assert((GDSV6 == ondsk_blkver) || (GDSV4 == ondsk_blkver));
if (GDSVCURR != ondsk_blkver)
inctn_detail.blknum_struct.blknum = blk->block;
else
inctn_detail.blknum_struct.blknum = 0; /* i.e. no adjustment to "blks_to_upgrd" necessary */
bmphist.blk_num = bit_map;
if (NULL == (bmphist.buffaddr = t_qread(bmphist.blk_num, (sm_int_ptr_t)&bmphist.cycle,
&bmphist.cr)))
{
t_retry((enum cdb_sc)rdfail_detail);
continue;
}
t_write_map(&bmphist, (uchar_ptr_t)update_array, ctn, -1);
# ifdef GTM_SNAPSHOT
if ((MUSWP_FREE_BLK == TREF(in_mu_swap_root_state)) && blk->level)
{
assert(1 == ks->used);
cw_set[cw_set_depth-1].level = CSE_LEVEL_DRT_LVL0_FREE; /* special level for gvcst_map_build */
cw_set[cw_set_depth-2].level = CSE_LEVEL_DRT_LVL0_FREE; /* special level for t_end */
/* Here we do not need to do BIT_SET_DIR_TREE because later the block will be always written to
* snapshot file without checking whether it belongs to DIR or GV tree
*/
}
# endif
UNIX_ONLY(DEBUG_ONLY(lcl_t_tries = t_tries));
if ((trans_num)0 == (ret_tn = t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)))
{
# ifdef UNIX
assert((CDB_STAGNATE == t_tries) || (lcl_t_tries == t_tries - 1));
assert(0 < t_tries);
DEBUG_ONLY(status = LAST_RESTART_CODE); /* get the recent restart code */
/* We don't expect online rollback related retries because we are here with the database NOT fully
* upgraded. This means, online rollback cannot even start (it issues ORLBKNOV4BLK). Assert that.
*/
assert((cdb_sc_onln_rlbk1 != status) && (cdb_sc_onln_rlbk2 != status));
# endif
continue;
}
break;
}
} /* for all blocks in the kill_set */
TREF(in_gvcst_bmp_mark_free) = FALSE;
return ret_tn;
}