fis-gtm/sr_port/gvincr_recompute_upd_array.c

/****************************************************************
 *								*
 *	Copyright 2004, 2010 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include "gtm_string.h"

#include "cdb_sc.h"
#include "gdsroot.h"
#include "gtm_facility.h"
#include "gdskill.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "gdscc.h"
#include "min_max.h"		/* needed for gdsblkops.h */
#include "gdsblkops.h"
#include "jnl.h"
#include "copy.h"
#include "gvcst_protos.h"	/* for gvcst_search_blk prototypes */
#include "op.h"			/* for add_mvals prototype */
#include "jnl_get_checksum.h"

GBLREF	uint4			dollar_tlevel;
GBLREF	gd_region		*gv_cur_region;
GBLREF	sgmnt_addrs		*cs_addrs;
GBLREF	sgmnt_data_ptr_t	cs_data;
GBLREF	mval			*post_incr_mval;	/* mval pointing to the post-$INCR value */
GBLREF	jnl_format_buffer       *non_tp_jfb_ptr;
GBLREF	jnl_gbls_t		jgbl;
GBLREF	char			*update_array, *update_array_ptr;
GBLREF	int			gv_fillfactor, rc_set_fragment;	/* Contains offset within data at which data fragment starts */
GBLREF	unsigned char		cw_set_depth;
GBLREF	gv_key			*gv_currkey;
GBLREF	unsigned int		t_tries;
GBLREF	uint4			update_array_size;
GBLREF	gv_namehead		*gv_target;

/* --------------------------------------------------------------------------------------------
 * This code is very similar to the code in gvcst_put for the non-block-split case as well as
 * the code in recompute_upd_array in tp_tend.c. All of these need to be maintained in sync.
 * --------------------------------------------------------------------------------------------
 */

enum cdb_sc	gvincr_recompute_upd_array(srch_blk_status *bh, struct cw_set_element_struct *cse, cache_rec_ptr_t cr)
{
	blk_segment		*bs1, *bs_ptr;
	char			*va;
	enum cdb_sc		status;
	int4			blk_size, blk_fill_size, cur_blk_size, blk_seg_cnt, delta, tail_len, new_rec_size;
	int4			target_key_size, data_len;
	mstr			value;
	rec_hdr_ptr_t		curr_rec_hdr, rp;
	sm_uc_ptr_t		cp1, buffaddr;
	unsigned short		rec_size;
	jnl_format_buffer	*jfb;
	blk_hdr_ptr_t		old_block;
	sgmnt_addrs		*csa;

	csa = cs_addrs;
	assert(!dollar_tlevel);	/* this recomputation is currently supported only for non-TP */
	/* To support this for TP would require addressing a lot more issues. Examples are
	 * 	a) Currently we format jnl records only for explicit updates and not for implicit updates (updates in trigger code).
	 * 		All such triggers updates currently happen inside of a TP (even if the explicit update is non-TP, there
	 * 		is an implicit TP wrapper). Therefore we need to record more information as to whether this update
	 * 		to the database needs a corresponding format of the logical journal record or not.
	 */
	assert(0 == cse->level);	/* better be a leaf-level block */
	assert(csa->now_crit);
	assert(!cse->level && (gds_t_write == cse->mode) && (NULL == cse->new_buff) && (GDS_WRITE_PLAIN == cse->write_type));
	blk_size = cs_data->blk_size;	/* "blk_size" is also used by the BLK_FINI macro below */
	blk_fill_size = (blk_size * gv_fillfactor) / 100 - cs_data->reserved_bytes;
	/* clues for gv_target involved in recomputation need not be nullified since only the value changes (not the key) */
	assert(CR_NOTVALID != (sm_long_t)cr);
	if (NULL == cr || CR_NOTVALID == (sm_long_t)cr || (0 <= cr->read_in_progress))
	{
		assert(CDB_STAGNATE > t_tries);
		return cdb_sc_lostcr;
	}
	if (cr->in_tend)
	{	/* Possible if this cache-record is being modified concurrently by another process in bg_update_phase2.
		 * Normally t_qread would have waited for this to complete before returning. But it is possible in some
		 * cases to bypass t_qread (e.g. gv_target->clue.end is non-zero). In this case we have two options.
		 *	a) Signal a restart. This will cause clue.end to get reset to 0 and will now go through t_qread.
		 *	b) Wait for in_tend to become non-zero and then proceed. This will save a restart.
		 * Since we are not in TP the overhead of restarting is not that bad.
		 * Since we hold crit at this point, we decide not to wait. We choose (a).
		 */
		assert(CDB_STAGNATE > t_tries);
		return cdb_sc_blkmod;
	}
	buffaddr = bh->buffaddr;
	target_key_size = gv_currkey->end + 1;
	if (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, bh)))
	{
		assert(CDB_STAGNATE > t_tries);
		return status;
	}
	if (target_key_size != bh->curr_rec.match)	/* key does not exist, nothing doable here, restart transaction */
	{
		assert(CDB_STAGNATE > t_tries);
		return cdb_sc_blkmod;
	}
	cur_blk_size = ((blk_hdr_ptr_t)buffaddr)->bsiz;
	rp = (rec_hdr_ptr_t)(buffaddr + bh->curr_rec.offset);
	GET_USHORT(rec_size, &rp->rsiz);
	data_len = rec_size + rp->cmpc - SIZEOF(rec_hdr) - target_key_size;
	if (cdb_sc_normal != (status = gvincr_compute_post_incr(bh)))
	{
		assert(CDB_STAGNATE > t_tries);
		return status;
	}
	assert(MV_IS_STRING(post_incr_mval));	/* gvincr_recompute_post_incr should have set it to be a of type MV_STR */
	value = post_incr_mval->str;
	new_rec_size = rec_size - data_len + value.len;
	delta = new_rec_size - rec_size;
	if ((cur_blk_size + delta) > blk_fill_size)
	{
		assert(CDB_STAGNATE > t_tries);
		return cdb_sc_blksplit;
	}
	if (0 != rc_set_fragment)
	{
		assert(CDB_STAGNATE > t_tries);
		return cdb_sc_mkblk;	/* let gvcst_put do the recomputation out of crit in case of rc_set */
	}
	/* Note that a lot of the code below relies on the fact that we are in non-TP. For TP we need to do extra stuff */
	assert(NULL != update_array);
	assert(NULL != update_array_ptr);
	assert(0 != update_array_size);
	assert(update_array + update_array_size >= update_array_ptr);
	assert(1 == cw_set_depth);
	/* since cw_set_depth is guaranteed to be 1 (by the above assert), we can be sure that the only update array space we would
	 * have used is for the current (and only) cw_set_element "cse" and hence can reuse the space by resetting update_array_ptr
	 */
	assert(ROUND_UP2((INTPTR_T)update_array, UPDATE_ELEMENT_ALIGN_SIZE) == (INTPTR_T)cse->upd_addr);
	RESET_UPDATE_ARRAY; /* do not use CHECK_AND_RESET_UPDATE_ARRAY since we are knowingly resetting an active update array */
	BLK_INIT(bs_ptr, bs1);
	BLK_SEG(bs_ptr, buffaddr + SIZEOF(blk_hdr), bh->curr_rec.offset - SIZEOF(blk_hdr));
	BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr);
	curr_rec_hdr->rsiz = new_rec_size;
	curr_rec_hdr->cmpc = bh->prev_rec.match;
	BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr));
	BLK_ADDR(cp1, target_key_size - bh->prev_rec.match, unsigned char);
	memcpy(cp1, gv_currkey->base + bh->prev_rec.match, target_key_size - bh->prev_rec.match);
	BLK_SEG(bs_ptr, cp1, target_key_size - bh->prev_rec.match);
	assert(0 != value.len);
	BLK_ADDR(va, value.len, char);
	memcpy(va, value.addr, value.len);
	BLK_SEG(bs_ptr, (unsigned char *)va, value.len);
	rp = (rec_hdr_ptr_t)((sm_uc_ptr_t)rp + rec_size);
	tail_len = (int4)(cur_blk_size - ((sm_uc_ptr_t)rp - buffaddr));
	assert(tail_len >= 0); /* else gvincr_recompute_post_incr would have returned cdb_sc_rmisalign and we will not be here */
	if (tail_len > 0)
	{
		BLK_SEG(bs_ptr, (sm_uc_ptr_t)rp, tail_len);
	}
	if (0 == BLK_FINI(bs_ptr, bs1))
	{
		assert(CDB_STAGNATE > t_tries);
		return cdb_sc_mkblk;
	}
	cse->upd_addr = (unsigned char *)bs1;
	/* assert that cse->old_block is indeed pointing to the buffer that the cache-record is pointing to.
	 * this is necessary to ensure that we are copying "ondsk_blkver" from the correct cache-record.
	 * there is a possibility that this assert might not hold true which is if we are in a restartable situation.
	 * but in that case do the same check that t_end will perform to determine this.
	 */
	assert((cse->old_block == (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr)) || (bh->cycle != cr->cycle) || (bh->cr != cr));
	cse->ondsk_blkver = cr->ondsk_blkver;
	cse->done = FALSE;
	/* Reformat the logical SET jnl-record if we need to write logical records. But recompute checksums for PBLK record
	 * ONLY IF journaling is enabled. Do not need to do this in the case REPL_WAS_ENABLED(csa) is TRUE as replication
	 * only cares about logical records. Hence the separation of the code below into two "if" blocks.
	 */
	if (JNL_WRITE_LOGICAL_RECS(csa))
		jfb = jnl_format(JNL_SET, gv_currkey, post_incr_mval, 0); /* Re-format the logical SET jnl-record */
	if (JNL_ENABLED(csa))
	{	/* Recompute checksums in case necessary */
		if (csa->jnl_before_image && (NULL != cse->old_block))
		{
			old_block = (blk_hdr_ptr_t)cse->old_block;
			if (old_block->tn < csa->jnl->jnl_buff->epoch_tn)
				cse->blk_checksum = jnl_get_checksum((uint4 *)old_block, csa, old_block->bsiz);
			else
				cse->blk_checksum = 0;
		}
	}
	assert(NULL != gv_target);
	/* If clue is known to be non-zero, we have the potential for the first_rec part of it to be unreliable.
	 * Reset it to be safe. See comment in similar section in tp_hist for details on why.
	 */
	if (gv_target->clue.end)
		GVT_CLUE_INVALIDATE_FIRST_REC(gv_target);
	return cdb_sc_normal;
}