fis-gtm/sr_port/gvincr_recompute_upd_array.c

207 lines
9.0 KiB
C

/****************************************************************
* *
* Copyright 2004, 2010 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_string.h"
#include "cdb_sc.h"
#include "gdsroot.h"
#include "gtm_facility.h"
#include "gdskill.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "gdscc.h"
#include "min_max.h" /* needed for gdsblkops.h */
#include "gdsblkops.h"
#include "jnl.h"
#include "copy.h"
#include "gvcst_protos.h" /* for gvcst_search_blk prototypes */
#include "op.h" /* for add_mvals prototype */
#include "jnl_get_checksum.h"
GBLREF uint4 dollar_tlevel;
GBLREF gd_region *gv_cur_region;
GBLREF sgmnt_addrs *cs_addrs;
GBLREF sgmnt_data_ptr_t cs_data;
GBLREF mval *post_incr_mval; /* mval pointing to the post-$INCR value */
GBLREF jnl_format_buffer *non_tp_jfb_ptr;
GBLREF jnl_gbls_t jgbl;
GBLREF char *update_array, *update_array_ptr;
GBLREF int gv_fillfactor, rc_set_fragment; /* Contains offset within data at which data fragment starts */
GBLREF unsigned char cw_set_depth;
GBLREF gv_key *gv_currkey;
GBLREF unsigned int t_tries;
GBLREF uint4 update_array_size;
GBLREF gv_namehead *gv_target;
/* --------------------------------------------------------------------------------------------
* This code is very similar to the code in gvcst_put for the non-block-split case as well as
* the code in recompute_upd_array in tp_tend.c. All of these need to be maintained in sync.
* --------------------------------------------------------------------------------------------
*/
enum cdb_sc gvincr_recompute_upd_array(srch_blk_status *bh, struct cw_set_element_struct *cse, cache_rec_ptr_t cr)
{
blk_segment *bs1, *bs_ptr;
char *va;
enum cdb_sc status;
int4 blk_size, blk_fill_size, cur_blk_size, blk_seg_cnt, delta, tail_len, new_rec_size;
int4 target_key_size, data_len;
mstr value;
rec_hdr_ptr_t curr_rec_hdr, rp;
sm_uc_ptr_t cp1, buffaddr;
unsigned short rec_size;
jnl_format_buffer *jfb;
blk_hdr_ptr_t old_block;
sgmnt_addrs *csa;
csa = cs_addrs;
assert(!dollar_tlevel); /* this recomputation is currently supported only for non-TP */
/* To support this for TP would require addressing a lot more issues. Examples are
* a) Currently we format jnl records only for explicit updates and not for implicit updates (updates in trigger code).
* All such triggers updates currently happen inside of a TP (even if the explicit update is non-TP, there
* is an implicit TP wrapper). Therefore we need to record more information as to whether this update
* to the database needs a corresponding format of the logical journal record or not.
*/
assert(0 == cse->level); /* better be a leaf-level block */
assert(csa->now_crit);
assert(!cse->level && (gds_t_write == cse->mode) && (NULL == cse->new_buff) && (GDS_WRITE_PLAIN == cse->write_type));
blk_size = cs_data->blk_size; /* "blk_size" is also used by the BLK_FINI macro below */
blk_fill_size = (blk_size * gv_fillfactor) / 100 - cs_data->reserved_bytes;
/* clues for gv_target involved in recomputation need not be nullified since only the value changes (not the key) */
assert(CR_NOTVALID != (sm_long_t)cr);
if (NULL == cr || CR_NOTVALID == (sm_long_t)cr || (0 <= cr->read_in_progress))
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_lostcr;
}
if (cr->in_tend)
{ /* Possible if this cache-record is being modified concurrently by another process in bg_update_phase2.
* Normally t_qread would have waited for this to complete before returning. But it is possible in some
* cases to bypass t_qread (e.g. gv_target->clue.end is non-zero). In this case we have two options.
* a) Signal a restart. This will cause clue.end to get reset to 0 and will now go through t_qread.
* b) Wait for in_tend to become non-zero and then proceed. This will save a restart.
* Since we are not in TP the overhead of restarting is not that bad.
* Since we hold crit at this point, we decide not to wait. We choose (a).
*/
assert(CDB_STAGNATE > t_tries);
return cdb_sc_blkmod;
}
buffaddr = bh->buffaddr;
target_key_size = gv_currkey->end + 1;
if (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, bh)))
{
assert(CDB_STAGNATE > t_tries);
return status;
}
if (target_key_size != bh->curr_rec.match) /* key does not exist, nothing doable here, restart transaction */
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_blkmod;
}
cur_blk_size = ((blk_hdr_ptr_t)buffaddr)->bsiz;
rp = (rec_hdr_ptr_t)(buffaddr + bh->curr_rec.offset);
GET_USHORT(rec_size, &rp->rsiz);
data_len = rec_size + rp->cmpc - SIZEOF(rec_hdr) - target_key_size;
if (cdb_sc_normal != (status = gvincr_compute_post_incr(bh)))
{
assert(CDB_STAGNATE > t_tries);
return status;
}
assert(MV_IS_STRING(post_incr_mval)); /* gvincr_recompute_post_incr should have set it to be a of type MV_STR */
value = post_incr_mval->str;
new_rec_size = rec_size - data_len + value.len;
delta = new_rec_size - rec_size;
if ((cur_blk_size + delta) > blk_fill_size)
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_blksplit;
}
if (0 != rc_set_fragment)
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_mkblk; /* let gvcst_put do the recomputation out of crit in case of rc_set */
}
/* Note that a lot of the code below relies on the fact that we are in non-TP. For TP we need to do extra stuff */
assert(NULL != update_array);
assert(NULL != update_array_ptr);
assert(0 != update_array_size);
assert(update_array + update_array_size >= update_array_ptr);
assert(1 == cw_set_depth);
/* since cw_set_depth is guaranteed to be 1 (by the above assert), we can be sure that the only update array space we would
* have used is for the current (and only) cw_set_element "cse" and hence can reuse the space by resetting update_array_ptr
*/
assert(ROUND_UP2((INTPTR_T)update_array, UPDATE_ELEMENT_ALIGN_SIZE) == (INTPTR_T)cse->upd_addr);
RESET_UPDATE_ARRAY; /* do not use CHECK_AND_RESET_UPDATE_ARRAY since we are knowingly resetting an active update array */
BLK_INIT(bs_ptr, bs1);
BLK_SEG(bs_ptr, buffaddr + SIZEOF(blk_hdr), bh->curr_rec.offset - SIZEOF(blk_hdr));
BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr);
curr_rec_hdr->rsiz = new_rec_size;
curr_rec_hdr->cmpc = bh->prev_rec.match;
BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr));
BLK_ADDR(cp1, target_key_size - bh->prev_rec.match, unsigned char);
memcpy(cp1, gv_currkey->base + bh->prev_rec.match, target_key_size - bh->prev_rec.match);
BLK_SEG(bs_ptr, cp1, target_key_size - bh->prev_rec.match);
assert(0 != value.len);
BLK_ADDR(va, value.len, char);
memcpy(va, value.addr, value.len);
BLK_SEG(bs_ptr, (unsigned char *)va, value.len);
rp = (rec_hdr_ptr_t)((sm_uc_ptr_t)rp + rec_size);
tail_len = (int4)(cur_blk_size - ((sm_uc_ptr_t)rp - buffaddr));
assert(tail_len >= 0); /* else gvincr_recompute_post_incr would have returned cdb_sc_rmisalign and we will not be here */
if (tail_len > 0)
{
BLK_SEG(bs_ptr, (sm_uc_ptr_t)rp, tail_len);
}
if (0 == BLK_FINI(bs_ptr, bs1))
{
assert(CDB_STAGNATE > t_tries);
return cdb_sc_mkblk;
}
cse->upd_addr = (unsigned char *)bs1;
/* assert that cse->old_block is indeed pointing to the buffer that the cache-record is pointing to.
* this is necessary to ensure that we are copying "ondsk_blkver" from the correct cache-record.
* there is a possibility that this assert might not hold true which is if we are in a restartable situation.
* but in that case do the same check that t_end will perform to determine this.
*/
assert((cse->old_block == (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr)) || (bh->cycle != cr->cycle) || (bh->cr != cr));
cse->ondsk_blkver = cr->ondsk_blkver;
cse->done = FALSE;
/* Reformat the logical SET jnl-record if we need to write logical records. But recompute checksums for PBLK record
* ONLY IF journaling is enabled. Do not need to do this in the case REPL_WAS_ENABLED(csa) is TRUE as replication
* only cares about logical records. Hence the separation of the code below into two "if" blocks.
*/
if (JNL_WRITE_LOGICAL_RECS(csa))
jfb = jnl_format(JNL_SET, gv_currkey, post_incr_mval, 0); /* Re-format the logical SET jnl-record */
if (JNL_ENABLED(csa))
{ /* Recompute checksums in case necessary */
if (csa->jnl_before_image && (NULL != cse->old_block))
{
old_block = (blk_hdr_ptr_t)cse->old_block;
if (old_block->tn < csa->jnl->jnl_buff->epoch_tn)
cse->blk_checksum = jnl_get_checksum((uint4 *)old_block, csa, old_block->bsiz);
else
cse->blk_checksum = 0;
}
}
assert(NULL != gv_target);
/* If clue is known to be non-zero, we have the potential for the first_rec part of it to be unreliable.
* Reset it to be safe. See comment in similar section in tp_hist for details on why.
*/
if (gv_target->clue.end)
GVT_CLUE_INVALIDATE_FIRST_REC(gv_target);
return cdb_sc_normal;
}