/**************************************************************** * * * Copyright 2004, 2010 Fidelity Information Services, Inc * * * * This source code contains the intellectual property * * of its copyright holder(s), and is made available * * under a license. If you do not know the terms of * * the license, please stop and do not read further. * * * ****************************************************************/ #include "mdef.h" #include "gtm_string.h" #include "cdb_sc.h" #include "gdsroot.h" #include "gtm_facility.h" #include "gdskill.h" #include "fileinfo.h" #include "gdsbt.h" #include "gdsblk.h" #include "gdsfhead.h" #include "filestruct.h" #include "gdscc.h" #include "min_max.h" /* needed for gdsblkops.h */ #include "gdsblkops.h" #include "jnl.h" #include "copy.h" #include "gvcst_protos.h" /* for gvcst_search_blk prototypes */ #include "op.h" /* for add_mvals prototype */ #include "jnl_get_checksum.h" GBLREF uint4 dollar_tlevel; GBLREF gd_region *gv_cur_region; GBLREF sgmnt_addrs *cs_addrs; GBLREF sgmnt_data_ptr_t cs_data; GBLREF mval *post_incr_mval; /* mval pointing to the post-$INCR value */ GBLREF jnl_format_buffer *non_tp_jfb_ptr; GBLREF jnl_gbls_t jgbl; GBLREF char *update_array, *update_array_ptr; GBLREF int gv_fillfactor, rc_set_fragment; /* Contains offset within data at which data fragment starts */ GBLREF unsigned char cw_set_depth; GBLREF gv_key *gv_currkey; GBLREF unsigned int t_tries; GBLREF uint4 update_array_size; GBLREF gv_namehead *gv_target; /* -------------------------------------------------------------------------------------------- * This code is very similar to the code in gvcst_put for the non-block-split case as well as * the code in recompute_upd_array in tp_tend.c. All of these need to be maintained in sync. * -------------------------------------------------------------------------------------------- */ enum cdb_sc gvincr_recompute_upd_array(srch_blk_status *bh, struct cw_set_element_struct *cse, cache_rec_ptr_t cr) { blk_segment *bs1, *bs_ptr; char *va; enum cdb_sc status; int4 blk_size, blk_fill_size, cur_blk_size, blk_seg_cnt, delta, tail_len, new_rec_size; int4 target_key_size, data_len; mstr value; rec_hdr_ptr_t curr_rec_hdr, rp; sm_uc_ptr_t cp1, buffaddr; unsigned short rec_size; jnl_format_buffer *jfb; blk_hdr_ptr_t old_block; sgmnt_addrs *csa; csa = cs_addrs; assert(!dollar_tlevel); /* this recomputation is currently supported only for non-TP */ /* To support this for TP would require addressing a lot more issues. Examples are * a) Currently we format jnl records only for explicit updates and not for implicit updates (updates in trigger code). * All such triggers updates currently happen inside of a TP (even if the explicit update is non-TP, there * is an implicit TP wrapper). Therefore we need to record more information as to whether this update * to the database needs a corresponding format of the logical journal record or not. */ assert(0 == cse->level); /* better be a leaf-level block */ assert(csa->now_crit); assert(!cse->level && (gds_t_write == cse->mode) && (NULL == cse->new_buff) && (GDS_WRITE_PLAIN == cse->write_type)); blk_size = cs_data->blk_size; /* "blk_size" is also used by the BLK_FINI macro below */ blk_fill_size = (blk_size * gv_fillfactor) / 100 - cs_data->reserved_bytes; /* clues for gv_target involved in recomputation need not be nullified since only the value changes (not the key) */ assert(CR_NOTVALID != (sm_long_t)cr); if (NULL == cr || CR_NOTVALID == (sm_long_t)cr || (0 <= cr->read_in_progress)) { assert(CDB_STAGNATE > t_tries); return cdb_sc_lostcr; } if (cr->in_tend) { /* Possible if this cache-record is being modified concurrently by another process in bg_update_phase2. * Normally t_qread would have waited for this to complete before returning. But it is possible in some * cases to bypass t_qread (e.g. gv_target->clue.end is non-zero). In this case we have two options. * a) Signal a restart. This will cause clue.end to get reset to 0 and will now go through t_qread. * b) Wait for in_tend to become non-zero and then proceed. This will save a restart. * Since we are not in TP the overhead of restarting is not that bad. * Since we hold crit at this point, we decide not to wait. We choose (a). */ assert(CDB_STAGNATE > t_tries); return cdb_sc_blkmod; } buffaddr = bh->buffaddr; target_key_size = gv_currkey->end + 1; if (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, bh))) { assert(CDB_STAGNATE > t_tries); return status; } if (target_key_size != bh->curr_rec.match) /* key does not exist, nothing doable here, restart transaction */ { assert(CDB_STAGNATE > t_tries); return cdb_sc_blkmod; } cur_blk_size = ((blk_hdr_ptr_t)buffaddr)->bsiz; rp = (rec_hdr_ptr_t)(buffaddr + bh->curr_rec.offset); GET_USHORT(rec_size, &rp->rsiz); data_len = rec_size + rp->cmpc - SIZEOF(rec_hdr) - target_key_size; if (cdb_sc_normal != (status = gvincr_compute_post_incr(bh))) { assert(CDB_STAGNATE > t_tries); return status; } assert(MV_IS_STRING(post_incr_mval)); /* gvincr_recompute_post_incr should have set it to be a of type MV_STR */ value = post_incr_mval->str; new_rec_size = rec_size - data_len + value.len; delta = new_rec_size - rec_size; if ((cur_blk_size + delta) > blk_fill_size) { assert(CDB_STAGNATE > t_tries); return cdb_sc_blksplit; } if (0 != rc_set_fragment) { assert(CDB_STAGNATE > t_tries); return cdb_sc_mkblk; /* let gvcst_put do the recomputation out of crit in case of rc_set */ } /* Note that a lot of the code below relies on the fact that we are in non-TP. For TP we need to do extra stuff */ assert(NULL != update_array); assert(NULL != update_array_ptr); assert(0 != update_array_size); assert(update_array + update_array_size >= update_array_ptr); assert(1 == cw_set_depth); /* since cw_set_depth is guaranteed to be 1 (by the above assert), we can be sure that the only update array space we would * have used is for the current (and only) cw_set_element "cse" and hence can reuse the space by resetting update_array_ptr */ assert(ROUND_UP2((INTPTR_T)update_array, UPDATE_ELEMENT_ALIGN_SIZE) == (INTPTR_T)cse->upd_addr); RESET_UPDATE_ARRAY; /* do not use CHECK_AND_RESET_UPDATE_ARRAY since we are knowingly resetting an active update array */ BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, buffaddr + SIZEOF(blk_hdr), bh->curr_rec.offset - SIZEOF(blk_hdr)); BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr); curr_rec_hdr->rsiz = new_rec_size; curr_rec_hdr->cmpc = bh->prev_rec.match; BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr)); BLK_ADDR(cp1, target_key_size - bh->prev_rec.match, unsigned char); memcpy(cp1, gv_currkey->base + bh->prev_rec.match, target_key_size - bh->prev_rec.match); BLK_SEG(bs_ptr, cp1, target_key_size - bh->prev_rec.match); assert(0 != value.len); BLK_ADDR(va, value.len, char); memcpy(va, value.addr, value.len); BLK_SEG(bs_ptr, (unsigned char *)va, value.len); rp = (rec_hdr_ptr_t)((sm_uc_ptr_t)rp + rec_size); tail_len = (int4)(cur_blk_size - ((sm_uc_ptr_t)rp - buffaddr)); assert(tail_len >= 0); /* else gvincr_recompute_post_incr would have returned cdb_sc_rmisalign and we will not be here */ if (tail_len > 0) { BLK_SEG(bs_ptr, (sm_uc_ptr_t)rp, tail_len); } if (0 == BLK_FINI(bs_ptr, bs1)) { assert(CDB_STAGNATE > t_tries); return cdb_sc_mkblk; } cse->upd_addr = (unsigned char *)bs1; /* assert that cse->old_block is indeed pointing to the buffer that the cache-record is pointing to. * this is necessary to ensure that we are copying "ondsk_blkver" from the correct cache-record. * there is a possibility that this assert might not hold true which is if we are in a restartable situation. * but in that case do the same check that t_end will perform to determine this. */ assert((cse->old_block == (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr)) || (bh->cycle != cr->cycle) || (bh->cr != cr)); cse->ondsk_blkver = cr->ondsk_blkver; cse->done = FALSE; /* Reformat the logical SET jnl-record if we need to write logical records. But recompute checksums for PBLK record * ONLY IF journaling is enabled. Do not need to do this in the case REPL_WAS_ENABLED(csa) is TRUE as replication * only cares about logical records. Hence the separation of the code below into two "if" blocks. */ if (JNL_WRITE_LOGICAL_RECS(csa)) jfb = jnl_format(JNL_SET, gv_currkey, post_incr_mval, 0); /* Re-format the logical SET jnl-record */ if (JNL_ENABLED(csa)) { /* Recompute checksums in case necessary */ if (csa->jnl_before_image && (NULL != cse->old_block)) { old_block = (blk_hdr_ptr_t)cse->old_block; if (old_block->tn < csa->jnl->jnl_buff->epoch_tn) cse->blk_checksum = jnl_get_checksum((uint4 *)old_block, csa, old_block->bsiz); else cse->blk_checksum = 0; } } assert(NULL != gv_target); /* If clue is known to be non-zero, we have the potential for the first_rec part of it to be unreliable. * Reset it to be safe. See comment in similar section in tp_hist for details on why. */ if (gv_target->clue.end) GVT_CLUE_INVALIDATE_FIRST_REC(gv_target); return cdb_sc_normal; }