/**************************************************************** * * * Copyright 2001, 2011 Fidelity Information Services, Inc * * * * This source code contains the intellectual property * * of its copyright holder(s), and is made available * * under a license. If you do not know the terms of * * the license, please stop and do not read further. * * * ****************************************************************/ #include "mdef.h" #include #include "gtm_stdio.h" #include "gtm_stdlib.h" #include "gtm_string.h" #include "gtm_inet.h" /* Required for gtmsource.h */ #ifdef VMS #include /* Required for gtmsource.h */ #endif #include "gdsroot.h" #include "gdskill.h" #include "gdsblk.h" #include "gtm_facility.h" #include "fileinfo.h" #include "gdsbt.h" #include "gdsfhead.h" #include "filestruct.h" #include "cdb_sc.h" #include "min_max.h" /* needed for gdsblkops.h */ #include "gdsblkops.h" #include "jnl.h" #include "gdscc.h" #include "copy.h" #include "buddy_list.h" /* needed for tp.h */ #include "hashtab_int4.h" /* needed for tp.h */ #include "tp.h" #include "rc_oflow.h" #include "repl_msg.h" #include "gtmsource.h" #include "rtnhdr.h" #include "stack_frame.h" #ifdef GTM_TRIGGER # include "gv_trigger.h" # include "gtm_trigger.h" # include "gv_trigger_protos.h" # include "subscript.h" # include "mv_stent.h" # include "stringpool.h" #endif #include "tp_frame.h" #include "tp_restart.h" /* Include prototypes */ #include "t_write.h" #include "t_write_root.h" #include "t_end.h" #include "t_retry.h" #include "t_begin.h" #include "t_create.h" #include "gvcst_blk_build.h" #include "gvcst_expand_key.h" #include "gvcst_protos.h" /* for gvcst_search,gvcst_search_blk,gvcst_put prototype */ #include "op.h" /* for op_add & op_tstart prototype */ #include "format_targ_key.h" /* for format_targ_key prototype */ #include "gvsub2str.h" /* for gvsub2str prototype */ #include "tp_set_sgm.h" /* for tp_set_sgm prototype */ #include "op_tcommit.h" /* for op_tcommit prototype */ #include "have_crit.h" #ifdef GTM_TRIGGER LITREF mval literal_null; LITREF mval literal_one; LITREF mval literal_zero; #endif /* Globals that will not change in value across nested trigger calls of gvcst_put OR even if they might change in value, * the change is such that they dont need save/restore logic surrounding the "gtm_trigger" call. Any new GBLREFs that are * added in this module need to be examined for interference between gvcst_put and nested trigger call and any save/restore * logic (if needed) should be appropriately added surrounding the "gtm_trigger" invocation. */ GBLREF boolean_t gvdupsetnoop; /* if TRUE, duplicate SETs update journal but not database (except for curr_tn++) */ GBLREF boolean_t horiz_growth; GBLREF boolean_t in_gvcst_incr; GBLREF char *update_array, *update_array_ptr; GBLREF gv_key *gv_altkey; GBLREF gv_namehead *reset_gv_target; GBLREF inctn_opcode_t inctn_opcode; GBLREF int gv_fillfactor; GBLREF int rc_set_fragment; /* Contains offset within data at which data fragment starts */ GBLREF int4 gv_keysize; GBLREF int4 prev_first_off, prev_next_off; GBLREF uint4 update_trans; GBLREF jnl_format_buffer *non_tp_jfb_ptr; GBLREF jnl_gbls_t jgbl; GBLREF jnlpool_addrs jnlpool; GBLREF uint4 dollar_tlevel; GBLREF uint4 process_id; GBLREF uint4 update_array_size, cumul_update_array_size; /* the current total size of the update array */ GBLREF unsigned char t_fail_hist[CDB_MAX_TRIES]; GBLREF unsigned int t_tries; GBLREF cw_set_element cw_set[CDB_CW_SET_SIZE];/* create write set. */ GBLREF boolean_t skip_dbtriggers; /* see gbldefs.c for description of this global */ GBLREF stack_frame *frame_pointer; #ifdef GTM_TRIGGER GBLREF int tprestart_state; GBLREF int4 gtm_trigger_depth; GBLREF int4 tstart_trigger_depth; GBLREF boolean_t skip_INVOKE_RESTART; GBLREF boolean_t ztwormhole_used; /* TRUE if $ztwormhole was used by trigger code */ #endif #ifdef DEBUG GBLREF boolean_t skip_block_chain_tail_check; #endif /* Globals that could change in value across nested trigger calls of gvcst_put AND need to be saved/restored */ GBLREF boolean_t is_dollar_incr; GBLREF gd_region *gv_cur_region; GBLREF gv_key *gv_currkey; GBLREF gv_namehead *gv_target; GBLREF mval *post_incr_mval; GBLREF mval increment_delta_mval; GBLREF sgm_info *sgm_info_ptr; GBLREF sgmnt_addrs *cs_addrs; GBLREF sgmnt_data_ptr_t cs_data; error_def(ERR_GVINCRISOLATION); error_def(ERR_GVIS); error_def(ERR_GVPUTFAIL); error_def(ERR_REC2BIG); error_def(ERR_RSVDBYTE2HIGH); error_def(ERR_TPRETRY); /* Before issuing an error, add GVT to the list of known gvts in this TP transaction in case it is not already done. * This GVT addition is usually done by "tp_hist" but that function has most likely not yet been invoked in gvcst_put. * Doing this addition will ensure we remember to reset any non-zero clue in dir_tree as part of tp_clean_up when a TROLLBACK * or TRESTART (implicit or explicit) occurs. Not doing so could cause transfer of control from the current gvcst_put action * to a user-defined error trap which if it does further database references, it could end up using invalid clues from GVT * and potentially incorrectly commit the transaction causing db integ errors as well. */ #define ENSURE_VALUE_WITHIN_MAX_REC_SIZE(value, GVT) \ { \ if (dollar_tlevel) \ ADD_TO_GVT_TP_LIST(GVT); /* note: macro also updates read_local_tn if necessary */ \ if (gv_currkey->end + 1 + value.len + SIZEOF(rec_hdr) > gv_cur_region->max_rec_size) \ { \ if (0 == (end = format_targ_key(buff, MAX_ZWR_KEY_SZ, gv_currkey, TRUE))) \ end = &buff[MAX_ZWR_KEY_SZ - 1]; \ rts_error(VARLSTCNT(10) ERR_REC2BIG, 4, gv_currkey->end + 1 + value.len + SIZEOF(rec_hdr), \ (int4)gv_cur_region->max_rec_size, \ REG_LEN_STR(gv_cur_region), ERR_GVIS, 2, end - buff, buff); \ } \ } /* See comment before ENSURE_VALUE_WITHIN_MAX_REC_SIZE macro definition for why the ADD_TO_GVT_TP_LIST call below is necessary */ #define ISSUE_RSVDBYTE2HIGH_ERROR(GVT) \ { \ if (dollar_tlevel) \ ADD_TO_GVT_TP_LIST(GVT); /* note: macro also updates read_local_tn if necessary */ \ /* The record that is newly inserted/updated does not fit by itself in a separate block \ * if the current reserved-bytes for this database is taken into account. Cannot go on. \ */ \ if (0 == (end = format_targ_key(buff, MAX_ZWR_KEY_SZ, gv_currkey, TRUE))) \ end = &buff[MAX_ZWR_KEY_SZ - 1]; \ rts_error(VARLSTCNT(11) ERR_RSVDBYTE2HIGH, 5, new_blk_size_single, \ REG_LEN_STR(gv_cur_region), blk_size, blk_reserved_bytes, \ ERR_GVIS, 2, end - buff, buff); \ } #define RESTORE_ZERO_GVT_ROOT_ON_RETRY(LCL_ROOT, GV_TARGET, TP_ROOT, DIR_HIST, DIR_TREE) \ { \ if (!LCL_ROOT) \ { \ assert(NULL != DIR_HIST); \ assert(DIR_TREE == GV_TARGET->gd_csa->dir_tree); \ /* t_retry only resets gv_target->clue and not the clue of the directory tree. \ * But DIR_HIST non-null implies the directory tree was used in a gvcst_search and hence \ * was validated (in t_end/tp_hist),so we need to reset its clue before the next try. \ */ \ DIR_TREE->clue.end = 0; \ /* Check if LCL_ROOT & GV_TARGET->root are in sync. If not make them so. */ \ if (GV_TARGET->root) \ { /* We had reset the root block from zero to a non-zero value within \ * this function, but since we are restarting, we can no longer be \ * sure of the validity of the root block. Reset it to 0 so it will \ * be re-determined in the next global reference. \ */ \ assert((TP_ROOT == GV_TARGET->root) \ || ((0 == TP_ROOT) GTMTRIG_ONLY(&& (0 < gvtr_parms.num_triggers_invoked)))); \ GV_TARGET->root = 0; \ } \ } \ } #ifdef DEBUG # define DBG_SAVE_VAL_AT_FUN_ENTRY \ { /* Save copy of "val" at function entry. \ * Make sure this is not touched by any nested trigger code */ \ dbg_lcl_val = val; \ dbg_vallen = val->str.len; \ memcpy(dbg_valbuff, val->str.addr, MIN(ARRAYSIZE(dbg_valbuff), dbg_vallen)); \ } # define DBG_CHECK_VAL_AT_FUN_EXIT \ { /* Check "val" is same as what it was at function entry.(i.e. was not touched by nested trigger code). \ * The only exception is if $ZTVAL changed "val" in which case gvcst_put would have been redone. */ \ assert(dbg_vallen == dbg_lcl_val->str.len); \ assert(0 == memcmp(dbg_valbuff, dbg_lcl_val->str.addr, MIN(ARRAYSIZE(dbg_valbuff), dbg_vallen))); \ } #else # define DBG_SAVE_VAL_AT_FUN_ENTRY # define DBG_CHECK_VAL_AT_FUN_EXIT #endif #define GOTO_RETRY \ { \ GTMTRIG_DBG_ONLY(dbg_trace_array[dbg_num_iters].retry_line = __LINE__); \ goto retry; \ } void gvcst_put(mval *val) { sgmnt_addrs *csa; sgmnt_data_ptr_t csd; node_local_ptr_t cnl; int4 blk_size, blk_fill_size, blk_reserved_bytes; const int4 zeroes = 0; boolean_t jnl_format_done; blk_segment *bs1, *bs_ptr, *new_blk_bs; block_id allocation_clue, tp_root, gvt_for_root, blk_num, last_split_blk_num[MAX_BT_DEPTH]; block_index left_hand_index, ins_chain_index, root_blk_cw_index, next_blk_index; block_offset next_offset, first_offset, ins_off1, ins_off2, old_curr_chain_next_off; cw_set_element *cse, *cse_new, *old_cse; gv_namehead *save_targ, *split_targ, *dir_tree; enum cdb_sc status; gv_key *temp_key; mstr value; off_chain chain1, curr_chain, prev_chain, chain2; rec_hdr_ptr_t curr_rec_hdr, extra_rec_hdr, next_rec_hdr, new_star_hdr, rp; srch_blk_status *bh, *bq, *tp_srch_status; srch_hist *dir_hist; int cur_blk_size, blk_seg_cnt, delta, i, j, left_hand_offset, n, ins_chain_offset, new_blk_size_l, new_blk_size_r, new_blk_size_single, new_blk_size, blk_reserved_size, last_possible_left_offset, new_rec_size, next_rec_shrink, next_rec_shrink1, offset_sum, rec_cmpc, target_key_size, tp_lev, undo_index, cur_val_offset, curr_offset, bh_level; uint4 segment_update_array_size, key_top, cp2_len, bs1_2_len, bs1_3_len; char *va, last_split_direction[MAX_BT_DEPTH]; sm_uc_ptr_t cp1, cp2, curr; unsigned short extra_record_orig_size, rec_size, temp_short; unsigned int prev_rec_offset, prev_rec_match, curr_rec_offset, curr_rec_match; boolean_t copy_extra_record, level_0, new_rec, no_pointers, succeeded, key_exists; boolean_t make_it_null, gbl_target_was_set, duplicate_set, new_rec_goes_to_right, need_extra_block_split; key_cum_value *tempkv; jnl_format_buffer *jfb, *ztworm_jfb; jnl_action *ja; mval *set_val; /* actual right-hand-side value of the SET or $INCR command */ ht_ent_int4 *tabent; unsigned char buff[MAX_ZWR_KEY_SZ], *end, old_ch, new_ch; sm_uc_ptr_t buffaddr; block_id lcl_root, last_split_bnum; sgm_info *si; uint4 nodeflags; boolean_t write_logical_jnlrecs, can_write_logical_jnlrecs, blk_match, is_split_dir_left; int split_depth; mval *ja_val; int rc; int4 cse_first_off; enum split_dir last_split_dir; # ifdef GTM_TRIGGER boolean_t is_tpwrap; boolean_t ztval_gvcst_put_redo, skip_hasht_read; gtm_trigger_parms trigparms; gvt_trigger_t *gvt_trigger; gvtr_invoke_parms_t gvtr_parms; int gtm_trig_status; int4 data_len; unsigned char *save_msp; mv_stent *save_mv_chain; mval *ztold_mval = NULL; mval *ztval_mval; boolean_t lcl_implicit_tstart; /* local copy of the global variable "implicit_tstart" */ mval lcl_increment_delta_mval; /* local copy of "increment_delta_mval" */ boolean_t lcl_is_dollar_incr; /* local copy of is_dollar_incr taken at start of module. * used to restore is_dollar_incr in case of TP restarts */ mval *lcl_post_incr_mval; /* local copy of "post_incr_mval" at function entry. * used to restore "post_incr_mval" in case of TP restarts */ mval *lcl_val; /* local copy of "val" at function entry. * used to restore "val" in case of TP restarts */ # endif # ifdef DEBUG char dbg_valbuff[256]; mstr_len_t dbg_vallen; mval *dbg_lcl_val; int dbg_num_iters = -1; /* number of iterations through gvcst_put */ int lcl_dollar_tlevel, lcl_t_tries; typedef struct { unsigned int t_tries; int retry_line; boolean_t is_fresh_tn_start; boolean_t is_dollar_incr; boolean_t ztval_gvcst_put_redo; boolean_t is_extra_block_split; mval *val; boolean_t lcl_implicit_tstart; } dbg_trace; /* We want to capture all pertinent information across each iteration of gvcst_put. * There are 3 things that can contribute to a new iteration. * a) restarts from the primary set. * Max of 4 iterations. * b) extra_block_split from the primary set. It can have its own set of restarts too. * Max of 4 iterations per extra_block_split. * The # of extra block splits could be arbitrary in case of non-TP but cannot be more than 1 for TP * because in TP, we would have grabbed crit in the final retry and prevent any more concurrent updates. * c) ztval_gvcst_put_redo. This in turn can have its own set of restarts and extra_block_split iterations. * Could take a max of (a) + (b) = 4 + 4 = 8 iterations. * Total of 16 max iterations. If ever a transaction goes for more than this # of iterations (theoretically * possible in non-TP if a lot of extra block splits occur), we assert fail. */ dbg_trace dbg_trace_array[16]; boolean_t is_fresh_tn_start; boolean_t is_mm; # endif is_dollar_incr = in_gvcst_incr; in_gvcst_incr = FALSE; csa = cs_addrs; csd = csa->hdr; cnl = csa->nl; assert(csd == cs_data); DEBUG_ONLY(is_mm = (dba_mm == csd->acc_meth);) # ifdef GTM_TRIGGER TRIG_CHECK_REPLSTATE_MATCHES_EXPLICIT_UPDATE(gv_cur_region, csa); assert(!dollar_tlevel || (tstart_trigger_depth <= gtm_trigger_depth)); if (!dollar_tlevel || (gtm_trigger_depth == tstart_trigger_depth)) { /* This is an explicit update. Set ztwormhole_used to FALSE. Note that we initialize this only at the * beginning of the transaction and not at the beginning of each try/retry. If the application used * $ztwormhole in any retsarting try of the transaction, we consider it necessary to write the * TZTWORM/UZTWORM record even though it was not used in the succeeding/committing try. */ ztwormhole_used = FALSE; } # endif JNLPOOL_INIT_IF_NEEDED(csa, csd, cnl); blk_size = csd->blk_size; blk_reserved_bytes = csd->reserved_bytes; blk_fill_size = (blk_size * gv_fillfactor) / 100 - blk_reserved_bytes; jnl_format_done = FALSE; /* do "jnl_format" only once per logical non-tp transaction irrespective of number of retries */ GTMTRIG_ONLY( ztval_gvcst_put_redo = FALSE; skip_hasht_read = FALSE; ) assert(('\0' != gv_currkey->base[0]) && gv_currkey->end); DBG_CHECK_GVTARGET_GVCURRKEY_IN_SYNC; /* this needs to be initialized before any code that does a "goto retry" since this gets used there */ save_targ = gv_target; gbl_target_was_set = (INVALID_GV_TARGET != reset_gv_target); if (INVALID_GV_TARGET != reset_gv_target) gbl_target_was_set = TRUE; else { gbl_target_was_set = FALSE; reset_gv_target = save_targ; } DBG_SAVE_VAL_AT_FUN_ENTRY; GTMTRIG_ONLY( lcl_implicit_tstart = FALSE; DEBUG_ONLY(gvtr_parms.num_triggers_invoked = -1;) /* set to an out-of-design value; checked by an assert */ ) DEBUG_ONLY( status = cdb_sc_normal; lcl_dollar_tlevel = dollar_tlevel; ) fresh_tn_start: DEBUG_ONLY(lcl_t_tries = -1;) DEBUG_ONLY(is_fresh_tn_start = TRUE;) assert(!jnl_format_done || (dollar_tlevel GTMTRIG_ONLY(&& ztval_gvcst_put_redo))); T_BEGIN_SETORKILL_NONTP_OR_TP(ERR_GVPUTFAIL); tn_restart: /* t_tries should never decrease - it either increases or stays the same. If should decrease we could live-lock with * an oscillating t_tries and never reach CDB_STAGNATE (go from optimistic to pessimistic concurrency). Since we * typically do a normal increment and then, for certain conditions, do a complementary decrement, we assert that * the net effect is never a decrease. */ assert(csa == cs_addrs); /* no amount of retries should change cs_addrs from what it was at entry into gvcst_put */ assert((((int)t_tries) > lcl_t_tries) || (CDB_STAGNATE == t_tries)); DEBUG_ONLY(lcl_t_tries = t_tries;) /* update lcl_t_tries */ DEBUG_ONLY( dbg_num_iters++; assert(dbg_num_iters < ARRAYSIZE(dbg_trace_array)); dbg_trace_array[dbg_num_iters].is_fresh_tn_start = is_fresh_tn_start; dbg_trace_array[dbg_num_iters].t_tries = t_tries; is_fresh_tn_start = FALSE; dbg_trace_array[dbg_num_iters].is_dollar_incr = is_dollar_incr; GTMTRIG_ONLY(dbg_trace_array[dbg_num_iters].ztval_gvcst_put_redo = ztval_gvcst_put_redo;) dbg_trace_array[dbg_num_iters].val = val; GTMTRIG_ONLY(dbg_trace_array[dbg_num_iters].lcl_implicit_tstart = lcl_implicit_tstart;) dbg_trace_array[dbg_num_iters].is_extra_block_split = FALSE; dbg_trace_array[dbg_num_iters].retry_line = 0; split_targ = NULL; ) /* If MM and file extension occurred, reset csd to cs_data to avoid out-of-date value. If BG we dont need the reset * but if checks are costlier than unconditional sets in a pipelined architecture so we choose not to do the if. */ assert(is_mm || (csd == cs_data)); csd = cs_data; # ifdef GTM_TRIGGER gvtr_parms.num_triggers_invoked = 0; /* clear any leftover value */ assert(!ztval_gvcst_put_redo || IS_PTR_INSIDE_M_STACK(val)); is_tpwrap = FALSE; if (!skip_dbtriggers && !skip_hasht_read) { GVTR_INIT_AND_TPWRAP_IF_NEEDED(csa, csd, gv_target, gvt_trigger, lcl_implicit_tstart, is_tpwrap, ERR_GVPUTFAIL); assert(gvt_trigger == gv_target->gvt_trigger); if (is_tpwrap) { /* The above call to GVTR_INIT* macro created a TP transaction (by invoking op_tstart). * Save all pertinent global variable information that needs to be restored in case of * a restart. Note that the restart could happen in a nested trigger so these global * variables could have changed in value from what they were at gvcst_put entry, hence * the need to save/restore them. If this is not an implicitly tp wrapped transaction, * there is no need to do this save/restore because a restart will transfer control * back to the M code corresponding to the start of the transaction which would * automatically initialize these global variables to the appropriate values. */ assert(lcl_implicit_tstart); lcl_is_dollar_incr = is_dollar_incr; lcl_val = val; lcl_post_incr_mval = post_incr_mval; lcl_increment_delta_mval = increment_delta_mval; } if (NULL != gvt_trigger) PUSH_ZTOLDMVAL_ON_M_STACK(ztold_mval, save_msp, save_mv_chain); } # endif assert(csd == cs_data); /* assert csd is in sync with cs_data even if there were MM db file extensions */ si = sgm_info_ptr; /* Cannot be moved before GVTR_INIT_AND_TPWRAP_IF_NEEDED macro since we could enter gvcst_put * with sgm_info_ptr NULL but could tpwrap a non-tp transaction due to triggers. In that case * we want the updated sgm_info_ptr to be noted down in si and used later. */ assert((NULL == si) || (si->update_trans)); assert(NULL != update_array); assert(NULL != update_array_ptr); assert(0 != update_array_size); assert(update_array + update_array_size >= update_array_ptr); /* When the following two asserts trip, we should change the data types of prev_first_off * and prev_next_off, so they satisfy the assert. */ assert(SIZEOF(prev_first_off) >= SIZEOF(block_offset)); assert(SIZEOF(prev_next_off) >= SIZEOF(block_offset)); prev_first_off = prev_next_off = PREV_OFF_INVALID; horiz_growth = FALSE; assert(t_tries < CDB_STAGNATE || csa->now_crit); /* we better hold crit in the final retry (TP & non-TP) */ /* level_0 == true and no_pointers == false means that this is a directory tree data block containing pointers to roots */ level_0 = no_pointers = TRUE; assert(gv_altkey->top == gv_currkey->top); assert(gv_altkey->top == gv_keysize); assert(gv_currkey->end < gv_currkey->top); assert(gv_altkey->end < gv_altkey->top); temp_key = gv_currkey; dir_hist = NULL; ins_chain_index = 0; lcl_root = gv_target->root; tp_root = lcl_root; if (!dollar_tlevel) { CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ } else { segment_update_array_size = UA_NON_BM_SIZE(csd); ENSURE_UPDATE_ARRAY_SPACE(segment_update_array_size); curr_chain = *(off_chain *)&lcl_root; if (curr_chain.flag == 1) { tp_get_cw(si->first_cw_set, (int)curr_chain.cw_index, &cse); tp_root = cse->blk; } } if (0 == tp_root) { /* Global does not exist as far as we know. Creating a new one requires validating the directory tree path which * led us to this conclusion. So scan the directory tree here and validate its history at the end of this function. * If we decide to restart due to a concurrency conflict, remember to reset gv_target->root to 0 before restarting. */ gv_target = dir_tree = csa->dir_tree; for (cp1 = temp_key->base, cp2 = gv_altkey->base; 0 != *cp1;) *cp2++ = *cp1++; *cp2++ = 0; *cp2 = 0; gv_altkey->end = cp2 - gv_altkey->base; assert(gv_altkey->end <= gv_altkey->top); dir_hist = &gv_target->hist; status = gvcst_search(gv_altkey, NULL); RESET_GV_TARGET_LCL(save_targ); if (cdb_sc_normal != status) GOTO_RETRY; if (gv_altkey->end + 1 == dir_hist->h[0].curr_rec.match) { GET_LONG(tp_root, (dir_hist->h[0].buffaddr + SIZEOF(rec_hdr) + dir_hist->h[0].curr_rec.offset + gv_altkey->end + 1 - ((rec_hdr_ptr_t)(dir_hist->h[0].buffaddr + dir_hist->h[0].curr_rec.offset))->cmpc)); if (dollar_tlevel) { gvt_for_root = dir_hist->h[0].blk_num; curr_chain = *(off_chain *)&gvt_for_root; if (curr_chain.flag == 1) tp_get_cw(si->first_cw_set, curr_chain.cw_index, &cse); else { if (NULL != (tabent = lookup_hashtab_int4(si->blks_in_use, (uint4 *)&gvt_for_root))) tp_srch_status = tabent->value; else tp_srch_status = NULL; cse = tp_srch_status ? tp_srch_status->cse : NULL; } assert(!cse || !cse->high_tlevel); } assert(0 == gv_target->root); gv_target->root = tp_root; } } blk_reserved_size = blk_size - blk_reserved_bytes; if (0 == tp_root) { /* there is no entry in the GVT (and no root), so create a new empty tree and put the name in the GVT */ /* Create the data block */ key_exists = FALSE; if (is_dollar_incr) { /* The global variable that is being $INCREMENTed does not exist. * $INCREMENT() should not signal UNDEF error but proceed with an implicit $GET(). */ assert(dollar_tlevel ? si->update_trans : update_trans); *post_incr_mval = *val; MV_FORCE_NUM(post_incr_mval); post_incr_mval->mvtype &= ~MV_STR; /* needed to force any alphanumeric string to numeric */ MV_FORCE_STR(post_incr_mval); assert(post_incr_mval->str.len); value = post_incr_mval->str; /* The MAX_REC_SIZE check could not be done in op_gvincr (like is done in op_gvput) because * the post-increment value is not known until here. so do the check here. */ ENSURE_VALUE_WITHIN_MAX_REC_SIZE(value, dir_tree); } else value = val->str; /* Potential size of a GVT leaf block containing just the new/updated record */ new_blk_size_single = SIZEOF(blk_hdr) + SIZEOF(rec_hdr) + temp_key->end + 1 + value.len; if (new_blk_size_single > blk_reserved_size) { /* The record that is newly inserted/updated does not fit by itself in a separate block * if the current reserved-bytes for this database is taken into account. Cannot go on. */ ISSUE_RSVDBYTE2HIGH_ERROR(dir_tree); } BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr); curr_rec_hdr->rsiz = SIZEOF(rec_hdr) + temp_key->end + 1 + value.len; curr_rec_hdr->cmpc = 0; BLK_INIT(bs_ptr, new_blk_bs); BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr)); BLK_ADDR(cp1, temp_key->end + 1, unsigned char); memcpy(cp1, temp_key->base, temp_key->end + 1); BLK_SEG(bs_ptr, cp1, temp_key->end + 1); if (0 != value.len) { BLK_ADDR(va, value.len, char); memcpy(va, value.addr, value.len); BLK_SEG(bs_ptr, (unsigned char *)va, value.len); } if (0 == BLK_FINI(bs_ptr, new_blk_bs)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } assert(new_blk_bs[0].len <= blk_reserved_size); /* Assert that new block has space for reserved bytes */ /* Create the index block */ BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr); curr_rec_hdr->rsiz = BSTAR_REC_SIZE; curr_rec_hdr->cmpc = 0; BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr)); BLK_SEG(bs_ptr, (unsigned char *)&zeroes, SIZEOF(block_id)); if (0 == BLK_FINI(bs_ptr, bs1)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } assert(bs1[0].len <= blk_reserved_size); /* Assert that new block has space for reserved bytes */ allocation_clue = ALLOCATION_CLUE(csd->trans_hist.total_blks); next_blk_index = t_create(allocation_clue, (uchar_ptr_t)new_blk_bs, 0, 0, 0); ++allocation_clue; ins_chain_index = t_create(allocation_clue, (uchar_ptr_t)bs1, SIZEOF(blk_hdr) + SIZEOF(rec_hdr), next_blk_index, 1); root_blk_cw_index = ins_chain_index; temp_key = gv_altkey; gv_target->hist.h[0].blk_num = HIST_TERMINATOR; gv_target = dir_tree; bh = &gv_target->hist.h[0]; value.len = SIZEOF(block_id); value.addr = (char *)&zeroes; no_pointers = FALSE; } else { if (cdb_sc_normal != (status = gvcst_search(gv_currkey, NULL))) GOTO_RETRY; target_key_size = gv_currkey->end + 1; bh = &gv_target->hist.h[0]; key_exists = (target_key_size == bh->curr_rec.match); if (is_dollar_incr) { if (key_exists) { /* $INCR is being done on an existing global variable key in the database. * the value to set the key to has to be determined by adding the existing value * with the increment passed as the input parameter "val" (of type (mval *)) to gvcst_put */ if (cdb_sc_normal != (status = gvincr_compute_post_incr(bh))) { assert(CDB_STAGNATE > t_tries); GOTO_RETRY; } } else { /* The global variable that is being $INCREMENTed does not exist. $INCREMENT() should not * signal UNDEF error but proceed with an implicit $GET() */ *post_incr_mval = *val; MV_FORCE_NUM(post_incr_mval); post_incr_mval->mvtype &= ~MV_STR; /* needed to force any alphanumeric string to numeric */ MV_FORCE_STR(post_incr_mval); assert(post_incr_mval->str.len); } assert(MV_IS_STRING(post_incr_mval)); assert(dollar_tlevel ? si->update_trans : update_trans); value = post_incr_mval->str; /* The MAX_REC_SIZE check could not be done in op_gvincr (like is done in op_gvput) because * the post-increment value is not known until here. so do the check here. */ ENSURE_VALUE_WITHIN_MAX_REC_SIZE(value, gv_target); } else value = val->str; } /* -------------------------------------------------------------------------------------------- * The code for the non-block-split case is very similar to the code in recompute_upd_array. * Any changes in either place should be reflected in the other. * -------------------------------------------------------------------------------------------- */ need_extra_block_split = FALSE; /* Assume we don't require an additional block split (most common case) */ duplicate_set = FALSE; /* Assume this is NOT a duplicate set (most common case) */ split_depth = 0; split_targ = gv_target; for (succeeded = FALSE; !succeeded; no_pointers = level_0 = FALSE) { buffaddr = bh->buffaddr; cur_blk_size = ((blk_hdr_ptr_t)buffaddr)->bsiz; target_key_size = temp_key->end + 1; /* Potential size of a block containing just the new/updated record */ new_blk_size_single = SIZEOF(blk_hdr) + SIZEOF(rec_hdr) + target_key_size + value.len; if (new_blk_size_single > blk_reserved_size) { /* The record that is newly inserted/updated does not fit by itself in a separate block * if the current reserved-bytes for this database is taken into account. If this is not a * GVT leaf block, this situation is then possible if we are not in the final retry (and hence * dont hold crit on the region) and "temp_key->end" (and in turn "target_key_size") was * computed from a stale copy (due to concurrent updates or buffer reuse) of the global buffer * (effectively a restartable situation). If so, restart. If not issue error. */ if (no_pointers || (CDB_STAGNATE <= t_tries)) { ISSUE_RSVDBYTE2HIGH_ERROR(gv_target); } else { status = cdb_sc_mkblk; GOTO_RETRY; } } curr_rec_match = bh->curr_rec.match; curr_rec_offset = bh->curr_rec.offset; new_rec = (target_key_size != curr_rec_match); if (!new_rec && !no_pointers) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_lostcr; /* will a new cdb_sc status be better */ GOTO_RETRY; } rp = (rec_hdr_ptr_t)(buffaddr + curr_rec_offset); if (curr_rec_offset == cur_blk_size) { if ((FALSE == new_rec) && dollar_tlevel) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } rec_cmpc = 0; rec_size = 0; } else { GET_USHORT(rec_size, &rp->rsiz); rec_cmpc = rp->cmpc; if ((sm_uc_ptr_t)rp + rec_size > (sm_uc_ptr_t)buffaddr + cur_blk_size) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } } prev_rec_match = bh->prev_rec.match; if (new_rec) { new_rec_size = SIZEOF(rec_hdr) + target_key_size - prev_rec_match + value.len; if (cur_blk_size <= (signed int)curr_rec_offset) /* typecast necessary to enforce "signed int" comparison */ next_rec_shrink = 0; else next_rec_shrink = curr_rec_match - rec_cmpc; delta = new_rec_size - next_rec_shrink; } else { if (rec_cmpc != prev_rec_match) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } assert(target_key_size > rec_cmpc); cur_val_offset = SIZEOF(rec_hdr) + (target_key_size - rec_cmpc); # ifdef GTM_TRIGGER if (no_pointers && (NULL != ztold_mval) && !skip_hasht_read) { /* Complete initialization of ztold_mval */ assert(!skip_dbtriggers); data_len = rec_size - cur_val_offset; if (0 > data_len) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_rmisalign; GOTO_RETRY; } if (data_len) { ENSURE_STP_FREE_SPACE(data_len); ztold_mval->str.addr = (char *)stringpool.free; memcpy(ztold_mval->str.addr, (sm_uc_ptr_t)rp + cur_val_offset, data_len); stringpool.free += data_len; } ztold_mval->str.len = data_len; ztold_mval->mvtype = MV_STR; /* ztold_mval is now completely initialized */ } # endif new_rec_size = cur_val_offset + value.len; delta = new_rec_size - rec_size; if (!delta && gvdupsetnoop && value.len && !memcmp(value.addr, (sm_uc_ptr_t)rp + new_rec_size - value.len, value.len)) { duplicate_set = TRUE; succeeded = TRUE; break; /* duplicate SET */ } next_rec_shrink = 0; } blk_num = bh->blk_num; bh_level = bh->level; if (dollar_tlevel) { if ((SIZEOF(rec_hdr) + target_key_size - prev_rec_match + value.len) != new_rec_size) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } chain1 = *(off_chain *)&blk_num; if ((1 == chain1.flag) && ((int)chain1.cw_index >= si->cw_set_depth)) { assert(si->tp_csa == csa); assert(FALSE == csa->now_crit); status = cdb_sc_blknumerr; GOTO_RETRY; } } next_rec_shrink1 = next_rec_shrink; /* Potential size of the current block including the new/updated record */ new_blk_size = cur_blk_size + delta; /* It is possible due to concurrency issues (for example if the buffer that we are planning on updating * in shared memory got reused for a different block) that "new_blk_size" is lesser than "new_blk_size_single" * In those cases, we will go into the non-block-split case but eventually we will restart. */ assert((new_blk_size >= new_blk_size_single) || (CDB_STAGNATE > t_tries)); if ((new_blk_size <= blk_fill_size) || (new_blk_size <= new_blk_size_single)) { /* Update can be done without overflowing the block's fillfactor OR the record to be updated * is the only record in the new block. Do not split block in either case. This means we might * not honour the desired FillFactor if the only record in a block exceeds the blk_fill_size, * but in this case we are guaranteed the block has room for the current reserved bytes. */ if (no_pointers) /* level zero (normal) data block: no deferred pointer chains */ ins_chain_offset = 0; else /* index or directory level block */ ins_chain_offset =(int)((sm_uc_ptr_t)rp - buffaddr + new_rec_size - SIZEOF(block_id)); BLK_INIT(bs_ptr, bs1); if (0 == rc_set_fragment) { BLK_SEG(bs_ptr, buffaddr + SIZEOF(blk_hdr), curr_rec_offset - SIZEOF(blk_hdr)); BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr); curr_rec_hdr->rsiz = new_rec_size; curr_rec_hdr->cmpc = prev_rec_match; BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr)); BLK_ADDR(cp1, target_key_size - prev_rec_match, unsigned char); memcpy(cp1, temp_key->base + prev_rec_match, target_key_size - prev_rec_match); BLK_SEG(bs_ptr, cp1, target_key_size - prev_rec_match); if (0 != value.len) { BLK_ADDR(va, value.len, char); memcpy(va, value.addr, value.len); BLK_SEG(bs_ptr, (unsigned char *)va, value.len); } if (!new_rec) rp = (rec_hdr_ptr_t)((sm_uc_ptr_t)rp + rec_size); n = (int)(cur_blk_size - ((sm_uc_ptr_t)rp - buffaddr)); if (n > 0) { if (new_rec) { BLK_ADDR(next_rec_hdr, SIZEOF(rec_hdr), rec_hdr); next_rec_hdr->rsiz = rec_size - next_rec_shrink; next_rec_hdr->cmpc = curr_rec_match; BLK_SEG(bs_ptr, (sm_uc_ptr_t)next_rec_hdr, SIZEOF(rec_hdr)); next_rec_shrink += SIZEOF(rec_hdr); } if (n >= next_rec_shrink) { BLK_SEG(bs_ptr, (sm_uc_ptr_t)rp + next_rec_shrink, n - next_rec_shrink); } else { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } } } else { /* With GT.M TRIGGERS, it is not clear how the RC protocol will work. The below assert is to * be informed whenever such usage happens (expected to be really rare) and handle it right * then instead of worrying about it during the initial trigger implementation. */ assert(FALSE); curr_rec_hdr = (rec_hdr_ptr_t)(buffaddr + curr_rec_offset); /* First piece is block prior to record + key + data prior to fragment */ BLK_SEG(bs_ptr, buffaddr + SIZEOF(blk_hdr), curr_rec_offset - SIZEOF(blk_hdr) + SIZEOF(rec_hdr) + rc_set_fragment + gv_currkey->end + 1 - curr_rec_hdr->cmpc); /* Second piece is fragment itself */ BLK_ADDR(va, value.len, char); memcpy(va, value.addr, value.len); BLK_SEG(bs_ptr, (unsigned char *)va, value.len); /* Third piece is data after fragment + rest of block after record */ n = (int)(cur_blk_size - ((sm_uc_ptr_t)curr_rec_hdr - buffaddr) - SIZEOF(rec_hdr) - (gv_currkey->end + 1 - curr_rec_hdr->cmpc) - rc_set_fragment - value.len); if (0 < n) BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr + gv_currkey->end + 1 - curr_rec_hdr->cmpc + rc_set_fragment + value.len, n); } if (0 == BLK_FINI(bs_ptr, bs1)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } assert(bs1[0].len <= blk_reserved_size); /* Assert that new block has space for reserved bytes */ cse = t_write(bh, (unsigned char *)bs1, ins_chain_offset, ins_chain_index, bh_level, FALSE, FALSE, GDS_WRITE_PLAIN); assert(!dollar_tlevel || !cse->high_tlevel); if ((0 != ins_chain_offset) && (NULL != cse) && (0 != cse->first_off)) { /* formerly tp_offset_chain - inserts a new_entry in the chain */ assert((NULL != cse->new_buff) || horiz_growth && cse->low_tlevel->new_buff && (buffaddr == cse->low_tlevel->new_buff)); assert(0 == cse->next_off); assert(ins_chain_offset > (signed)SIZEOF(blk_hdr)); /* we want signed comparison */ assert((curr_rec_offset - SIZEOF(off_chain)) == (ins_chain_offset - new_rec_size)); offset_sum = cse->first_off; curr = buffaddr + offset_sum; /* The typecast is needed below to enforce a "signed int" (versus "unsigned int") comparison */ if (offset_sum >= (signed int)curr_rec_offset) { /* the new record is prior to the first existing chain record, id the new one as first */ /* first_off-------------v--------------------v * [blk_hdr]...[new rec ( )]...[existing rec ( )]... */ cse->next_off = cse->first_off - (ins_chain_offset - new_rec_size) - next_rec_shrink1; cse->first_off = ins_chain_offset; } else { if (horiz_growth) { old_cse = cse->low_tlevel; assert(old_cse->first_off); assert(old_cse && old_cse->done); assert(!old_cse->undo_next_off[0] && !old_cse->undo_offset[0]); } /* find chain records before and after the new one */ for ( ; ; curr += curr_chain.next_off) { /* try to make offset_sum identify the first chain entry after the new record */ GET_LONGP(&curr_chain, curr); assert(curr_chain.flag == 1); if (0 == curr_chain.next_off) break; offset_sum += curr_chain.next_off; /* The typecast is needed below to enforce a "signed int" comparison */ if (offset_sum >= (signed int)curr_rec_offset) break; } /* store the next_off in old_cse before changing it in the buffer (for rolling back) */ if (horiz_growth) { old_cse->undo_next_off[0] = curr_chain.next_off; old_cse->undo_offset[0] = (block_offset)(curr - buffaddr); assert(old_cse->undo_offset[0]); } if (0 == curr_chain.next_off) { /* the last chain record precedes the new record: just update it */ /* ---|---------------v * [blk_hdr]...[existing rec ( )]...[new rec ( )]... */ curr_chain.next_off = ins_chain_offset - offset_sum; GET_LONGP(curr, &curr_chain); } else { /* update the chain record before the new one */ /* ---|---------------v--------------------v * [blk_hdr]...[existing rec ( )]...[new rec ( )]...[existing rec ( )] */ curr_chain.next_off = (unsigned int)(ins_chain_offset - (curr - buffaddr)); GET_LONGP(curr, &curr_chain); cse->next_off = offset_sum - (ins_chain_offset - new_rec_size) - next_rec_shrink1; } } assert((ins_chain_offset + (int)cse->next_off) <= (delta + (sm_long_t)cur_blk_size - SIZEOF(off_chain))); } succeeded = TRUE; if (level_0) { if (new_rec) { /* New record insertion at leaf level. gvcst_search would have already updated clue to * reflect the new key, but we need to fix the search history to keep it in sync with clue. * This search history (and clue) will be used by the NEXT call to gvcst_search. * Note that clue.end could be 0 at this point (see "Clue less than first rec, invalidate" * comment in gvcst_search) in which case the below assignment is unnecessary (though does * not hurt) but we want to avoid the if check (since we expect clue to be non-zero mostly). */ assert((0 == gv_target->clue.end) || (gv_target->clue.end + 1 == target_key_size)); assert(1 < target_key_size); assert(bh->curr_rec.match != target_key_size); bh->curr_rec.match = target_key_size; } /* ------------------------------------------------------------------------------------------------- * We have to maintain information for future recomputation only if the following are satisfied * 1) The block is a leaf-level block * 2) We are in TP (indicated by non-null cse) * 3) The global has NOISOLATION turned ON * 4) The cw_set_element hasn't encountered a block-split or a kill * 5) We don't need an extra_block_split * * We can also add an optimization that only cse's of mode gds_t_write need to have such updations, * but because of the belief that for a nonisolated variable, we will very rarely encounter a * situation where a created block (in TP) will have some new keys added to it, and that adding * the check slows down the normal code, we don't do that check here. * ------------------------------------------------------------------------------------------------- */ if (cse && gv_target->noisolation && !cse->write_type && !need_extra_block_split) { assert(dollar_tlevel); if (is_dollar_incr) { ADD_TO_GVT_TP_LIST(gv_target); /* See comment in ENSURE_VALUE_WITHIN_MAX_REC_SIZE * macro definition for why this macro call is necessary */ rts_error(VARLSTCNT(4) ERR_GVINCRISOLATION, 2, gv_target->gvname.var_name.len, gv_target->gvname.var_name.addr); } if (NULL == cse->recompute_list_tail || 0 != memcmp(gv_currkey->base, cse->recompute_list_tail->key.base, gv_currkey->top)) { tempkv = (key_cum_value *)get_new_element(si->recompute_list, 1); tempkv->key = *gv_currkey; tempkv->next = NULL; memcpy(tempkv->key.base, gv_currkey->base, gv_currkey->end + 1); if (NULL == cse->recompute_list_head) { assert(NULL == cse->recompute_list_tail); cse->recompute_list_head = tempkv; } else cse->recompute_list_tail->next = tempkv; cse->recompute_list_tail = tempkv; } else tempkv = cse->recompute_list_tail; assert(0 == val->str.len || ((val->str.len == bs1[4].len) && 0 == memcmp(val->str.addr, bs1[4].addr, val->str.len))); tempkv->value.len = val->str.len; /* bs1[4].addr is undefined if val->str.len is 0 */ tempkv->value.addr = (char *)bs1[4].addr;/* but not used in that case, so ok */ } } } else { /* Block split required */ split_depth++; gv_target->clue.end = 0; /* invalidate clue */ /* Potential size of the left and right blocks, including the new record */ new_blk_size_l = curr_rec_offset + new_rec_size; new_blk_size_r = SIZEOF(blk_hdr) + SIZEOF(rec_hdr) + target_key_size + value.len + cur_blk_size - curr_rec_offset - (new_rec ? next_rec_shrink : rec_size); assert(new_blk_size_single <= blk_reserved_size); assert(blk_reserved_size >= blk_fill_size); extra_record_orig_size = 0; prev_rec_offset = bh->prev_rec.offset; assert(new_blk_size_single <= new_blk_size_r); /* Decide which side (left or right) the new record goes. Ensure either side has at least one record. * This means we might not honor the desired FillFactor if the only record in a block exceeds the * blk_fill_size, but in this case we are guaranteed the block has room for the current reserved bytes. * The typecast of curr_rec_offset is needed below to enforce a "signed int" comparison. */ if (new_blk_size_r > blk_fill_size) { new_rec_goes_to_right = (new_blk_size_r == new_blk_size_single); last_split_dir = NEWREC_DIR_FORCED; /* no choice in split direction */ } else if (new_blk_size_l > blk_fill_size) { new_rec_goes_to_right = TRUE; last_split_dir = NEWREC_DIR_FORCED; /* no choice in split direction */ } else { /* new_rec can go in either direction without any issues of fitting in. * This is where we need to use a few heuristics to ensure good block space utilization. * We note down which direction (left or right) the new record went in after the split. * We use that as the heuristic to identify the direction of data loading and do the * splits accordingly for future updates. */ last_split_dir = (enum split_dir)gv_target->last_split_direction[bh_level]; if (NEWREC_DIR_FORCED == last_split_dir) { /* dont have prior information to use heuristic. choose whichever side is less full. * if this turns out to not be the correct choice, we will correct ourselves at the * time of the next block split at the same level. */ last_split_dir = (new_blk_size_l < new_blk_size_r) ? NEWREC_DIR_LEFT : NEWREC_DIR_RIGHT; } else { /* Last block split at this level chose a specific direction for new_rec. See if * that heuristic worked. This is done by checking if the block # that new_rec went * into previously is the same block that is being split now. If so, that means the * previous choice of direction was actually not optimal. So try the other direction now. */ last_split_bnum = gv_target->last_split_blk_num[bh_level]; if (dollar_tlevel) { chain2 = *(off_chain *)&last_split_bnum; if (chain1.flag == chain2.flag) { if (!chain1.flag) blk_match = (blk_num == last_split_bnum); else { assert(chain1.cw_index < si->cw_set_depth); blk_match = (chain1.cw_index == chain2.cw_index); } } else blk_match = FALSE; } else { DEBUG_ONLY(chain1 = *(off_chain *)&last_split_bnum;) assert(!chain1.flag); blk_match = (blk_num == last_split_bnum); } is_split_dir_left = (NEWREC_DIR_LEFT == last_split_dir); if (blk_match) /* switch direction since last choice did not seem to have worked */ last_split_dir = is_split_dir_left ? NEWREC_DIR_RIGHT : NEWREC_DIR_LEFT; else { /* blk# did not match means there is a high likelihood that the current split * is happening in the OTHER sibling block from the previous block split operation * at the same level. There is no easy way of confirming this so we assume the * heuristic is doing its job, unless we see evidence otherwise. And that evidence * is IF the block sizes of the left and right halves dont match the direction of * choice (e.g. if we choose NEWREC_DIR_LEFT, we expect the right block to be * almost full and the left block to be almost empty and vice versa). * In this case too switch the direction. */ if (is_split_dir_left) { if (new_blk_size_l > new_blk_size_r) last_split_dir = NEWREC_DIR_RIGHT; } else { if (new_blk_size_l < new_blk_size_r) last_split_dir = NEWREC_DIR_LEFT; } } } new_rec_goes_to_right = (NEWREC_DIR_RIGHT == last_split_dir); } last_split_direction[bh_level] = (char)last_split_dir; if (new_rec_goes_to_right) { /* Left side of this block will be split off into a new block. * The new record and the right side of this block will remain in this block. */ /* prepare new block */ BLK_INIT(bs_ptr, bs1); if (level_0) { BLK_SEG(bs_ptr, buffaddr + SIZEOF(blk_hdr), curr_rec_offset - SIZEOF(blk_hdr)); } else { /* for index records, the record before the split becomes a new *-key */ /* Note: If the block split was caused by our appending the new record * to the end of the block, this code causes the record PRIOR to the * current *-key to become the new *-key. */ BLK_SEG(bs_ptr, buffaddr + SIZEOF(blk_hdr), prev_rec_offset - SIZEOF(blk_hdr)); BLK_ADDR(new_star_hdr, SIZEOF(rec_hdr), rec_hdr); new_star_hdr->rsiz = BSTAR_REC_SIZE; new_star_hdr->cmpc = 0; BLK_SEG(bs_ptr, (sm_uc_ptr_t)new_star_hdr, SIZEOF(rec_hdr)); BLK_SEG(bs_ptr, (sm_uc_ptr_t)rp - SIZEOF(block_id), SIZEOF(block_id)); } new_blk_bs = bs1; if (0 == BLK_FINI(bs_ptr,bs1)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } /* We want to assert that the left block has enough space for reserved bytes but * it is possible that it DOES NOT have enough space for reserved bytes if the pre-split * block was previously populated with a very low reserved bytes setting and if the current * reserved bytes setting is much higher than what the chosen split point would free up. * This is an issue waiting to be fixed by C9K01-003221. Until then the following assert * has to remain commented out. * * assert(bs1[0].len <= blk_reserved_size); */ /* prepare the existing block */ BLK_INIT(bs_ptr, bs1); ins_chain_offset = no_pointers ? 0 : (int)(SIZEOF(blk_hdr) + SIZEOF(rec_hdr) + target_key_size); left_hand_offset = left_hand_index = 0; if (!new_rec) rp = (rec_hdr_ptr_t)((sm_uc_ptr_t)rp + rec_size); BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr); curr_rec_hdr->rsiz = target_key_size + SIZEOF(rec_hdr) + value.len; curr_rec_hdr->cmpc = 0; BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr)); BLK_ADDR(cp1, target_key_size, unsigned char); memcpy(cp1, temp_key->base, target_key_size); BLK_SEG(bs_ptr, cp1, target_key_size); if (0 != value.len) { BLK_ADDR(va, value.len, char); memcpy(va, value.addr, value.len); BLK_SEG(bs_ptr, (unsigned char *)va, value.len); } if (buffaddr + cur_blk_size > (sm_uc_ptr_t)rp) { BLK_ADDR(next_rec_hdr, SIZEOF(rec_hdr), rec_hdr); GET_USHORT(next_rec_hdr->rsiz, &rp->rsiz); next_rec_hdr->rsiz -= next_rec_shrink; next_rec_hdr->cmpc = new_rec ? curr_rec_match : rp->cmpc; BLK_SEG(bs_ptr, (sm_uc_ptr_t)next_rec_hdr, SIZEOF(rec_hdr)); next_rec_shrink += SIZEOF(rec_hdr); n = cur_blk_size - INTCAST(((sm_uc_ptr_t)rp - buffaddr)) - next_rec_shrink; if (0 > n) /* want signed compare as 'n' can be negative */ { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } BLK_SEG(bs_ptr, (sm_uc_ptr_t)rp + next_rec_shrink, n); } if (0 == BLK_FINI(bs_ptr, bs1)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } assert(bs1[0].len <= blk_reserved_size); /* Assert that right block has space for reserved bytes */ assert(gv_altkey->top == gv_currkey->top); assert(gv_altkey->end < gv_altkey->top); temp_key = gv_altkey; if (cdb_sc_normal != (status = gvcst_expand_key((blk_hdr_ptr_t)buffaddr, prev_rec_offset, temp_key))) GOTO_RETRY; } else { /* Insert in left hand (new) block */ if (!level_0) { /* In case of an index block, as long as the current record is not a *-record * (i.e. last record in the block) and copying an extra record into the left * block does not cause it to exceed the fill factor, copy an additional record. * Not doing the extra record copy for index blocks (was the case pre-V54002) has * been seen to create suboptimally filled index blocks (as low as 15% fillfactor) * depending on the patterns of updates. */ assert(new_rec); copy_extra_record = ((BSTAR_REC_SIZE != rec_size) && ((new_blk_size_l + BSTAR_REC_SIZE) <= blk_fill_size)); } else { copy_extra_record = ((0 == prev_rec_offset) && (NEWREC_DIR_LEFT == last_split_dir) && new_rec && (SIZEOF(blk_hdr) < cur_blk_size)); } BLK_INIT(bs_ptr, bs1); if (no_pointers) left_hand_offset = 0; else { left_hand_offset = curr_rec_offset + SIZEOF(rec_hdr); if (level_0 || copy_extra_record) left_hand_offset += target_key_size - prev_rec_match; } left_hand_index = ins_chain_index; ins_chain_index = ins_chain_offset = 0; BLK_SEG(bs_ptr, buffaddr + SIZEOF(blk_hdr), curr_rec_offset - SIZEOF(blk_hdr)); if (level_0) { /* After the initial split, will this record fit into the new left block? * If not, this pass will make room and we will do another block split on the next pass. */ assert((blk_seg_cnt + SIZEOF(rec_hdr) + target_key_size - prev_rec_match + value.len) == new_blk_size_l); assert((new_blk_size_single <= new_blk_size_l) || (CDB_STAGNATE > t_tries)); assert((new_blk_size_single != new_blk_size_l) || ((0 == prev_rec_offset) && (SIZEOF(blk_hdr) == curr_rec_offset))); assert((new_blk_size_single >= new_blk_size_l) || ((SIZEOF(blk_hdr) <= prev_rec_offset) && (SIZEOF(blk_hdr) < curr_rec_offset))); if ((new_blk_size_l > blk_fill_size) && (new_blk_size_l > new_blk_size_single)) { /* There is at least one existing record to the left of the split point. * Do the initial split this pass and make an extra split next pass. */ need_extra_block_split = TRUE; DEBUG_ONLY(dbg_trace_array[dbg_num_iters].is_extra_block_split = TRUE;) } else { BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr); curr_rec_hdr->rsiz = new_rec_size; curr_rec_hdr->cmpc = prev_rec_match; BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr)); BLK_ADDR(cp1, target_key_size - prev_rec_match, unsigned char); memcpy(cp1, temp_key->base + prev_rec_match, target_key_size - prev_rec_match); BLK_SEG(bs_ptr, cp1, target_key_size - prev_rec_match); if (0 != value.len) { BLK_ADDR(va, value.len, char); memcpy(va, value.addr, value.len); BLK_SEG(bs_ptr, (unsigned char *)va, value.len); } if (copy_extra_record) { n = rec_size - curr_rec_match; /* typecast needed below to enforce a "signed int" comparison */ if ((n + (signed int)curr_rec_offset + new_rec_size) > blk_fill_size) copy_extra_record = FALSE; else { BLK_ADDR(extra_rec_hdr, SIZEOF(rec_hdr), rec_hdr); extra_rec_hdr->rsiz = n; extra_rec_hdr->cmpc = curr_rec_match; BLK_SEG(bs_ptr, (sm_uc_ptr_t)extra_rec_hdr, SIZEOF(rec_hdr)); if (n < (signed)SIZEOF(rec_hdr)) /* want signed compare */ { /* as 'n' can be negative */ assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } BLK_SEG(bs_ptr, buffaddr + SIZEOF(blk_hdr) + SIZEOF(rec_hdr) + curr_rec_match, n - SIZEOF(rec_hdr)); new_blk_size_l += n; } } } } else { if (copy_extra_record) { BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr); curr_rec_hdr->rsiz = new_rec_size; curr_rec_hdr->cmpc = prev_rec_match; BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr)); BLK_ADDR(cp1, target_key_size - prev_rec_match, unsigned char); memcpy(cp1, temp_key->base + prev_rec_match, target_key_size - prev_rec_match); BLK_SEG(bs_ptr, cp1, target_key_size - prev_rec_match); assert(value.len); BLK_ADDR(va, value.len, char); memcpy(va, value.addr, value.len); BLK_SEG(bs_ptr, (unsigned char *)va, value.len); new_blk_size_l += BSTAR_REC_SIZE; } else new_blk_size_l = curr_rec_offset + BSTAR_REC_SIZE; BLK_ADDR(new_star_hdr, SIZEOF(rec_hdr), rec_hdr); new_star_hdr->rsiz = BSTAR_REC_SIZE; new_star_hdr->cmpc = 0; BLK_SEG(bs_ptr, (sm_uc_ptr_t)new_star_hdr, SIZEOF(rec_hdr)); if (!copy_extra_record) { BLK_SEG(bs_ptr, (unsigned char *)&zeroes, SIZEOF(block_id)); } else BLK_SEG(bs_ptr, (sm_uc_ptr_t)rp + rec_size - SIZEOF(block_id), SIZEOF(block_id)); } new_blk_bs = bs1; if (0 == BLK_FINI(bs_ptr, bs1)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } /* We want to assert that the left block has enough space for reserved bytes but * it is possible that it DOES NOT have enough space for reserved bytes if the pre-split * block was previously populated with a very low reserved bytes setting and if the current * reserved bytes setting is much higher than what the chosen split point would free up. * This is an issue waiting to be fixed by C9K01-003221. Until then the following assert * has to remain commented out. * * assert(bs1[0].len <= blk_reserved_size); */ /* assert that both !new_rec and copy_extra_record can never be TRUE at the same time */ assert(new_rec || !copy_extra_record); if (!new_rec || copy_extra_record) { /* Should guard for empty block??? */ rp = (rec_hdr_ptr_t)((sm_uc_ptr_t)rp + rec_size); rec_cmpc = rp->cmpc; temp_short = rec_size; GET_USHORT(rec_size, &rp->rsiz); } if (copy_extra_record) { extra_record_orig_size = temp_short; assert(gv_altkey->top == gv_currkey->top); assert(gv_altkey->end < gv_altkey->top); temp_key = gv_altkey; if (cdb_sc_normal != (status = gvcst_expand_key((blk_hdr_ptr_t)buffaddr, curr_rec_offset, temp_key))) GOTO_RETRY; } else if (temp_key != gv_altkey) { memcpy(gv_altkey, temp_key, SIZEOF(gv_key) + temp_key->end); temp_key = gv_altkey; } rec_size += rec_cmpc; BLK_INIT(bs_ptr, bs1); BLK_ADDR(next_rec_hdr, SIZEOF(rec_hdr), rec_hdr); next_rec_hdr->rsiz = rec_size; next_rec_hdr->cmpc = 0; BLK_SEG(bs_ptr, (sm_uc_ptr_t)next_rec_hdr, SIZEOF(rec_hdr)); BLK_ADDR(cp1, rec_cmpc, unsigned char); memcpy(cp1, temp_key->base, rec_cmpc); BLK_SEG(bs_ptr, cp1, rec_cmpc); n = cur_blk_size - INTCAST(((sm_uc_ptr_t)rp - buffaddr)) - SIZEOF(rec_hdr); if (0 > n) /* want signed compare as 'n' can be negative */ { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } BLK_SEG(bs_ptr, (sm_uc_ptr_t)(rp + 1), n); if (0 == BLK_FINI(bs_ptr, bs1)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } /* We want to assert that the right block has enough space for reserved bytes but * it is possible that it DOES NOT have enough space for reserved bytes if the pre-split * block was previously populated with a very low reserved bytes setting and if the current * reserved bytes setting is much higher than what the chosen split point would free up. * This is an issue waiting to be fixed by C9K01-003221. Until then the following assert * has to remain commented out. * * assert(bs1[0].len <= blk_reserved_size); */ } next_blk_index = t_create(blk_num, (uchar_ptr_t)new_blk_bs, left_hand_offset, left_hand_index, bh_level); if (!no_pointers && dollar_tlevel) { /* there may be chains */ assert(new_rec); curr_chain = *(off_chain *)&blk_num; if (curr_chain.flag == 1) tp_get_cw(si->first_cw_set, curr_chain.cw_index, &cse); else { if (NULL != (tabent = lookup_hashtab_int4(si->blks_in_use, (uint4 *)&blk_num))) tp_srch_status = tabent->value; else tp_srch_status = NULL; cse = tp_srch_status ? tp_srch_status->cse : NULL; } assert(!cse || !cse->high_tlevel); if ((NULL != cse) && (0 != cse->first_off)) { /* there is an existing chain: fix to account for the split */ assert(NULL != cse->new_buff); assert(cse->done); assert(0 == cse->next_off); cse_new = si->last_cw_set; assert(!cse_new->high_tlevel); assert(0 == cse_new->next_off); assert(0 == cse_new->first_off); assert(cse_new->ins_off == left_hand_offset); assert(cse_new->index == left_hand_index); assert(cse_new->level == cse->level); cse_first_off = (int4)cse->first_off; offset_sum = cse_first_off; curr = buffaddr + offset_sum; GET_LONGP(&curr_chain, curr); assert(curr_chain.flag == 1); last_possible_left_offset = curr_rec_offset + extra_record_orig_size - SIZEOF(off_chain); /* some of the following logic used to be in tp_split_chain which was nixed */ if (offset_sum <= last_possible_left_offset) { /* the split falls within or after the chain; otherwise entire chain stays right */ assert((cse_first_off < curr_rec_offset) || (cse_first_off == last_possible_left_offset)); if (left_hand_offset && (curr_rec_offset < cse_first_off)) { /* We are inserting the new record (with the to-be-filled child block * number) AND an extra record in the left block and the TP block * chain of the block to be split starts AFTER the new record's offset * in the current block. This means the left block (cse_new) will have a * block chain starting with the newly inserted record's block pointer. */ cse_new->first_off = left_hand_offset; } else { cse_new->first_off = cse_first_off; assert(0 == cse_new->next_off); } if (level_0) /* if no *-key issue stop after, rather than at, a match */ last_possible_left_offset += SIZEOF(off_chain); if (offset_sum < last_possible_left_offset) { /* it's not an immediate hit */ for ( ; ; curr += curr_chain.next_off, GET_LONGP(&curr_chain, curr)) { /* follow chain upto split point */ assert(1 == curr_chain.flag); if (0 == curr_chain.next_off) break; offset_sum += curr_chain.next_off; if (offset_sum >= last_possible_left_offset) break; } /* end of search chain loop */ } assert(curr >= (buffaddr + cse_first_off)); if (level_0) /* restore match point to "normal" */ last_possible_left_offset -= SIZEOF(off_chain); if ((offset_sum == last_possible_left_offset) && !level_0) { /* The last record in the left side of the pre-split block is where * the search stopped. If no extra record copy was done, then this * record will end up BEFORE the inserted record in the post-split * left block. Otherwise this will be AFTER the inserted record. * * In case of copy_extra_record, the extra record will become the *-key * ---|------------v-----------------v * [blk_hdr]...[curr rec( )][new rec ( )] [extra rec (*-key)] * * In case of no extra record copy, the new record will become the *-key * ---|-------------------v * [blk_hdr]...[curr rec( )][new rec (*-key)( )] * * Take this into account during the calculations below. */ assert(cse_first_off <= last_possible_left_offset); if (left_hand_offset) { assert(!ins_chain_offset); if (!extra_record_orig_size && (offset_sum != cse_first_off)) { /* bring curr up to the match */ curr += curr_chain.next_off; GET_LONGP(&curr_chain, curr); } curr_offset = curr - buffaddr; undo_index = 0; if (curr_offset < curr_rec_offset) { /* The chain starts before the curr_rec_offset. Fix * next_off field from the last element in the chain * before this offset. */ prev_chain = curr_chain; assert(extra_record_orig_size || (BSTAR_REC_SIZE == (left_hand_offset - curr_offset))); prev_chain.next_off = left_hand_offset - curr_offset; assert((curr_offset + prev_chain.next_off) <= (new_blk_size_l - SIZEOF(off_chain))); if (dollar_tlevel != cse->t_level) { assert(dollar_tlevel > cse->t_level); assert(!cse->undo_next_off[0] && !cse->undo_offset[0]); assert(!cse->undo_next_off[1] && !cse->undo_offset[1]); cse->undo_next_off[0] = curr_chain.next_off; cse->undo_offset[0] = (block_offset)curr_offset; undo_index = 1; } GET_LONGP(curr, &prev_chain); } if (extra_record_orig_size) { if (offset_sum != cse_first_off) { /* bring curr up to the match */ curr += curr_chain.next_off; curr_offset += curr_chain.next_off; GET_LONGP(&curr_chain, curr); } if (dollar_tlevel != cse->t_level) { assert(dollar_tlevel > cse->t_level); assert(!cse->undo_next_off[undo_index] && !cse->undo_offset[undo_index]); cse->undo_next_off[undo_index] = curr_chain.next_off; cse->undo_offset[undo_index] = (block_offset)curr_offset; } prev_chain = curr_chain; prev_chain.next_off = 0; GET_LONGP(curr, &prev_chain); cse_new->next_off = BSTAR_REC_SIZE; } offset_sum += curr_chain.next_off; } else { undo_index = 0; /* the last record turns into the *-key */ if (offset_sum == cse_first_off) { /* it's all there is */ /* first_off --------------------v * [blk_hdr]...[curr rec (*-key)( )] */ assert(prev_rec_offset >= SIZEOF(blk_hdr)); cse_new->first_off = (block_offset)(prev_rec_offset + SIZEOF(rec_hdr)); } else { /* update the next_off of the previous chain record */ /* ---|--------------------v * [blk_hdr]...[prev rec( )][curr rec (*-key)( )] */ assert((buffaddr + prev_rec_offset) > curr); prev_chain = curr_chain; assert((offset_sum - prev_chain.next_off) /* check old */ == (curr - buffaddr)); /* method equivalent */ prev_chain.next_off = (unsigned int)( (prev_rec_offset + (unsigned int)(SIZEOF(rec_hdr)) - (curr - buffaddr))); assert((curr - buffaddr + prev_chain.next_off) <= ((new_blk_size_l < blk_reserved_size ? new_blk_size_l : blk_reserved_size) - SIZEOF(off_chain))); if (dollar_tlevel != cse->t_level) { assert(dollar_tlevel > cse->t_level); assert(!cse->undo_next_off[0] && !cse->undo_offset[0]); assert(!cse->undo_next_off[1] && !cse->undo_offset[1]); cse->undo_next_off[0] = curr_chain.next_off; cse->undo_offset[0] = (block_offset)(curr - buffaddr); undo_index = 1; } GET_LONGP(curr, &prev_chain); /* bring curr up to the match */ curr += curr_chain.next_off; GET_LONGP(&curr_chain, curr); } offset_sum += curr_chain.next_off; if (dollar_tlevel != cse->t_level) { assert(dollar_tlevel > cse->t_level); assert(!cse->undo_next_off[undo_index] && !cse->undo_offset[undo_index]); cse->undo_next_off[undo_index] = curr_chain.next_off; cse->undo_offset[undo_index] = (block_offset)(curr - buffaddr); } curr_chain.next_off = 0; GET_LONGP(curr, &curr_chain); } } else { /* found the split and no *-key issue: just terminate before the split */ if (offset_sum == cse_first_off) offset_sum += curr_chain.next_off; /* put it in the lead */ old_curr_chain_next_off = curr_chain.next_off; if (left_hand_offset) { /* there's a new chain rec in left */ curr_offset = curr - buffaddr; if (extra_record_orig_size && (curr_offset == last_possible_left_offset)) { assert(level_0); /* else *-key issues */ cse_new->next_off = extra_record_orig_size - next_rec_shrink1; } assert(!ins_chain_offset); /* put the new one at the end of the chain */ /* ---|---------------v * [blk_hdr]...[curr rec( )]...[new rec ( )] */ /* the new rec may or may not be a *-key */ assert((offset_sum - curr_chain.next_off) == curr_offset); assert(left_hand_offset > curr_offset); curr_chain.next_off = (block_offset)(left_hand_offset - curr_offset); } else curr_chain.next_off = 0; assert((curr - buffaddr + curr_chain.next_off) <= ((new_blk_size_l < blk_reserved_size ? new_blk_size_l : blk_reserved_size) - SIZEOF(off_chain))); if (dollar_tlevel != cse->t_level) { assert(dollar_tlevel > cse->t_level); assert(!cse->undo_next_off[0] && !cse->undo_offset[0]); assert(!cse->undo_next_off[1] && !cse->undo_offset[1]); cse->undo_next_off[0] = old_curr_chain_next_off; cse->undo_offset[0] = (block_offset)(curr - buffaddr); } GET_LONGP(curr, &curr_chain); } /* end of *-key or not alternatives */ assert((left_hand_offset + (int)cse_new->next_off) <= ((new_blk_size_l < blk_reserved_size ? new_blk_size_l : blk_reserved_size) - SIZEOF(off_chain))); } /* end of buffer and cse_new adjustments */ prev_first_off = cse_first_off; if (ins_chain_offset) { /* if there is a new chain rec in the old block, put it first */ /* first_off---------v * [blk_hdr][new rec( )]... */ assert(!left_hand_offset); assert(0 == extra_record_orig_size); assert(ins_chain_offset >= (SIZEOF(blk_hdr) + SIZEOF(rec_hdr))); cse->first_off = ins_chain_offset; assert(0 == cse->next_off); if (offset_sum > last_possible_left_offset) { /* there are existing chain records after the split */ /* first_off---------v--------------------v * [blk_hdr][new rec( )]...[existing rec ( )] */ prev_next_off = cse->next_off; cse->next_off = offset_sum - last_possible_left_offset - next_rec_shrink1; assert((int)(cse->next_off + ins_chain_offset) < new_blk_size_r); } } else if (offset_sum <= last_possible_left_offset) { /* the last chain record went left with the split */ cse->first_off = 0; } else { /* just adjust the anchor for the split */ /* first_off------------------v * [blk_hdr]...[existing rec ( )] */ assert(offset_sum >= (int)cse_first_off); cse->first_off = (block_offset)(offset_sum - last_possible_left_offset + rec_cmpc + SIZEOF(blk_hdr) - SIZEOF(off_chain)); assert(cse->first_off >= (SIZEOF(blk_hdr) + SIZEOF(rec_hdr))); } assert((ins_chain_offset + (int)cse->next_off) <= ((new_blk_size_r < blk_reserved_size ? new_blk_size_r : blk_reserved_size) - SIZEOF(off_chain))); } /* end of of split processing */ } /* end of tp only code */ if (!dollar_tlevel) cse = NULL; else { cse_new = si->last_cw_set; assert(!cse_new->high_tlevel); gvcst_blk_build(cse_new, NULL, 0); cse_new->done = TRUE; } /* Record block split heuristic info that will be used in next block split */ if (!new_rec_goes_to_right) { chain1.flag = 1; chain1.cw_index = next_blk_index; chain1.next_off = 0; assert(SIZEOF(gv_target->last_split_blk_num[bh_level]) == SIZEOF(off_chain)); last_split_blk_num[bh_level] = *(block_id *)&chain1; } else last_split_blk_num[bh_level] = blk_num; assert(temp_key == gv_altkey); /* If new_rec_goes_to_right is TRUE, then it almost always implies that the left side of * the block is almost full (i.e. adding the new record there caused it to exceed the fill * factor) therefore direct all future updates to keys in between (which lie between the * last key of the left block and the first key of the right block) to the right block. * * If not, direct those updates to the left block thereby preventing it from staying at a * low capacity for a long period of time. * * This direction of future updates is implemented by controlling what key gets passed for * record addition into the parent index block. For directing all in-between updates to the * right block, pass in the last key of the left block to the parent index block. For directing * all in-between updates to the left block, back off 1 spot from the first key of the right * block and pass that to the parent index block. * * Doing this backoff accurately would imply finding the last non-zero byte in the key and taking * 1 off from it. In case the length of the right key is less than the left key, it is possible * that this backoff causes the new key to be less than even the left key (e.g. if left side has * "C2 13 93 00" as key sequence corresponding to the number 1292 and right side has "C2 14 00" * corresponding to the number 1300, taking one off the right side would give "C2 13 00" which corresponds * to the number 12 and is lesser than the left side). In this case, we would have to start adding in * FF bytes to the key as much as possible until we reached the left key length. In the above example, * we would get "C2 13 FF 00". * * In the end, because of the complexities involved in getting an accurate backoff (see above paragraph), * we instead implement a simplified backoff by examining just the first byte that differs and the * immediately following byte (if needed). If it turns out that we cannot get a backoff with just * those 2 bytes (should be rare), we then let the left key go unmodified. In such cases, we expect * not many intervening possible keys and and therefore it does not matter that much whether we pass * the left or (right-1) key to the parent. * * temp_key already holds the key corresponding to the last record of the left block. * bs1[2] and bs1[3] hold the key corresponding to the first record of the right block. */ if (level_0) { /* Determine key for record to pass on to parent index block */ cp1 = temp_key->base; cp2 = (unsigned char *)bs1[2].addr; bs1_2_len = bs1[2].len; for (i = 0; (i < bs1_2_len) && (*cp2 == *cp1); ++i) { ++cp2; ++cp1; } if (i == bs1_2_len) { cp2 = (unsigned char *)bs1[3].addr; bs1_3_len = bs1[3].len; for (j = 0; (j < bs1_3_len) && (*cp2 == *cp1); ++j) { ++cp2; ++cp1; } } n = (int)((sm_long_t)*cp2 - (sm_long_t)*cp1); if (0 > n) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } else if (1 < n) { temp_key->end = cp1 - temp_key->base + 2; if (temp_key->end < temp_key->top) { *cp1++ += (!new_rec_goes_to_right ? (n - 1) : 1); *cp1++ = 0; *cp1 = 0; } else { temp_key->end = temp_key->prev; assert(temp_key->end < temp_key->top); assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } } else if (1 == n) { cp1++; if ((cp1 - temp_key->base + 2) < temp_key->top) { if (i == (bs1_2_len - 1)) cp2 = (unsigned char *)bs1[3].addr; else cp2++; if ((STR_SUB_MAXVAL != *cp1) || (KEY_DELIMITER != *cp2)) { if (!new_rec_goes_to_right) { old_ch = *cp2; new_ch = old_ch - 1; *cp1 = new_ch; if (KEY_DELIMITER != old_ch) *(cp1 - 1) = *(cp2 - 1); } else { old_ch = *cp1; new_ch = old_ch + 1; *cp1 = new_ch; if (STR_SUB_MAXVAL == old_ch) *(cp1 - 1) = *(cp2 - 1); } cp1++; if (KEY_DELIMITER == new_ch) temp_key->end--; else *cp1++ = KEY_DELIMITER; *cp1 = KEY_DELIMITER; temp_key->end = cp1 - temp_key->base; } } else { temp_key->end = temp_key->prev; assert(temp_key->end < temp_key->top); assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } } } assert(temp_key->end < temp_key->top); assert(KEY_DELIMITER == temp_key->base[temp_key->end]); assert(KEY_DELIMITER == temp_key->base[temp_key->end - 1]); assert(KEY_DELIMITER != temp_key->base[temp_key->end - 2]); bq = bh + 1; if (HIST_TERMINATOR != bq->blk_num) { /* Not root; write blocks and continue */ if (cdb_sc_normal != (status = gvcst_search_blk(temp_key, bq))) GOTO_RETRY; cse = t_write(bh, (unsigned char *)bs1, ins_chain_offset, ins_chain_index, bh_level, TRUE, FALSE, GDS_WRITE_PLAIN); assert(!dollar_tlevel || !cse->high_tlevel); if (cse) { assert(dollar_tlevel); cse->write_type |= GDS_WRITE_BLOCK_SPLIT; } value.len = SIZEOF(block_id); value.addr = (char *)&zeroes; ++bh; ins_chain_index = next_blk_index; } else { /* Create new root */ if ((bh_level + 1) == MAX_BT_DEPTH) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_maxlvl; GOTO_RETRY; } ins_chain_index = t_create(blk_num, (uchar_ptr_t)bs1, ins_chain_offset, ins_chain_index, bh_level); make_it_null = FALSE; if (NULL != cse) { /* adjust block to use the buffer and offsets worked out for the old root */ assert(cse->done); assert(NULL != cse->new_buff); cse_new = si->last_cw_set; assert(!cse_new->high_tlevel); cse_new->blk_target = cse->blk_target; cse_new->first_off = cse->first_off; cse_new->next_off = cse->next_off; /* to be able to incrementally rollback, we need another copy of new_buff, * pointer copying wouldn't suffice */ cse_new->new_buff = (unsigned char *)get_new_free_element(si->new_buff_list); memcpy(cse_new->new_buff, cse->new_buff, ((blk_hdr_ptr_t)cse->new_buff)->bsiz); cse_new->old_block = NULL; make_it_null = TRUE; } /* Build the right child of the new root right now since it is possible that before commit the * root block may have been recycled in the global buffer which wouldn't cause a restart since * it has been built already (see the gvcst_blk_build below). Otherwise, we may be relying * on incorrect data in the root block when we build this right child finally in bg_update. * Note that this needs to be done only in TP since only tp_tend allows for a block with a * cse not to be in the global buffer if a new_buff already exists. */ if (dollar_tlevel) { DEBUG_ONLY(tp_get_cw(si->first_cw_set, ins_chain_index, &cse_new);) assert(cse_new == si->last_cw_set); cse_new = si->last_cw_set; assert(FALSE == cse_new->done); assert(!cse_new->high_tlevel); gvcst_blk_build(cse_new, NULL, 0); cse_new->done = TRUE; } target_key_size = temp_key->end + 1; BLK_INIT(bs_ptr, bs1); BLK_ADDR(curr_rec_hdr, SIZEOF(rec_hdr), rec_hdr); curr_rec_hdr->rsiz = target_key_size + SIZEOF(rec_hdr) + SIZEOF(block_id); curr_rec_hdr->cmpc = 0; BLK_SEG(bs_ptr, (sm_uc_ptr_t)curr_rec_hdr, SIZEOF(rec_hdr)); BLK_ADDR(cp1, target_key_size, unsigned char); memcpy(cp1, temp_key->base, target_key_size); BLK_SEG(bs_ptr, cp1, target_key_size); BLK_SEG(bs_ptr, (unsigned char *)&zeroes, SIZEOF(block_id)); BLK_ADDR(next_rec_hdr, SIZEOF(rec_hdr), rec_hdr); next_rec_hdr->rsiz = BSTAR_REC_SIZE; next_rec_hdr->cmpc = 0; BLK_SEG(bs_ptr, (sm_uc_ptr_t)next_rec_hdr, SIZEOF(rec_hdr)); BLK_SEG(bs_ptr, (unsigned char *)&zeroes, SIZEOF(block_id)); if (0 == BLK_FINI(bs_ptr, bs1)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_mkblk; GOTO_RETRY; } assert(bs1[0].len <= blk_reserved_size); /* Assert that new block has space for reserved bytes */ ins_off1 = (block_offset)(SIZEOF(blk_hdr) + SIZEOF(rec_hdr) + target_key_size); ins_off2 = (block_offset)(SIZEOF(blk_hdr) + (2 * SIZEOF(rec_hdr)) + SIZEOF(block_id) + target_key_size); assert(ins_off1 < ins_off2); /* Since a new root block is not created but two new children are created, this update to the * root block should disable the "indexmod" optimization (C9B11-001813). */ cse = t_write(bh, (unsigned char *)bs1, ins_off1, next_blk_index, bh_level + 1, TRUE, FALSE, GDS_WRITE_KILLTN); if (make_it_null) cse->new_buff = NULL; assert(!dollar_tlevel || !cse->high_tlevel); if (!dollar_tlevel) { /* create a sibling cw-set-element to store ins_off2/ins_chain_index */ t_write_root(ins_off2, ins_chain_index); } else { cse->write_type |= GDS_WRITE_BLOCK_SPLIT; assert(NULL == cse->new_buff); cse->first_off = 0; cse->next_off = ins_off2 - ins_off1; /* the following is the only place where the buffer is not completely built by * gvcst_blk_build. this means that the block chain seen by gvcst_blk_build will * have a bad value (that is fixed below) at the end of the list. therefore the * block chain integrity checking code in gvcst_blk_build will error out normally * in this case. signal that routine to skip checking just this tail element. */ DEBUG_ONLY(skip_block_chain_tail_check = TRUE;) gvcst_blk_build(cse, NULL, 0); DEBUG_ONLY(skip_block_chain_tail_check = FALSE;) curr_chain.flag = 1; curr_chain.cw_index = ins_chain_index; curr_chain.next_off = 0; curr = cse->new_buff + ins_off2; GET_LONGP(curr, &curr_chain); cse->done = TRUE; gv_target->clue.end = 0; } succeeded = TRUE; } } } assert(succeeded); horiz_growth = FALSE; assert((csa->dir_tree == gv_target) || tp_root); RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ); /* The only case where gv_target is still csa->dir_tree after the above RESET macro is if op_gvput was invoked * with gv_target being set to cs_addrs->dir_tree. In that case gbl_target_was_set would have been set to TRUE. Assert. */ assert((csa->dir_tree != gv_target) || gbl_target_was_set); /* Format the journal records only once for non-TP (irrespective of number of restarts). * We remember this through the variable "jnl_format_done". If TRUE, we do not redo the jnl_format. * The only exception is if we are in $INCREMENT in which case we need to reformat since the * current value (and hence the post-increment value) of the key might be different in different tries. * In this case, the restart code checks and resets "jnl_format_done" to FALSE. */ if (!dollar_tlevel) { nodeflags = 0; if (skip_dbtriggers) nodeflags |= JS_SKIP_TRIGGERS_MASK; assert(!jnl_format_done || !is_dollar_incr && (JNL_SET == non_tp_jfb_ptr->ja.operation)); if (need_extra_block_split) inctn_opcode = inctn_gvcstput_extra_blk_split; else if (JNL_WRITE_LOGICAL_RECS(csa) && !jnl_format_done) { jfb = jnl_format(JNL_SET, gv_currkey, (!is_dollar_incr ? val : post_incr_mval), nodeflags); assert(NULL != jfb); jnl_format_done = TRUE; } succeeded = ((trans_num)0 != t_end(&gv_target->hist, dir_hist, TN_NOT_SPECIFIED)); inctn_opcode = inctn_invalid_op; if (succeeded) { if (NULL != dir_hist) { /* The Global Variable Tree was created in this transaction. So clear its gv_target to be safe. * The directory tree though will have a non-zero value and that can stay as it is since it * was validated in this transaction and was found good enough for us to commit. */ assert(dir_tree != gv_target); gv_target->clue.end = 0; } } else { /* "t_retry" would have already been invoked by "t_end". * So instead of going to "retry:", do only whatever steps from there are necessary here. */ RESTORE_ZERO_GVT_ROOT_ON_RETRY(lcl_root, gv_target, tp_root, dir_hist, dir_tree); jnl_format_done = FALSE; /* need to reformat jnl records for $INCR even in case of non-TP */ GTMTRIG_DBG_ONLY(dbg_trace_array[dbg_num_iters].retry_line = __LINE__); goto tn_restart; } } else { status = tp_hist(dir_hist); if (NULL != dir_hist) { /* Note that although "tp_hist" processes the "dir_hist" history, it only adds "gv_target" to gvt_tp_list. * But csa->dir_tree might have had clue, blk-split related info etc. modified as part of this * gvcst_put invocation that might also need cleanup (just like any other gv_target) so add * csa->dir_tree to gvt_tp_list (if not already done). */ assert(dir_tree == csa->dir_tree); ADD_TO_GVT_TP_LIST(dir_tree); /* note: macro also updates read_local_tn if necessary */ } if (cdb_sc_normal != status) GOTO_RETRY; jnl_format_done = FALSE; } if (succeeded) { if (0 == tp_root) { /* Fill in gv_target->root with newly created root block value. * Previously, root remained at 0 at the end of the transaction and it was left to the * NEXT transaction to do a gvcst_root_search and determine the new root block. * This was fine until recently when op_gvrectarg was reworked to NOT do a gvcst_root_search * (to avoid potential TP restarts while unwinding the M stack). This meant that gv_target->root * needed to be kept uptodate as otherwise it was possible for gv_target->root to be stale * after a op_gvrectarg causing incorrect behavior of following M code (see v52000/C9B10001765 * subtest for example where $order(^gvn,$$extrinsic) is done and extrinsic CREATES <^gvn>). */ GTMTRIG_ONLY(assert(!ztval_gvcst_put_redo);) assert(0 == gv_target->root); if (!dollar_tlevel) { tp_root = cw_set[root_blk_cw_index].blk; assert(gds_t_acquired == cw_set[root_blk_cw_index].old_mode); assert(gds_t_committed == cw_set[root_blk_cw_index].mode); assert(!IS_BITMAP_BLK(tp_root)); } else { chain1.flag = 1; chain1.cw_index = root_blk_cw_index; chain1.next_off = 0; /* does not matter what value we set this field to */ assert(SIZEOF(tp_root) == SIZEOF(chain1)); tp_root = *(block_id *)&chain1; } gv_target->root = tp_root; } if (need_extra_block_split) { /* The logical update required an extra block split operation first (which succeeded) so * get back to doing the logical update before doing any trigger invocations etc. */ GTMTRIG_ONLY(skip_hasht_read = TRUE;) goto fresh_tn_start; } for (bh_level = 0; bh_level < split_depth; bh_level++) { blk_num = last_split_blk_num[bh_level]; assert(0 != blk_num); split_targ->last_split_blk_num[bh_level] = blk_num; assert((NEWREC_DIR_FORCED == last_split_direction[bh_level]) || (NEWREC_DIR_LEFT == last_split_direction[bh_level]) || (NEWREC_DIR_RIGHT == last_split_direction[bh_level])); split_targ->last_split_direction[bh_level] = last_split_direction[bh_level]; /* Fix blk_num if it was created in this transaction. In case of non-TP, we have the real block number * corresponding to the created block. In case of TP, we can know that only at tp_clean_up time so defer. */ chain1 = *(off_chain *)&blk_num; if (chain1.flag) { if (!dollar_tlevel) { assert(chain1.cw_index < ARRAYSIZE(cw_set)); split_targ->last_split_blk_num[bh_level] = cw_set[chain1.cw_index].blk; } else split_targ->split_cleanup_needed = TRUE;/* phantom blk# will be fixed at tp_clean_up time */ } } if (dollar_tlevel) { nodeflags = 0; if (skip_dbtriggers) nodeflags |= JS_SKIP_TRIGGERS_MASK; ja_val = (!is_dollar_incr ? val : post_incr_mval); write_logical_jnlrecs = JNL_WRITE_LOGICAL_RECS(csa); # ifdef GTM_TRIGGER if (!skip_dbtriggers) { /* Since we are about to invoke the trigger, we better have gv_target->gvt_trigger and * the local variable gvt_trigger in sync. The only exception is when we are here because * of a $ztvalue update and redoing the gvcst_put. In this case, it's possible that * the trigger code that was previously executed deleted the trigger and did an update * on the global which would have set gv_target->gvt_trigger to NULL. Assert accordingly. */ assert(ztval_gvcst_put_redo || (gvt_trigger == gv_target->gvt_trigger)); if ((NULL != gvt_trigger) && !ztval_gvcst_put_redo) { assert(dollar_tlevel); /* Format ZTWORM and SET journal records. * "ztworm_jfb", "jfb" and "jnl_format_done" are set by the below macro. */ JNL_FORMAT_ZTWORM_IF_NEEDED(csa, write_logical_jnlrecs, JNL_SET, gv_currkey, ja_val, ztworm_jfb, jfb, jnl_format_done); /* Initialize trigger parms that dont depend on the context of the matching trigger */ trigparms.ztoldval_new = key_exists ? ztold_mval : (mval *)&literal_null; PUSH_MV_STENT(MVST_MVAL); /* protect $ztval from stp_gcol */ ztval_mval = &mv_chain->mv_st_cont.mvs_mval; if (!is_dollar_incr) *ztval_mval = *val; else { *ztval_mval = *post_incr_mval; /* Since this is pointing to malloced buffer, we need to repoint it to stringpool * to avoid a nested trigger call (that does a $INCR) from overwriting this buffer. * This way buffers corresponding to $ztvals of nested triggers can coexist. */ s2pool(&ztval_mval->str); } trigparms.ztvalue_new = ztval_mval; trigparms.ztdata_new = key_exists ? &literal_one : &literal_zero; gvtr_parms.gvtr_cmd = GVTR_CMDTYPE_SET; gvtr_parms.gvt_trigger = gvt_trigger; gvtr_parms.duplicate_set = duplicate_set; /* Now that we have filled in minimal information, let "gvtr_match_n_invoke" do the rest */ gtm_trig_status = gvtr_match_n_invoke(&trigparms, &gvtr_parms); assert((0 == gtm_trig_status) || (ERR_TPRETRY == gtm_trig_status)); if (ERR_TPRETRY == gtm_trig_status) { /* A restart has been signaled that we need to handle or complete the handling of. * This restart could have occurred reading the trigger in which case no * tp_restart() has yet been done or it could have occurred in trigger code in * which case we need to finish the incomplete tp_restart. In both cases this * must be an implicitly TP wrapped transaction. Our action is to complete the * necessary tp_restart() logic (t_retry is already completed so should be skipped) * and then re-do the gvcst_put logic. */ assert(lcl_implicit_tstart); assert(CDB_STAGNATE >= t_tries); status = cdb_sc_normal; /* signal "retry:" to avoid t_retry call */ GOTO_RETRY; } REMOVE_ZTWORM_JFB_IF_NEEDED(ztworm_jfb, jfb, si); if (trigparms.ztvalue_changed) { /* At least one of the invoked triggers changed $ztval. * Redo the gvcst_put with $ztval as the right side of the SET. * Also make sure gtm_trigger calls are NOT done this time around. */ assert(0 < gvtr_parms.num_triggers_invoked); val = trigparms.ztvalue_new; MV_FORCE_STR(val); /* in case the updated value happens to be a numeric quantity */ ztval_gvcst_put_redo = TRUE; skip_hasht_read = TRUE; /* In case, the current gvcst_put invocation was for $INCR, reset the corresponding * global variable that indicates a $INCR is in progress since the redo of the * gvcst_put is a SET command (no longer $INCR). */ is_dollar_incr = FALSE; /* Dont pop the mvals as we want ztval_mval (which points to the mval containing * "val" for the redo iteration) protected-from-stp_gcol/accessible until the * redo is complete. */ goto fresh_tn_start; } } POP_MVALS_FROM_M_STACK_IF_NEEDED(ztold_mval, save_msp, save_mv_chain); /* pop any stacked mvals before op_tcommit as it does its own popping */ } # endif if (write_logical_jnlrecs && !jnl_format_done) { assert(dollar_tlevel); # ifdef GTM_TRIGGER /* Do not replicate implicit update or $ztval redo update */ assert(tstart_trigger_depth <= gtm_trigger_depth); if ((gtm_trigger_depth > tstart_trigger_depth) || ztval_gvcst_put_redo) { /* Ensure that JS_SKIP_TRIGGERS_MASK and JS_NOT_REPLICATED_MASK are mutually exclusive. */ assert(!(nodeflags & JS_SKIP_TRIGGERS_MASK)); nodeflags |= JS_NOT_REPLICATED_MASK; } # endif jfb = jnl_format(JNL_SET, gv_currkey, ja_val, nodeflags); assert(NULL != jfb); jnl_format_done = TRUE; } # ifdef GTM_TRIGGER /* Go ahead with commit of any implicit TP wrapped transaction */ if (lcl_implicit_tstart) { GVTR_OP_TCOMMIT(status); if (cdb_sc_normal != status) GOTO_RETRY; } # endif } assert(!JNL_WRITE_LOGICAL_RECS(csa) || jnl_format_done); /* Now that the SET/$INCR is finally complete, increment the corresponding GVSTAT counter */ INCR_GVSTATS_COUNTER(csa, cnl, n_set, 1); DBG_CHECK_VAL_AT_FUN_EXIT; assert(lcl_dollar_tlevel == dollar_tlevel); return; } retry: /* Note that it is possible cs_addrs is not equal to csa at this point in case we restarted due to trigger * invocations and in case those triggers referenced globals in different regions. But this should be fixed * by a call to t_retry/tp_restart below (it does a TP_CHANGE_REG(tp_pointer->gd_reg)). */ RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ); /* Need to restart. If directory tree was used in this transaction, nullify its clue as well (not normally * done by t_retry). The RESTORE_ZERO_GVT_ROOT_ON_RETRY macro call below takes care of that for us. */ RESTORE_ZERO_GVT_ROOT_ON_RETRY(lcl_root, gv_target, tp_root, dir_hist, dir_tree); # ifdef GTM_TRIGGER if (!skip_dbtriggers) { if (lcl_implicit_tstart) { assert(!skip_INVOKE_RESTART); assert((cdb_sc_normal != status) || (ERR_TPRETRY == gtm_trig_status)); if (cdb_sc_normal != status) skip_INVOKE_RESTART = TRUE; /* causes t_retry to invoke only tp_restart without any rts_error */ /* else: t_retry has already been done by gtm_trigger so no need to do it again for this try */ /* If an implicitly TP wrapped transaction is restarting, restore things to what they were * at entry into gvcst_put. Note that we could have done multiple iterations of gvcst_put for * extra_block_split/retry/ztval_gvcst_put_redo. */ ztval_gvcst_put_redo = FALSE; skip_hasht_read = FALSE; val = lcl_val; /* $increment related fields need to be restored */ is_dollar_incr = lcl_is_dollar_incr; post_incr_mval = lcl_post_incr_mval; increment_delta_mval = lcl_increment_delta_mval; } } # endif assert((cdb_sc_normal != status) GTMTRIG_ONLY(|| lcl_implicit_tstart)); if (cdb_sc_normal != status) { GTMTRIG_ONLY(POP_MVALS_FROM_M_STACK_IF_NEEDED(ztold_mval, save_msp, save_mv_chain)); t_retry(status); } else { /* else: t_retry has already been done so no need to do that again but need to still invoke tp_restart * to complete pending "tprestart_state" related work. */ # ifdef GTM_TRIGGER assert(ERR_TPRETRY == gtm_trig_status); TRIGGER_BASE_FRAME_UNWIND_IF_NOMANSLAND; POP_MVALS_FROM_M_STACK_IF_NEEDED(ztold_mval, save_msp, save_mv_chain); # endif rc = tp_restart(1, !TP_RESTART_HANDLES_ERRORS); assert(0 == rc GTMTRIG_ONLY(&& TPRESTART_STATE_NORMAL == tprestart_state)); } GTMTRIG_ONLY(assert(!skip_INVOKE_RESTART);) /* if set to TRUE a few statements above, should have been reset by t_retry */ /* At this point, we can be in TP only if we implicitly did a tstart in gvcst_put (as part of a trigger update). * Assert that. Since the t_retry/tp_restart would have reset si->update_trans, we need to set it again. * So reinvoke the T_BEGIN call only in case of TP. For non-TP, update_trans is unaffected by t_retry. */ assert(!dollar_tlevel GTMTRIG_ONLY(|| lcl_implicit_tstart)); if (dollar_tlevel) { jnl_format_done = FALSE; /* need to reformat jnl records unconditionally in case of TP */ tp_set_sgm(); /* set sgm_info_ptr & first_sgm_info for TP start */ T_BEGIN_SETORKILL_NONTP_OR_TP(ERR_GVPUTFAIL); /* set update_trans and t_err for wrapped TP */ } else if (is_dollar_incr) jnl_format_done = FALSE; /* need to reformat jnl records for $INCR even in case of non-TP */ assert(dollar_tlevel || update_trans); goto tn_restart; }