fis-gtm/sr_port/t_write.c

242 lines
9.4 KiB
C

/****************************************************************
* *
* Copyright 2001, 2010 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_string.h"
#include "gdsroot.h"
#include "gdskill.h"
#include "gdsblk.h"
#include "gdsbt.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsfhead.h"
#include "gdscc.h"
#include "filestruct.h"
#include "copy.h"
#include "jnl.h"
#include "hashtab_int4.h" /* needed for tp.h */
#include "buddy_list.h" /* needed for tp.h */
#include "tp.h"
#include "t_write.h"
#include "min_max.h"
#include "jnl_get_checksum.h"
GBLREF cw_set_element cw_set[];
GBLREF unsigned char cw_set_depth;
GBLREF sgmnt_addrs *cs_addrs;
GBLREF sgm_info *sgm_info_ptr;
GBLREF uint4 dollar_tlevel;
GBLREF trans_num local_tn; /* transaction number for THIS PROCESS */
GBLREF gv_namehead *gv_target;
GBLREF uint4 t_err;
GBLREF unsigned int t_tries;
GBLREF boolean_t horiz_growth;
GBLREF int4 prev_first_off, prev_next_off;
GBLREF boolean_t mu_reorg_process;
cw_set_element *t_write (
srch_blk_status *blkhist, /* Search History of the block to be written. Currently the
* following members in this structure are used by "t_write"
* "blk_num" --> Block number being modified
* "buffaddr" --> Address of before image of the block
* "cr->ondsk_blkver" --> Actual block version on disk
*/
unsigned char *upd_addr, /* Address of the update array that contains the changes for this block */
block_offset ins_off, /* Offset to the position in the buffer that is to receive
* a block number when one is created. */
block_index index, /* Index into the create/write set. The specified entry is
* always a create entry. When the create gets assigned a
* block number, the block number is inserted into this
* buffer at the location specified by ins_off. */
char level, /* Level of the block in the tree */
boolean_t first_copy, /* Is first copy needed if overlaying same buffer? */
boolean_t forward, /* Is forward processing required? */
uint4 write_type) /* Whether "killtn" of the bt needs to be simultaneously updated or not */
{
cw_set_element *cse, *tp_cse, *old_cse;
off_chain chain;
uint4 iter;
srch_blk_status *tp_srch_status;
ht_ent_int4 *tabent;
block_id blk;
cache_rec_ptr_t cr;
boolean_t new_cse; /* TRUE if we had to create a new cse for the input block */
jnl_buffer_ptr_t jbbp; /* jbbp is non-NULL only if before-image journaling */
sgmnt_addrs *csa;
blk_hdr_ptr_t old_block;
unsigned int bsiz;
csa = cs_addrs;
horiz_growth = FALSE;
/* When the following two asserts trip, we should change the data types of prev_first_off
* and prev_next_off, so they satisfy the assert.
*/
assert(SIZEOF(prev_first_off) >= SIZEOF(block_offset));
assert(SIZEOF(prev_next_off) >= SIZEOF(block_offset));
blk = blkhist->blk_num;
if (!dollar_tlevel)
{
if (blk >= csa->ti->total_blks)
GTMASSERT;
cse = &cw_set[cw_set_depth];
cse->mode = gds_t_noop; /* initialize it to a value that is not "gds_t_committed" before incrementing
* cw_set_depth as secshr_db_clnup relies on it */
cw_set_depth++;
assert(cw_set_depth < CDB_CW_SET_SIZE);
assert(index < (int)cw_set_depth);
new_cse = TRUE;
tp_cse = NULL; /* dont bother returning tp_cse for non-TP; it's almost never needed and it distiguishes the cases */
} else
{
assert(!index || index < sgm_info_ptr->cw_set_depth);
chain = *(off_chain *)&blk;
if (chain.flag == 1)
{
tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain.cw_index, &cse);
blk = cse->blk;
} else
{
if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
tp_srch_status = (srch_blk_status *)tabent->value;
else
tp_srch_status = NULL;
cse = tp_srch_status ? tp_srch_status->cse : NULL;
/* tp_srch_status->cse always returns latest in the horizontal list */
}
assert(!cse || !cse->high_tlevel);
if (cse == NULL)
{
tp_cw_list(&cse);
sgm_info_ptr->cw_set_depth++;
assert(gv_target);
cse->blk_target = gv_target;
new_cse = TRUE;
} else
{
new_cse = FALSE;
assert(cse->done);
assert(dollar_tlevel >= cse->t_level);
if (cse->t_level != dollar_tlevel)
{
/* this part of the code is similar to that in gvcst_delete_blk(),
* any changes in one should be reflected in the other */
horiz_growth = TRUE;
old_cse = cse;
cse = (cw_set_element *)get_new_free_element(sgm_info_ptr->tlvl_cw_set_list);
memcpy(cse, old_cse, SIZEOF(cw_set_element));
cse->low_tlevel = old_cse;
cse->high_tlevel = NULL;
old_cse->high_tlevel = cse;
cse->t_level = dollar_tlevel;
assert(2 == (SIZEOF(cse->undo_offset) / SIZEOF(cse->undo_offset[0])));
assert(2 == (SIZEOF(cse->undo_next_off) / SIZEOF(cse->undo_next_off[0])));
for (iter = 0; iter < 2; iter++)
cse->undo_next_off[iter] = cse->undo_offset[iter] = 0;
assert(old_cse->new_buff);
assert(old_cse->done);
cse->new_buff = NULL;
if (PREV_OFF_INVALID != prev_first_off)
old_cse->first_off = prev_first_off;
if (PREV_OFF_INVALID != prev_next_off)
old_cse->next_off = prev_next_off;
}
/* cse->mode can be kill_t_create or kill_t_write only if we have a restartable situation.
* this is because a TP transaction should never try modifying a block that is no longer visible in the
* tree. the only exception is if due to concurrency issues, we read a stale copy of a buffer that
* incorrectly led us to this child block number. this is a restartable situation.
* since this routine does not return a failure code, we continue and expect tp_tend to detect this.
*/
switch (cse->mode)
{
case kill_t_create:
assert(CDB_STAGNATE > t_tries);
cse->mode = gds_t_create;
break;
case kill_t_write:
assert(CDB_STAGNATE > t_tries);
cse->mode = gds_t_write;
break;
default:
;
}
}
tp_cse = cse;
}
if (new_cse)
{
cse->blk_checksum = 0;
cse->blk = blk;
cse->mode = gds_t_write;
cse->new_buff = NULL;
cse->old_block = blkhist->buffaddr;
old_block = (blk_hdr_ptr_t)cse->old_block;
assert(NULL != old_block);
jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
if ((NULL != jbbp) && (old_block->tn < jbbp->epoch_tn))
{ /* Pre-compute CHECKSUM. Since we dont necessarily hold crit at this point, ensure we never try to
* access the buffer more than the db blk_size.
*/
bsiz = MIN(old_block->bsiz, csa->hdr->blk_size);
cse->blk_checksum = jnl_get_checksum((uint4*)old_block, csa, bsiz);
}
/* the buffer in shared memory holding the GDS block contents currently does not have in its block header the
* on-disk format of that block. if it had, we could have easily copied that over to the cw-set-element.
* until then, we have to use the cache-record's field "ondsk_blkver". but the cache-record is available only in BG.
* thankfully, in MM, we do not allow GDSV4 type blocks, so we can safely assign GDSV5 (or GDSVCURR) to this field.
*/
cr = blkhist->cr;
assert((NULL != cr) || (dba_mm == csa->hdr->acc_meth));
cse->ondsk_blkver = (NULL == cr) ? GDSVCURR : cr->ondsk_blkver;
} else
{ /* we did not create a new cse. assert the integrity of few fields filled in when this cse was created */
assert(cse->blk == blk);
assert(0 == cse->reference_cnt);
/* If we did not create a new cse, check that the level already stored in the cse is the same as the input level.
* It is possible that they are different but that would mean we are in one of two situations
* 1) A restartable situation. Since this routine does not currently return a failure code,
* we do not restart here but instead wait for some other failure-code-returning-function
* (if nothing else, the function tp_tend) to catch this situation and trigger a restart.
* 2) This block number is the root block of a GVT or Directory Tree and the height of the tree
* is increasing now. In either case cse->blk_target points to the gv_target for that tree.
* The only exception to this is if the global's root is being created.
*/
assert(cse->level == level || (CDB_STAGNATE > t_tries) || gds_t_create == cse->mode
|| cse->blk_target->root == cse->blk);
}
cse->upd_addr = upd_addr;
cse->ins_off = ins_off;
cse->index = index;
cse->reference_cnt = 0;
cse->level = level;
cse->was_free = FALSE; /* t_write operates on BUSY blocks and hence cse->was_free is set to FALSE unconditionally */
if (horiz_growth)
cse->first_copy = TRUE;
else
cse->first_copy = first_copy;
cse->done = FALSE;
cse->forward_process = forward;
cse->jnl_freeaddr = 0; /* reset jnl_freeaddr that previous transaction might have filled in */
cse->t_level = dollar_tlevel;
/* All REORG operations should disable the "indexmod" optimization (C9B11-001813/C9H12-002934). Assert that. */
assert(!mu_reorg_process || (GDS_WRITE_KILLTN == write_type));
if (dollar_tlevel)
cse->write_type |= write_type;
else
cse->write_type = write_type;
prev_first_off = prev_next_off = PREV_OFF_INVALID;
blkhist->cse = cse; /* indicate to t_end/tp_tend that this block is part of the write-set */
return tp_cse;
}