242 lines
9.4 KiB
C
242 lines
9.4 KiB
C
|
/****************************************************************
|
||
|
* *
|
||
|
* Copyright 2001, 2010 Fidelity Information Services, Inc *
|
||
|
* *
|
||
|
* This source code contains the intellectual property *
|
||
|
* of its copyright holder(s), and is made available *
|
||
|
* under a license. If you do not know the terms of *
|
||
|
* the license, please stop and do not read further. *
|
||
|
* *
|
||
|
****************************************************************/
|
||
|
|
||
|
#include "mdef.h"
|
||
|
|
||
|
#include "gtm_string.h"
|
||
|
|
||
|
#include "gdsroot.h"
|
||
|
#include "gdskill.h"
|
||
|
#include "gdsblk.h"
|
||
|
#include "gdsbt.h"
|
||
|
#include "gtm_facility.h"
|
||
|
#include "fileinfo.h"
|
||
|
#include "gdsfhead.h"
|
||
|
#include "gdscc.h"
|
||
|
#include "filestruct.h"
|
||
|
#include "copy.h"
|
||
|
#include "jnl.h"
|
||
|
#include "hashtab_int4.h" /* needed for tp.h */
|
||
|
#include "buddy_list.h" /* needed for tp.h */
|
||
|
#include "tp.h"
|
||
|
#include "t_write.h"
|
||
|
#include "min_max.h"
|
||
|
#include "jnl_get_checksum.h"
|
||
|
|
||
|
GBLREF cw_set_element cw_set[];
|
||
|
GBLREF unsigned char cw_set_depth;
|
||
|
GBLREF sgmnt_addrs *cs_addrs;
|
||
|
GBLREF sgm_info *sgm_info_ptr;
|
||
|
GBLREF uint4 dollar_tlevel;
|
||
|
GBLREF trans_num local_tn; /* transaction number for THIS PROCESS */
|
||
|
GBLREF gv_namehead *gv_target;
|
||
|
GBLREF uint4 t_err;
|
||
|
GBLREF unsigned int t_tries;
|
||
|
GBLREF boolean_t horiz_growth;
|
||
|
GBLREF int4 prev_first_off, prev_next_off;
|
||
|
GBLREF boolean_t mu_reorg_process;
|
||
|
|
||
|
cw_set_element *t_write (
|
||
|
srch_blk_status *blkhist, /* Search History of the block to be written. Currently the
|
||
|
* following members in this structure are used by "t_write"
|
||
|
* "blk_num" --> Block number being modified
|
||
|
* "buffaddr" --> Address of before image of the block
|
||
|
* "cr->ondsk_blkver" --> Actual block version on disk
|
||
|
*/
|
||
|
unsigned char *upd_addr, /* Address of the update array that contains the changes for this block */
|
||
|
block_offset ins_off, /* Offset to the position in the buffer that is to receive
|
||
|
* a block number when one is created. */
|
||
|
block_index index, /* Index into the create/write set. The specified entry is
|
||
|
* always a create entry. When the create gets assigned a
|
||
|
* block number, the block number is inserted into this
|
||
|
* buffer at the location specified by ins_off. */
|
||
|
char level, /* Level of the block in the tree */
|
||
|
boolean_t first_copy, /* Is first copy needed if overlaying same buffer? */
|
||
|
boolean_t forward, /* Is forward processing required? */
|
||
|
uint4 write_type) /* Whether "killtn" of the bt needs to be simultaneously updated or not */
|
||
|
{
|
||
|
cw_set_element *cse, *tp_cse, *old_cse;
|
||
|
off_chain chain;
|
||
|
uint4 iter;
|
||
|
srch_blk_status *tp_srch_status;
|
||
|
ht_ent_int4 *tabent;
|
||
|
block_id blk;
|
||
|
cache_rec_ptr_t cr;
|
||
|
boolean_t new_cse; /* TRUE if we had to create a new cse for the input block */
|
||
|
jnl_buffer_ptr_t jbbp; /* jbbp is non-NULL only if before-image journaling */
|
||
|
sgmnt_addrs *csa;
|
||
|
blk_hdr_ptr_t old_block;
|
||
|
unsigned int bsiz;
|
||
|
|
||
|
csa = cs_addrs;
|
||
|
horiz_growth = FALSE;
|
||
|
|
||
|
/* When the following two asserts trip, we should change the data types of prev_first_off
|
||
|
* and prev_next_off, so they satisfy the assert.
|
||
|
*/
|
||
|
assert(SIZEOF(prev_first_off) >= SIZEOF(block_offset));
|
||
|
assert(SIZEOF(prev_next_off) >= SIZEOF(block_offset));
|
||
|
|
||
|
blk = blkhist->blk_num;
|
||
|
if (!dollar_tlevel)
|
||
|
{
|
||
|
if (blk >= csa->ti->total_blks)
|
||
|
GTMASSERT;
|
||
|
cse = &cw_set[cw_set_depth];
|
||
|
cse->mode = gds_t_noop; /* initialize it to a value that is not "gds_t_committed" before incrementing
|
||
|
* cw_set_depth as secshr_db_clnup relies on it */
|
||
|
cw_set_depth++;
|
||
|
assert(cw_set_depth < CDB_CW_SET_SIZE);
|
||
|
assert(index < (int)cw_set_depth);
|
||
|
new_cse = TRUE;
|
||
|
tp_cse = NULL; /* dont bother returning tp_cse for non-TP; it's almost never needed and it distiguishes the cases */
|
||
|
} else
|
||
|
{
|
||
|
assert(!index || index < sgm_info_ptr->cw_set_depth);
|
||
|
chain = *(off_chain *)&blk;
|
||
|
if (chain.flag == 1)
|
||
|
{
|
||
|
tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain.cw_index, &cse);
|
||
|
blk = cse->blk;
|
||
|
} else
|
||
|
{
|
||
|
if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
|
||
|
tp_srch_status = (srch_blk_status *)tabent->value;
|
||
|
else
|
||
|
tp_srch_status = NULL;
|
||
|
cse = tp_srch_status ? tp_srch_status->cse : NULL;
|
||
|
/* tp_srch_status->cse always returns latest in the horizontal list */
|
||
|
}
|
||
|
assert(!cse || !cse->high_tlevel);
|
||
|
if (cse == NULL)
|
||
|
{
|
||
|
tp_cw_list(&cse);
|
||
|
sgm_info_ptr->cw_set_depth++;
|
||
|
assert(gv_target);
|
||
|
cse->blk_target = gv_target;
|
||
|
new_cse = TRUE;
|
||
|
} else
|
||
|
{
|
||
|
new_cse = FALSE;
|
||
|
assert(cse->done);
|
||
|
assert(dollar_tlevel >= cse->t_level);
|
||
|
if (cse->t_level != dollar_tlevel)
|
||
|
{
|
||
|
/* this part of the code is similar to that in gvcst_delete_blk(),
|
||
|
* any changes in one should be reflected in the other */
|
||
|
horiz_growth = TRUE;
|
||
|
old_cse = cse;
|
||
|
cse = (cw_set_element *)get_new_free_element(sgm_info_ptr->tlvl_cw_set_list);
|
||
|
memcpy(cse, old_cse, SIZEOF(cw_set_element));
|
||
|
cse->low_tlevel = old_cse;
|
||
|
cse->high_tlevel = NULL;
|
||
|
old_cse->high_tlevel = cse;
|
||
|
cse->t_level = dollar_tlevel;
|
||
|
assert(2 == (SIZEOF(cse->undo_offset) / SIZEOF(cse->undo_offset[0])));
|
||
|
assert(2 == (SIZEOF(cse->undo_next_off) / SIZEOF(cse->undo_next_off[0])));
|
||
|
for (iter = 0; iter < 2; iter++)
|
||
|
cse->undo_next_off[iter] = cse->undo_offset[iter] = 0;
|
||
|
assert(old_cse->new_buff);
|
||
|
assert(old_cse->done);
|
||
|
cse->new_buff = NULL;
|
||
|
if (PREV_OFF_INVALID != prev_first_off)
|
||
|
old_cse->first_off = prev_first_off;
|
||
|
if (PREV_OFF_INVALID != prev_next_off)
|
||
|
old_cse->next_off = prev_next_off;
|
||
|
}
|
||
|
/* cse->mode can be kill_t_create or kill_t_write only if we have a restartable situation.
|
||
|
* this is because a TP transaction should never try modifying a block that is no longer visible in the
|
||
|
* tree. the only exception is if due to concurrency issues, we read a stale copy of a buffer that
|
||
|
* incorrectly led us to this child block number. this is a restartable situation.
|
||
|
* since this routine does not return a failure code, we continue and expect tp_tend to detect this.
|
||
|
*/
|
||
|
switch (cse->mode)
|
||
|
{
|
||
|
case kill_t_create:
|
||
|
assert(CDB_STAGNATE > t_tries);
|
||
|
cse->mode = gds_t_create;
|
||
|
break;
|
||
|
case kill_t_write:
|
||
|
assert(CDB_STAGNATE > t_tries);
|
||
|
cse->mode = gds_t_write;
|
||
|
break;
|
||
|
default:
|
||
|
;
|
||
|
}
|
||
|
}
|
||
|
tp_cse = cse;
|
||
|
}
|
||
|
if (new_cse)
|
||
|
{
|
||
|
cse->blk_checksum = 0;
|
||
|
cse->blk = blk;
|
||
|
cse->mode = gds_t_write;
|
||
|
cse->new_buff = NULL;
|
||
|
cse->old_block = blkhist->buffaddr;
|
||
|
old_block = (blk_hdr_ptr_t)cse->old_block;
|
||
|
assert(NULL != old_block);
|
||
|
jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
|
||
|
if ((NULL != jbbp) && (old_block->tn < jbbp->epoch_tn))
|
||
|
{ /* Pre-compute CHECKSUM. Since we dont necessarily hold crit at this point, ensure we never try to
|
||
|
* access the buffer more than the db blk_size.
|
||
|
*/
|
||
|
bsiz = MIN(old_block->bsiz, csa->hdr->blk_size);
|
||
|
cse->blk_checksum = jnl_get_checksum((uint4*)old_block, csa, bsiz);
|
||
|
}
|
||
|
/* the buffer in shared memory holding the GDS block contents currently does not have in its block header the
|
||
|
* on-disk format of that block. if it had, we could have easily copied that over to the cw-set-element.
|
||
|
* until then, we have to use the cache-record's field "ondsk_blkver". but the cache-record is available only in BG.
|
||
|
* thankfully, in MM, we do not allow GDSV4 type blocks, so we can safely assign GDSV5 (or GDSVCURR) to this field.
|
||
|
*/
|
||
|
cr = blkhist->cr;
|
||
|
assert((NULL != cr) || (dba_mm == csa->hdr->acc_meth));
|
||
|
cse->ondsk_blkver = (NULL == cr) ? GDSVCURR : cr->ondsk_blkver;
|
||
|
} else
|
||
|
{ /* we did not create a new cse. assert the integrity of few fields filled in when this cse was created */
|
||
|
assert(cse->blk == blk);
|
||
|
assert(0 == cse->reference_cnt);
|
||
|
/* If we did not create a new cse, check that the level already stored in the cse is the same as the input level.
|
||
|
* It is possible that they are different but that would mean we are in one of two situations
|
||
|
* 1) A restartable situation. Since this routine does not currently return a failure code,
|
||
|
* we do not restart here but instead wait for some other failure-code-returning-function
|
||
|
* (if nothing else, the function tp_tend) to catch this situation and trigger a restart.
|
||
|
* 2) This block number is the root block of a GVT or Directory Tree and the height of the tree
|
||
|
* is increasing now. In either case cse->blk_target points to the gv_target for that tree.
|
||
|
* The only exception to this is if the global's root is being created.
|
||
|
*/
|
||
|
assert(cse->level == level || (CDB_STAGNATE > t_tries) || gds_t_create == cse->mode
|
||
|
|| cse->blk_target->root == cse->blk);
|
||
|
}
|
||
|
cse->upd_addr = upd_addr;
|
||
|
cse->ins_off = ins_off;
|
||
|
cse->index = index;
|
||
|
cse->reference_cnt = 0;
|
||
|
cse->level = level;
|
||
|
cse->was_free = FALSE; /* t_write operates on BUSY blocks and hence cse->was_free is set to FALSE unconditionally */
|
||
|
if (horiz_growth)
|
||
|
cse->first_copy = TRUE;
|
||
|
else
|
||
|
cse->first_copy = first_copy;
|
||
|
cse->done = FALSE;
|
||
|
cse->forward_process = forward;
|
||
|
cse->jnl_freeaddr = 0; /* reset jnl_freeaddr that previous transaction might have filled in */
|
||
|
cse->t_level = dollar_tlevel;
|
||
|
/* All REORG operations should disable the "indexmod" optimization (C9B11-001813/C9H12-002934). Assert that. */
|
||
|
assert(!mu_reorg_process || (GDS_WRITE_KILLTN == write_type));
|
||
|
if (dollar_tlevel)
|
||
|
cse->write_type |= write_type;
|
||
|
else
|
||
|
cse->write_type = write_type;
|
||
|
prev_first_off = prev_next_off = PREV_OFF_INVALID;
|
||
|
blkhist->cse = cse; /* indicate to t_end/tp_tend that this block is part of the write-set */
|
||
|
return tp_cse;
|
||
|
}
|