600 lines
27 KiB
C
600 lines
27 KiB
C
/****************************************************************
|
|
* *
|
|
* Copyright 2003, 2012 Fidelity Information Services, Inc *
|
|
* *
|
|
* This source code contains the intellectual property *
|
|
* of its copyright holder(s), and is made available *
|
|
* under a license. If you do not know the terms of *
|
|
* the license, please stop and do not read further. *
|
|
* *
|
|
****************************************************************/
|
|
|
|
#include "mdef.h"
|
|
|
|
#include "gtm_string.h"
|
|
#include "gtm_inet.h"
|
|
|
|
#include <stddef.h> /* for offsetof() macro */
|
|
|
|
#ifdef VMS
|
|
#include <descrip.h> /* Required for gtmsource.h */
|
|
#endif
|
|
|
|
#include "gdsroot.h"
|
|
#include "gtm_facility.h"
|
|
#include "fileinfo.h"
|
|
#include "gdsbt.h"
|
|
#include "gdsblk.h"
|
|
#include "gdsfhead.h"
|
|
#include "filestruct.h"
|
|
#include "ccp.h"
|
|
#include "iosp.h"
|
|
#include "jnl.h"
|
|
#include "repl_msg.h"
|
|
#include "gtmsource.h"
|
|
#include "min_max.h"
|
|
#include "sleep_cnt.h"
|
|
#include "jnl_write.h"
|
|
#include "copy.h"
|
|
#include "jnl_get_checksum.h"
|
|
#include "memcoherency.h"
|
|
#include "is_proc_alive.h"
|
|
#include "wbox_test_init.h"
|
|
#include "gtmimagename.h"
|
|
|
|
GBLREF jnlpool_ctl_ptr_t temp_jnlpool_ctl;
|
|
GBLREF uint4 process_id;
|
|
GBLREF sm_uc_ptr_t jnldata_base;
|
|
GBLREF jnlpool_addrs jnlpool;
|
|
GBLREF jnlpool_ctl_ptr_t jnlpool_ctl;
|
|
GBLREF jnl_gbls_t jgbl;
|
|
GBLREF boolean_t is_src_server;
|
|
GBLREF boolean_t in_jnl_file_autoswitch;
|
|
|
|
error_def(ERR_JNLWRTNOWWRTR);
|
|
error_def(ERR_JNLWRTDEFER);
|
|
|
|
#ifdef DEBUG
|
|
/* The fancy ordering of operators/operands in the JNL_SPACE_AVAILABLE calculation is to avoid overflows. */
|
|
#define JNL_SPACE_AVAILABLE(jb, lcl_dskaddr, lcl_freeaddr, lcl_size, jnl_wrt_start_mask) \
|
|
( \
|
|
assert(((jb)->dskaddr <= lcl_freeaddr) \
|
|
|| (gtm_white_box_test_case_enabled \
|
|
&& (WBTEST_JNL_FILE_LOST_DSKADDR == gtm_white_box_test_case_number))), \
|
|
/* the following assert is an || to take care of 4G value overflows or 0 underflows */ \
|
|
assert((lcl_freeaddr <= lcl_size) || ((jb)->dskaddr >= lcl_freeaddr - lcl_size) \
|
|
|| (gtm_white_box_test_case_enabled \
|
|
&& (WBTEST_JNL_FILE_LOST_DSKADDR == gtm_white_box_test_case_number))), \
|
|
(lcl_size - (lcl_freeaddr - ((lcl_dskaddr = (jb)->dskaddr) & jnl_wrt_start_mask))) \
|
|
)
|
|
#else
|
|
#define JNL_SPACE_AVAILABLE(jb, dummy, lcl_freeaddr, lcl_size, jnl_wrt_start_mask) \
|
|
(lcl_size - (lcl_freeaddr - ((jb)->dskaddr & jnl_wrt_start_mask)))
|
|
#endif
|
|
|
|
|
|
#define JNL_PUTSTR(lcl_free, lcl_buff, src, len, lcl_size) \
|
|
{ \
|
|
int size_before_wrap; \
|
|
\
|
|
size_before_wrap = lcl_size - lcl_free; \
|
|
if (len <= size_before_wrap) \
|
|
{ \
|
|
memcpy(&lcl_buff[lcl_free], src, len); \
|
|
lcl_free += len; \
|
|
if (len == size_before_wrap) \
|
|
lcl_free = 0; \
|
|
} else \
|
|
{ \
|
|
memcpy(&lcl_buff[lcl_free], src, size_before_wrap); \
|
|
lcl_free = len - size_before_wrap; \
|
|
memcpy(&lcl_buff[0], src + size_before_wrap, lcl_free); \
|
|
} \
|
|
}
|
|
|
|
#define SET_JREC_LEN_PADDING_IF_NEEDED(JREC_LEN, JREC_LEN_PADDED) \
|
|
{ \
|
|
/* Before writing a journal record, check if we have some padding space \
|
|
* to close the journal file in case we are on the verge of an autoswitch. \
|
|
* If we are about to autoswitch the journal file at this point, dont \
|
|
* do the padding check since the padding space has already been checked \
|
|
* in jnl_write calls before this autoswitch invocation. We can safely \
|
|
* write the input record without worrying about autoswitch limit overflow. \
|
|
*/ \
|
|
JREC_LEN_PADDED = JREC_LEN; \
|
|
if (!in_jnl_file_autoswitch) \
|
|
JREC_LEN_PADDED = JREC_LEN + JNL_FILE_TAIL_PRESERVE; \
|
|
}
|
|
|
|
/* jpc : Journal private control
|
|
* rectype : Record type
|
|
* jnl_rec : This contains fixed part of a variable size record or the complete fixed size records.
|
|
* blk_ptr : For JRT_PBLK and JRT_AIMG this has the block image
|
|
* jfb : For SET/KILL/ZKILL/ZTWORM records entire record is formatted in this.
|
|
* For JRT_PBLK and JRT_AIMG it contains partial records
|
|
*/
|
|
void jnl_write(jnl_private_control *jpc, enum jnl_record_type rectype, jnl_record *jnl_rec, blk_hdr_ptr_t blk_ptr,
|
|
jnl_format_buffer *jfb)
|
|
{
|
|
int4 align_rec_len, rlen, rlen_with_align, dstlen, lcl_size, lcl_free, lcl_orig_free;
|
|
int4 align_rec_len_padded, rlen_padded;
|
|
jnl_buffer_ptr_t jb;
|
|
sgmnt_addrs *csa;
|
|
sgmnt_data_ptr_t csd;
|
|
node_local_ptr_t cnl;
|
|
struct_jrec_align align_rec;
|
|
uint4 status;
|
|
jrec_suffix suffix;
|
|
boolean_t nowrap, is_replicated;
|
|
struct_jrec_blk *jrec_blk;
|
|
uint4 checksum, jnlpool_size, lcl_freeaddr;
|
|
sm_uc_ptr_t lcl_buff;
|
|
gd_region *reg;
|
|
char *ptr;
|
|
int jnl_wrt_start_modulus, jnl_wrt_start_mask;
|
|
uint4 jnl_fs_block_size, aligned_lcl_free, padding_size;
|
|
uint4 tmp_csum1, tmp_csum2;
|
|
# ifdef DEBUG
|
|
uint4 lcl_dskaddr, mumps_node_sz;
|
|
char *mumps_node_ptr;
|
|
# endif
|
|
|
|
reg = jpc->region;
|
|
csa = &FILE_INFO(reg)->s_addrs;
|
|
csd = csa->hdr;
|
|
is_replicated = jrt_is_replicated[rectype];
|
|
/* Ensure that no replicated journal record is written by this routine if REPL-WAS_ENABLED(csa) is TRUE */
|
|
assert((JNL_ENABLED(csa) && !REPL_WAS_ENABLED(csa)) || !is_replicated);
|
|
/* Assert that the only journal records that the source server ever writes are PINI/PFIN/EPOCH/EOF
|
|
* which it does at the very end when the database is about to be shut down
|
|
*/
|
|
assert(!is_src_server || (JRT_EOF == rectype) || (JRT_PINI == rectype) || (JRT_EPOCH == rectype) || (JRT_PFIN == rectype));
|
|
assert(csa->now_crit || (csd->clustered && csa->nl->ccp_state == CCST_CLOSED));
|
|
assert(rectype > JRT_BAD && rectype < JRT_RECTYPES && JRT_ALIGN != rectype);
|
|
jb = jpc->jnl_buff;
|
|
/* Before taking a copy of jb->freeaddr, determine if both free and freeaddr are in sync. If not fix that first. */
|
|
if (jb->free_update_pid)
|
|
{
|
|
FIX_NONZERO_FREE_UPDATE_PID(csa, jb);
|
|
}
|
|
lcl_freeaddr = jb->freeaddr;
|
|
lcl_free = jb->free;
|
|
lcl_size = jb->size;
|
|
lcl_buff = &jb->buff[jb->buff_off];
|
|
DBG_CHECK_JNL_BUFF_FREEADDR(jb);
|
|
++jb->reccnt[rectype];
|
|
assert(NULL != jnl_rec);
|
|
rlen = jnl_rec->prefix.forwptr;
|
|
/* Do high-level check on rlen */
|
|
assert(rlen <= jb->max_jrec_len);
|
|
/* Do fine-grained checks on rlen */
|
|
GTMTRIG_ONLY(assert(!IS_ZTWORM(rectype) || (MAX_ZTWORM_JREC_LEN >= rlen));) /* ZTWORMHOLE */
|
|
assert(!IS_SET_KILL_ZKILL_ZTRIG(rectype) || (JNL_MAX_SET_KILL_RECLEN(csd) >= rlen)); /* SET, KILL, ZKILL */
|
|
assert((NULL == blk_ptr) || (JNL_MAX_PBLK_RECLEN(csd) >= rlen)); /* PBLK and AIMG */
|
|
jb->bytcnt += rlen;
|
|
assert (0 == rlen % JNL_REC_START_BNDRY);
|
|
rlen_with_align = rlen + (int4)MIN_ALIGN_RECLEN;
|
|
assert(0 == rlen_with_align % JNL_REC_START_BNDRY);
|
|
assert((uint4)rlen_with_align < ((uint4)1 << jb->log2_of_alignsize));
|
|
if ((lcl_freeaddr >> jb->log2_of_alignsize) == ((lcl_freeaddr + rlen_with_align - 1) >> jb->log2_of_alignsize))
|
|
rlen_with_align = rlen;
|
|
else
|
|
{
|
|
align_rec.align_str.length = ROUND_UP2(lcl_freeaddr, ((uint4)1 << jb->log2_of_alignsize))
|
|
- lcl_freeaddr - (uint4)MIN_ALIGN_RECLEN;
|
|
align_rec_len = (int4)(MIN_ALIGN_RECLEN + align_rec.align_str.length);
|
|
assert (0 == align_rec_len % JNL_REC_START_BNDRY);
|
|
rlen_with_align = rlen + align_rec_len;
|
|
}
|
|
jnl_wrt_start_mask = JNL_WRT_START_MASK(jb);
|
|
jnl_wrt_start_modulus = JNL_WRT_START_MODULUS(jb);
|
|
cnl = csa->nl;
|
|
/* If we are currently extending the journal file and writing the closing part of journal records,
|
|
* it better be the records that we expect. This is because we will skip the padding check for these
|
|
* records. The macro JNL_FILE_TAIL_PRESERVE already takes into account padding space for these.
|
|
*/
|
|
assert(!in_jnl_file_autoswitch
|
|
|| (JRT_PINI == rectype) || (JRT_PFIN == rectype) || (JRT_EPOCH == rectype)
|
|
|| (JRT_INCTN == rectype) || (JRT_EOF == rectype));
|
|
if (rlen_with_align != rlen)
|
|
{ /* the calls below to jnl_write_attempt() and jnl_file_extend() are duplicated for the ALIGN record and the
|
|
* non-ALIGN journal record instead of making it a function. this is purely for performance reasons.
|
|
*/
|
|
assert((!jb->blocked) || (FALSE == is_proc_alive(jb->blocked, 0))
|
|
VMS_ONLY(|| ((jb->blocked == process_id) && lib$ast_in_prog())));
|
|
jb->blocked = process_id;
|
|
/* We should differentiate between a full and an empty journal buffer, hence the pessimism reflected in the <=
|
|
* check below. Hence also the -1 in lcl_freeaddr - (lcl_size - align_rec_len - 1).
|
|
* This means that although we have space we might still be invoking jnl_write_attempt (very unlikely).
|
|
*/
|
|
if (JNL_SPACE_AVAILABLE(jb, lcl_dskaddr, lcl_freeaddr, lcl_size, jnl_wrt_start_mask) <= align_rec_len)
|
|
{ /* The fancy ordering of operators/operands in the calculation done below is to avoid overflows. */
|
|
if (SS_NORMAL != jnl_write_attempt(jpc,
|
|
ROUND_UP2(lcl_freeaddr - (lcl_size - align_rec_len- 1), jnl_wrt_start_modulus)))
|
|
{
|
|
assert(NOJNL == jpc->channel); /* jnl file lost */
|
|
return; /* let the caller handle the error */
|
|
}
|
|
}
|
|
jb->blocked = 0;
|
|
SET_JREC_LEN_PADDING_IF_NEEDED(align_rec_len, align_rec_len_padded); /* sets align_rec_len_padded */
|
|
if (jb->filesize < DISK_BLOCKS_SUM(lcl_freeaddr, align_rec_len_padded)) /* not enough room in jnl file, extend it */
|
|
{ /* We should never reach here if we are called from t_end/tp_tend. We check that by using the fact that
|
|
* early_tn is different from curr_tn in the t_end/tp_tend case. The only exception is wcs_recover which
|
|
* also sets these to be different in case of writing an INCTN record. For this case though it is okay to
|
|
* extend/autoswitch the file. So allow that.
|
|
*/
|
|
assertpro((csa->ti->early_tn == csa->ti->curr_tn) || (JRT_INCTN == rectype));
|
|
assert(!IS_REPLICATED(rectype)); /* all replicated jnl records should have gone through t_end/tp_tend */
|
|
assert(jrt_fixed_size[rectype]); /* this is used later in re-computing checksums */
|
|
assert(NULL == blk_ptr); /* as otherwise it is a PBLK or AIMG record which is of variable record
|
|
* length that conflicts with the immediately above assert.
|
|
*/
|
|
assert(NULL == jfb); /* as otherwise it is a logical record with formatted journal records which
|
|
* is of variable record length (conflicts with the jrt_fixed_size assert).
|
|
*/
|
|
assertpro(!in_jnl_file_autoswitch); /* avoid recursion of jnl_file_extend */
|
|
if (SS_NORMAL != jnl_flush(reg))
|
|
{
|
|
assert(NOJNL == jpc->channel); /* jnl file lost */
|
|
return; /* let the caller handle the error */
|
|
}
|
|
assert(lcl_freeaddr == jb->dskaddr);
|
|
if (EXIT_ERR == jnl_file_extend(jpc, align_rec_len)) /* if extension fails, not much we can do */
|
|
{
|
|
assert(FALSE);
|
|
return;
|
|
}
|
|
if (0 == jpc->pini_addr)
|
|
{ /* This can happen only if jnl got switched in jnl_file_extend above.
|
|
* Write a PINI record in the new journal file and then continue writing the input record.
|
|
* Basically we need to redo the processing in jnl_write because a lot of the local variables
|
|
* have changed state (e.g. jb->freeaddr etc.). So we instead call jnl_write()
|
|
* recursively and then return immediately.
|
|
*/
|
|
jnl_put_jrt_pini(csa);
|
|
assertpro(jpc->pini_addr); /* should have been set in "jnl_put_jrt_pini" */
|
|
if (JRT_PINI != rectype)
|
|
{
|
|
jnl_rec->prefix.pini_addr = jpc->pini_addr;
|
|
/* Checksum needs to be recomputed since prefix.pini_addr is changed in above statement */
|
|
jnl_rec->prefix.checksum = INIT_CHECKSUM_SEED;
|
|
jnl_rec->prefix.checksum = compute_checksum(INIT_CHECKSUM_SEED,
|
|
(uint4 *)jnl_rec, jnl_rec->prefix.forwptr);
|
|
jnl_write(jpc, rectype, jnl_rec, NULL, NULL);
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
align_rec.prefix.jrec_type = JRT_ALIGN;
|
|
assert(align_rec_len <= jb->max_jrec_len);
|
|
align_rec.prefix.forwptr = suffix.backptr = align_rec_len;
|
|
align_rec.prefix.time = jnl_rec->prefix.time;
|
|
align_rec.prefix.tn = jnl_rec->prefix.tn;
|
|
/* we have to write an ALIGN record here before writing the PINI record but we do not have a non-zero
|
|
* pini_addr for the ALIGN since we have not yet written the PINI. we use the pini_addr field of the
|
|
* first PINI journal record in the journal file which is nothing but JNL_FILE_FIRST_RECORD.
|
|
*/
|
|
align_rec.prefix.pini_addr = (JRT_PINI == rectype) ? JNL_FILE_FIRST_RECORD : jnl_rec->prefix.pini_addr;
|
|
align_rec.prefix.checksum = INIT_CHECKSUM_SEED;
|
|
suffix.suffix_code = JNL_REC_SUFFIX_CODE;
|
|
align_rec.prefix.checksum = compute_checksum(INIT_CHECKSUM_SEED, (uint4 *)&align_rec, SIZEOF(jrec_prefix));
|
|
ADJUST_CHECKSUM(align_rec.prefix.checksum, lcl_freeaddr, align_rec.prefix.checksum);
|
|
ADJUST_CHECKSUM(align_rec.prefix.checksum, csd->jnl_checksum, align_rec.prefix.checksum);
|
|
assert(lcl_free >= 0 && lcl_free < lcl_size);
|
|
if (lcl_size >= (lcl_free + align_rec_len))
|
|
{ /* before the string for zeroes */
|
|
memcpy(lcl_buff + lcl_free, (uchar_ptr_t)&align_rec, FIXED_ALIGN_RECLEN);
|
|
lcl_free += (int4)(FIXED_ALIGN_RECLEN + align_rec.align_str.length); /* zeroing is not necessary */
|
|
} else
|
|
{
|
|
JNL_PUTSTR(lcl_free, lcl_buff, (uchar_ptr_t)&align_rec, (int4)FIXED_ALIGN_RECLEN, lcl_size);
|
|
if (lcl_size >= (lcl_free + align_rec.align_str.length + SIZEOF(jrec_suffix)))
|
|
lcl_free += align_rec.align_str.length; /* zeroing is not necessary */
|
|
else
|
|
{
|
|
if (lcl_size >= (lcl_free + align_rec.align_str.length))
|
|
{
|
|
lcl_free += align_rec.align_str.length; /* zeroing is not necessary */
|
|
if (lcl_size == lcl_free)
|
|
lcl_free = 0;
|
|
} else
|
|
lcl_free = lcl_free + align_rec.align_str.length - lcl_size;
|
|
}
|
|
}
|
|
/* Now copy suffix */
|
|
assert(0 == (UINTPTR_T)(&lcl_buff[0] + lcl_free) % SIZEOF(jrec_suffix));
|
|
*(jrec_suffix *)(lcl_buff + lcl_free) = *(jrec_suffix *)&suffix;
|
|
lcl_free += SIZEOF(jrec_suffix);
|
|
if (lcl_size == lcl_free)
|
|
lcl_free = 0;
|
|
jpc->new_freeaddr = lcl_freeaddr + align_rec_len;
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_jrec_other, 1);
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_jbuff_bytes, align_rec_len);
|
|
assert(jgbl.gbl_jrec_time >= align_rec.prefix.time);
|
|
assert(align_rec.prefix.time >= jb->prev_jrec_time);
|
|
jb->prev_jrec_time = align_rec.prefix.time;
|
|
jpc->temp_free = lcl_free; /* set jpc->temp_free BEFORE setting free_update_pid (secshr_db_clnup relies on this) */
|
|
assert(lcl_free == jpc->new_freeaddr % lcl_size);
|
|
/* Note that freeaddr should be updated ahead of free since jnl_output_sp.c does computation of wrtsize
|
|
* based on free and asserts follow later there which use freeaddr.
|
|
*/
|
|
jb->free_update_pid = process_id;
|
|
lcl_freeaddr = jpc->new_freeaddr;
|
|
jb->freeaddr = lcl_freeaddr;
|
|
/* Write memory barrier here to enforce the fact that freeaddr *must* be seen to be updated before
|
|
free is updated. It is less important if free is stale so we do not require a 2nd barrier for that
|
|
and will let the lock release (crit lock required since clustering not currently supported) do the
|
|
2nd memory barrier for us. This barrier takes care of this process's responsibility to broadcast
|
|
cache changes. It is up to readers to also specify a read memory barrier if necessary to receive
|
|
this broadcast.
|
|
*/
|
|
SHM_WRITE_MEMORY_BARRIER;
|
|
jb->free = lcl_free;
|
|
jb->free_update_pid = 0;
|
|
DBG_CHECK_JNL_BUFF_FREEADDR(jb);
|
|
if (JRT_PINI == rectype)
|
|
{
|
|
jnl_rec->prefix.pini_addr = lcl_freeaddr;
|
|
/* Checksum needs to be recomputed since prefix.pini_addr is changed in above statement */
|
|
jnl_rec->prefix.checksum = INIT_CHECKSUM_SEED;
|
|
jnl_rec->prefix.checksum = compute_checksum(INIT_CHECKSUM_SEED,
|
|
(uint4 *)&jnl_rec->jrec_pini, SIZEOF(struct_jrec_pini));
|
|
}
|
|
}
|
|
checksum = jnl_rec->prefix.checksum;
|
|
assert(checksum);
|
|
# ifdef DEBUG
|
|
/* Ensure that the checksum computed earlier in jnl_format or jnl_write_pblk or jnl_write_aimg_rec or fixed-sized records
|
|
* matches with the block's content.
|
|
*/
|
|
if ((JRT_PBLK == rectype) || (JRT_AIMG == rectype))
|
|
{
|
|
COMPUTE_COMMON_CHECKSUM(tmp_csum2, jnl_rec->prefix);
|
|
tmp_csum1 = jnl_get_checksum((uint4 *)blk_ptr, NULL, jnl_rec->jrec_pblk.bsiz);
|
|
COMPUTE_PBLK_CHECKSUM(tmp_csum1, &jnl_rec->jrec_pblk, tmp_csum2, tmp_csum1);
|
|
assert(checksum == tmp_csum1);
|
|
} else if (IS_SET_KILL_ZKILL_ZTRIG_ZTWORM(rectype))
|
|
{
|
|
COMPUTE_COMMON_CHECKSUM(tmp_csum2, jnl_rec->prefix);
|
|
mumps_node_ptr = jfb->buff + FIXED_UPD_RECLEN;
|
|
mumps_node_sz = jfb->record_size - (FIXED_UPD_RECLEN + JREC_SUFFIX_SIZE);
|
|
tmp_csum1 = jnl_get_checksum((uint4 *)mumps_node_ptr, NULL, mumps_node_sz);
|
|
COMPUTE_LOGICAL_REC_CHECKSUM(tmp_csum1, &jnl_rec->jrec_set_kill, tmp_csum2, tmp_csum1);
|
|
assert(checksum == tmp_csum1);
|
|
}else if (jrt_fixed_size[rectype] || JRT_ALIGN == rectype)
|
|
{
|
|
jnl_rec->prefix.checksum = INIT_CHECKSUM_SEED;
|
|
switch(rectype)
|
|
{
|
|
case JRT_ALIGN:
|
|
tmp_csum1 = compute_checksum(INIT_CHECKSUM_SEED, (uint4 *)&jnl_rec->jrec_align, SIZEOF(jrec_prefix));
|
|
break;
|
|
default:
|
|
if(JRT_TRIPLE != rectype && JRT_HISTREC != rectype)
|
|
tmp_csum1 = compute_checksum(INIT_CHECKSUM_SEED, (uint4 *)&jnl_rec->jrec_set_kill,
|
|
jnl_rec->prefix.forwptr);
|
|
break;
|
|
}
|
|
assert(checksum == tmp_csum1);
|
|
jnl_rec->prefix.checksum = checksum;
|
|
}
|
|
# endif
|
|
ADJUST_CHECKSUM(checksum, lcl_freeaddr, checksum);
|
|
ADJUST_CHECKSUM(checksum, csd->jnl_checksum, checksum);
|
|
jnl_rec->prefix.checksum = checksum;
|
|
UNIX_ONLY(assert((!jb->blocked) || (FALSE == is_proc_alive(jb->blocked, 0)));)
|
|
VMS_ONLY(assert(!jb->blocked || (jb->blocked == process_id) && lib$ast_in_prog())); /* wcs_wipchk_ast can set jb->blocked */
|
|
jb->blocked = process_id;
|
|
/* We should differentiate between a full and an empty journal buffer, hence the pessimism reflected in the <= check below.
|
|
* Hence also the -1 in lcl_freeaddr - (lcl_size - rlen - 1).
|
|
* This means that although we have space we might still be invoking jnl_write_attempt (very unlikely).
|
|
*/
|
|
if (JNL_SPACE_AVAILABLE(jb, lcl_dskaddr, lcl_freeaddr, lcl_size, jnl_wrt_start_mask) <= rlen)
|
|
{ /* The fancy ordering of operators/operands in the calculation done below is to avoid overflows. */
|
|
if (SS_NORMAL != jnl_write_attempt(jpc, ROUND_UP2(lcl_freeaddr - (lcl_size - rlen - 1), jnl_wrt_start_modulus)))
|
|
{
|
|
assert(NOJNL == jpc->channel); /* jnl file lost */
|
|
return; /* let the caller handle the error */
|
|
}
|
|
}
|
|
jb->blocked = 0;
|
|
SET_JREC_LEN_PADDING_IF_NEEDED(rlen, rlen_padded); /* sets rlen_padded */
|
|
if (jb->filesize < DISK_BLOCKS_SUM(lcl_freeaddr, rlen_padded)) /* not enough room in jnl file, extend it. */
|
|
{ /* We should never reach here if we are called from t_end/tp_tend. We check that by using the fact that
|
|
* early_tn is different from curr_tn in the t_end/tp_tend case. The only exception is wcs_recover which
|
|
* also sets these to be different in case of writing an INCTN record. For this case though it is okay to
|
|
* extend/autoswitch the file. So allow that.
|
|
*/
|
|
assertpro((csa->ti->early_tn == csa->ti->curr_tn) || (JRT_INCTN == rectype));
|
|
assert(!IS_REPLICATED(rectype)); /* all replicated jnl records should have gone through t_end/tp_tend */
|
|
assert(jrt_fixed_size[rectype]); /* this is used later in re-computing checksums */
|
|
assert(NULL == blk_ptr); /* as otherwise it is a PBLK or AIMG record which is of variable record
|
|
* length that conflicts with the immediately above assert.
|
|
*/
|
|
assert(NULL == jfb); /* as otherwise it is a logical record with formatted journal records which
|
|
* is of variable record length (conflicts with the jrt_fixed_size assert).
|
|
*/
|
|
if (SS_NORMAL != jnl_flush(reg))
|
|
{
|
|
assert(NOJNL == jpc->channel); /* jnl file lost */
|
|
return; /* let the caller handle the error */
|
|
}
|
|
assert(lcl_freeaddr == jb->dskaddr);
|
|
if (EXIT_ERR == jnl_file_extend(jpc, rlen)) /* if extension fails, not much we can do */
|
|
{
|
|
assert(FALSE);
|
|
return;
|
|
}
|
|
if (0 == jpc->pini_addr)
|
|
{ /* This can happen only if jnl got switched in jnl_file_extend above.
|
|
* Write a PINI record in the new journal file and then continue writing the input record.
|
|
* Basically we need to redo the processing in jnl_write because a lot of the local variables
|
|
* have changed state (e.g. jb->freeaddr etc.). So we instead call jnl_write()
|
|
* recursively and then return immediately.
|
|
*/
|
|
jnl_put_jrt_pini(csa);
|
|
assertpro(jpc->pini_addr); /* should have been set in "jnl_put_jrt_pini" */
|
|
if (JRT_PINI != rectype)
|
|
{
|
|
jnl_rec->prefix.pini_addr = jpc->pini_addr;
|
|
/* Checksum needs to be recomputed since prefix.pini_addr is changed in above statement */
|
|
jnl_rec->prefix.checksum = INIT_CHECKSUM_SEED;
|
|
jnl_rec->prefix.checksum = compute_checksum(INIT_CHECKSUM_SEED,
|
|
(uint4 *)jnl_rec, jnl_rec->prefix.forwptr);
|
|
jnl_write(jpc, rectype, jnl_rec, NULL, NULL);
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
lcl_orig_free = lcl_free;
|
|
nowrap = (lcl_size >= (lcl_free + rlen));
|
|
assert(jrt_fixed_size[JRT_EOF]);
|
|
if (jrt_fixed_size[rectype])
|
|
{
|
|
if (nowrap)
|
|
{
|
|
memcpy(lcl_buff + lcl_free, (uchar_ptr_t)jnl_rec, rlen);
|
|
lcl_free += rlen;
|
|
if (lcl_size == lcl_free)
|
|
lcl_free = 0;
|
|
} else
|
|
JNL_PUTSTR(lcl_free, lcl_buff, (uchar_ptr_t)jnl_rec, rlen, lcl_size);
|
|
/* As part of writing the EOF record into the journal buffer, add enough 0-padding needed to reach
|
|
* a filesystem-block-size aligned boundary. This way later jnl_qio_start can safely do aligned
|
|
* writes without having to write non-zero garbage after the EOF record. Note that this has to be
|
|
* done BEFORE updating freeaddr. Otherwise, it is possible that a jnl qio timer pops after freeaddr
|
|
* gets updated but before the 0-padding is done and flushes the eof record to disk without the 0-padding.
|
|
*/
|
|
if (JRT_EOF == rectype)
|
|
{
|
|
jnl_fs_block_size = jb->fs_block_size;
|
|
aligned_lcl_free = ROUND_UP2(lcl_free, jnl_fs_block_size);
|
|
padding_size = aligned_lcl_free - lcl_free;
|
|
assert(0 <= (int4)padding_size);
|
|
if (padding_size)
|
|
memset(lcl_buff + lcl_free, 0, padding_size);
|
|
}
|
|
/* Note: Cannot easily use ? : syntax below as INCR_GVSTATS_COUNTER macro
|
|
* is not an arithmetic expression but a sequence of statements.
|
|
*/
|
|
if (JRT_EPOCH != rectype)
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_jrec_other, 1);
|
|
/* else for EPOCH, the increment of JRE or JRI is done after "jnl_write_epoch_rec" in caller */
|
|
} else
|
|
{
|
|
if (NULL != blk_ptr) /* PBLK and AIMG */
|
|
{
|
|
assert(FIXED_BLK_RECLEN == FIXED_PBLK_RECLEN);
|
|
assert(FIXED_BLK_RECLEN == FIXED_AIMG_RECLEN);
|
|
jrec_blk = (struct_jrec_blk *)jnl_rec;
|
|
if (nowrap)
|
|
{ /* write fixed part of record before the actual gds block image */
|
|
memcpy(lcl_buff + lcl_free, (uchar_ptr_t)jnl_rec, FIXED_BLK_RECLEN);
|
|
lcl_free += (int4)FIXED_BLK_RECLEN;
|
|
/* write actual block */
|
|
memcpy(lcl_buff + lcl_free, (uchar_ptr_t)blk_ptr, jrec_blk->bsiz);
|
|
lcl_free += jrec_blk->bsiz;
|
|
/* Now write trailing characters for 8-bye alignment and then suffix */
|
|
memcpy(lcl_buff + lcl_free, (uchar_ptr_t)jfb->buff, jfb->record_size);
|
|
lcl_free += jfb->record_size;
|
|
assert(lcl_free <= lcl_size);
|
|
if (lcl_size == lcl_free)
|
|
lcl_free = 0;
|
|
} else
|
|
{ /* write fixed part of record before the actual gds block image */
|
|
JNL_PUTSTR(lcl_free, lcl_buff, (uchar_ptr_t)jnl_rec, (int4)FIXED_BLK_RECLEN, lcl_size);
|
|
/* write actual block */
|
|
JNL_PUTSTR(lcl_free, lcl_buff, (uchar_ptr_t)blk_ptr, jrec_blk->bsiz, lcl_size);
|
|
/* Now write trailing characters for 8-bye alignment and then suffix */
|
|
JNL_PUTSTR(lcl_free, lcl_buff, (uchar_ptr_t)jfb->buff, jfb->record_size, lcl_size);
|
|
}
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_jrec_pblk, 1);
|
|
} else
|
|
{ /* SET, KILL, ZKILL for TP, ZTP, non-TP */
|
|
assert(IS_TP(rectype) || IS_ZTP(rectype) || (0 == ((struct_jrec_upd *)jfb->buff)->update_num));
|
|
assert((!IS_TP(rectype) && !IS_ZTP(rectype)) || (0 != ((struct_jrec_upd *)jfb->buff)->update_num));
|
|
assert(((jrec_prefix *)jfb->buff)->forwptr == jfb->record_size);
|
|
if (nowrap)
|
|
{
|
|
memcpy(lcl_buff + lcl_free, (uchar_ptr_t)jfb->buff, rlen);
|
|
lcl_free += rlen;
|
|
if (lcl_size == lcl_free)
|
|
lcl_free = 0;
|
|
} else
|
|
JNL_PUTSTR(lcl_free, lcl_buff, (uchar_ptr_t)jfb->buff, rlen, lcl_size);
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_jrec_logical, 1);
|
|
}
|
|
}
|
|
assert((lcl_free - lcl_orig_free + lcl_size) % lcl_size == rlen);
|
|
assert(lcl_buff[lcl_orig_free] == rectype);
|
|
assert(lcl_orig_free < lcl_free || lcl_free < jb->dsk);
|
|
assert((lcl_freeaddr >= jb->dskaddr)
|
|
|| (gtm_white_box_test_case_enabled && (WBTEST_JNL_FILE_LOST_DSKADDR == gtm_white_box_test_case_number)));
|
|
jpc->new_freeaddr = lcl_freeaddr + rlen;
|
|
INCR_GVSTATS_COUNTER(csa, cnl, n_jbuff_bytes, rlen);
|
|
assert(lcl_free == jpc->new_freeaddr % lcl_size);
|
|
if (REPL_ENABLED(csa) && is_replicated)
|
|
{ /* If the database is encrypted, then at this point jfb->buff will contain encrypted
|
|
* data which we don't want to to push into the jnlpool. Instead, we make use of the
|
|
* alternate alt_buff which is guaranteed to contain the original unencrypted data.
|
|
*/
|
|
if (jrt_fixed_size[rectype])
|
|
ptr = (char *)jnl_rec;
|
|
else
|
|
{
|
|
# ifdef GTM_CRYPT
|
|
if (csd->is_encrypted && IS_SET_KILL_ZKILL_ZTRIG_ZTWORM(rectype))
|
|
ptr = jfb->alt_buff;
|
|
else
|
|
# endif
|
|
ptr = jfb->buff;
|
|
}
|
|
assert(NULL != jnlpool.jnlpool_ctl && NULL != jnlpool_ctl); /* ensure we haven't yet detached from the jnlpool */
|
|
assert((&FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs)->now_crit); /* ensure we have the jnl pool lock */
|
|
DEBUG_ONLY(jgbl.cu_jnl_index++;)
|
|
jnlpool_size = temp_jnlpool_ctl->jnlpool_size;
|
|
dstlen = jnlpool_size - temp_jnlpool_ctl->write;
|
|
if (rlen <= dstlen) /* dstlen >= rlen (most frequent case) */
|
|
memcpy(jnldata_base + temp_jnlpool_ctl->write, ptr, rlen);
|
|
else /* dstlen < rlen */
|
|
{
|
|
memcpy(jnldata_base + temp_jnlpool_ctl->write, ptr, dstlen);
|
|
memcpy(jnldata_base, ptr + dstlen, rlen - dstlen);
|
|
}
|
|
temp_jnlpool_ctl->write += rlen;
|
|
if (temp_jnlpool_ctl->write >= jnlpool_size)
|
|
temp_jnlpool_ctl->write -= jnlpool_size;
|
|
}
|
|
assert(jgbl.gbl_jrec_time >= jnl_rec->prefix.time);
|
|
assert(jnl_rec->prefix.time >= jb->prev_jrec_time);
|
|
jb->prev_jrec_time = jnl_rec->prefix.time;
|
|
jpc->temp_free = lcl_free; /* set jpc->temp_free BEFORE setting free_update_pid (secshr_db_clnup relies on this) */
|
|
/* Note that freeaddr should be updated ahead of free since jnl_output_sp.c does computation of wrtsize
|
|
* based on free and asserts follow later there which use freeaddr.
|
|
*/
|
|
jb->free_update_pid = process_id;
|
|
lcl_freeaddr = jpc->new_freeaddr;
|
|
jb->freeaddr = lcl_freeaddr;
|
|
/* Write memory barrier here to enforce the fact that freeaddr *must* be seen to be updated before
|
|
free is updated. It is less important if free is stale so we do not require a 2nd barrier for that
|
|
and will let the lock release (crit lock required since clustering not currently supported) do the
|
|
2nd memory barrier for us. This barrier takes care of this process's responsibility to broadcast
|
|
cache changes. It is up to readers to also specify a read memory barrier if necessary to receive
|
|
this broadcast.
|
|
*/
|
|
SHM_WRITE_MEMORY_BARRIER;
|
|
jb->free = lcl_free;
|
|
jb->free_update_pid = 0;
|
|
DBG_CHECK_JNL_BUFF_FREEADDR(jb);
|
|
VMS_ONLY(
|
|
if (((lcl_freeaddr - jb->dskaddr) > jb->min_write_size)
|
|
&& (SS_NORMAL != (status = jnl_qio_start(jpc))) && (ERR_JNLWRTNOWWRTR != status) && (ERR_JNLWRTDEFER != status))
|
|
{
|
|
jb->blocked = 0;
|
|
jnl_file_lost(jpc, status);
|
|
return;
|
|
}
|
|
)
|
|
}
|