fis-gtm/sr_port/jnl.h

1401 lines
61 KiB
C
Raw Normal View History

/****************************************************************
* *
* Copyright 2001, 2012 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#ifndef JNL_H_INCLUDED
#define JNL_H_INCLUDED
#ifdef DEBUG
#include <stddef.h> /* for offsetof macro (see OFFSETOF usage in assert below) */
#endif
#ifndef JNLSP_H_INCLUDED
#include "jnlsp.h"
#endif
#ifdef GTM_CRYPT
#include "gtmcrypt.h"
#endif
error_def(ERR_JNLBADLABEL);
error_def(ERR_JNLENDIANBIG);
error_def(ERR_JNLENDIANLITTLE);
#define TID_STR_SIZE 8
#define JPV_LEN_NODE 16
#define JPV_LEN_USER 12
#define JPV_LEN_PRCNAM 16
#define JPV_LEN_TERMINAL 15
/* Whenever JNL_LABEL_TEXT changes, also change the following
* 1) Update JNL_VER_THIS
* 2) Add REPL_JNL_Vxx enum to repl_jnl_t typedef AND Vxx_JNL_VER #define in repl_filter.h
* 3) Add an entry each to repl_filter_old2cur & repl_filter_cur2old arrays in repl_filter.c.
* If the FILTER format is also changing, then do the following as well
* 4) Add REPL_FILTER_Vxx enum to repl_filter_t typedef in repl_filter.h
* 5) Add/Edit IF_xTOy macros in repl_filter.h to transform from/to the NEW jnl format version only.
* Remove all entries that dont have the new jnl format in either the from or to part of the conversion.
* 6) Add/Edit prototype and implement functions jnl_xTOy() and jnl_yTOx() in repl_filter.c
* 7) Enhance repl_tr_endian_convert() to endian convert journal records from previous jnl formats to new format.
* This is similar to the jnl_xTOy() filter conversion functions except that lot of byte-swaps are needed.
* 8) Periodically determine if the size of the array repl_filter_old2cur is huge and if so trim support of
* rolling upgrade (using replication internal filters) for older GT.M versions/jnl-formats.
* This would mean bumping the macro JNL_VER_EARLIEST_REPL and examining all arrays that are defined
* using this macro and changing the entries in all those arrays accordingly (e.g. repl_filter_old2cur
* array currently assumes earliest supported version is V15 and hence has the function named IF_curTO15
* which needs to change to say IF_curTO17 if the earliest supported version changes to V17 or so).
*
*/
#define JNL_LABEL_TEXT "GDSJNL22" /* see above comment paragraph for todos whenever this is changed */
#define JNL_VER_THIS 22
#define JNL_VER_EARLIEST_REPL 17 /* Replication filter support starts here GDSJNL17 = GT.M V5.1-000.
* (even though it should be V5.0-000, since that is pre-multisite,
* the replication connection with V55000 will error out at handshake
* time so V5.1-000 is the minimum that will even reach internal filter code)
*/
#define JRT_MAX_V17 JRT_AIMG /* Maximum jnl record type in GDSJNL17 or GDSJNL18 that can be input to replication
* filter. Actually JRT_TRIPLE is a higher record type than JRT_AIMG but it is only
* sent through the replication pipe and never seen by filter routines.
*/
#define JRT_MAX_V19 JRT_UZTWORM /* Max jnlrec type in GDSJNL19/GDSJNL20 that can be input to replication filter */
#define JRT_MAX_V21 JRT_UZTRIG /* Max jnlrec type in GDSJNL21/GDSJNL22 that can be input to replication filter */
#define JRT_MAX_V22 JRT_UZTRIG /* Max jnlrec type in GDSJNL22 that can be input to replication filter.
* Actually JRT_HISTREC is a higher record type than JRT_UZTRIG but it is only
* sent through the replication pipe and never seen by filter routines.
*/
#define ALIGN_KEY 0xdeadbeef
#ifdef UNIX
#define JNL_ALLOC_DEF 2048
#define JNL_ALLOC_MIN 200
#elif defined(VMS)
#define JNL_ALLOC_DEF 100
#define JNL_ALLOC_MIN 10
#endif
/* JNL_BUFFER_MIN database block size / 512 + 1 */
#define JNL_BUFFER_MAX 32768 /* # of 512-byte blocks = 16Mb journal buffer size */
/* JNL_EXTEND_DEF allocation size / 10
#define JNL_EXTEND_DEF_PERC 0.1
* Uncomment this section when code is ready to use extension = 10% of allocation
*/
#define JNL_EXTEND_MIN 0
#ifdef UNIX
#define JNL_EXTEND_DEF 2048
#define JNL_EXTEND_MAX 1073741823
#else
#define JNL_EXTEND_DEF 100
#define JNL_EXTEND_MAX 65535
#endif
#define JNL_MIN_WRITE 32768
#define JNL_MAX_WRITE 65536
/* FE was changed to EB because, the bit pattern there seems to vary more than the one for "FE".
* Also a research in ELWOOD journal file showed that "EB" was one of the few patterns that had the least occurrences */
#define JNL_REC_SUFFIX_CODE 0xEB
/* In Unix, with sync_io, we do journal writes to disk at filesystem block size boundaries.
* In VMS, the writes are at 512-byte boundaries only.
*/
#ifdef UNIX
# define JNL_WRT_START_MODULUS(jb) jb->fs_block_size
#elif defined(VMS)
# define JNL_WRT_START_MODULUS(jb) 512
#endif
#define JNL_WRT_START_MASK(jb) ~(JNL_WRT_START_MODULUS(jb) - 1)
#define JNL_WRT_END_MODULUS 8
#define JNL_WRT_END_MASK ~(JNL_WRT_END_MODULUS - 1)
#ifdef UNIX
#define JNL_MIN_ALIGNSIZE (1 << 8) /* 256 disk blocks effectively 128K alignsize */
#define JNL_DEF_ALIGNSIZE (1 << 11) /* 2048 disk blocks effectively 1M alignsize */
#else
#define JNL_MIN_ALIGNSIZE (1 << 5) /* 32 disk blocks effectively 16K alignsize */
#define JNL_DEF_ALIGNSIZE (1 << 7) /* 128 disk blocks effectively 64K alignsize */
#endif
#define JNL_MAX_ALIGNSIZE (1 << 22) /* 4194304 disk blocks effectively 2G alignsize */
#define JNL_REC_START_BNDRY 8
#define MAX_LOGI_JNL_REC_SIZE (MAX_DB_BLK_SIZE) /* maximum logical journal record size */
#define MAX_JNL_REC_SIZE (MAX_LOGI_JNL_REC_SIZE + DISK_BLOCK_SIZE) /* one more disk-block for PBLK record header/footer */
#ifdef GTM_TRIGGER
/* Define maximum size that $ZTWORMHOLE can be. Since $ZTWORMHOLE should be able to fit in a journal record and the
* minimum alignsize is 128K, we do not want it to go more than 128K (that way irrespective of whatever alignsize the user
* specifies for the journal file, $ZTWORMHOLE will fit in the journal record). Leaving a max of 512 bytes for the
* journal record prefix/suffix (32-byte overhead) and MIN_ALIGN_RECLEN (see comment in JNL_MAX_RECLEN macro for why
* this is needed) we allow for a max of 128K-512 bytes in $ZTWORMHOLE.
*/
#define MAX_ZTWORMHOLE_LEN (128 * 1024)
#define MAX_ZTWORMHOLE_SIZE (MAX_ZTWORMHOLE_LEN - 512)
#define MAX_ZTWORM_JREC_LEN (MAX_ZTWORMHOLE_LEN - MIN_ALIGN_RECLEN)
#endif
#define MIN_YIELD_LIMIT 0
#define MAX_YIELD_LIMIT 2048
#define DEFAULT_YIELD_LIMIT 8
#ifdef UNIX
/* Have a minimum jnl-file-auto-switch-limit of 16 align boundaries (currently each align boundary is 128K) */
#define JNL_AUTOSWITCHLIMIT_MIN (16 * JNL_MIN_ALIGNSIZE)
#define JNL_AUTOSWITCHLIMIT_DEF 8386560 /* Instead of 8388607 it is adjusted for default allocation = extension = 2048 */
#else
/* Have a minimum jnl-file-auto-switch-limit of 128 align boundaries (currently each align boundary is 16K) */
#define JNL_AUTOSWITCHLIMIT_MIN (128 * JNL_MIN_ALIGNSIZE)
#define JNL_AUTOSWITCHLIMIT_DEF 8388600 /* Instead of 8388607 it is adjusted for default allocation = extension = 100 */
#endif
/* options (4-bytes unsigned integer) to wcs_flu() (currently flush_hdr, write_epoch, sync_epoch) are bit-wise ored */
#define WCSFLU_NONE 0
#define WCSFLU_FLUSH_HDR 1
#define WCSFLU_WRITE_EPOCH 2
#define WCSFLU_SYNC_EPOCH 4
#define WCSFLU_FSYNC_DB 8 /* Currently used only in Unix wcs_flu() */
#define WCSFLU_IN_COMMIT 16 /* Set if caller is t_end or tp_tend. See wcs_flu for explanation of when this is set */
#define WCSFLU_MSYNC_DB 32 /* Force a full msync if NO_MSYNC is defined. Currently used only in Unix wcs_flu(). */
/* options for error_on_jnl_file_lost */
#define JNL_FILE_LOST_TURN_OFF 0 /* Turn off journaling. */
#define JNL_FILE_LOST_ERRORS 1 /* Throw an rts_error. */
#define MAX_JNL_FILE_LOST_OPT JNL_FILE_LOST_ERRORS
/* EPOCHs are written unconditionally in Unix (assuming jnl is ON) while they are written only for BEFORE_IMAGE in VMS */
#define JNL_HAS_EPOCH(jnlfile) UNIX_ONLY(TRUE) VMS_ONLY(jnlfile->before_images)
#ifdef DEBUG
#define DEFAULT_EPOCH_INTERVAL_IN_SECONDS 30 /* exercise epoch-syncing code relatively more often in DBG */
#else
#define DEFAULT_EPOCH_INTERVAL_IN_SECONDS 300
#endif
#define DEFAULT_EPOCH_INTERVAL SECOND2EPOCH_SECOND(DEFAULT_EPOCH_INTERVAL_IN_SECONDS) /* ***MUST*** include math.h for VMS */
#define MAX_EPOCH_INTERVAL 32767 /* in seconds. Amounts to nearly 10 hours. Don't want to keep db stale so long */
#define JNL_ENABLED(X) ((X)->jnl_state == jnl_open) /* If TRUE, journal records are to be written */
#define JNL_ALLOWED(X) ((X)->jnl_state != jnl_notallowed) /* If TRUE, journaling is allowed for the file */
#define REPL_ENABLED(X) ((X)->repl_state == repl_open) /* If TRUE, replication records are to be written */
#define REPL_WAS_ENABLED(X) ((X)->repl_state == repl_was_open) /* If TRUE, replication is now closed, but was open earlier */
/* In this state, replication records are not written */
#define REPL_ALLOWED(X) ((X)->repl_state != repl_closed) /* If TRUE, replication records are/were written */
/* Logical records should be written if journaling is enabled in the region OR if replication state is WAS_ON (repl_was_open).
* In the former case, the journal records will be written to the journal pool, journal buffer and journal file.
* In the latter case, the journal records will be written to the journal pool but not to the journal buffer and journal file.
* All code that generates logical journal records should use the below macro instead of JNL_ENABLED macro.
* Note that replication does not care about non-logical records (PBLK/AIMG/INCTN etc.) and hence code that generates them does
* not need to (and should not) use this macro.
*/
#define JNL_WRITE_LOGICAL_RECS(X) (JNL_ENABLED(X) || REPL_WAS_ENABLED(X))
/* The following macro should be used to invoke the function "jnl_write" for any logical record. This macro
* checks if journaling is enabled and if so invokes "jnl_write" else it invokes "jnl_write_poolonly" which
* writes only to the journal pool.
*/
#define JNL_WRITE_APPROPRIATE(CSA, JPC, RECTYPE, JREC, BLKPTR, JFB) \
{ \
assert(JNL_ENABLED(CSA) || REPL_WAS_ENABLED(CSA)); \
assert((NULL == JFB) || (RECTYPE == ((jnl_record *)(((jnl_format_buffer *)JFB)->buff))->prefix.jrec_type)); \
if (JNL_ENABLED(CSA)) \
jnl_write(JPC, RECTYPE, JREC, BLKPTR, JFB); /* write to jnlbuffer, jnlfile, jnlpool */ \
else \
jnl_write_poolonly(JPC, RECTYPE, JREC, JFB); /* write to jnlpool only */ \
}
#define MUEXTRACT_TYPE(A) (((A)[0]-'0')*10 + ((A)[1]-'0')) /* A is a character pointer */
#define PADDED PADDING
/* User must enter this string to ask standard input or output. */
#define JNL_STDO_EXTR "-stdout"
#ifdef BIGENDIAN
#define THREE_LOW_BYTES(x) ((uchar_ptr_t)((uchar_ptr_t)&x + 1))
#else
#define THREE_LOW_BYTES(x) ((uchar_ptr_t)(&x))
#endif
#define EXTTIME(S) extract_len = exttime(S, murgbl.extr_buff, extract_len)
/* This macro should be used to initialize jgbl.gbl_jrec_time to the system time. The reason is that it does additional checks. */
#define SET_GBL_JREC_TIME \
{ \
assert(!jgbl.dont_reset_gbl_jrec_time); \
JNL_SHORT_TIME(jgbl.gbl_jrec_time); \
}
/* This macro ensures that journal records are written in non-decreasing time order in each journal file.
* It is passed the time field to adjust and a pointer to the journal buffer of the region.
* The journal buffer holds the timestamp of the most recently written journal record.
*/
#define ADJUST_GBL_JREC_TIME(jgbl, jbp) \
{ \
if (jgbl.gbl_jrec_time < jbp->prev_jrec_time) \
{ \
assert(!jgbl.dont_reset_gbl_jrec_time); \
jgbl.gbl_jrec_time = jbp->prev_jrec_time; \
} \
}
/* This macro is similar to ADJUST_GBL_JREC_TIME except that this ensures ordering of timestamps across
* ALL replicated regions in a replicated environment. In VMS, we dont maintain this prev_jnlseqno_time
* field.
*/
# define ADJUST_GBL_JREC_TIME_JNLPOOL(jgbl, jpl) \
{ \
if (jgbl.gbl_jrec_time < jpl->prev_jnlseqno_time) \
{ \
assert(!jgbl.dont_reset_gbl_jrec_time); \
jgbl.gbl_jrec_time = jpl->prev_jnlseqno_time; \
} \
jpl->prev_jnlseqno_time = jgbl.gbl_jrec_time; \
}
/* Check if journal file is usable from the fields in the file header.
* Currently, the fields tested are LABEL and ENDIANNESS.
*/
#define JNL_HDR_ENDIAN_OFFSET 8
#define CHECK_JNL_FILE_IS_USABLE(JFH, STATUS, DO_GTMPUTMSG, JNL_FN_LEN, JNL_FN) \
{ \
boolean_t check_failed = FALSE; \
uint4 lcl_status; \
\
assert(JNL_HDR_ENDIAN_OFFSET == OFFSETOF(jnl_file_header, is_little_endian)); \
if (0 != MEMCMP_LIT((JFH)->label, JNL_LABEL_TEXT)) \
{ \
lcl_status = ERR_JNLBADLABEL; \
check_failed = TRUE; \
} \
BIGENDIAN_ONLY( \
else if ((JFH)->is_little_endian) \
{ \
lcl_status = ERR_JNLENDIANLITTLE; \
check_failed = TRUE; \
} \
) \
LITTLEENDIAN_ONLY( \
else if (!(JFH)->is_little_endian) \
{ \
lcl_status = ERR_JNLENDIANBIG; \
check_failed = TRUE; \
} \
) \
/* Currently, we can do one gtm_putmsg for any of the above 3 error messages \
* because all of them have a fao count of 2 and expect jnl_fn_len and jnl_fn \
* as arguments. If a new error gets added and has a different fao format, \
* then the below gtm_putmsg has to be done differently based on that error. \
*/ \
if (check_failed) \
{ \
STATUS = lcl_status; \
if (DO_GTMPUTMSG) \
gtm_putmsg(VARLSTCNT(4) lcl_status, 2, JNL_FN_LEN, JNL_FN); \
} \
}
/* Token generation used in non-replicated journaled environment. Note the assumption here
that SIZEOF(token_split_t) == SIZEOF(token_build) which will be asserted in gvcst_init().
The TOKEN_SET macro below depends on this assumption.
*/
typedef struct token_split_t_struct
{
# ifdef BIGENDIAN
uint4 process_id;
uint4 local_tn;
# else
uint4 local_tn;
uint4 process_id;
# endif
} token_split_t;
typedef union
{
token_split_t t_piece;
token_num token;
} token_build;
/* To assist in setting token value, the following macro is supplied to handle the two token parts */
#define TOKEN_SET(BASE, TN, PID) (((token_build_ptr_t)(BASE))->t_piece.local_tn = (uint4)(TN), \
((token_build_ptr_t)(BASE))->t_piece.process_id = (PID))
enum jpv_types
{
CURR_JPV = 0,
ORIG_JPV,
JPV_COUNT
};
/* Note we have two process verctors now for a pini record */
typedef struct jnl_process_vector_struct /* name needed since this is used in cmmdef.h for "pvec" member */
{
uint4 jpv_pid; /* Process id */
int4 jpv_image_count; /* Image activations [VMS only] */
jnl_proc_time jpv_time; /* Timestamp of the process genarating this.
(This could be different than the journal record timestamp) */
jnl_proc_time jpv_login_time; /* Used for process initialization time */
char jpv_node[JPV_LEN_NODE], /* Node name */
jpv_user[JPV_LEN_USER], /* User name */
jpv_prcnam[JPV_LEN_PRCNAM], /* Process name [VMS only] */
jpv_terminal[JPV_LEN_TERMINAL]; /* Login terminal */
unsigned char jpv_mode; /* a la JPI$_MODE [VMS only] */
int4 filler;
/* SIZEOF(jnl_process_vector) must be a multiple of SIZEOF(int4) */
} jnl_process_vector;
enum pini_rec_stat
{
IGNORE_PROC = 0,
ACTIVE_PROC = 1,
FINISHED_PROC = 2,
BROKEN_PROC = 4
};
typedef struct pini_list
{
uint4 pini_addr;
uint4 new_pini_addr; /* used in forward phase of recovery */
jnl_process_vector jpv; /* CURR_JPV. Current process's JPV. For GTCM server we also use this. */
jnl_process_vector origjpv; /* ORIG_JPV. Used for GTCM client only */
enum pini_rec_stat state; /* used for show qualifier */
} pini_list_struct;
enum jnl_record_type
{
#define JNL_TABLE_ENTRY(rectype, extract_rtn, label, update, fixed_size, is_replicated) rectype,
#include "jnl_rec_table.h"
#undef JNL_TABLE_ENTRY
JRT_RECTYPES /* Total number of JOURNAL record types */
};
#include "jnl_typedef.h"
enum jnl_state_codes
{
jnl_notallowed,
jnl_closed,
jnl_open
};
enum repl_state_codes
{
repl_closed, /* region not replicated, no records are written */
repl_open, /* region is replicated, and records are written */
repl_was_open /* region is currently not replicated, but it was earlier; jnl_file_lost() changes open to was_open */
};
typedef struct
{
trans_num eov_tn; /* curr_tn is saved as eov_tn by jnl_write_epoch. Used by recover/rollback */
volatile trans_num epoch_tn; /* Transaction number for current epoch */
seq_num end_seqno; /* reg_seqno saved by jnl_write_epoch. Used by recover/rollback */
seq_num strm_end_seqno[MAX_SUPPL_STRMS]; /* used to keep jfh->strm_end_seqno uptodate with each epoch.
* Unused in VMS but defined so shared memory layout is similar in Unix & VMS.
*/
int4 min_write_size, /* if unwritten data gets to this size, write it */
max_write_size, /* maximum size of any single write */
size; /* buffer size */
int4 epoch_interval; /* Time between successive epochs in epoch-seconds */
boolean_t before_images; /* If TRUE, before-image processing is enabled */
/* end not volatile QUAD */
uintszofptr_t buff_off; /* relative offset to filesystem-block-size aligned buffer start */
volatile int4 free; /* relative index of first byte to write in buffer */
volatile uint4 freeaddr, /* virtual on-disk address which will correspond to free, when it is written */
end_of_data, /* Synched offset updated by jnl_write_epoch. Used by recover/rollback */
filesize; /* highest virtual address available in the file (units in disk-blocks) */
/* end mainline QUAD */
volatile int4 blocked;
volatile uint4 fsync_dskaddr; /* dskaddr upto which fsync is done */
volatile int4 dsk; /* relative index of 1st byte to write to disk;
* if free == dsk, buffer is empty */
volatile int4 wrtsize; /* size of write in progress */
volatile uint4 dskaddr, /* virtual on-disk address corresponding to dsk */
now_writer, /* current owner of io_in_prog (VMS-only) */
image_count; /* for VMS is_proc_alive */
volatile struct /* must be at least word aligned for memory coherency */
{
short cond;
unsigned short length;
int4 dev_specific;
} iosb;
/* alignsize is removed and log2_of_alignsize introduced */
uint4 log2_of_alignsize; /* Ceiling of log2(alignsize) */
jnl_tm_t eov_timestamp; /* jgbl.gbl_jrec_time saved by jnl_write_epoch. Used by recover/rollback */
uint4 cycle; /* shared copy of the number of the current journal file generation */
volatile int4 qiocnt, /* Number of qio's issued */
bytcnt, /* Number of bytes written */
errcnt, /* Number of errors during writing */
reccnt[JRT_RECTYPES]; /* Number of records written per opcode */
int filler_align[35 - JRT_RECTYPES]; /* So buff below starts on even (QW) keel */
/* Note the above filler will fail if JRT_RECTYPES grows beyond 31 elements and give compiler warning in VMS
* if JRT_RECTYPES equals 31. In that case, change the start num to the next odd number above MAX(31,JRT_RECTYPES).
*/
volatile jnl_tm_t prev_jrec_time; /* to ensure that time never decreases across successive jnl records */
volatile int4 free_update_pid; /* pid that is updating jb->free and jb->freeaddr */
volatile uint4 next_epoch_time; /* Time when next epoch is to be written (in epoch-seconds) */
volatile boolean_t need_db_fsync; /* need an fsync of the db file */
volatile int4 io_in_prog; /* VMS only: write in progress indicator (NOTE: must manipulate
only with interlocked instructions */
uint4 enospc_errcnt; /* number of times jb->errcnt was last incremented due to ENOSPC error
* when writing to this journal file */
uint4 max_jrec_len; /* copy of max_jrec_len from journal file header */
uint4 fs_block_size; /* underlying journal file system block size;
* primarily used in Unix, 512 in VMS */
/* CACHELINE_PAD macros provide spacing between the following latches so that they do
not interfere with each other which can happen if they fall in the same data cacheline
of a processor.
*/
CACHELINE_PAD(SIZEOF(global_latch_t), 0) /* start next latch at a different cacheline than previous fields */
global_latch_t io_in_prog_latch; /* UNIX only: write in progress indicator */
CACHELINE_PAD(SIZEOF(global_latch_t), 1) /* pad enough space so next latch falls in different cacheline */
global_latch_t fsync_in_prog_latch; /* fsync in progress indicator */
CACHELINE_PAD(SIZEOF(global_latch_t), 2) /* pad enough space so next non-filler byte falls in different cacheline */
/**********************************************************************************************/
/* Important: must keep header structure quadword (8 byte) aligned for buffers used in QIO's */
/**********************************************************************************************/
unsigned char buff[1]; /* Actually buff[size] */
} jnl_buffer;
#define FIX_NONZERO_FREE_UPDATE_PID(csa, jbp) \
{ \
assert(csa->now_crit); /* hold crit before manipulating freeaddr/free */ \
assert(jbp->free_update_pid); \
UNIX_ONLY(assert(!is_proc_alive(jbp->free_update_pid, 0));) \
VMS_ONLY(assert(FALSE);) /* secshr_db_clnup should have cleaned up this field even in case of STOP/ID */ \
if ((jbp->freeaddr % jbp->size) != jbp->free) \
{ /* Previous process in jnl_write got killed after incrementing freeaddr but before incrementing \
* free. Recalculate jbp->free based on current value of jbp->freeaddr. */ \
jbp->free = jbp->freeaddr % jbp->size; \
jbp->free_update_pid = 0; \
} \
DBG_CHECK_JNL_BUFF_FREEADDR(jbp); \
}
#define DBG_CHECK_JNL_BUFF_FREEADDR(jbp) \
{ \
assert((jbp->freeaddr % jbp->size) == jbp->free); \
assert((jbp->freeaddr >= jbp->dskaddr) \
|| (gtm_white_box_test_case_enabled \
&& (WBTEST_JNL_FILE_LOST_DSKADDR == gtm_white_box_test_case_number))); \
}
#ifdef DB64
# ifdef __osf__
# pragma pointer_size(save)
# pragma pointer_size(long)
# else
# error UNSUPPORTED PLATFORM
# endif
#endif
typedef jnl_buffer *jnl_buffer_ptr_t;
typedef token_build *token_build_ptr_t;
#ifdef DB64
# ifdef __osf__
# pragma pointer_size(restore)
# endif
#endif
typedef struct jnl_private_control_struct
{
jnl_buffer_ptr_t jnl_buff; /* pointer to shared memory */
gd_region *region; /* backpointer to region head */
fd_type channel, /* output channel, aka fd in UNIX */
old_channel; /* VMS only - for dealing with deferred deassign */
gd_id fileid; /* currently initialized and used only by source-server */
vms_lock_sb *jnllsb; /* VMS only */
uint4 pini_addr, /* virtual on-disk address for JRT_PINI record, if journaling */
new_freeaddr;
int4 temp_free; /* M Temp copy of free relative index until full write done */
double filler_q0; /* reset QUAD end mainline */
int4 new_dsk; /* A VMS only */
uint4 new_dskaddr; /* A VMS only */
int4 status; /* A for error reporting */
volatile boolean_t dsk_update_inprog; /* A VMS only */
volatile boolean_t qio_active; /* jnl buffer write in progress in THIS process (recursion indicator) */
boolean_t fd_mismatch; /* TRUE when jpc->channel does not point to the active journal */
volatile boolean_t sync_io; /* TRUE if the process is using O_SYNC/O_DSYNC for this jnl (UNIX) */
/* TRUE if writers open NOCACHING to bypass XFC cache (VMS) */
boolean_t error_reported; /* TRUE if jnl_file_lost already reported the journaling error */
uint4 status2; /* for secondary error status, currently used only in VMS */
uint4 cycle; /* private copy of the number of this journal file generation */
} jnl_private_control;
typedef enum
{
JNL_KILL,
JNL_SET,
JNL_ZKILL,
# ifdef GTM_TRIGGER
JNL_ZTWORM,
JNL_ZTRIG,
# endif
JA_MAX_TYPES
} jnl_action_code;
typedef enum
{
#define MUEXT_TABLE_ENTRY(muext_rectype, code0, code1) muext_rectype,
#include "muext_rec_table.h"
#undef MUEXT_TABLE_ENTRY
MUEXT_MAX_TYPES /* Total number of EXTRACT JOURNAL record types */
} muextract_type;
typedef struct
{
jnl_action_code operation;
uint4 nodeflags;
} jnl_action;
#define JNL_FENCE_LIST_END ((sgmnt_addrs *)-1L)
typedef struct
{
sgmnt_addrs *fence_list;
int level;
token_num token;
seq_num strm_seqno; /* valid only in case of replication. uninitialized in case of ZTP */
} jnl_fence_control;
typedef struct
{
uint4 jrec_type : 8; /* Offset:0 :: Actually, enum jnl_record_type */
uint4 forwptr : 24; /* Offset:1 :: Offset to beginning of next record */
off_jnl_t pini_addr; /* Offset:4 :: Offset in the journal file which contains pini record */
jnl_tm_t time; /* Offset:8 :: 4-byte time stamp both for UNIX and VMS */
uint4 checksum; /* Offset:12 :: Generated from journal record */
trans_num tn; /* Offset:16 */
} jrec_prefix; /* 24-byte */
typedef struct
{
uint4 backptr : 24; /* Offset to beginning of current record */
uint4 suffix_code : 8; /* JNL_REC_SUFFIX_CODE */
} jrec_suffix; /* 4-byte */
typedef union
{
seq_num jnl_seqno;
token_num token;
} token_seq_t;
typedef struct
{
char label[SIZEOF(JNL_LABEL_TEXT) - 1];
char is_little_endian; /* this field's offset (JNL_HDR_ENDIAN_OFFSET) should not change
* across journal versions and is checked right after reading the header.
*/
char filler_align8[7];
jnl_process_vector who_created, /* Process who created */
who_opened; /* Process who last opened */
jnl_proc_time bov_timestamp, /* 8-byte time when journal was created */
eov_timestamp; /* 8-byte time when journal was last updated
Updated by cre_jnl_file/jnl_file_extend/jnl_file_close */
trans_num bov_tn, /* Beginning journal record's transaction number */
eov_tn; /* End transaction number.
Updated by cre_jnl_file/jnl_file_extend/jnl_file_close */
seq_num start_seqno; /* reg_seqno when this journal file was created */
seq_num end_seqno; /* reg_seqno when this journal file was closed or last extended.
Updated by cre_jnl_file/jnl_file_extend/jnl_file_close */
off_jnl_t end_of_data; /* Offset of beginning of last record.
Updated by cre_jnl_file/jnl_file_extend/jnl_file_close */
off_jnl_t prev_recov_end_of_data; /* Recovered/Rolled back journal's turn around point's offset.
This offset was supposed to have EOF_RECORD before recover switched journal.
A non-zero value means this journal was recovered and had the turn around point. */
off_jnl_t virtual_size; /* Allocation + n * Extension (in blocks). jnl_file_extend updates it */
boolean_t crash; /* crashed before jnl_file_close() completed */
boolean_t recover_interrupted; /* true when recover creates the journal file; false after success. */
off_jnl_t turn_around_offset; /* At turn around point journal record's (EPOCH) offset */
jnl_tm_t turn_around_time; /* At turn around point journal record's timestamp */
boolean_t before_images; /* before image enabled in this journal */
uint4 alignsize; /* align size of journal (where a valid record start) */
int4 epoch_interval; /* Time between successive epochs in epoch-seconds */
int4 repl_state; /* To state whether replication is turned on for this journal file */
uint4 autoswitchlimit;/* Limit in disk blocks (max 4GBytes) when jnl should be auto switched */
uint4 jnl_alq; /* initial allocation (in blocks) */
uint4 jnl_deq; /* extension (in blocks) */
#ifdef VMS
boolean_t update_disabled;/* If the secondary side has database update disabled. For rollback. */
#else
boolean_t filler_update_disabled; /* obsoleted as part of multi-site replication changes */
#endif
int4 max_jrec_len; /* Maximum length in bytes of a journal record.
* Although computed from the database block size, we need this
* stored as well in case database is not available */
uint4 data_file_name_length; /* Length of data_file_name */
uint4 prev_jnl_file_name_length; /* Length of prev_jnl_file_name */
uint4 next_jnl_file_name_length; /* Length of next_jnl_file_name */
uint4 checksum; /* Calculate from journal file id */
uint4 prev_recov_blks_to_upgrd_adjust; /* amount to adjust filehdr "blks_to_upgrd" if ever
* backward recovery goes back past this journal file */
unsigned char data_file_name[JNL_NAME_SIZE]; /* Database file name */
unsigned char prev_jnl_file_name[JNL_NAME_SIZE]; /* Previous generation journal file name */
unsigned char next_jnl_file_name[JNL_NAME_SIZE]; /* Next generation journal file name */
/* encryption related fields */
uint4 is_encrypted;
char encryption_hash[GTMCRYPT_RESERVED_HASH_LEN];
/* The below two arrays are unused in VMS but defined there to keep the layout similar between Unix & VMS */
seq_num strm_start_seqno[MAX_SUPPL_STRMS];
seq_num strm_end_seqno[MAX_SUPPL_STRMS];
/* filler remaining */
char filler[440];
} jnl_file_header;
typedef struct
{
int4 status,
alloc,
extend,
buffer;
sgmnt_data_ptr_t csd;
seq_num reg_seqno;
unsigned char jnl[JNL_NAME_SIZE],
*fn;
uint4 max_jrec_len;
short fn_len,
jnl_len,
jnl_def_len;
bool before_images;
bool filler_bool[1];
uint4 alignsize;
int4 autoswitchlimit; /* limit in disk blocks (8388607 blocks)
* when jnl should be auto switched */
int4 epoch_interval; /* Time between successive epochs in epoch-seconds */
char *prev_jnl;
int4 prev_jnl_len;
int4 jnl_state; /* current csd->jnl_state */
int4 repl_state;
uint4 status2; /* for secondary error status information in VMS */
boolean_t no_rename;
boolean_t no_prev_link;
int4 blks_to_upgrd; /* Blocks not at current block version level */
uint4 checksum;
uint4 free_blocks; /* free blocks counter at time of epoch */
uint4 total_blks; /* total blocks counter at time of epoch */
uint4 is_encrypted;
char encryption_hash[GTMCRYPT_HASH_LEN];
} jnl_create_info;
/* Journal record definitions */
#define jnl_str_len_t uint4 /* 4 byte length (which is in turn split into bit fields below) */
/* Bit masks for the "nodeflags" field. Note that there is no flag to indicate whether this update actually invoked any triggers.
* That is because we have to format the journal record BEFORE invoking any triggers (that way the triggering update comes ahead
* of its corresponding triggered updates in the journal file as this ordering is relied upon by the update process) and as part
* of formatting, we also compute the checksum that includes the "nodeflags" field and so fixing this field AFTER trigger
* invocation to reflect if any triggers were invoked would mean recomputing the checksum all over again. Currently there is no
* need for the "triggers actually invoked" bit. If it is later desired, care should be taken to recompute the checksum.
*/
#define JS_NOT_REPLICATED_MASK (1 << 0) /* 1 if this update should NOT be replicated.
* All updates done inside of a trigger and a SET redo (because of changes to $ztval)
* fall in this category.
*/
#define JS_HAS_TRIGGER_MASK (1 << 1) /* 1 if the global being updated had at least one trigger defined (not necessarily
* invoked for this particular update)
*/
#define JS_NULL_ZTWORM_MASK (1 << 2) /* 1 if $ZTWORMHOLE for this update should be "" string, 0 otherwise */
#define JS_SKIP_TRIGGERS_MASK (1 << 3) /* 1 if MUPIP LOAD update so triggers are not invoked on replay by update process */
#define JS_IS_DUPLICATE (1 << 4) /* 1 if this SET or KILL is a duplicate. In case of a SET, this is a duplicate set.
* In case of a KILL, it is a kill of a non-existing node aka duplicate kill.
* Note that the dupkill occurs only in case of the update process. In case of GT.M,
* the KILL is entirely skipped. In both duplicate sets or kills, only a journal
* record is written, the database is untouched.
*/
#define JS_MAX_MASK (1 << 8) /* max of 8 bits we have for mask */
/* Note that even though mumps_node, ztworm_str, ztrig_str and align_str are members defined as type "jnl_string" below,
* the "nodeflags" field is initialized to non-zero values ONLY in the case of the mumps_node member.
* For ztworm_str and align_str, nodeflags is guaranteed to be zero so the 24-bit "length" member
* can even be used as a 32-bit length (if necessary) without issues. This is why nodeflags is
* defined in a different order (BEFORE or AFTER the "length" member) based on big-endian or little-endian.
*/
typedef struct
{
# ifdef BIGENDIAN
unsigned int nodeflags : 8;
unsigned int length : 24;
# else
unsigned int length : 24;
unsigned int nodeflags : 8;
# endif
char text[1]; /* Actually text[length] */
} jnl_string;
typedef struct jnl_format_buff_struct
{
que_ent free_que;
struct jnl_format_buff_struct *next;
# ifdef GTM_TRIGGER
struct jnl_format_buff_struct *prev;
# endif
enum jnl_record_type rectype;
int4 record_size;
char *buff;
uint4 checksum;
jnl_action ja;
# ifdef GTM_CRYPT
char *alt_buff; /* for storing the unencrypted jnl *SET and *KILL records to be pushed
* into the jnl pool. */
NON_GTM64_ONLY(int4 dummy_filler;) /* for alignment in 32 bit machines. */
# endif
} jnl_format_buffer;
/* All fixed size records are 8-byte-multiple size.
* All variable size records are made 8-byte multiple size by run-time process */
/* struct_jrec_upd for non-TP, TP or ZTP. For replication we use 8-byte jnl_seqno. Otherwise we use 8-byte token.
* Currently we dont support ZTP + replication.
*/
typedef struct /* variable length */
{
jrec_prefix prefix;
token_seq_t token_seq; /* must start at 8-byte boundary */
seq_num strm_seqno; /* non-zero only if this is a supplementary instance in which case this #
* reflects the 60-bit sequence number corresponding to this update on the
* originating primary + higher order 4-bits reflecting the stream #.
*/
uint4 update_num; /* 'n' where this is the nth journaled update (across all regions) in this TP
* transaction. n=1 for the first update inside TP, 2 for the second update
* inside TP and so on. Needed so journal recovery and update process can play
* all the updates inside of one TP transaction in the exact same order as GT.M.
*/
unsigned short filler_short;
unsigned short num_participants; /* # of regions that wrote a TCOM record in their jnl files.
* Currently written only for TSET/TKILL/TZTWORM records.
* Uninitialized for all other types of SET/KILL/ZTWORM records.
*/
jnl_string mumps_node; /* For set/kill/zkill : {jnl_str_len_t key_len, char key[key_len]} */
/* For set additionally : {mstr_len_t data_len, char data[data_len]} */
} struct_jrec_upd;
/* $ztwormhole record */
typedef struct /* variable length */
{
jrec_prefix prefix;
token_seq_t token_seq; /* must start at 8-byte boundary */
seq_num strm_seqno; /* see "struct_jrec_upd" for comment on the purpose of this field */
uint4 update_num; /* 'n' where this is the nth journaled update (across all regions) in this TP
* transaction. n=1 for the first update inside TP, 2 for the second update
* inside TP and so on. Needed so journal recovery and update process can play
* all the updates inside of one TP transaction in the exact same order as GT.M.
*/
unsigned short filler_short;
unsigned short num_participants; /* # of regions that wrote a TCOM record in their jnl files.
* Currently written only for TSET/TKILL/TZTWORM records.
* Uninitialized for all other types of SET/KILL/ZTWORM records.
*/
jnl_string ztworm_str; /* jnl_str_len_t ztworm_str_len, char ztworm_str[ztworm_str_len]} */
} struct_jrec_ztworm;
#define INVALID_UPDATE_NUM (uint4)-1
typedef struct /* variable length */
{
jrec_prefix prefix;
block_id blknum;
uint4 bsiz;
enum db_ver ondsk_blkver; /* Previous version of block from cache_rec */
int4 filler;
char blk_contents[1]; /* Actually blk_contents[bsiz] */
} struct_jrec_blk;
typedef struct /* variable length */
{
jrec_prefix prefix;
jnl_string align_str;
/* Note: Actual string (potentially 0-length too) follows the align_string and then jrec_suffix */
} struct_jrec_align;
/* Please change the "GBLDEF struct_jrec_tcom" initialization, if below is changed */
typedef struct /* fixed length */
{
jrec_prefix prefix;
token_seq_t token_seq; /* must start at 8-byte boundary */
seq_num strm_seqno; /* see "struct_jrec_upd" for comment on the purpose of this field */
unsigned short filler_short;
unsigned short num_participants; /* # of regions that wrote a TCOM record in their jnl files */
char jnl_tid[TID_STR_SIZE];
jrec_suffix suffix;
} struct_jrec_tcom;
/* Please change the "static struct_jrec_ztcom" initialization in op_ztcommit.c, if below is changed */
typedef struct /* fixed length */
{
jrec_prefix prefix;
token_num token; /* must start at 8-byte boundary */
seq_num filler_8bytes; /* To mirror tcom layout. It is ok to waste space because ztcom is
* obsoleted record. This keeps logic (e.g. MUR_TCOM_TOKEN_PROCESSING) faster
* by avoiding if checks (of whether the rectype is TCOM or ZTCOM and accordingly
* taking the appropriate offset).
*/
unsigned short filler_short;
unsigned short participants; /* # of regions that wrote ZTCOM record in their jnl files for this fenced tn */
jrec_suffix suffix;
} struct_jrec_ztcom;
/* Below are different inctn_detail_*_t type definitions based on the inctn record opcode.
* Each of them need to ensure the following.
* a) SIZEOF(inctn_detail_*_t) is identical.
* b) "opcode" member is at the same offset.
* c) "suffix" is the last member.
* Any new inctn_detail_*_t type definitions should have corresponding code changes in jnl_write_inctn_rec.c
*/
typedef struct
{
block_id blknum; /* block that got upgraded or downgraded (opcode = inctn_blk*grd) */
uint4 filler_uint4;
unsigned short filler_short;
unsigned short opcode;
jrec_suffix suffix;
} inctn_detail_blknum_t;
typedef struct
{
int4 blks_to_upgrd_delta; /* Delta to adjust csd->blks_to_upgrade (opcode = inctn_gdsfilext_*) */
uint4 filler_uint4;
unsigned short filler_short;
unsigned short opcode;
jrec_suffix suffix;
} inctn_detail_blks2upgrd_t;
typedef union
{
inctn_detail_blknum_t blknum_struct;
inctn_detail_blks2upgrd_t blks2upgrd_struct;
} inctn_detail_t;
typedef struct /* fixed length */
{
jrec_prefix prefix;
inctn_detail_t detail;
/* jrec_suffix is already part of inctn_detail_t */
} struct_jrec_inctn;
typedef struct /* fixed length */
{
jrec_prefix prefix;
jnl_process_vector process_vector[JPV_COUNT];
int4 filler;
jrec_suffix suffix;
} struct_jrec_pini;
typedef struct /* fixed length */
{
jrec_prefix prefix;
uint4 filler;
jrec_suffix suffix;
} struct_jrec_pfin;
/* Following 3 are same structures. In case we change it in future, let's define them separately */
typedef struct /* fixed length */
{
jrec_prefix prefix;
seq_num jnl_seqno; /* must start at 8-byte boundary */
seq_num strm_seqno; /* see "struct_jrec_upd" for comment on the purpose of this field */
uint4 filler;
jrec_suffix suffix;
} struct_jrec_null;
typedef struct /* fixed length */
{
jrec_prefix prefix;
seq_num jnl_seqno; /* must start at 8-byte boundary */
uint4 blks_to_upgrd; /* blocks-to-upgrade counter at time of epoch */
uint4 free_blocks; /* free blocks counter at time of epoch */
uint4 total_blks; /* total blocks counter at time of epoch */
boolean_t fully_upgraded; /* cs_data->fully_upgraded at the time of epoch */
seq_num strm_seqno[MAX_SUPPL_STRMS]; /* seqno of each possible supplementary stream at epoch time.
* used by rollback to restore seqnos on the database.
*/
uint4 filler; /* so as to make the EPOCH record aligned to 8 byte boundary */
jrec_suffix suffix;
} struct_jrec_epoch;
typedef struct /* fixed length */
{
jrec_prefix prefix;
seq_num jnl_seqno; /* must start at 8-byte boundary */
uint4 filler;
jrec_suffix suffix;
} struct_jrec_eof;
typedef struct /* fixed length */
{
jrec_prefix prefix; /* 24 bytes */
uint4 orig_total_blks;
uint4 orig_free_blocks;
uint4 total_blks_after_trunc;
jrec_suffix suffix; /* 4 bytes */
} struct_jrec_trunc;
typedef union
{
jrec_prefix prefix;
struct_jrec_upd jrec_set_kill; /* JRT_SET or JRT_KILL or JRT_ZTRIG record will use this format */
struct_jrec_ztworm jrec_ztworm;
struct_jrec_blk jrec_pblk,
jrec_aimg;
struct_jrec_align jrec_align;
/** All below are fixed size and above are variable size records */
struct_jrec_tcom jrec_tcom;
struct_jrec_ztcom jrec_ztcom;
struct_jrec_inctn jrec_inctn;
struct_jrec_pini jrec_pini;
struct_jrec_pfin jrec_pfin;
struct_jrec_null jrec_null;
struct_jrec_epoch jrec_epoch;
struct_jrec_eof jrec_eof;
struct_jrec_trunc jrec_trunc;
} jnl_record;
/* Macro to access fixed size record's size */
#define TCOM_RECLEN SIZEOF(struct_jrec_tcom)
#define ZTCOM_RECLEN SIZEOF(struct_jrec_ztcom)
#define INCTN_RECLEN SIZEOF(struct_jrec_inctn)
#define PINI_RECLEN SIZEOF(struct_jrec_pini)
#define PFIN_RECLEN SIZEOF(struct_jrec_pfin)
#define NULL_RECLEN SIZEOF(struct_jrec_null)
#define EPOCH_RECLEN SIZEOF(struct_jrec_epoch)
#define EOF_RECLEN SIZEOF(struct_jrec_eof)
#define TRUNC_RECLEN SIZEOF(struct_jrec_trunc)
/* Macro to access variable size record's fixed part's size */
#define FIXED_ZTWORM_RECLEN OFFSETOF(struct_jrec_ztworm, ztworm_str)
#define FIXED_UPD_RECLEN OFFSETOF(struct_jrec_upd, mumps_node)
#define MIN_ALIGN_RECLEN (OFFSETOF(struct_jrec_align, align_str.text[0]) + JREC_SUFFIX_SIZE)
#define FIXED_ALIGN_RECLEN OFFSETOF(struct_jrec_align, align_str.text[0])
#define FIXED_BLK_RECLEN OFFSETOF(struct_jrec_blk, blk_contents[0])
#define FIXED_PBLK_RECLEN OFFSETOF(struct_jrec_blk, blk_contents[0])
#define FIXED_AIMG_RECLEN OFFSETOF(struct_jrec_blk, blk_contents[0])
#define MIN_PBLK_RECLEN (OFFSETOF(struct_jrec_blk, blk_contents[0]) + JREC_SUFFIX_SIZE)
#define MIN_AIMG_RECLEN (OFFSETOF(struct_jrec_blk, blk_contents[0]) + JREC_SUFFIX_SIZE)
#define JREC_PREFIX_SIZE SIZEOF(jrec_prefix)
#define JREC_SUFFIX_SIZE SIZEOF(jrec_suffix)
#define MIN_JNLREC_SIZE (JREC_PREFIX_SIZE + JREC_SUFFIX_SIZE)
#define JREC_PREFIX_UPTO_LEN_SIZE (offsetof(jrec_prefix, pini_addr))
typedef struct set_jnl_options_struct
{
int cli_journal, cli_enable, cli_on, cli_replic_on;
boolean_t alignsize_specified,
allocation_specified,
autoswitchlimit_specified,
image_type_specified, /* beofre/nobefore option specified */
buffer_size_specified,
epoch_interval_specified,
extension_specified,
filename_specified,
sync_io_specified,
yield_limit_specified;
/* since jnl_create_info does not have following fields, we need them here */
boolean_t sync_io;
int4 yield_limit;
} set_jnl_options;
/* rlist_state needed to be moved here to use with mu_set_reglist */
enum rlist_state {
NONALLOCATED,
ALLOCATED,
DEALLOCATED
};
/* mu_set_reglist needed to be moved here for the journal specific fields */
/* ATTN: the first four items in this structure need to be identical to those
* in structure tp_region in tp.h.
*/
typedef struct mu_set_reglist
{
struct mu_set_reglist *fPtr; /* all fields after this are used for mupip_set_journal.c */
gd_region *reg;
char unique_id[UNIQUE_ID_SIZE];
enum rlist_state state;
sgmnt_data_ptr_t sd;
bool exclusive; /* standalone access is required for this region */
int fd;
enum jnl_state_codes jnl_new_state;
enum repl_state_codes repl_new_state;
boolean_t before_images;
} mu_set_rlist;
/* The enum codes below correspond to code-paths that can call set_jnl_file_close() in VMS */
typedef enum
{
SET_JNL_FILE_CLOSE_BACKUP = 1, /* just for safety a non-zero value to start with */
SET_JNL_FILE_CLOSE_SETJNL,
SET_JNL_FILE_CLOSE_EXTEND,
SET_JNL_FILE_CLOSE_RUNDOWN,
SET_JNL_FILE_CLOSE_INVALID_OP
} set_jnl_file_close_opcode_t;
typedef void (*pini_addr_reset_fnptr)(sgmnt_addrs *csa);
typedef struct
{
token_num mur_jrec_seqno; /* This is jnl_seqno of the current record that backward
* recovery/rollback is playing in its forward phase.
*/
token_num mur_jrec_strm_seqno; /* This is the strm_seqno of the current record that backward
* recovery/rollback is playing in its forward phase.
*/
VMS_ONLY(seq_num max_resync_seqno;) /* for update process and rollback fetchresync */
unsigned short filler_short;
unsigned short mur_jrec_participants;
jnl_tm_t gbl_jrec_time;
jnl_tm_t mur_tp_resolve_time; /* tp resolve time as determined by journal recovery.
* Time of the point upto which a region will be processed for
* TP token resolution for backward or forward recover.
* Note : This is what prevents user to change system time.
*/
boolean_t forw_phase_recovery;
boolean_t mur_rollback; /* a copy of mur_options.rollback to be accessible to runtime code */
boolean_t mupip_journal; /* the current command is a MUPIP JOURNAL command */
boolean_t dont_reset_gbl_jrec_time; /* Do not reset gbl_jrec_time */
pini_addr_reset_fnptr mur_pini_addr_reset_fnptr; /* function pointer to invoke "mur_pini_addr_reset" */
uint4 cumul_jnl_rec_len; /* cumulative length of the replicated journal records
* for the current TP or non-TP transaction */
boolean_t wait_for_jnl_hard;
uint4 tp_ztp_jnl_upd_num; /* Incremented whenever a journaled update happens inside of
* TP or ZTP. Copied over to the corresponding journal record
* to record the sequence of all updates inside TP/ZTP transaction.
*/
uint4 mur_jrec_nodeflags; /* copy of "nodeflags" from jnl record currently being played */
# ifdef GTM_TRIGGER
unsigned char *prev_ztworm_ptr; /* Non-NULL if at least one ztwormhole record was successfully
* formatted in this transaction. Note that ZTWORMHOLE records are
* formatted ONLY in case of journaled & replicated databases.
* 1. If replicated database is unencrypted, this points to
* jfb->buff + FIXED_UPD_RECLEN
* 2. If replicated database is encrypted, this points to
* jfb->alt_buff + FIXED_UPD_RECLEN
* If no ztwormhole record is yet formatted, then points to NULL
*/
unsigned char *save_ztworm_ptr; /* copy of prev_ztworm_ptr saved until we know for sure whether
* a ZTWORMHOLE journal record will be written or not.
*/
# endif
# ifdef DEBUG
boolean_t mur_fences_none; /* a copy of mur_options.fences to be accessible to runtime code */
uint4 cumul_index;
uint4 cu_jnl_index;
uint4 max_tp_ztp_jnl_upd_num; /* Max of all <jgbl.tp_ztp_jnl_upd_num> values processed in this
* potentially multi-region transaction. Used only by jnl recovery.
*/
boolean_t mur_options_forward; /* a copy of mur_options.forward to be accessible to GT.M runtime */
# endif
# ifdef UNIX
boolean_t onlnrlbk; /* TRUE if ONLINE ROLLBACK */
# endif
} jnl_gbls_t;
#define JNL_SHARE_SIZE(X) (JNL_ALLOWED(X) ? \
(ROUND_UP(JNL_NAME_EXP_SIZE + SIZEOF(jnl_buffer), OS_PAGE_SIZE) \
+ ROUND_UP(((sgmnt_data_ptr_t)X)->jnl_buffer_size * DISK_BLOCK_SIZE, \
OS_PAGE_SIZE) + OS_PAGE_SIZE) : 0)
/* pass address of jnl_buffer to get address of expanded jnl file name */
#define JNL_GDID_PVT(CSA) ((CSA)->jnl->fileid)
#ifdef UNIX
#define JNL_GDID_PTR(CSA) ((gd_id_ptr_t)(&((CSA)->nl->jnl_file.u)))
#else
#define JNL_GDID_PTR(CSA) ((gd_id_ptr_t)(&((CSA)->nl->jnl_file.jnl_file_id)))
#endif
/* Note that since "cycle" (in jpc and jb below) can rollover the 4G limit back to 0, it should
* only be used to do "!=" checks and never to do ordered checks like "<", ">", "<=" or ">=".
*/
#define JNL_FILE_SWITCHED(JPC) ((JPC)->cycle != (JPC)->jnl_buff->cycle)
#define REG_STR "region"
#define FILE_STR "database file"
/* Given a journal record, get_jnl_seqno returns the jnl_seqno field
* Now all replication type records, EOF and EPOCH have the jnl_seqno at the same offset.
* Modify the macro GET_JNL_SEQNO if offset of jnl_seqno is changed for any journal records
*/
#define GET_JNL_SEQNO(j) (((jnl_record *)(j))->jrec_null.jnl_seqno)
#define GET_STRM_SEQNO(j) (((jnl_record *)(j))->jrec_null.strm_seqno)
#define GET_REPL_JNL_SEQNO(j) (IS_REPLICATED(((jrec_prefix *)j)->jrec_type) ? GET_JNL_SEQNO(j) : 0)
/* For MUPIP JOURNAL -ROLLBACK, getting the strm_reg_seqno from the file header is not as straightforward
* as accessing csd->strm_reg_seqno[idx]. This is because it increments this field in mur_output_record even
* before we reach t_end/tp_tend. That is done for convenience of the implementation. But this assumes that the
* commit has actually completed. Therefore, in case we need to invoke jnl_file_extend() inside t_end/tp_tend even
* before the commit, we would see an incorrect value of csd->strm_reg_seqno[idx]. In that case, we use the
* global variable jgbl.mur_jrec_strm_seqno to identify if the strm_reg_seqno[idx] value is 1 more than that and
* if so return 1 lesser than that as the real strm_reg_seqno[idx]. This is used by routines that write journal
* records (EPOCH, jfh->strm_end_seqno etc.) to write the correct strm_seqno. Not doing so will cause the strm_seqno
* to be higher than necessary and confuse everything else (including rollback) as far as replication is concerned.
* Note: We check for process_exiting to differentiate between calls made from mur_close_files() to before. Once we
* reach mur_close_files, we should no longer be in an active transaction and so we dont need to make any adjustments.
* VMS does not support supplementary instances so the below macro does not apply there at all.
*/
#ifdef UNIX
#define MUR_ADJUST_STRM_REG_SEQNO_IF_NEEDED(CSD, DST) \
{ \
int strm_num; \
seq_num strm_seqno; \
\
GBLREF int process_exiting; \
\
if (jgbl.mur_jrec_strm_seqno && !process_exiting) \
{ \
assert(jgbl.mur_rollback); \
VMS_ONLY(assert(FALSE);) \
strm_seqno = jgbl.mur_jrec_strm_seqno; \
strm_num = GET_STRM_INDEX(strm_seqno); \
strm_seqno = GET_STRM_SEQ60(strm_seqno); \
if (CSD->strm_reg_seqno[strm_num] == (strm_seqno + 1)) \
{ \
assert(DST[strm_num] == (strm_seqno + 1)); \
DST[strm_num] = strm_seqno; \
} \
} \
}
#else
#define MUR_ADJUST_STRM_REG_SEQNO_IF_NEEDED(CSD, DST)
#endif
/* Given a journal record, GET_TN returns the tn field
*/
#define GET_TN(j) (((*jrec_prefix)(j))->prefix.tn)
/* In t_end(), we need to write the after-image if DSE or mupip recover/rollback is playing it.
* But to write it out, we should have it already built before bg_update().
* Hence, we pre-build the block here itself before invoking t_end().
*/
#define BUILD_AIMG_IF_JNL_ENABLED(CSD, JFB, TN) \
{ \
GBLREF cw_set_element cw_set[]; \
GBLREF unsigned char cw_set_depth; \
\
cw_set_element *cse; \
\
if (JNL_ENABLED(CSD)) \
{ \
assert(1 == cw_set_depth); /* Only DSE uses this macro and it updates one block at a time */ \
cse = (cw_set_element *)(&cw_set[0]); \
cse->new_buff = JFB; \
gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, TN); \
cse->done = TRUE; \
} \
}
/* In Unix, the journal file header size is currently set to 64K so it is aligned with any possible filesystem block size
* known at this point. This will help us do aligned writes to the journal file header as well as the journal file contents
* without needing to mix both of them in the same aligned disk write. In VMS, we continue with 512-byte alignment so no change.
* Note that the journal_file_header structure is only 2K currently and is captured using the REAL_JNL_HDR_LEN macro while
* the padded 64K file header is captured using the JNL_HDR_LEN macro. Use either one as appropriate in the code. Both of them
* are identical in VMS where it is currently 2K.
*/
#define REAL_JNL_HDR_LEN SIZEOF(jnl_file_header)
#ifdef UNIX
# define JNL_HDR_LEN 64*1024
#elif defined(VMS)
# define JNL_HDR_LEN REAL_JNL_HDR_LEN
#endif
#define JNL_FILE_FIRST_RECORD JNL_HDR_LEN
/* Minimum possible journal file size */
#define MIN_JNL_FILE_SIZE (JNL_HDR_LEN + PINI_RECLEN + EPOCH_RECLEN + PFIN_RECLEN + EOF_RECLEN)
/* maximum required journal file size (in 512-byte blocks), if the current transaction was the only one in a fresh journal file */
#define MAX_REQD_JNL_FILE_SIZE(tot_jrec_size) DIVIDE_ROUND_UP((tot_jrec_size + MIN_JNL_FILE_SIZE), DISK_BLOCK_SIZE)
/* this macro aligns the input size to account that journal file sizes can increase only in multiples of the extension size */
#define ALIGNED_ROUND_UP(tmp_tot_jrec_size, jnl_alq, jnl_deq) \
(((tmp_tot_jrec_size) <= (jnl_alq) || !(jnl_deq)) \
? (jnl_alq) \
: ((jnl_alq) + ROUND_UP((tmp_tot_jrec_size) - (jnl_alq), (jnl_deq))))
/* this macro aligns the input size to account that journal file sizes can increase only in multiples of the extension size */
#define ALIGNED_ROUND_DOWN(tmp_tot_jrec_size, jnl_alq, jnl_deq) \
(((tmp_tot_jrec_size) <= (jnl_alq) || !(jnl_deq)) \
? (jnl_alq) \
: ((jnl_alq) + ROUND_DOWN((tmp_tot_jrec_size) - (jnl_alq), (jnl_deq))))
/* the following macro uses 8-byte quantities (gtm_uint64_t) to perform additions that might cause a 4G overflow */
#define DISK_BLOCKS_SUM(freeaddr, jrec_size) DIVIDE_ROUND_UP((((gtm_uint64_t)(freeaddr)) + (jrec_size)), DISK_BLOCK_SIZE)
#if defined(UNIX)
/* For future portability JNLBUFF_ALLOC is defined in jnl.h instead of jnlsp.h */
#define JPC_ALLOC(csa) \
{ \
csa->jnl = (jnl_private_control *)malloc(SIZEOF(*csa->jnl)); \
memset(csa->jnl, 0, SIZEOF(*csa->jnl)); \
}
#define ASSERT_JNLFILEID_NOT_NULL(csa) \
{ \
assert(0 != csa->nl->jnl_file.u.inode); \
assert(0 != csa->nl->jnl_file.u.device); \
}
#define NULLIFY_JNL_FILE_ID(csa) \
{ \
csa->nl->jnl_file.u.inode = 0; \
csa->nl->jnl_file.u.device = 0; \
}
#elif defined(VMS)
#define JPC_ALLOC(csa) \
{ \
vms_lock_sb *tmp_jnllsb; \
if (NULL == csa->jnl) \
{ \
csa->jnl = (jnl_private_control *)malloc(SIZEOF(*csa->jnl)); \
memset(csa->jnl, 0, SIZEOF(*csa->jnl)); \
csa->jnl->jnllsb = malloc(SIZEOF(vms_lock_sb)); \
} else \
{ \
tmp_jnllsb = csa->jnl->jnllsb; \
memset(csa->jnl, 0, SIZEOF(*csa->jnl)); \
csa->jnl->jnllsb = tmp_jnllsb; \
} \
memset(csa->jnl->jnllsb, 0, SIZEOF(vms_lock_sb)); \
}
#define ASSERT_JNLFILEID_NOT_NULL(csa) assert(0 != memcmp(csa->nl->jnl_file.jnl_file_id.fid, zero_fid, SIZEOF(zero_fid)));
#define NULLIFY_JNL_FILE_ID(csa) memset(&csa->nl->jnl_file.jnl_file_id, 0, SIZEOF(gds_file_id))
#endif
#define JNL_INIT(csa, reg, csd) \
{ \
csa->jnl_state = csd->jnl_state; \
csa->jnl_before_image = csd->jnl_before_image; \
csa->repl_state = csd->repl_state; \
if JNL_ALLOWED(csa) \
{ \
JPC_ALLOC(csa); \
csa->jnl->region = reg; \
csa->jnl->jnl_buff = (jnl_buffer_ptr_t)((sm_uc_ptr_t)(csa->nl) + NODE_LOCAL_SPACE + JNL_NAME_EXP_SIZE); \
csa->jnl->channel = NOJNL; \
} else \
csa->jnl = NULL; \
}
#define JNL_FD_CLOSE(CHANNEL, RC) \
{ \
fd_type lcl_channel; \
\
/* Reset incoming channel BEFORE closing it. This way, if we get interrupted BEFORE the close but \
* after we have reset channel, we could at most end up with a file descriptor leak. Doing it the \
* other way around could cause us to close the channel but yet have a dangling pointer to it that \
* could result in more than one close of the same file descriptor where the second close could \
* be on some other valid open file descriptor. \
*/ \
lcl_channel = CHANNEL; \
CHANNEL = NOJNL; \
F_CLOSE(lcl_channel, RC); /* resets "lcl_channel" to FD_INVALID */ \
assert(SS_NORMAL == RC); \
}
#define MAX_EPOCH_DELAY 30
#define EXT_NEW "_new"
#define PREFIX_ROLLED_BAK "rolled_bak_"
#define REC_TOKEN(jnlrec) ((struct_jrec_upd *)jnlrec)->token_seq.token
#define REC_JNL_SEQNO(jnlrec) ((struct_jrec_upd *)jnlrec)->token_seq.jnl_seqno
#define REC_LEN_FROM_SUFFIX(ptr, reclen) ((jrec_suffix *)((unsigned char *)ptr + reclen - JREC_SUFFIX_SIZE))->backptr
#define JNL_MAX_SET_KILL_RECLEN(CSD) (uint4)ROUND_UP2(FIXED_UPD_RECLEN + JREC_SUFFIX_SIZE \
+ ((CSD)->blk_size - SIZEOF(blk_hdr) - SIZEOF(rec_hdr)) \
+ SIZEOF(jnl_str_len_t) + SIZEOF(mstr_len_t), JNL_REC_START_BNDRY) \
/* fixed size part of update record + MAX possible (key + data) len + keylen-len + datalen-len */
#define JNL_MAX_PBLK_RECLEN(CSD) (uint4)ROUND_UP2(MIN_PBLK_RECLEN + (CSD)->blk_size, JNL_REC_START_BNDRY)
/* Macro to compute the maximum possible journal record length in the journal file.
* In order to compute the maximum jnl record length, note that an align record is written whenever
* a <non-align-record + an-align-record> would cause the jnl file offset to move past an aligned boundary.
* Therefore after computing the maximum possible non-align-jnl-record-length, we need to add MIN_ALIGN_RECLEN
* as this is the maximum possible align-jnl-record-length and should be the eventual max_jrec_len.
*/
#define JNL_MAX_RECLEN(JINFO, CSD) \
{ \
int4 max_logi_reclen, max_pblk_reclen; \
\
/* A logical record is a SET/KILL record. The SET could be as big as (CSD)->max_rec_size, but since \
* csd->max_rec_size can be changed independent of journal file creation (through DSE), we consider \
* the max possible record size that can fit in a GDS block. After that we consider $ZTWORMHOLE too. \
*/ \
max_logi_reclen = JNL_MAX_SET_KILL_RECLEN(CSD); \
GTMTRIG_ONLY(max_logi_reclen = MAX(max_logi_reclen, MAX_ZTWORM_JREC_LEN);) \
max_pblk_reclen = JNL_MAX_PBLK_RECLEN(CSD); \
(JINFO)->max_jrec_len = MAX(max_logi_reclen, max_pblk_reclen) + MIN_ALIGN_RECLEN; \
}
/* Macro that checks that the region seqno in the filehdr is never more than the seqno in the journal pool */
#define ASSERT_JNL_SEQNO_FILEHDR_JNLPOOL(csd, jnlpool_ctl) \
{ /* The seqno in the file header should be at most 1 greater than that in the journal pool. \
* See step (5) of of commit logic flow in secshr_db_clnup.c for why. Assert that. \
*/ \
assert((NULL == jnlpool_ctl) || (csd->reg_seqno <= (jnlpool_ctl->jnl_seqno + 1))); \
}
#ifdef GTM_CRYPT
#define DECODE_SET_KILL_ZKILL_ZTRIG(mumps_node_ptr, rec_size, key_handle, RC) \
{ \
int span_length, fixed_prefix; \
\
RC = 0; \
assert(FIXED_UPD_RECLEN == FIXED_ZTWORM_RECLEN); \
fixed_prefix = FIXED_UPD_RECLEN; \
ASSERT_ENCRYPTION_INITIALIZED; \
span_length = rec_size - fixed_prefix - JREC_SUFFIX_SIZE; \
GTMCRYPT_DECODE_FAST(key_handle, (char *)mumps_node_ptr, span_length, NULL, RC); \
}
#endif /* GTM_CRYPT */
/* jnl_ prototypes */
uint4 jnl_file_extend(jnl_private_control *jpc, uint4 total_jnl_rec_size);
uint4 jnl_file_lost(jnl_private_control *jpc, uint4 jnl_stat);
uint4 jnl_qio_start(jnl_private_control *jpc);
uint4 jnl_write_attempt(jnl_private_control *jpc, uint4 threshold);
void jnl_prc_vector(jnl_process_vector *pv);
void jnl_send_oper(jnl_private_control *jpc, uint4 status);
uint4 cre_jnl_file(jnl_create_info *info);
uint4 cre_jnl_file_common(jnl_create_info *info, char *rename_fn, int rename_fn_len);
void jfh_from_jnl_info (jnl_create_info *info, jnl_file_header *header);
uint4 jnl_ensure_open(void);
void set_jnl_info(gd_region *reg, jnl_create_info *set_jnl_info);
void jnl_write_epoch_rec(sgmnt_addrs *csa);
void jnl_write_inctn_rec(sgmnt_addrs *csa);
void jnl_write_logical(sgmnt_addrs *csa, jnl_format_buffer *jfb);
void jnl_write_ztp_logical(sgmnt_addrs *csa, jnl_format_buffer *jfb);
void jnl_write_eof_rec(sgmnt_addrs *csa, struct_jrec_eof *eof_record);
void jnl_write_trunc_rec(sgmnt_addrs *csa, uint4 orig_total_blks, uint4 orig_free_blocks, uint4 total_blks_after_trunc);
void jnl_write_poolonly(jnl_private_control *jpc, enum jnl_record_type rectype, jnl_record *jnl_rec, jnl_format_buffer *jfb);
jnl_format_buffer *jnl_format(jnl_action_code opcode, gv_key *key, mval *val, uint4 nodeflags);
#ifdef VMS
void finish_active_jnl_qio(void);
void jnl_start_ast(jnl_private_control *jpc);
uint4 jnl_permit_ast(jnl_private_control *jpc);
void jnl_qio_end(jnl_private_control *jpc);
#endif
void wcs_defer_wipchk_ast(jnl_private_control *jpc);
uint4 set_jnl_file_close(set_jnl_file_close_opcode_t set_jnl_file_close_opcode);
uint4 jnl_file_open_common(gd_region *reg, off_jnl_t os_file_size);
uint4 jnl_file_open_switch(gd_region *reg, uint4 sts);
void jnl_file_close(gd_region *reg, bool clean, bool dummy);
/* Consider putting followings in a mupip only header file : Layek 2/18/2003 */
boolean_t mupip_set_journal_parse(set_jnl_options *jnl_options, jnl_create_info *jnl_info);
uint4 mupip_set_journal_newstate(set_jnl_options *jnl_options, jnl_create_info *jnl_info, mu_set_rlist *rptr);
void mupip_set_journal_fname(jnl_create_info *jnl_info);
uint4 mupip_set_jnlfile_aux(jnl_file_header *header, char *jnl_fname);
void jnl_extr_init(void);
int exttime(uint4 time, char *buffer, int extract_len);
char *ext2jnlcvt(char *ext_buff, int4 ext_len, jnl_record *rec);
char *ext2jnl(char *ptr, jnl_record *rec);
char *jnl2extcvt(jnl_record *rec, int4 jnl_len, char *ext_buff);
char *jnl2ext(char *jnl_buff, char *ext_buff);
#endif /* JNL_H_INCLUDED */