fis-gtm/sr_unix/jnl_output_sp.c

/***************************************************************
 *								*
 *	Copyright 2001, 2013 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include <errno.h>
#include "gtm_unistd.h"	/* DB_FSYNC macro needs this */
#include "gtm_string.h"

#include "gtmio.h"	/* this has to come in before gdsfhead.h, for all "open" to be defined
				to "open64", including the open in header files */
#include "aswp.h"
#include "gdsroot.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "gt_timer.h"
#include "jnl.h"
#include "lockconst.h"
#include "interlock.h"
#include "iosp.h"
#include "gdsbgtr.h"
#include "is_file_identical.h"
#include "dpgbldir.h"
#include "rel_quant.h"
#include "repl_sp.h"	/* for F_CLOSE used by the JNL_FD_CLOSE macro */
#include "memcoherency.h"
#include "gtm_dbjnl_dupfd_check.h"
#include "anticipatory_freeze.h"

GBLREF	volatile int4	db_fsync_in_prog;
GBLREF	volatile int4	jnl_qio_in_prog;
GBLREF	uint4		process_id;

error_def(ERR_DBFSYNCERR);
error_def(ERR_ENOSPCQIODEFER);
error_def(ERR_JNLACCESS);
error_def(ERR_JNLCNTRL);
error_def(ERR_JNLRDERR);
error_def(ERR_JNLWRTDEFER);
error_def(ERR_JNLWRTNOWWRTR);
error_def(ERR_PREMATEOF);

uint4 jnl_sub_qio_start(jnl_private_control *jpc, boolean_t aligned_write);
void jnl_mm_timer_write(void);

/* If the second argument is TRUE, then the jnl write is done only upto the previous aligned boundary.
 * else the write is done upto the freeaddr */

uint4 jnl_sub_qio_start(jnl_private_control *jpc, boolean_t aligned_write)
{
	boolean_t		was_wrapped;
	int			tsz, close_res;
	jnl_buffer_ptr_t	jb;
	int4			free_ptr;
	sgmnt_addrs		*csa;
	node_local_ptr_t	cnl;
	sm_uc_ptr_t		base;
	unix_db_info		*udi;
	unsigned int		status;
	int			save_errno;
	uint4			aligned_dskaddr, dskaddr;
	int4			aligned_dsk, dsk;
	int			aligned_tsz;
	sm_uc_ptr_t		aligned_base;
	uint4			jnl_fs_block_size;
	gd_region		*reg;

	assert(NULL != jpc);
	reg = jpc->region;
	udi = FILE_INFO(reg);
	csa = &udi->s_addrs;
	jb = jpc->jnl_buff;
	if (jb->io_in_prog_latch.u.parts.latch_pid == process_id)	/* We already have the lock? */
		return ERR_JNLWRTNOWWRTR;			/* timer driven io in progress */
	jnl_qio_in_prog++;
	if (!GET_SWAPLOCK(&jb->io_in_prog_latch))
	{
		jnl_qio_in_prog--;
		assert(0 <= jnl_qio_in_prog);
		return ERR_JNLWRTDEFER;
	}
#	ifdef DEBUG
	/* When jnl_sub_qio_start() is called as part of WBTEST_SIGTSTP_IN_JNL_OUTPUT_SP white-box test case,
	 * aligned_write should always be FALSE. But depending upon the filesystem block size, it is possible that
	 * the function could also be called with aligned_write being TRUE. This could lead to sending SIGTSTP
	 * twice. Hence ensure that SIGTSTP is sent only for the unaligned write.
	 */
	if (gtm_white_box_test_case_enabled && (WBTEST_SIGTSTP_IN_JNL_OUTPUT_SP == gtm_white_box_test_case_number)
				&& !aligned_write)
		kill(process_id, SIGTSTP);
#	endif
	if (jb->dsk != (jb->dskaddr % jb->size))
	{
		RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
		jnl_qio_in_prog--;
		assert(0 <= jnl_qio_in_prog);
		return ERR_JNLCNTRL;
	}
	if (!JNL_FILE_SWITCHED(jpc))
		jpc->fd_mismatch = FALSE;
	else
	{	/* journal file has been switched; release io_in_prog lock and return */
		jpc->fd_mismatch = TRUE;
		RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
		jnl_qio_in_prog--;
		assert(0 <= jnl_qio_in_prog);
		return SS_NORMAL;
	}
	/* Currently we overload io_in_prog_latch to perform the db fsync too. Anyone trying to do a
	 *   jnl_qio_start will first check if a db_fsync is needed and if so sync that before doing any jnl qio.
	 * Note that since an epoch record is written when need_db_fsync is set to TRUE, we are guaranteed that
	 *   (dskaddr < freeaddr) which is necessary for the jnl_wait --> jnl_write_attempt mechanism (triggered
	 *   by wcs_flu) to actually initiate a call to jnl_qio_start().
	 */
	if (jb->need_db_fsync)
	{
		DB_FSYNC(reg, udi, csa, db_fsync_in_prog, save_errno);
		GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_DBFSYNCERR, save_errno, EIO);
		if (0 != save_errno)
		{
			RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
			jnl_qio_in_prog--;
			assert(0 <= jnl_qio_in_prog);
			/* DBFSYNCERR can potentially cause syslog flooding. Remove the following line if we it becomes an issue. */
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFSYNCERR, 2, DB_LEN_STR(reg), save_errno);
			rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFSYNCERR, 2, DB_LEN_STR(reg), save_errno);
			assert(FALSE);	/* should not come here as the rts_error above should not return */
			return ERR_DBFSYNCERR;	/* ensure we do not fall through to the code below as we no longer have the lock */
		}
		jb->need_db_fsync = FALSE;
	}
	free_ptr = jb->free;
        /* The following barrier is to make sure that for the value of "free" that we extract (which may be
         * slightly stale but that is not a correctness issue) we make sure we dont write out a stale version of
         * the journal buffer contents. While it is possible that we see journal buffer contents that are more
         * uptodate than "free", this would only mean writing out a less than optimal number of bytes but again,
         * not a correctness issue. Secondary effect is that it also enforces a corresponding non-stale value of
         * freeaddr is read and this is relied upon by asserts below.
	 */
	SHM_READ_MEMORY_BARRIER;
	dsk = jb->dsk;
	dskaddr = jb->dskaddr;
	was_wrapped = (free_ptr < dsk);
	jnl_fs_block_size = jb->fs_block_size;
	if (aligned_write)
		free_ptr = ROUND_DOWN2(free_ptr, jnl_fs_block_size);
	assert(!(jb->size % jnl_fs_block_size));
	tsz = (free_ptr < dsk ? jb->size : free_ptr) - dsk;
	if ((aligned_write && !was_wrapped && (free_ptr <= dsk)) || (NOJNL == jpc->channel))
		tsz = 0;
	assert(0 <= tsz);
	assert(dskaddr + tsz <= jb->freeaddr);
	status = SS_NORMAL;
	if (tsz)
	{	/* ensure that dsk and free are never equal and we have left space for JNL_WRT_START_MASK */
		assert(SS_NORMAL == status);
		assert((free_ptr > dsk) || (free_ptr < (dsk & JNL_WRT_START_MASK(jb)))
			|| (dsk != (dsk & JNL_WRT_START_MASK(jb))));
		jb->wrtsize = tsz;
		jb->qiocnt++;
		base = &jb->buff[dsk + jb->buff_off];
		assert((base + tsz) <= (jb->buff + jb->size + jnl_fs_block_size));
		assert(NOJNL != jpc->channel);
		/* If sync_io is turned on, we would have turned on the O_DIRECT flag on some platforms. That will
		 * require us to do aligned writes. Both the source buffer and the size of the write need to be aligned
		 * for this to work on some platforms. The alignment needs to be on a filesystem-block-size granularity.
		 * If sync_io is not turned on, doing aligned writes saves us from the OS doing a read of the block
		 * under the covers in case we write only a part of the filesystem block.
		 * Therefore we do aligned writes no matter what. This means we could be writing some garbage padding
		 * data out after the last valid journal record jut to fit in the alignment requirements. But that is
		 * considered okay because as part of writing the EOF record out (for a clean termination), jnl_write
		 * would have 0-padded the journal buffer for us. So a cleanly shutdown journal file will have 0-padding
		 * following the EOF record but an actively used journal file might have garbage padding following the
		 * last valid record. This is considered okay as journal recovery has logic to scan past the garbage and
		 * locate the last valid record in case of a crash before writing the EOF.
		 */
		aligned_dsk = ROUND_DOWN2(dsk, jnl_fs_block_size);
		aligned_dskaddr = ROUND_DOWN2(dskaddr, jnl_fs_block_size);
		aligned_tsz = ROUND_UP2((tsz + (dskaddr - aligned_dskaddr)), jnl_fs_block_size);
		aligned_base = (sm_uc_ptr_t)ROUND_DOWN2((uintszofptr_t)base, jnl_fs_block_size);
		/* Assert that aligned_dsk never backs up to a point BEFORE where the free pointer is */
		assert((aligned_dsk > free_ptr) || (dsk <= free_ptr));
		/* Assert that aligned_dskaddr never backs up to a point inside journal file header territory.
		 * This is because those fields are always updated inside crit and therefore we should
		 * never touch those while we hold only the jnl qio lock.
		 */
		assert(JNL_HDR_LEN <= aligned_dskaddr);
		/* Assert that both ends of the source buffer for the write falls within journal buffer limits */
		assert(aligned_base >= &jb->buff[jb->buff_off]);
		assert(aligned_base + aligned_tsz <= &jb->buff[jb->buff_off + jb->size]);
		JNL_LSEEKWRITE(csa, csa->hdr->jnl_file_name, jpc->channel,
			(off_t)aligned_dskaddr, aligned_base, (size_t)aligned_tsz, jpc->status);
		status = jpc->status;
		if (SS_NORMAL == status)
		{	/* update jnl_buff pointers to reflect the successful write to the journal file */
			assert(dsk <= jb->size);
			assert(jb->io_in_prog_latch.u.parts.latch_pid == process_id);
			jpc->new_dsk = dsk + tsz;
			if (jpc->new_dsk >= jb->size)
			{
				assert(jpc->new_dsk == jb->size);
				jpc->new_dsk = 0;
			}
			jpc->new_dskaddr = dskaddr + tsz;
			assert(jpc->new_dsk == jpc->new_dskaddr % jb->size);
			assert(jb->freeaddr >= jpc->new_dskaddr);
			jpc->dsk_update_inprog = TRUE;	/* for secshr_db_clnup to clean it up (when it becomes feasible in Unix) */
			jb->dsk = jpc->new_dsk;
			jb->dskaddr = jpc->new_dskaddr;
			jpc->dsk_update_inprog = FALSE;
			cnl = csa->nl;
			INCR_GVSTATS_COUNTER(csa, cnl, n_jfile_bytes, aligned_tsz);
			INCR_GVSTATS_COUNTER(csa, cnl, n_jfile_writes, 1);
		} else
		{
			assert((ENOSPC == status) || (ERR_ENOSPCQIODEFER == status));
			jb->errcnt++;
			if (ENOSPC == status)
				jb->enospc_errcnt++;
			else
				jb->enospc_errcnt = 0;

			if (ERR_ENOSPCQIODEFER != status)
			{
				jnl_send_oper(jpc, ERR_JNLACCESS);
				jpc->status = status;	/* set jpc->status back to original error as jnl_send_oper resets
							 * jpc->status to SS_NORMAL. We need it in callers of this function
							 * (e.g. jnl_write_attempt). */
			}
#			ifdef GTM_FD_TRACE
			if ((EBADF == status) || (ESPIPE == status))
			{	/* likely case of D9I11-002714. check if fd is valid */
				gtm_dbjnl_dupfd_check();
				/* If fd of this journal points to some other database or journal file opened by this process
				 * the above call would have reset jpc->channel. If it did not get reset, then check
				 * if the fd in itself is valid and points back to the journal file. If not reset it to NOJNL.
				 */
				if (NOJNL != jpc->channel)
					gtm_check_fd_is_valid(reg, FALSE, jpc->channel);
				/* If jpc->channel still did not get reset to NOJNL, it means the file descriptor is valid but
				 * not sure why we are getting EBADF/ESPIPE errors. No further recovery attempted at this point.
				 */
			}
#			endif
			if (ERR_ENOSPCQIODEFER == status)
				status = ERR_JNLWRTDEFER;
			else
				status = ERR_JNLACCESS;
		}
	}
	RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
	if ((jnl_closed == csa->hdr->jnl_state) && (NOJNL != jpc->channel))
	{
		JNL_FD_CLOSE(jpc->channel, close_res);	/* sets jpc->channel to NOJNL */
		jpc->pini_addr = 0;
	}
	jnl_qio_in_prog--;
	assert(0 <= jnl_qio_in_prog);
	return status;
}

/* This is a wrapper for jnl_sub_qio_start that tries to divide the writes into optimal chunks.
 * It calls jnl_sub_qio_start() with appropriate arguments in two stages, the first one with
 * optimal "jnl_fs_block_size" boundary and the other suboptimal tail end of the write. The latter
 * call is made only if no other process has finished the jnl write upto the required point
 * during the time this process yields
 */
uint4 jnl_qio_start(jnl_private_control *jpc)
{
	unsigned int		yield_cnt, status;
	uint4			target_freeaddr, lcl_dskaddr, old_freeaddr;
	jnl_buffer_ptr_t	jb;
	sgmnt_addrs		*csa;
	unix_db_info		*udi;
	uint4			jnl_fs_block_size;

	assert(NULL != jpc);
	udi = FILE_INFO(jpc->region);
	csa = &udi->s_addrs;
	jb = jpc->jnl_buff;
	/* this block of code (till yield()) processes the buffer upto an "jnl_fs_block_size" alignment boundary
	 * and the next block of code (after the yield()) processes the tail end of the data (if necessary)
	 */
	lcl_dskaddr = jb->dskaddr;
	target_freeaddr = jb->freeaddr;
	if (lcl_dskaddr >= target_freeaddr)
		return SS_NORMAL;
	/* ROUND_DOWN2 macro is used under the assumption that "jnl_fs_block_size" would be a power of 2 */
	jnl_fs_block_size = jb->fs_block_size;
	if (ROUND_DOWN2(lcl_dskaddr, jnl_fs_block_size) != ROUND_DOWN2(target_freeaddr, jnl_fs_block_size))
	{	/* data crosses/touches an alignment boundary */
		if (SS_NORMAL != (status = jnl_sub_qio_start(jpc, TRUE)))
			return status;
	} /* else, data does not cross/touch an alignment boundary, yield and see if someone else
	   * does the dirty job more efficiently
	   */
	for (yield_cnt = 0; yield_cnt < csa->hdr->yield_lmt; yield_cnt++)
	{	/* yield() until someone has finished your job or no one else is active on the jnl file */
		old_freeaddr = jb->freeaddr;
		rel_quant();
		/* Purpose of this memory barrier is to get a current view of asyncrhonously changed fields
		 * like whether the jnl file was switched, the write position in the journal file and the
		 * write address in the journal buffer for all the remaining statements in this loop because
		 * the rel_quant call above allows any and all of them to change and we aren't under any
		 * locks while in this loop. This is not a correctness issue as we would either eventually
		 * see the updates or it means we are writing what has already been written. It is a performance
		 * issue keeping more current with state changes done by other processes on other processors.
		 */
		SHM_READ_MEMORY_BARRIER;
		if (JNL_FILE_SWITCHED(jpc))
			return SS_NORMAL;
		/* assert(old_freeaddr <= jb->freeaddr) ** Potential race condition with jnl file switch could
		 * make this assert fail so it is removed
		 */
		if (old_freeaddr == jb->freeaddr || target_freeaddr <= jb->dskaddr)
			break;
	}
	status = SS_NORMAL;
	if (target_freeaddr > jb->dskaddr)
		status = jnl_sub_qio_start(jpc, FALSE);
	return status;
}

static boolean_t	jnl_timer;
void jnl_mm_timer_write(void)
{	/* While this should work by region and use baton passing to more accurately and efficiently perform its task,
	 * it is currently a blunt instrument
	 */
	gd_region	*reg, *r_top;
	gd_addr		*addr_ptr;
	sgmnt_addrs	*csa;

	for (addr_ptr = get_next_gdr(NULL);  NULL != addr_ptr;  addr_ptr = get_next_gdr(addr_ptr))
	{	/* since the unix timers don't provide an argument, for now write all regions */
		for (reg = addr_ptr->regions, r_top = reg + addr_ptr->n_regions;  reg < r_top; reg++)
		{
			if ((dba_mm == reg->dyn.addr->acc_meth) && reg->open)
			{
				csa = &FILE_INFO(reg)->s_addrs;
				if ((NULL != csa->jnl) && (NOJNL != csa->jnl->channel))
					jnl_qio_start(csa->jnl);
			}
		}
	}
	jnl_timer = FALSE;
	return;
}

void jnl_mm_timer(sgmnt_addrs *csa, gd_region *reg)
{	/* While this should work by region and use baton passing to more accurately and efficiently perform its task,
	 * it is currently a blunt instrument.
	 */
	assert(reg->open);
	if (FALSE == jnl_timer)
	{
		jnl_timer = TRUE;
		start_timer((TID)jnl_mm_timer, csa->hdr->flush_time[0], &jnl_mm_timer_write, 0, NULL);
	}
	return;
}