fis-gtm/sr_port/jnl_write_attempt.c

/****************************************************************
 *								*
 *	Copyright 2001, 2012 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include "gdsroot.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "gdsbgtr.h"
#include "filestruct.h"
#include "iosp.h"
#include "jnl.h"
#include "lockconst.h"
#include "interlock.h"
#include "sleep_cnt.h"
#include "send_msg.h"
#include "wcs_sleep.h"
#include "is_proc_alive.h"
#include "compswap.h"
#include "is_file_identical.h"
#include "have_crit.h"
#include "wbox_test_init.h"
#include "anticipatory_freeze.h"

#ifdef UNIX
#include "repl_msg.h"			/* needed for gtmsource.h */
#include "gtmsource.h"			/* needed for jnlpool_addrs typedef */
#include "gtmmsg.h"
#endif
#include "gtm_c_stack_trace.h"

#ifdef UNIX
GBLREF	jnlpool_addrs	jnlpool;
#endif
GBLREF	pid_t		process_id;
GBLREF	uint4		image_count;

error_def(ERR_JNLCNTRL);
error_def(ERR_JNLFLUSH);
error_def(ERR_JNLFLUSHNOPROG);
error_def(ERR_JNLPROCSTUCK);
error_def(ERR_JNLWRTDEFER);
error_def(ERR_JNLWRTNOWWRTR);
error_def(ERR_TEXT);
error_def(ERR_JNLWRTDEFER);
error_def(ERR_JNLWRTNOWWRTR);

#ifdef VMS
#  define CURRENT_WRITER jb->now_writer
#else
#  define CURRENT_WRITER jb->io_in_prog_latch.u.parts.latch_pid
#endif

static uint4 jnl_sub_write_attempt(jnl_private_control *jpc, unsigned int *lcnt, uint4 threshold)
{
	sgmnt_addrs		*csa;
	jnl_buffer_ptr_t	jb;
	unsigned int		status;
	boolean_t		was_crit, exact_check;
	/**** Note static/local */
	static uint4		loop_image_count, writer;	/* assumes calls from one loop at a time */
	uint4			new_dskaddr, new_dsk;
	static uint4		stuck_cnt = 0;

	/* Some callers of jnl_sub_write_attempt (jnl_flush->jnl_write_attempt, jnl_write->jnl_write_attempt) are in
	 * crit, and some other (jnl_wait->jnl_write_attempt) are not. Callers in crit do not need worry about journal
	 * buffer fields (dskaddr, freeaddr) changing underneath them, but for those not in crit, jnl_sub_write_attempt
	 * might incorrectly return an error status when journal file is switched. Such callers should check for
	 * journal file switched condition and terminate any loops they are in.
	 */
	jb = jpc->jnl_buff;
	status = ERR_JNLWRTDEFER;
	csa = &FILE_INFO(jpc->region)->s_addrs;
	was_crit = csa->now_crit;
	exact_check = was_crit && (threshold == jb->freeaddr);	/* see comment in jnl_write_attempt() for why this is needed */
	while (exact_check ? (jb->dskaddr != threshold) : (jb->dskaddr < threshold))
	{
#ifdef UNIX
		if (jb->io_in_prog_latch.u.parts.latch_pid == process_id)
		{
			/* if error condition occurred while doing jnl_qio_start(), then release the lock before waiting */
			/* note that this is done only in UNIX because Unix does synchronous I/O */
			jb->image_count = 0;
			RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
		}
		if (!jb->io_in_prog_latch.u.parts.latch_pid)
			status = jnl_qio_start(jpc);
#elif defined VMS
		if (lib$ast_in_prog())
		{
			if (!jb->io_in_prog)
			{
				assert(jb->blocked == process_id);
				jnl_start_ast(jpc);
				if (jb->now_writer == process_id)
					status = jb->iosb.cond;
			}
			break;		/* no fancy stuff within an AST */
		} else if (!jb->io_in_prog)
		{	/* Note down jpc->new_dskaddr/new_dsk into local variables so we get a consistent copy of these two
			 * variables for checking them later.
			 */
			new_dskaddr = jpc->new_dskaddr;
			new_dsk = jpc->new_dsk;
			status = jnl_qio_start(jpc);
		}
#else
#error UNSUPPORTED PLATFORM
#endif
		if (SS_NORMAL == status)
		{
#			if defined VMS
			/* Check if JNLCNTRL error was signalled by jnl_qio_start(). Note that it does not explicitly
			 * return this error since it in turn calls an AST routine jnl_start_ast that actually has the
			 * qio lock (and hence can look at dskaddr/dsk without any concurrency issues). But jpc will
			 * have two fields new_dskaddr/new_dsk set to what dskaddr/dsk were right after obtaining the
			 * qio lock but before releasing it in case of a JNLCNTRL error. We use those two values to
			 * recheck if this is a JNLCNTRL error situation and if so return that error from here.
			 * Note that we cannot use fields from jpc since they could be set by an AST that pops right
			 * after we check new_dskaddr below but before we fetch the value of new_dsk. So it is important
			 * to use the local variables which we know are a consistent snapshot of jpc->new_dskaddr/new_dsk.
			 * The only consequence of this approach is that in case there is a dskaddr/dsk inconsistency,
			 * it will be detected by the local variables in the next iteration (not the first time around).
			 */
			if ((new_dskaddr % jb->size) != new_dsk)
			{
				assert(gtm_white_box_test_case_enabled
					&& (WBTEST_JNL_FILE_LOST_DSKADDR == gtm_white_box_test_case_number));
				status = ERR_JNLCNTRL;
			}
#			endif
			break;
		}
		UNIX_ONLY(assert(ERR_JNLWRTNOWWRTR != status);)	/* dont have asynchronous jnl writes in Unix */
		if ((ERR_JNLWRTNOWWRTR != status) && (ERR_JNLWRTDEFER != status))
			return status;
		if ((writer != CURRENT_WRITER) || (1 == *lcnt))
		{
			writer = CURRENT_WRITER;
			loop_image_count = jb->image_count;
			*lcnt = 1;	/* !!! this should be detected and limited by the caller !!! */
			break;
		}
		if (*lcnt <= JNL_MAX_FLUSH_TRIES)
		{
			wcs_sleep(*lcnt);
			break;
		}
		VMS_ONLY(
			if ((CURRENT_WRITER == process_id) && (jpc->qio_active == TRUE) && (jb->iosb.cond == -2))
		        {	/* this an "impossible" condition where the private flag and the io have lost sync */
				GTMASSERT;	/* this should only occur in VMS; secshr_db_clnup should clear the problem */
			}
		)
		if (writer == CURRENT_WRITER)
		{
			if (!was_crit)
				grab_crit(jpc->region);	/* jnl_write_attempt has an assert about have_crit that this relies on */
			if (VMS_ONLY(0 == writer ||) FALSE == is_proc_alive(writer, jb->image_count))
			{	/* no one home, clear the semaphore; */
				BG_TRACE_PRO_ANY(csa, jnl_blocked_writer_lost);
				jnl_send_oper(jpc, ERR_JNLFLUSH);
				send_msg(VARLSTCNT(3) ERR_JNLPROCSTUCK, 1, CURRENT_WRITER);
				send_msg(VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("Journal IO writer changed during wait"));
				VMS_ONLY(jb->io_in_prog = 0);
				UNIX_ONLY(COMPSWAP_UNLOCK(&jb->io_in_prog_latch, writer, jb->image_count, LOCK_AVAILABLE, 0));
				if (!was_crit)
					rel_crit(jpc->region);
				*lcnt = 1;
				continue;
			}
			if (!was_crit)
				rel_crit(jpc->region);
			/* this is the interesting case: a process is stuck */
			BG_TRACE_PRO_ANY(csa, jnl_blocked_writer_stuck);
			jpc->status = status;
			jnl_send_oper(jpc, ERR_JNLFLUSH);
			send_msg(VARLSTCNT(3) ERR_JNLPROCSTUCK, 1, CURRENT_WRITER);
			stuck_cnt++;
			GET_C_STACK_FROM_SCRIPT("JNLPROCSTUCK", process_id, CURRENT_WRITER, stuck_cnt);
			*lcnt = 1;	/* ??? is it necessary to limit this, and if so, how ??? */
			status = ERR_JNLPROCSTUCK;
			break;
		}
		break;
	}
	if ((threshold > jb->freeaddr)
		|| (csa->now_crit && ((jb->dskaddr > jb->freeaddr) || (jb->free != (jb->freeaddr % jb->size)))))
	{	/* threshold > jb->freeaddr => somebody decremented jb->freeaddr after we computed threshold, or jnl was switched
		 * jb->dsk != jb->freeaddr % jb->size => out of design condition
		 * jb->dskaddr > jb->freeaddr => out of design condition, or jnl was switched
		 */
		status = ERR_JNLCNTRL;
	}
	return status;
}

uint4 jnl_write_attempt(jnl_private_control *jpc, uint4 threshold)
{
	jnl_buffer_ptr_t	jb;
	unsigned int		lcnt, prev_lcnt, cnt, proc_stuck_cnt;
	sgmnt_addrs		*csa;
	unsigned int		status;
	boolean_t		was_crit, jnlfile_lost, exact_check;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	jb = jpc->jnl_buff;
	csa = &FILE_INFO(jpc->region)->s_addrs;
	was_crit = csa->now_crit;

	/* If holding crit and input threshold matches jb->freeaddr, then we need to wait in the loop as long as dskaddr
	 * is not EQUAL to threshold. This is because if dskaddr is lesser than threshold we need to wait. If ever it
	 * becomes greater than threshold, it is an out-of-design situation (since dskaddr has effectively become > freeaddr)
	 * and so we need to trigger "jnl_file_lost" which is done in "jnl_sub_write_attempt" so it is important to invoke
	 * that routine (in the for loop below). Hence the need to do an exact match instead of a < match. If not holding
	 * crit or input threshold does not match jb->freeaddr, then dskaddr becoming GREATER than threshold is a valid
	 * condition so we should do a (dskaddr < threshold), not a (dskaddr != threshold) check in that case.
	 */
	exact_check = was_crit && (threshold == jb->freeaddr);
	assert(!was_crit || threshold <= jb->freeaddr);
	/* Check that we either own crit on the current region or we DONT own crit on ANY region. This is relied upon by
	 * the grab_crit calls (done in jnl_write_attempt and jnl_sub_write_attempt) to ensure no deadlocks are possible.
	 */
	assert(was_crit || (0 == have_crit(CRIT_HAVE_ANY_REG)));
	for (prev_lcnt = lcnt = cnt = 1, proc_stuck_cnt = 0;
		(was_crit || (NOJNL != jpc->channel)) && (exact_check ? jb->dskaddr != threshold : jb->dskaddr < threshold);
		lcnt++, prev_lcnt = lcnt, cnt++)
	{
		status = jnl_sub_write_attempt(jpc, &lcnt, threshold);
		if (JNL_FILE_SWITCHED(jpc))
		{	/* If we are holding crit, the journal file switch could happen in the form of journaling getting
			 * turned OFF (due to disk space issues etc.)
			 */
			jpc->status = SS_NORMAL;
			return SS_NORMAL;
		}
		if (SS_NORMAL == status)
		{
			if (JNL_FLUSH_PROG_TRIES > lcnt)
			{
				proc_stuck_cnt = 0;
				/* In VMS, jnl writes are asynchronous. The above call to "jnl_sub_write_attempt" has returned
				 * SS_NORMAL status. This means the jnl qio lock is not in use by anyone else and is up for grabs.
				 * We would have scheduled a jnl qio write through a sys$dclast call. We have no control of when
				 * the AST routine "jnl_start_ast" will actually get control and start the write. Until then
				 * we dont want to keep reinvoking "jnl_sub_write_attempt" in a hard spin loop. So sleep.
				 * In Unix, writes are synchronous so SS_NORMAL status return implies we have completed a jnl
				 * write and "jb->dskaddr" is closer to "threshold" than it was in the previous iteration.
				 * A sleep at this point will only slow things down unnecessarily. Hence no sleep if Unix.
				 */
				VMS_ONLY(wcs_sleep(lcnt);)
				continue;
			}
			jpc->status = SS_NORMAL;
			jnl_send_oper(jpc, ERR_JNLFLUSH);
			send_msg(VARLSTCNT(8) ERR_JNLFLUSHNOPROG, 2, JNL_LEN_STR(csa->hdr),
				 ERR_TEXT, 2, LEN_AND_LIT("Could not flush all the buffered journal data"));
			GTMASSERT; /* too many attempts to flush journal data */
		}
		if ((ERR_JNLCNTRL == status)
			|| (csa->now_crit
				&& (ERR_JNLWRTDEFER != status) && (ERR_JNLWRTNOWWRTR != status) && (ERR_JNLPROCSTUCK != status)))
		{	/* If JNLCNTRL or if holding crit and not waiting for some other writer (or self in VMS)
			 * better turn off journaling and proceed with database update to avoid a database hang.
			 */
			if (was_crit)
				jb->blocked = 0;
			else
				grab_crit(jpc->region);	/* jnl_write_attempt has an assert about have_crit that this relies on */
			jnlfile_lost = FALSE;
			if (jb->free_update_pid)
			{
				FIX_NONZERO_FREE_UPDATE_PID(csa, jb);
			} else
			{
				assert(gtm_white_box_test_case_enabled
					&& (WBTEST_JNL_FILE_LOST_DSKADDR == gtm_white_box_test_case_number));
				if (JNL_ENABLED(csa->hdr))
				{	/* We ignore the return value of jnl_file_lost() since we always want to report the journal
					 * error, whatever its error handling method is.  Also, an operator log will be sent by some
					 * callers (t_end()) only if an error is returned here, and the operator log is wanted in
					 * those cases.
					 */
					jnl_file_lost(jpc, status);
					jnlfile_lost = TRUE;
				}
				/* Else journaling got closed concurrently by another process by invoking "jnl_file_lost"
				 * just before we got crit. Do not invoke "jnl_file_lost" again on the same journal file.
				 * Instead continue and next iteration will detect the journal file has switched and terminate.
				 */
			}
			if (!was_crit)
				rel_crit(jpc->region);
			if (!jnlfile_lost)
				continue;
			else
				return status;
		}
#		ifdef UNIX
		if ((ERR_JNLWRTDEFER == status) && IS_REPL_INST_FROZEN)
		{	/* Check if instance freeze is in effect and this db has instance freeze activation enabled.
			 * In that case, we do not want to keep retrying the jnl_qio as that might cause lcnt to increase
			 * and eventually GTMASSERT implying this is an IO issue whereas it is possible some other process
			 * is holding the jnl_qio lock on this region and is not able to write to the journal file for a
			 * long time because the instance is frozen. To avoid false GTMASSERTs, wait for freeze to be
			 * lifted before continuing with normal flow (which is to increment lcnt and keep retrying the
			 * attempt at the jnl_qio lock). Note that this process is guaranteed not to have set the instance
			 * freeze due to a ENOSPC situation as in that case we would never have allowed any interrupt to occur
			 * until we succeed with that write and will clear the freeze before moving on.
			 * Note that the below macro takes care of the "db has instance freeze activation enabled" check too.
			 */
			 WAIT_FOR_REPL_INST_UNFREEZE(csa);
		}
#		endif
		if ((ERR_JNLWRTDEFER != status) && (ERR_JNLWRTNOWWRTR != status) && (ERR_JNLPROCSTUCK != status))
		{	/* If holding crit, then jnl_sub_write_attempt would have invoked jnl_file_lost which would have
			 * caused the JNL_FILE_SWITCHED check at the beginning of this for loop to succeed and return from
			 * this function so we should never have gotten here. Assert accordingly. If not holding crit,
			 * wait for some crit holder to invoke jnl_file_lost. Until then keep sleep looping indefinitely.
			 * The sleep in this case is not time-limited because the callers of jnl_write_attempt (particularly
			 * jnl_wait) do not check its return value so they assume success returns from this function. It is
			 * non-trivial to change the interface and code of all callers to handle the error situation so we
			 * instead choose to sleep indefinitely here until some crit process encounters the same error and
			 * triggers jnl_file_lost processing which will terminate the loop due to the JNL_FILE_SWITCHED check.
			 */
			assert(!csa->now_crit);
			wcs_sleep(lcnt);
		} else if (prev_lcnt != lcnt)
		{
			assert(1 == lcnt);
			if (ERR_JNLWRTDEFER == status)
			{	/* Change of writer */
				if (JNL_FLUSH_PROG_TRIES <= cnt)
				{
					send_msg(VARLSTCNT(8) ERR_JNLFLUSHNOPROG, 2, JNL_LEN_STR(csa->hdr),
						ERR_TEXT, 2, LEN_AND_LIT("No progress even with multiple writers"));
					GTMASSERT;
				}
				proc_stuck_cnt = 0;
			} else if (ERR_JNLPROCSTUCK == status && (JNL_FLUSH_PROG_FACTOR <= ++proc_stuck_cnt))
			{
				send_msg(VARLSTCNT(8) ERR_JNLFLUSHNOPROG, 2, JNL_LEN_STR(csa->hdr), ERR_TEXT, 2,
					LEN_AND_LIT("Progress prevented by a process stuck flushing journal data"));
				VMS_ONLY(
					if (TREF(gtm_environment_init))
					{
						proc_stuck_cnt = 0;
						continue;
					}
				)

				GTMASSERT;
			}
		}
	}
	return SS_NORMAL;
}