fis-gtm/sr_unix/mutex.c

/****************************************************************
 *								*
 *	Copyright 2001, 2011 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

/* GT.M Mutex Control */

#include "mdef.h"

#include "gtm_time.h"	/* for time() */
#include "gtm_socket.h"
#include "gtm_string.h"
#include "gtm_stdlib.h"
#include "gtm_unistd.h"
#include "gtm_stdio.h"

#include <errno.h>
#include <sys/un.h>
#include <iotcp_select.h>
#if defined(__sparc) || defined(__hpux) || defined(__MVS__) || defined(__linux__) || defined(__CYGWIN__)
#include "gtm_limits.h"
#else
#include <sys/limits.h>
#endif

#include "aswp.h"
#include "gdsroot.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "lockconst.h"
#include "interlock.h"
#include "filestruct.h"
#include "io.h"
#include "jnl.h"
#include "gdsbgtr.h"
#include "mutex.h"
#include "relqueopi.h"
#include "eintr_wrappers.h"
#include "send_msg.h"
#include "is_proc_alive.h"
#include "compswap.h"
#include "gtmsecshr.h"
#include "rel_quant.h"
#include "add_inter.h"
#include "mutex_deadlock_check.h"
#include "gt_timer.h"
#include "heartbeat_timer.h"
#include "gtmio.h"
#ifdef DEBUG
#include "wbox_test_init.h"
#include "repl_msg.h"			/* needed by gtmsource.h */
#include "gtmsource.h"			/* required for jnlpool GBLREF */
#endif

#define QUANT_RETRY			10000
#define QUEUE_RETRY			255

#ifdef MUTEX_MSEM_WAKE
#define MUTEX_MAX_HEARTBEAT_WAIT        2 /* so that total wait for both select and msem wait will be the same */
#define MUTEX_LCKALERT_PERIOD		8
#endif

/* The following CAREFUL_* macros invoke the corresponding * macros except in the case csa->hdr is NULL.
 * This is possible if the csa corresponds to the journal pool where there is no notion of a db hdr.
 * In that case, we skip invoking the * macros.
 */
#define	CAREFUL_SET_TRACEABLE_VAR(CSA, VALUE)					\
{										\
	sgmnt_data_ptr_t	lcl_csd;					\
										\
	lcl_csd = CSA->hdr;							\
	assert((NULL != lcl_csd)						\
		|| (CSA == &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs));	\
	if (NULL != lcl_csd)							\
		SET_TRACEABLE_VAR(lcl_csd->wc_blocked, TRUE);			\
}

#define	CAREFUL_BG_TRACE_PRO_ANY(CSA, EVENT)					\
{										\
	sgmnt_data_ptr_t	lcl_csd;					\
										\
	lcl_csd = CSA->hdr;							\
	assert((NULL != lcl_csd)						\
		|| (CSA == &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs));	\
	if (NULL != lcl_csd)							\
		BG_TRACE_PRO_ANY(CSA, EVENT);					\
}

GBLREF pid_t			process_id;
GBLREF uint4			image_count;
GBLREF int			num_additional_processors;
#ifdef MUTEX_MSEM_WAKE
GBLREF volatile uint4           heartbeat_counter;
#  ifdef POSIX_MSEM
static sem_t			*mutex_wake_msem_ptr = NULL;
#  else
static msemaphore		*mutex_wake_msem_ptr = NULL;
#  endif
static mutex_que_entry_ptr_t	msem_slot;
#else
GBLREF int			mutex_sock_fd;
GBLREF fd_set			mutex_wait_on_descs;
#endif
GBLREF	uint4			mutex_per_process_init_pid;
GBLREF	boolean_t		mu_rndwn_file_dbjnl_flush;
#ifdef DEBUG
GBLREF	jnlpool_addrs		jnlpool;
#endif

DECLARE_MUTEX_TRACE_CNTRS

DECLARE_MUTEX_TEST_SIGNAL_FLAG

static	boolean_t	woke_self;
static	boolean_t	woke_none;
static	unsigned short	next_rand[3];
static	int		optimistic_attempts;
static	int		mutex_expected_wake_instance = 0;

static	enum cdb_sc	mutex_wakeup(mutex_struct_ptr_t addr);
void			mutex_salvage(gd_region *reg);

error_def(ERR_MUTEXERR);
error_def(ERR_MUTEXFRCDTERM);
error_def(ERR_MUTEXLCKALERT);
error_def(ERR_TEXT);
error_def(ERR_WCBLOCKED);

/*
 *	General:
 *		Uses compare-and-swap logic to obtain/release a semaphore
 *		in shared memory.
 *
 *	Interface:
 *		void gtm_mutex_init(reg, n, crash)
 *			Initialize mutex structure for region reg with n
 *		queue slots. If crash is TRUE, then this is a "crash"
 *		reinitialization; otherwise, it's a "clean" initialization.
 *
 *		enum cdb_sc mutex_lockw(reg, mutex_spin_parms, seq)
 *			Write access to mutex for region reg
 *
 *		enum cdb_sc mutex_lockwim(reg, mutex_spin_parms, seq)
 *			Write access for region reg; if cannot lock,
 *		immediately return cdb_sc_nolock
 *
 *		enum cdb_sc mutex_unlockw(reg, seq);
 *			Unlock write access for region reg
 *
 *		For routines taking the seq argument, if seq != crash count,
 *		return cdb_sc_critreset.
 *
 *
 *	Mutex structure must be quadword aligned
 *
 *
 *	Mutex structure :
 *
 *		---------------------------------
 *		|	   semaphore		|
 *		---------------------------------
 *		|	  crash count		|
 *		---------------------------------
 *		|       # of que slots		|
 *		--------------------------------
 *		|_ fl waiting process que head _|
 *		|_ bl			       _|
 *		|_ global_latch		       _|
 *		---------------------------------
 *		|_ fl unused slots queue head  _|
 *		|_ bl			       _|
 *		|_ global_latch		       _|
 *		---------------------------------
 *		|_ fl	first queue entry      _|
 *		|_ bl			       _|
 *		|_ pid			       _|
 *		|  super_crit [CCP use only]^   |
 *		---------------------------------
 *		|_ fl	second queue entry     _|
 *		|_ bl			       _|
 *		|_ pid			       _|
 *		|  super_crit [CCP use only]^   |
 *		---------------------------------
 *		:	:	:	:	:
 *		---------------------------------
 *		|_ fl	last queue entry       _|
 *		|_ bl			       _|
 *		|_ pid			       _|
 *		|  super_crit [CCP use only]^   |
 *		---------------------------------
 *
 *		^Note:  only one entry at a time (at the head of the
 *		        waiting process queue) will ever use "super_crit".
 *		        CCP is used in VMS only - 03/11/98
 *              07-31-2002 se: super-crit is not used at all anymore. Comments are left for historical purposes.
 *
 *		Fields may be interspersed with fillers for alignment purposes.
 */

static	void	clean_initialize(mutex_struct_ptr_t addr, int n, bool crash)
{
	mutex_que_entry_ptr_t	q_free_entry;
#	if defined(MUTEX_MSEM_WAKE) && !defined(POSIX_MSEM)
	msemaphore		*status;
#	endif

	assert(n > 0);
	addr->queslots = n;
	/* Initialize the waiting process queue to be empty */
	addr->prochead.que.fl = addr->prochead.que.bl = 0;
	SET_LATCH_GLOBAL(&addr->prochead.latch, LOCK_AVAILABLE);
	/* Initialize the free queue to be empty */
	addr->freehead.que.fl = addr->freehead.que.bl = 0;
	SET_LATCH_GLOBAL(&addr->freehead.latch, LOCK_AVAILABLE);
	/* Clear the first free entry */
	q_free_entry = (mutex_que_entry_ptr_t)((sm_uc_ptr_t)&addr->freehead + SIZEOF(mutex_que_head));
	q_free_entry->que.fl = q_free_entry->que.bl = 0;
	q_free_entry->pid = 0;
	q_free_entry->super_crit = (void *)NULL;
	q_free_entry->mutex_wake_instance = 0;
	while (n--)
	{
#		ifdef MUTEX_MSEM_WAKE
#		  ifdef POSIX_MSEM
		if (-1 == sem_init(&q_free_entry->mutex_wake_msem, TRUE, 0))  /* Shared lock with no initial resources (locked) */
#		  else
		if ((NULL == (status = msem_init(&q_free_entry->mutex_wake_msem, MSEM_LOCKED))) || ((msemaphore *)-1 == status))
#		  endif
			rts_error(VARLSTCNT(7) ERR_MUTEXERR, 0, ERR_TEXT, 2,
				RTS_ERROR_TEXT("Error with mutex wait memory semaphore initialization"), errno);
#		endif
		/* Initialize fl,bl links to 0 before INSQTI as it (gtm_insqti in relqueopi.c) asserts this */
		DEBUG_ONLY(((que_ent_ptr_t)q_free_entry)->fl = 0;)
		DEBUG_ONLY(((que_ent_ptr_t)q_free_entry)->bl = 0;)
		if (INTERLOCK_FAIL == INSQTI((que_ent_ptr_t)q_free_entry++, (que_head_ptr_t)&addr->freehead))
			rts_error(VARLSTCNT(6) ERR_MUTEXERR, 0, ERR_TEXT, 2,
				RTS_ERROR_TEXT("Interlock instruction failure in mutex initialize"));
	}
	SET_LATCH_GLOBAL(&addr->semaphore, LOCK_AVAILABLE);
	if (!crash)
	{
		SET_LATCH(&addr->crashcnt, 0);
		SET_LATCH_GLOBAL(&addr->crashcnt_latch, LOCK_AVAILABLE);
	}
	return;
}

static	void	crash_initialize(mutex_struct_ptr_t addr, int n, bool crash)
{
	/*
	 * mutex_wake_proc() is not declared here because its return value
	 * is left unspecified in its definition (see mutex_wake_proc.c)
	 */
	mutex_que_entry_ptr_t	next_entry;

	INCR_CNT(&addr->crashcnt, &addr->crashcnt_latch);
	addr->freehead.que.fl = addr->freehead.que.bl = 0;
	next_entry = (mutex_que_entry_ptr_t)&addr->prochead;
	do
	{
		if (0 == next_entry->que.fl)
		{
			/* Wait queue empty; do a clean initialization */
			clean_initialize(addr, n, crash);
			return;
		}
		next_entry = (mutex_que_entry_ptr_t)((sm_uc_ptr_t)next_entry + next_entry->que.fl);
		if (next_entry <= (mutex_que_entry_ptr_t)&addr->prochead ||
		    next_entry >= (mutex_que_entry_ptr_t)&addr->prochead + n + 1 ||
		    (0 != ((INTPTR_T)next_entry & (SIZEOF(mutex_que_entry) - 1))))
		{
			/*
			 * next_entry == &addr->prochead => loop is done;
			 * next_entry below queue head => queue is corrupt;
			 * next_entry above queue top => queue is corrupt;
			 * next_entry is not (SIZEOF(queue) entry)-byte
			 * aligned => queue is corrupt ...
			 * ... in all cases do a clean initialization
			 */
			clean_initialize(addr, n, crash);
			return;
		}
		/* Wake up process */
		if (next_entry->pid != process_id)
#			ifdef MUTEX_MSEM_WAKE
			mutex_wake_proc(&next_entry->mutex_wake_msem);
#			else
			mutex_wake_proc((sm_int_ptr_t)&next_entry->pid, next_entry->mutex_wake_instance);
#			endif
	} while (TRUE);
}

static	enum cdb_sc mutex_long_sleep(mutex_struct_ptr_t addr, mutex_lock_t mutex_lock_type)
{
	enum cdb_sc		status;
	boolean_t		wakeup_status;
#	ifdef MUTEX_MSEM_WAKE
	uint4                   bad_heartbeat;
#	else
	struct timeval		timeout;
	int			timeout_threshold;
	struct sockaddr_un	mutex_woke_me_proc;
	GTM_SOCKLEN_TYPE	mutex_woke_me_proc_len;
	mutex_wake_msg_t	mutex_wake_msg[2];
	int			sel_stat;
	ssize_t			nbrecvd;
	int			timeout_intr_slpcnt;
	long			timeout_val;
#	endif
#	ifdef DEBUG
	if (gtm_white_box_test_case_enabled
		&& (WBTEST_SENDTO_EPERM == gtm_white_box_test_case_number))
	{
		FPRINTF(stderr, "MUPIP BACKUP is about to start long sleep\n");
	}
#	endif
	if (LOCK_AVAILABLE == addr->semaphore.u.parts.latch_pid && ++optimistic_attempts <= MUTEX_MAX_OPTIMISTIC_ATTEMPTS)
	{
		MUTEX_DPRINT2("%d: Nobody in crit (II) wake procs\n", process_id);
		MUTEX_TRACE_CNTR(mutex_trc_mutex_slp_fn_noslp);
		status = mutex_wakeup(addr);
		if ((cdb_sc_normal == status) && (woke_self || woke_none))
			return (cdb_sc_normal);
		else if (cdb_sc_dbccerr == status)
			return (cdb_sc_dbccerr);
	}
	optimistic_attempts = 0;
	do
	{
#		ifdef MUTEX_MSEM_WAKE
		/* My msemaphore is already used by another process.
		 * In other words, I was woken up, but missed my wakeup call.
		 * I should return immediately.
		 */
		if (msem_slot->pid != process_id)
			wakeup_status = TRUE;
		else
		{
			bad_heartbeat = 0;
			/*
			 * the check for EINTR below is valid and should not be converted to an EINTR
			 * wrapper macro, because another condition is checked for the while loop.
			 */
			while (!(wakeup_status = (0 == MSEM_LOCKW(mutex_wake_msem_ptr))))
			{
				if (EINTR == errno)
				{
					if (bad_heartbeat)	/* to save memory reference and calc on fast path */
					{
						if (bad_heartbeat < heartbeat_counter)
						{
							MUTEX_DPRINT3("%d: msem sleep done, heartbeat_counter = %d\n",
								     process_id, heartbeat_counter);
							break;
						}
						MUTEX_DPRINT3("%d: msem sleep continue, heartbeat_counter = %d\n",
							      process_id, heartbeat_counter);
					} else
						bad_heartbeat = heartbeat_counter + MUTEX_MAX_HEARTBEAT_WAIT - 1;
					/* -1 since we were interrupted this time */
				} else
					rts_error(VARLSTCNT(7) ERR_MUTEXERR, 0, ERR_TEXT, 2,
						RTS_ERROR_TEXT("Error with mutex wake msem"), errno);

			}
			/* wakeup_status is set to true, if I was able to lock...somebody woke me up;
			 * wakeup_status is set to false, if I timed out and should go to recovery.
			 */
		}
#		else
		do
		{
			timeout.tv_sec = MUTEX_CONST_TIMEOUT_VAL;
			timeout.tv_usec = (gtm_tv_usec_t)(nrand48(next_rand) & ((1U << MUTEX_NUM_WAIT_BITS) - 1)) + 1;
			timeout_val = timeout.tv_sec * ONE_MILLION + timeout.tv_usec;
			/*
			 * Can add backoff logic here to increase the timeout
			 * as the number of attempts increase
			 */
			timeout_intr_slpcnt = MUTEX_INTR_SLPCNT;
			MUTEX_DPRINT4("%d: Sleeping for %d s %d us\n", process_id, timeout.tv_sec, timeout.tv_usec);
			FD_SET(mutex_sock_fd, &mutex_wait_on_descs);
			MUTEX_TRACE_CNTR(mutex_trc_slp);
			/*
			 * the check for EINTR below is valid and should not be converted to an EINTR
			 * wrapper macro, since it might be a timeout.
			 */
			while (-1 == (sel_stat =
				select(mutex_sock_fd + 1, &mutex_wait_on_descs, (fd_set *)NULL, (fd_set *)NULL, &timeout)))
			{
				if (EINTR == errno)
				{	/* somebody interrupted me, reduce the timeout by half and continue */
					MUTEX_TRACE_CNTR(mutex_trc_slp_intr);
					if (!(timeout_intr_slpcnt--)) /* Assume timed out */
					{
						sel_stat = 0;
						MUTEX_TRACE_CNTR(mutex_trc_intr_tmout);
						break;
					}
				} else
					rts_error(VARLSTCNT(5) ERR_TEXT, 2,
						RTS_ERROR_TEXT("Error with mutex select. Running in degraded mode"), errno);
				timeout_val >>= 1;
				timeout.tv_sec = timeout_val / ONE_MILLION;
				timeout.tv_usec = (gtm_tv_usec_t)(timeout_val % ONE_MILLION);
				MUTEX_DPRINT4("%d: Interrupted select, new timeout %d s %d us\n", process_id, timeout.tv_sec,
					timeout.tv_usec);
				/* the next line deals with the case that an interrupted select has changed mutex_wait_on_descs */
				FD_SET(mutex_sock_fd, &mutex_wait_on_descs);
				MUTEX_TRACE_CNTR(mutex_trc_slp);
			}
			if (1 == sel_stat) /* Somebody woke me up */
			{
				mutex_woke_me_proc_len = SIZEOF(struct sockaddr_un);
				RECVFROM_SOCK(mutex_sock_fd, (void *)&mutex_wake_msg[0], SIZEOF(mutex_wake_msg), 0,
					(struct sockaddr *)&mutex_woke_me_proc,
					(GTM_SOCKLEN_TYPE *)&mutex_woke_me_proc_len, nbrecvd);
				if (SIZEOF(mutex_wake_msg) == nbrecvd) /* Drained out both old and new wake messages */
				{
					MUTEX_TRACE_CNTR(mutex_trc_slp_wkup);
					MUTEX_TRACE_CNTR(mutex_trc_pgybckd_dlyd_wkup);
					MUTEX_DPRINT3("%d: %d woke me up, drained delayed message too\n", process_id,
						mutex_wake_msg[1].pid);
					wakeup_status = TRUE;
					break;
				}
				if (BIN_TOGGLE(mutex_expected_wake_instance) == mutex_wake_msg[0].mutex_wake_instance)
				{
					MUTEX_DPRINT3("%d: %d woke me up\n", process_id, mutex_wake_msg[0].pid);
					MUTEX_TRACE_CNTR(mutex_trc_slp_wkup);
					wakeup_status = TRUE;
					break;
				} /* else, old wake msg, ignore */
				MUTEX_DPRINT3("%d: %d sent me delayed wake msg\n", process_id, mutex_wake_msg[0].pid);
				MUTEX_TRACE_CNTR(mutex_trc_xplct_dlyd_wkup);
			} else if (0 == sel_stat) /* Timed out */
			{
				MUTEX_DPRINT2("%d: Sleep done, go wake others\n", process_id);
				MUTEX_TRACE_CNTR(mutex_trc_slp_tmout);
				wakeup_status = FALSE;
				break;
			}
		} while (TRUE);
#		endif
		/*
		 * If I was woken up and am a writer, others are blocking on
		 * me. So, I shall try to get the lock NOW
		 */
		if (wakeup_status)
		{
			if (MUTEX_LOCK_WRITE == mutex_lock_type)
				return (cdb_sc_normal);
		} else
			mutex_deadlock_check(addr);	/* Timed out: See if any deadlocks and fix if detected */
		status = mutex_wakeup(addr); /* Timed out or reader. In case
					      * of reader this causes
					      * accelerated wakeup of readers
					      * in the queue */
		if (cdb_sc_dbccerr == status)
			return (cdb_sc_dbccerr);
		/* else status is cdb_sc_normal */
		if (wakeup_status || woke_self || woke_none)
			return (cdb_sc_normal);
		/*
		 * There are others above me in the queue or I missed my
		 * wakeup call. In the latter case, select or msem_lock will return
		 * immediately and there won't be further sleeps.
		 */
	} while (TRUE);
}

static	enum cdb_sc mutex_sleep(sgmnt_addrs *csa, mutex_lock_t mutex_lock_type)
{
	/* Insert this process at the tail of the wait queue and hibernate */
	mutex_struct_ptr_t	addr;
	mutex_que_entry_ptr_t	free_slot;
	int			redo_cntr;
	int			queue_retry_counter_remq,
			        quant_retry_counter_remq,
				queue_retry_counter_insq,
				quant_retry_counter_insq;
#	ifdef MUTEX_MSEM_WAKE
	int			rc;
#	endif

	addr = csa->critical;
	MUTEX_TRACE_CNTR(mutex_trc_mutex_slp_fn);
	MUTEX_DPRINT2("%d: In Mutex Sleep\n", process_id);
	if (LOCK_AVAILABLE == addr->semaphore.u.parts.latch_pid) /* there is nobody in crit */
	{
		/*
		 * The above condition is an optimistic check to speed
		 * things up by not letting a process sleep.
		 * In an n-way SMP, there is a possibility that n processes
		 * (atleast one writer) might run in a lock-step manner
		 * testing the above condition almost at the same time and
		 * deciding that nobody is in crit. This might go on till
		 * atleast one of them grabs crit, or lock attempts cross a
		 * threshold (leading to recovery). This is not desired. To
		 * avoid such a scenario, we test the number of times we have
		 * run into this situation and force ourselves to sleep
		 */
		if (++optimistic_attempts < MUTEX_MAX_OPTIMISTIC_ATTEMPTS)
		{
			MUTEX_DPRINT2("%d: Nobody in crit (I) wake procs\n", process_id);
			MUTEX_TRACE_CNTR(mutex_trc_mutex_slp_fn_noslp);
			return (mutex_wakeup(addr));
		}
	}
	redo_cntr = 0;
	quant_retry_counter_remq = QUANT_RETRY;
	do
	{
		queue_retry_counter_remq = QUEUE_RETRY;
		do
		{
			free_slot = (mutex_que_entry_ptr_t)REMQHI((que_head_ptr_t)&addr->freehead);
#			ifdef MUTEX_MSEM_WAKE
                        msem_slot = free_slot;
#			endif
			if ((mutex_que_entry_ptr_t)NULL != free_slot &&
			    (mutex_que_entry_ptr_t)INTERLOCK_FAIL != free_slot)
			{
				free_slot->pid = process_id;
				free_slot->mutex_wake_instance = mutex_expected_wake_instance;
#				ifdef MUTEX_MSEM_WAKE
				mutex_wake_msem_ptr = &free_slot->mutex_wake_msem;
				/* this loop makes sure that the msemaphore is locked initially
				 * before the process goes to long sleep
				 */
				do
				{
					rc = MSEM_LOCKNW(mutex_wake_msem_ptr);
				} while (-1 == rc && EINTR == errno);
#				endif
				/*
				 * Significance of mutex_wake_instance field :
				 * -----------------------------------------
				 * After queueing itself, a process
				 * might go to sleep (select call in
				 * mutex_long_sleep) awaiting a wakeup message
				 * or a timeout. It is possible that a wakeup
				 * message might arrive after timeout. In this
				 * case, a later attempt at waiting for a
				 * wakeup message will falsely succeed on an
				 * old wakeup message. We use the
				 * mutex_wake_instance field (value 0 or 1)
				 * to distinguish between an old and a new
				 * wakeup message. Since at any given time
				 * there is atmost one entry in the queue for
				 * a process, the only values we need for
				 * mutex_wake_instance are 0 and 1.
				 */
				mutex_expected_wake_instance = BIN_TOGGLE(mutex_expected_wake_instance);
				quant_retry_counter_insq = QUANT_RETRY;
				do
				{
					queue_retry_counter_insq = QUEUE_RETRY;
					do
					{
						if (INTERLOCK_FAIL !=
							INSQTI((que_ent_ptr_t)free_slot, (que_head_ptr_t)&addr->prochead))
						{
							MUTEX_DPRINT3("%d: Inserted %d into wait queue\n", process_id,
									free_slot->pid);
							return (mutex_long_sleep(addr, mutex_lock_type));
						}
					} while (--queue_retry_counter_insq);
					if (!(--quant_retry_counter_insq))
						return (cdb_sc_dbccerr); /* Too many failures */
					rel_quant();
				} while (quant_retry_counter_insq);
			} else if ((mutex_que_entry_ptr_t)NULL == free_slot)
			{
				/* Record queue full event in db file header if applicable.
				 * Take care not to do it for jnlpool which has no concept of a db cache.
				 * In that case csa->hdr is NULL so use CAREFUL_BG_TRACE_PRO_ANY macro.
				 */
				CAREFUL_BG_TRACE_PRO_ANY(csa, mutex_queue_full);
				MUTEX_DPRINT2("%d: Free Queue full\n", process_id);
				/* Wait a second, then try again */
				MICROSEC_SLEEP(ONE_MILLION - 1);
				if (++redo_cntr < MUTEX_MAX_WAIT_FOR_PROGRESS_CNTR)
					break;
				/*
				 * When I can't find a free slot in the queue
				 * repeatedly, it means that there is no
				 * progress in the system. A recovery attempt
				 * might be warranted in this scenario. The
				 * trick is to return cdb_sc_normal which in
				 * turn causes another spin-loop initiation (or
				 * recovery when implemented).
				 * The objective of mutex_sleep is achieved
				 * (partially) in that sleep is done, though
				 * queueing isn't.
				 */
				return (cdb_sc_normal);
			} else
			{
				/* secondary interlock failed on an attempt to
				 * remove an entry from the free queue */
				redo_cntr = 0;
			}
		} while (--queue_retry_counter_remq);
		if (redo_cntr)
			quant_retry_counter_remq = QUANT_RETRY + 1;
		else
			rel_quant();
	} while (--quant_retry_counter_remq);

	return (cdb_sc_dbccerr);
}

static	enum cdb_sc mutex_wakeup(mutex_struct_ptr_t addr)
{
	mutex_que_entry_ptr_t	free_entry;
	int			queue_retry_counter_remq,
				quant_retry_counter_remq,
				queue_retry_counter_insq,
				quant_retry_counter_insq;
	uint4			wake_this_pid;
	int			wake_instance;

	woke_self = FALSE;
	woke_none = TRUE;
	quant_retry_counter_remq = QUANT_RETRY;
	do
	{
		queue_retry_counter_remq = QUEUE_RETRY;
		do
		{
			free_entry = (mutex_que_entry_ptr_t)REMQHI((que_head_ptr_t)&addr->prochead);
			if ((mutex_que_entry_ptr_t)NULL != free_entry &&
			    (mutex_que_entry_ptr_t)INTERLOCK_FAIL != free_entry)
			{
				quant_retry_counter_insq = QUANT_RETRY;
				wake_this_pid = free_entry->pid;
				wake_instance = free_entry->mutex_wake_instance;
#				ifdef MUTEX_MSEM_WAKE
				/*
				 * In case of msem wakeup, the msem has to be
				 * unlocked before returning free_entry to
				 * free queue, or else another process might
				 * use the same msem (in free_entry) for its
				 * sleep.
				 */
				if (wake_this_pid != process_id)
					mutex_wake_proc(&free_entry->mutex_wake_msem);
				else
					woke_self = TRUE;
				/* This makes this entry not belong to any process before
				 * inserting it into the free queue.
				 */
				 free_entry->pid = 0;
#				endif
				do
				{
					queue_retry_counter_insq = QUEUE_RETRY;
					do
					{
						if (INTERLOCK_FAIL !=
							INSQTI((que_ent_ptr_t)free_entry, (que_head_ptr_t)&addr->freehead))
						{
							MUTEX_DPRINT3("%d: Waking up %d\n", process_id, wake_this_pid);
							woke_none = FALSE;
							if (wake_this_pid != process_id)
							{
								MUTEX_TRACE_CNTR(mutex_trc_crit_wk);
#								ifndef MUTEX_MSEM_WAKE
								mutex_wake_proc((sm_int_ptr_t)&wake_this_pid, wake_instance);
#								endif
							} else
							{
								/* With
								 * msem wake,
								 * this can
								 * never
								 * happen */
								woke_self = TRUE;
							}
							return (cdb_sc_normal); /* No more wakes */
						}
					} while (--queue_retry_counter_insq);
					if (!(--quant_retry_counter_insq))
					{
#						ifndef MUTEX_MSEM_WAKE
						if (wake_this_pid != process_id)
							mutex_wake_proc((sm_int_ptr_t)&wake_this_pid, wake_instance);
#						endif
						/* Too many failures */
						return (cdb_sc_dbccerr);
					} else
						rel_quant();
				} while (quant_retry_counter_insq);
			} else if ((mutex_que_entry_ptr_t)NULL == free_entry)
			{
				/* Empty wait queue */
				MUTEX_DPRINT2("%d: Empty wait queue\n", process_id);
				return (cdb_sc_normal);
			} /* else secondary interlock failed */
		} while (--queue_retry_counter_remq);
		if (!(--quant_retry_counter_remq))
			return (cdb_sc_dbccerr); /* Too many queue failures */
		else
			rel_quant();
	} while (quant_retry_counter_remq);

	return (cdb_sc_dbccerr); /* This will never get executed, added to make compiler happy */
}


void	gtm_mutex_init(gd_region *reg, int n, bool crash)
{
	if (!crash)
		clean_initialize((&FILE_INFO(reg)->s_addrs)->critical, n, crash);
	else
		crash_initialize((&FILE_INFO(reg)->s_addrs)->critical, n, crash);
	return;
}


static enum cdb_sc write_lock_spin(gd_region *reg,
			           mutex_spin_parms_ptr_t mutex_spin_parms,
				   int crash_count,
				   int attempt_recovery,
				   mutex_lock_t mutex_lock_type)
{
	int			write_sleep_spin_count, write_hard_spin_count;
	sgmnt_addrs		*csa;
	mutex_struct_ptr_t	addr;
#	ifdef MUTEX_REAL_SLEEP
	int			micro_sleep_time;
#	endif

	csa = &FILE_INFO(reg)->s_addrs;
	assert(!csa->now_crit);
	addr = csa->critical;
	write_sleep_spin_count = 0;
	write_hard_spin_count = 0;
	do
	{
		do
		{
			if (crash_count != addr->crashcnt)
				return (cdb_sc_critreset);
			if (GET_SWAPLOCK(&addr->semaphore))
			{
				MUTEX_DPRINT3("%d: Write %sACQUIRED\n", process_id,
					      (MUTEX_LOCK_WRITE == mutex_lock_type) ? "" : "IMMEDIATE ");
				MUTEX_TEST_SIGNAL_HERE("WRTLCK NOW CRIT\n", FALSE);
				csa->now_crit = TRUE;
				MUTEX_TEST_SIGNAL_HERE("WRTLCK SUCCESS\n", FALSE);
				return (cdb_sc_normal);
			} else if (attempt_recovery)
			{
				mutex_salvage(reg);
				attempt_recovery = FALSE;
			}
			if (!write_hard_spin_count)	/* save memory reference on fast path */
				write_hard_spin_count = num_additional_processors ? mutex_spin_parms->mutex_hard_spin_count : 1;
		} while (--write_hard_spin_count);
		/* Sleep for a very short duration */
#		ifdef MUTEX_TRACE
		if (MUTEX_LOCK_WRITE == mutex_lock_type)
			MUTEX_TRACE_CNTR(mutex_trc_wt_short_slp);
		else
			MUTEX_TRACE_CNTR(mutex_trc_wtim_short_slp);
#		endif
#		ifdef MUTEX_REAL_SLEEP
		micro_sleep_time = (nrand48(next_rand) & mutex_spin_parms->mutex_spin_sleep_mask) + 1;
		assert(micro_sleep_time < ONE_MILLION);
		assert(FALSE == csa->now_crit);
		MICROSEC_SLEEP(micro_sleep_time);
#		else
		rel_quant();
#		endif
		if (!write_sleep_spin_count)	/* save memory reference on fast path */
			write_sleep_spin_count = mutex_spin_parms->mutex_sleep_spin_count;
	} while (--write_sleep_spin_count);
	MUTEX_DPRINT4("%d: Could not acquire WRITE %sLOCK, held by %d\n", process_id,
		(MUTEX_LOCK_WRITE == mutex_lock_type) ? "" : "IMMEDIATE ", addr->semaphore.u.parts.latch_pid);
	return (cdb_sc_nolock);
}

static enum cdb_sc mutex_lock(gd_region *reg,
			      mutex_spin_parms_ptr_t mutex_spin_parms,
			      int crash_count,
			      int max_lock_attempts,
			      mutex_lock_t mutex_lock_type)
{
	int			lock_attempts;
	sgmnt_addrs		*csa;
	enum cdb_sc		status;
	boolean_t		alert;
#	ifdef MUTEX_MSEM_WAKE
	uint4			alert_heartbeat_counter = 0;
#	endif
	uint4			in_crit_pid;

	/* Check that "mutex_per_process_init" has happened before we try to grab crit and that it was done with our current
	 * pid (i.e. ensure that even in the case where parent did the mutex init with its pid and did a fork, the child process
	 * has done a reinitialization with its pid). The only exception is if we are in "mu_rndwn_file" in which case we
	 * know for sure there is no other pid accessing the database shared memory.
	 */
	assert(mutex_per_process_init_pid == process_id || (0 == mutex_per_process_init_pid) && mu_rndwn_file_dbjnl_flush);
	optimistic_attempts = 0;
	lock_attempts = 0;
	alert = FALSE;
	do
	{
		if (MUTEX_LOCK_WRITE == mutex_lock_type)
		{
			MUTEX_TRACE_CNTR(mutex_trc_w_atmpts);
			status = write_lock_spin(reg, mutex_spin_parms, crash_count, alert, mutex_lock_type);
		} else
		{
			assert(MUTEX_LOCK_WRITE_IMMEDIATE == mutex_lock_type);
			return (write_lock_spin(reg, mutex_spin_parms, crash_count, FALSE, mutex_lock_type));
		}
		if (cdb_sc_normal == status || cdb_sc_critreset == status)
			return (status);
		assert(cdb_sc_nolock == status);
#		ifdef MUTEX_MSEM_WAKE
		if (0 == alert_heartbeat_counter)
			alert_heartbeat_counter = heartbeat_counter + MUTEX_LCKALERT_PERIOD;
		alert = (heartbeat_counter >= alert_heartbeat_counter);
#		else
		alert = (lock_attempts >= max_lock_attempts);
#		endif
		csa = &FILE_INFO(reg)->s_addrs;
		++lock_attempts;
		if (alert)
		{
			in_crit_pid = csa->nl->in_crit;
			if (in_crit_pid)
				send_msg(VARLSTCNT(5) ERR_MUTEXLCKALERT, 3, DB_LEN_STR(reg), in_crit_pid); /* Alert the admin */
			lock_attempts = 0;
#			ifdef MUTEX_MSEM_WAKE
			alert_heartbeat_counter = 0;
#			endif
		}
		if (cdb_sc_dbccerr == mutex_sleep(csa, mutex_lock_type))
			return (cdb_sc_dbccerr);
	} while (TRUE);
}

enum cdb_sc mutex_lockw(gd_region *reg, mutex_spin_parms_ptr_t mutex_spin_parms, int crash_count)
{
	MUTEX_TRACE_CNTR(mutex_trc_lockw);
	return (mutex_lock(reg, mutex_spin_parms, crash_count, MUTEX_MAX_WRITE_LOCK_ATTEMPTS, MUTEX_LOCK_WRITE));
}

enum cdb_sc mutex_lockwim(gd_region *reg, mutex_spin_parms_ptr_t mutex_spin_parms, int crash_count)
{
	MUTEX_TRACE_CNTR(mutex_trc_lockwim);
	return (mutex_lock(reg, mutex_spin_parms, crash_count, 0, MUTEX_LOCK_WRITE_IMMEDIATE));
}

enum cdb_sc mutex_unlockw(gd_region *reg, int crash_count)
{
	/* Unlock write access to the mutex at addr */

	uint4		already_clear;
	sgmnt_addrs	*csa;

	csa = &FILE_INFO(reg)->s_addrs;
	if (crash_count != csa->critical->crashcnt)
		return (cdb_sc_critreset);
	assert(csa->now_crit);
	MUTEX_TEST_SIGNAL_HERE("WRTUNLCK NOW CRIT\n", FALSE);
	csa->now_crit = FALSE;
	assert(csa->critical->semaphore.u.parts.latch_pid == process_id);
	RELEASE_SWAPLOCK(&csa->critical->semaphore);
	MUTEX_DPRINT2("%d: WRITE LOCK RELEASED\n", process_id);
	return (mutex_wakeup(csa->critical));
}

void mutex_cleanup(gd_region *reg)
{
	sgmnt_addrs	*csa;

	/* mutex_cleanup is called after doing a rel_crit on the same area so if we still own the lock
	   it is because csa->now_crit was not in sync with our semaphore. At this point, if we own
	   the lock, go ahead and release it.
	*/
	csa = &FILE_INFO(reg)->s_addrs;
	if (COMPSWAP_UNLOCK(&csa->critical->semaphore, process_id, image_count, LOCK_AVAILABLE, 0))
	{
		MUTEX_DPRINT2("%d  mutex_cleanup : released lock\n", process_id);
	}
}

void mutex_seed_init(void)
{
	time_t mutex_seed;

	mutex_seed = time(NULL) * process_id;
	next_rand[0] = (unsigned short)(mutex_seed & ((1U << (SIZEOF(unsigned short) * 8)) - 1));
	mutex_seed >>= (SIZEOF(unsigned short) * 8);
	next_rand[1] = (unsigned short)(mutex_seed & ((1U << (SIZEOF(unsigned short) * 8)) - 1));
	mutex_seed >>= (SIZEOF(unsigned short) * 8);
	next_rand[2] = (unsigned short)(mutex_seed & ((1U << (SIZEOF(unsigned short) * 8)) - 1));
}

void mutex_salvage(gd_region *reg)
{
	sgmnt_addrs	*csa;
	int		salvage_status;
	pid_t		holder_pid;
	boolean_t	mutex_salvaged;
	VMS_ONLY(uint4	holder_imgcnt;)
        DCL_THREADGBL_ACCESS;

        SETUP_THREADGBL_ACCESS;

	csa = &FILE_INFO(reg)->s_addrs;
	if (0 != (holder_pid = csa->critical->semaphore.u.parts.latch_pid))
	{
		mutex_salvaged = FALSE;
		VMS_ONLY(holder_imgcnt = csa->critical->semaphore.u.parts.latch_image_count);
		if (holder_pid == process_id VMS_ONLY(&& holder_imgcnt == image_count))
		{	/* We were trying to obtain a lock we already held -- very odd */
			RELEASE_SWAPLOCK(&csa->critical->semaphore);
			csa->nl->in_crit = 0;
			/* Mutex crash repaired, want to do write cache recovery, just in case.
			 * Take care not to do it for jnlpool which has no concept of a db cache.
			 * In that case csa->hdr is NULL so use CAREFUL_SET_TRACEABLE_VAR macro.
			 */
			CAREFUL_SET_TRACEABLE_VAR(csa, TRUE);
			mutex_salvaged = TRUE;
			MUTEX_DPRINT2("%d : mutex salvaged, culprit was our own process\n", process_id);
		} else if (!is_proc_alive(holder_pid, UNIX_ONLY(0) VMS_ONLY(holder_imgcnt)))
		{	/* Release the COMPSWAP lock AFTER setting csa->nl->in_crit to 0 as an assert in
			 * grab_crit (checking that csa->nl->in_crit is 0) relies on this order.
			 */
			send_msg(VARLSTCNT(5) ERR_MUTEXFRCDTERM, 3, holder_pid, REG_LEN_STR(reg));
			csa->nl->in_crit = 0;
			/* Mutex crash repaired, want to do write cache recovery, in case previous holder of crit had set
			 * some cr->in_cw_set to a non-zero value. Not doing cache recovery could cause incorrect
			 * GTMASSERTs in PIN_CACHE_RECORD macro in t_end/tp_tend.
			 * Take care not to do it for jnlpool which has no concept of a db cache.
			 * In that case csa->hdr is NULL so use CAREFUL_SET_TRACEABLE_VAR macro.
			 */
			CAREFUL_SET_TRACEABLE_VAR(csa, TRUE);
			COMPSWAP_UNLOCK(&csa->critical->semaphore, holder_pid, holder_imgcnt, LOCK_AVAILABLE, 0);
			mutex_salvaged = TRUE;
			/* Reset jb->blocked as well if the holder_pid had it set */
			if ((NULL != csa->jnl) && (NULL != csa->jnl->jnl_buff) && (csa->jnl->jnl_buff->blocked == holder_pid))
				csa->jnl->jnl_buff->blocked = 0;
			MUTEX_DPRINT3("%d : mutex salvaged, culprit was %d\n", process_id, holder_pid);
		} else if (FALSE == TREF(disable_sigcont))
		{
			/* The process might have been STOPPED (kill -SIGSTOP). Send SIGCONT and nudge the stopped process forward.
			 * However, skip this call in case of SENDTO_EPERM white-box test, because we do not want the intentionally
			 * stuck process to be awakened prematurely. */
			DEBUG_ONLY(if (!gtm_white_box_test_case_enabled || WBTEST_SENDTO_EPERM != gtm_white_box_test_case_number))
				continue_proc(holder_pid);
		}
		/* Record salvage event in db file header if applicable.
		 * Take care not to do it for jnlpool which has no concept of a db cache.
		 * In that case csa->hdr is NULL so check accordingly.
		 */
		assert((NULL != csa->hdr) || (csa == &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs));
		if (mutex_salvaged && (NULL != csa->hdr))
		{
			BG_TRACE_PRO_ANY(csa, wcb_mutex_salvage); /* no need to use CAREFUL_BG_TRACE_PRO_ANY macro
								   * since we already checked for csa->hdr non-NULL.
								   */
			send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_mutex_salvage"),
				process_id, &csa->ti->curr_tn, DB_LEN_STR(reg));
		}
	}
}

/* Do the per process initialization of mutex stuff. This function should be invoked only once per process. The only
 * exception is the receiver server which could invoke this twice. Once through the receiver server startup command when
 * it does "jnlpool_init" and the second through the child receiver server process initialization. The second initialization
 * is needed to set the mutex structures up to correspond to the child process id (and not the parent pid). The function below
 * has to be coded to ensure that the second call nullifies any effects of the first call.
 */
void	mutex_per_process_init(void)
{
	int4	status;

	assert(process_id != mutex_per_process_init_pid);
	mutex_seed_init();
	/* The heartbeat timer is used
	 * 	1) To periodically check if we have older generation journal files open and if so to close them.
	 *	2) By mutex logic to approximately measure the time spent sleeping while waiting for CRIT or MSEMLOCK.
	 * Linux currently does not support MSEMs. It uses the heartbeat timer only for (1).
	 */
	if (0 == mutex_per_process_init_pid)
		start_timer((TID)&heartbeat_timer, HEARTBEAT_INTERVAL, heartbeat_timer, 0, NULL);
#	ifndef MUTEX_MSEM_WAKE
	else
	{	/* Close socket opened by the first call. But dont delete the socket file as the parent process will do that. */
		assert(FD_INVALID != mutex_sock_fd);
		if (FD_INVALID != mutex_sock_fd)
			CLOSEFILE_RESET(mutex_sock_fd, status);	/* resets "mutex_sock_fd" to FD_INVALID */
	}
	assert(FD_INVALID == mutex_sock_fd);
	mutex_sock_init();
	assert(FD_INVALID != mutex_sock_fd);
#	endif
	mutex_per_process_init_pid = process_id;
}