/**************************************************************** * * * Copyright 2001, 2013 Fidelity Information Services, Inc * * * * This source code contains the intellectual property * * of its copyright holder(s), and is made available * * under a license. If you do not know the terms of * * the license, please stop and do not read further. * * * ****************************************************************/ /* GT.M Mutex Control */ #include "mdef.h" #include "gtm_time.h" /* for time() */ #include "gtm_socket.h" #include "gtm_string.h" #include "gtm_stdlib.h" #include "gtm_unistd.h" #include "gtm_stdio.h" #include #include #include #if defined(__sparc) || defined(__hpux) || defined(__MVS__) || defined(__linux__) || defined(__CYGWIN__) #include "gtm_limits.h" #else #include #endif #include "aswp.h" #include "gdsroot.h" #include "gtm_facility.h" #include "fileinfo.h" #include "gdsbt.h" #include "gdsfhead.h" #include "lockconst.h" #include "interlock.h" #include "filestruct.h" #include "io.h" #include "jnl.h" #include "gdsbgtr.h" #include "mutex.h" #include "relqueopi.h" #include "eintr_wrappers.h" #include "send_msg.h" #include "is_proc_alive.h" #include "compswap.h" #include "gtmsecshr.h" #include "rel_quant.h" #include "add_inter.h" #include "mutex_deadlock_check.h" #include "gt_timer.h" #include "gtmio.h" #include "gtm_c_stack_trace.h" #ifdef DEBUG #include "wbox_test_init.h" #include "repl_msg.h" /* needed by gtmsource.h */ #include "gtmsource.h" /* required for jnlpool GBLREF */ #endif #define QUANT_RETRY 10000 #define QUEUE_RETRY 255 #ifdef MUTEX_MSEM_WAKE #define MUTEX_MAX_HEARTBEAT_WAIT 2 /* so that total wait for both select and msem wait will be the same */ #define MUTEX_LCKALERT_PERIOD 4 #endif /* The following CAREFUL_* macros invoke the corresponding * macros except in the case csa->hdr is NULL. * This is possible if the csa corresponds to the journal pool where there is no notion of a db hdr. * In that case, we skip invoking the * macros. */ #define CAREFUL_SET_TRACEABLE_VAR(CSA, VALUE) \ { \ sgmnt_data_ptr_t lcl_csd; \ \ lcl_csd = CSA->hdr; \ assert((NULL != lcl_csd) \ || (CSA == &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs)); \ if (NULL != lcl_csd) \ SET_TRACEABLE_VAR(CSA->nl->wc_blocked, TRUE); \ } #define CAREFUL_BG_TRACE_PRO_ANY(CSA, EVENT) \ { \ sgmnt_data_ptr_t lcl_csd; \ \ lcl_csd = CSA->hdr; \ assert((NULL != lcl_csd) \ || (CSA == &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs)); \ if (NULL != lcl_csd) \ BG_TRACE_PRO_ANY(CSA, EVENT); \ } GBLREF pid_t process_id; GBLREF int process_exiting; GBLREF uint4 image_count; GBLREF int num_additional_processors; #ifdef MUTEX_MSEM_WAKE GBLREF volatile uint4 heartbeat_counter; # ifdef POSIX_MSEM static sem_t *mutex_wake_msem_ptr = NULL; # else static msemaphore *mutex_wake_msem_ptr = NULL; # endif static mutex_que_entry_ptr_t msem_slot; #else GBLREF int mutex_sock_fd; GBLREF fd_set mutex_wait_on_descs; #endif GBLREF uint4 mutex_per_process_init_pid; #ifdef DEBUG GBLREF jnlpool_addrs jnlpool; GBLREF boolean_t in_mu_rndwn_file; #endif GBLREF jnl_gbls_t jgbl; DECLARE_MUTEX_TRACE_CNTRS DECLARE_MUTEX_TEST_SIGNAL_FLAG static boolean_t woke_self; static boolean_t woke_none; static unsigned short next_rand[3]; static int optimistic_attempts; static int mutex_expected_wake_instance = 0; static enum cdb_sc mutex_wakeup(mutex_struct_ptr_t addr); void mutex_salvage(gd_region *reg); error_def(ERR_MUTEXERR); error_def(ERR_MUTEXFRCDTERM); error_def(ERR_MUTEXLCKALERT); error_def(ERR_ORLBKINPROG); error_def(ERR_TEXT); error_def(ERR_WCBLOCKED); /* * General: * Uses compare-and-swap logic to obtain/release a semaphore * in shared memory. * * Interface: * void gtm_mutex_init(reg, n, crash) * Initialize mutex structure for region reg with n * queue slots. If crash is TRUE, then this is a "crash" * reinitialization; otherwise, it's a "clean" initialization. * * enum cdb_sc mutex_lockw(reg, mutex_spin_parms, seq) * Write access to mutex for region reg * * enum cdb_sc mutex_lockwim(reg, mutex_spin_parms, seq) * Write access for region reg; if cannot lock, * immediately return cdb_sc_nolock * * enum cdb_sc mutex_unlockw(reg, seq); * Unlock write access for region reg * * For routines taking the seq argument, if seq != crash count, * return cdb_sc_critreset. * * * Mutex structure must be quadword aligned * * * Mutex structure : * * --------------------------------- * | semaphore | * --------------------------------- * | crash count | * --------------------------------- * | stuckexec | <-UNIX only * -------------------------------- * | # of que slots | * -------------------------------- * |_ fl waiting process que head _| * |_ bl _| * |_ global_latch _| * --------------------------------- * |_ fl unused slots queue head _| * |_ bl _| * |_ global_latch _| * --------------------------------- * |_ fl first queue entry _| * |_ bl _| * |_ pid _| * | super_crit [CCP use only]^ | * --------------------------------- * |_ fl second queue entry _| * |_ bl _| * |_ pid _| * | super_crit [CCP use only]^ | * --------------------------------- * : : : : : * --------------------------------- * |_ fl last queue entry _| * |_ bl _| * |_ pid _| * | super_crit [CCP use only]^ | * --------------------------------- * * ^Note: only one entry at a time (at the head of the * waiting process queue) will ever use "super_crit". * CCP is used in VMS only - 03/11/98 * 07-31-2002 se: super-crit is not used at all anymore. Comments are left for historical purposes. * * Fields may be interspersed with fillers for alignment purposes. */ static void clean_initialize(mutex_struct_ptr_t addr, int n, bool crash) { mutex_que_entry_ptr_t q_free_entry; # if defined(MUTEX_MSEM_WAKE) && !defined(POSIX_MSEM) msemaphore *status; # endif assert(n > 0); addr->queslots = n; /* Initialize the waiting process queue to be empty */ addr->prochead.que.fl = addr->prochead.que.bl = 0; SET_LATCH_GLOBAL(&addr->prochead.latch, LOCK_AVAILABLE); /* Initialize the free queue to be empty */ addr->freehead.que.fl = addr->freehead.que.bl = 0; SET_LATCH_GLOBAL(&addr->freehead.latch, LOCK_AVAILABLE); /* Clear the first free entry */ q_free_entry = (mutex_que_entry_ptr_t)((sm_uc_ptr_t)&addr->freehead + SIZEOF(mutex_que_head)); q_free_entry->que.fl = q_free_entry->que.bl = 0; q_free_entry->pid = 0; q_free_entry->super_crit = (void *)NULL; q_free_entry->mutex_wake_instance = 0; while (n--) { # ifdef MUTEX_MSEM_WAKE # ifdef POSIX_MSEM if (-1 == sem_init(&q_free_entry->mutex_wake_msem, TRUE, 0)) /* Shared lock with no initial resources (locked) */ # else if ((NULL == (status = msem_init(&q_free_entry->mutex_wake_msem, MSEM_LOCKED))) || ((msemaphore *)-1 == status)) # endif rts_error(VARLSTCNT(7) ERR_MUTEXERR, 0, ERR_TEXT, 2, RTS_ERROR_TEXT("Error with mutex wait memory semaphore initialization"), errno); # endif /* Initialize fl,bl links to 0 before INSQTI as it (gtm_insqti in relqueopi.c) asserts this */ DEBUG_ONLY(((que_ent_ptr_t)q_free_entry)->fl = 0;) DEBUG_ONLY(((que_ent_ptr_t)q_free_entry)->bl = 0;) if (INTERLOCK_FAIL == INSQTI((que_ent_ptr_t)q_free_entry++, (que_head_ptr_t)&addr->freehead)) rts_error(VARLSTCNT(6) ERR_MUTEXERR, 0, ERR_TEXT, 2, RTS_ERROR_TEXT("Interlock instruction failure in mutex initialize")); } SET_LATCH_GLOBAL(&addr->semaphore, LOCK_AVAILABLE); SET_LATCH_GLOBAL((global_latch_t *)&addr->stuckexec, LOCK_AVAILABLE); if (!crash) { SET_LATCH(&addr->crashcnt, 0); SET_LATCH_GLOBAL(&addr->crashcnt_latch, LOCK_AVAILABLE); } return; } static void crash_initialize(mutex_struct_ptr_t addr, int n, bool crash) { /* * mutex_wake_proc() is not declared here because its return value * is left unspecified in its definition (see mutex_wake_proc.c) */ mutex_que_entry_ptr_t next_entry; INCR_CNT(&addr->crashcnt, &addr->crashcnt_latch); addr->freehead.que.fl = addr->freehead.que.bl = 0; next_entry = (mutex_que_entry_ptr_t)&addr->prochead; do { if (0 == next_entry->que.fl) { /* Wait queue empty; do a clean initialization */ clean_initialize(addr, n, crash); return; } next_entry = (mutex_que_entry_ptr_t)((sm_uc_ptr_t)next_entry + next_entry->que.fl); if (next_entry <= (mutex_que_entry_ptr_t)&addr->prochead || next_entry >= (mutex_que_entry_ptr_t)&addr->prochead + n + 1 || (0 != ((INTPTR_T)next_entry & (SIZEOF(mutex_que_entry) - 1)))) { /* * next_entry == &addr->prochead => loop is done; * next_entry below queue head => queue is corrupt; * next_entry above queue top => queue is corrupt; * next_entry is not (SIZEOF(queue) entry)-byte * aligned => queue is corrupt ... * ... in all cases do a clean initialization */ clean_initialize(addr, n, crash); return; } /* Wake up process */ if (next_entry->pid != process_id) # ifdef MUTEX_MSEM_WAKE mutex_wake_proc(&next_entry->mutex_wake_msem); # else mutex_wake_proc((sm_int_ptr_t)&next_entry->pid, next_entry->mutex_wake_instance); # endif } while (TRUE); } static enum cdb_sc mutex_long_sleep(mutex_struct_ptr_t addr, mutex_lock_t mutex_lock_type, sgmnt_addrs *csa) { enum cdb_sc status; boolean_t wakeup_status; # ifdef MUTEX_MSEM_WAKE uint4 bad_heartbeat; # else struct timeval timeout; int timeout_threshold; struct sockaddr_un mutex_woke_me_proc; GTM_SOCKLEN_TYPE mutex_woke_me_proc_len; mutex_wake_msg_t mutex_wake_msg[2]; int sel_stat; ssize_t nbrecvd; int timeout_intr_slpcnt; long timeout_val; # endif # ifdef DEBUG if (gtm_white_box_test_case_enabled && (WBTEST_SENDTO_EPERM == gtm_white_box_test_case_number)) { FPRINTF(stderr, "MUPIP BACKUP is about to start long sleep\n"); } # endif if (LOCK_AVAILABLE == addr->semaphore.u.parts.latch_pid && ++optimistic_attempts <= MUTEX_MAX_OPTIMISTIC_ATTEMPTS) { MUTEX_DPRINT2("%d: Nobody in crit (II) wake procs\n", process_id); MUTEX_TRACE_CNTR(mutex_trc_mutex_slp_fn_noslp); status = mutex_wakeup(addr); if ((cdb_sc_normal == status) && (woke_self || woke_none)) return (cdb_sc_normal); else if (cdb_sc_dbccerr == status) return (cdb_sc_dbccerr); } optimistic_attempts = 0; do { # ifdef MUTEX_MSEM_WAKE /* My msemaphore is already used by another process. * In other words, I was woken up, but missed my wakeup call. * I should return immediately. */ if (msem_slot->pid != process_id) wakeup_status = TRUE; else { bad_heartbeat = 0; /* * the check for EINTR below is valid and should not be converted to an EINTR * wrapper macro, because another condition is checked for the while loop. */ while (!(wakeup_status = (0 == MSEM_LOCKW(mutex_wake_msem_ptr)))) { if (EINTR == errno) { if (bad_heartbeat) /* to save memory reference and calc on fast path */ { if (bad_heartbeat < heartbeat_counter) { MUTEX_DPRINT3("%d: msem sleep done, heartbeat_counter = %d\n", process_id, heartbeat_counter); break; } MUTEX_DPRINT3("%d: msem sleep continue, heartbeat_counter = %d\n", process_id, heartbeat_counter); } else bad_heartbeat = heartbeat_counter + MUTEX_MAX_HEARTBEAT_WAIT - 1; /* -1 since we were interrupted this time */ } else rts_error(VARLSTCNT(7) ERR_MUTEXERR, 0, ERR_TEXT, 2, RTS_ERROR_TEXT("Error with mutex wake msem"), errno); } /* wakeup_status is set to true, if I was able to lock...somebody woke me up; * wakeup_status is set to false, if I timed out and should go to recovery. */ } # else do { timeout.tv_sec = MUTEX_CONST_TIMEOUT_VAL; timeout.tv_usec = (gtm_tv_usec_t)(nrand48(next_rand) & ((1U << MUTEX_NUM_WAIT_BITS) - 1)) + 1; timeout_val = timeout.tv_sec * ONE_MILLION + timeout.tv_usec; /* * Can add backoff logic here to increase the timeout * as the number of attempts increase */ timeout_intr_slpcnt = MUTEX_INTR_SLPCNT; MUTEX_DPRINT4("%d: Sleeping for %d s %d us\n", process_id, timeout.tv_sec, timeout.tv_usec); FD_SET(mutex_sock_fd, &mutex_wait_on_descs); MUTEX_TRACE_CNTR(mutex_trc_slp); /* * the check for EINTR below is valid and should not be converted to an EINTR * wrapper macro, since it might be a timeout. */ while (-1 == (sel_stat = select(mutex_sock_fd + 1, &mutex_wait_on_descs, (fd_set *)NULL, (fd_set *)NULL, &timeout))) { if (EINTR == errno) { /* somebody interrupted me, reduce the timeout by half and continue */ MUTEX_TRACE_CNTR(mutex_trc_slp_intr); if (!(timeout_intr_slpcnt--)) /* Assume timed out */ { sel_stat = 0; MUTEX_TRACE_CNTR(mutex_trc_intr_tmout); break; } } else rts_error(VARLSTCNT(5) ERR_TEXT, 2, RTS_ERROR_TEXT("Error with mutex select. Running in degraded mode"), errno); timeout_val >>= 1; timeout.tv_sec = timeout_val / ONE_MILLION; timeout.tv_usec = (gtm_tv_usec_t)(timeout_val % ONE_MILLION); MUTEX_DPRINT4("%d: Interrupted select, new timeout %d s %d us\n", process_id, timeout.tv_sec, timeout.tv_usec); /* the next line deals with the case that an interrupted select has changed mutex_wait_on_descs */ FD_SET(mutex_sock_fd, &mutex_wait_on_descs); MUTEX_TRACE_CNTR(mutex_trc_slp); } if (1 == sel_stat) /* Somebody woke me up */ { mutex_woke_me_proc_len = SIZEOF(struct sockaddr_un); RECVFROM_SOCK(mutex_sock_fd, (void *)&mutex_wake_msg[0], SIZEOF(mutex_wake_msg), 0, (struct sockaddr *)&mutex_woke_me_proc, (GTM_SOCKLEN_TYPE *)&mutex_woke_me_proc_len, nbrecvd); if (SIZEOF(mutex_wake_msg) == nbrecvd) /* Drained out both old and new wake messages */ { MUTEX_TRACE_CNTR(mutex_trc_slp_wkup); MUTEX_TRACE_CNTR(mutex_trc_pgybckd_dlyd_wkup); MUTEX_DPRINT3("%d: %d woke me up, drained delayed message too\n", process_id, mutex_wake_msg[1].pid); wakeup_status = TRUE; break; } if (BIN_TOGGLE(mutex_expected_wake_instance) == mutex_wake_msg[0].mutex_wake_instance) { MUTEX_DPRINT3("%d: %d woke me up\n", process_id, mutex_wake_msg[0].pid); MUTEX_TRACE_CNTR(mutex_trc_slp_wkup); wakeup_status = TRUE; break; } /* else, old wake msg, ignore */ MUTEX_DPRINT3("%d: %d sent me delayed wake msg\n", process_id, mutex_wake_msg[0].pid); MUTEX_TRACE_CNTR(mutex_trc_xplct_dlyd_wkup); } else if (0 == sel_stat) /* Timed out */ { MUTEX_DPRINT2("%d: Sleep done, go wake others\n", process_id); MUTEX_TRACE_CNTR(mutex_trc_slp_tmout); wakeup_status = FALSE; break; } } while (TRUE); # endif /* * If I was woken up and am a writer, others are blocking on * me. So, I shall try to get the lock NOW */ if (wakeup_status) { if (MUTEX_LOCK_WRITE == mutex_lock_type) return (cdb_sc_normal); } else mutex_deadlock_check(addr, csa); /* Timed out: See if any deadlocks and fix if detected */ status = mutex_wakeup(addr); /* Timed out or reader. In case * of reader this causes * accelerated wakeup of readers * in the queue */ if (cdb_sc_dbccerr == status) return (cdb_sc_dbccerr); /* else status is cdb_sc_normal */ if (wakeup_status || woke_self || woke_none) return (cdb_sc_normal); /* * There are others above me in the queue or I missed my * wakeup call. In the latter case, select or msem_lock will return * immediately and there won't be further sleeps. */ } while (TRUE); } static enum cdb_sc mutex_sleep(sgmnt_addrs *csa, mutex_lock_t mutex_lock_type) { /* Insert this process at the tail of the wait queue and hibernate */ mutex_struct_ptr_t addr; mutex_que_entry_ptr_t free_slot; int redo_cntr; int queue_retry_counter_remq, quant_retry_counter_remq, queue_retry_counter_insq, quant_retry_counter_insq; # ifdef MUTEX_MSEM_WAKE int rc; # endif addr = csa->critical; MUTEX_TRACE_CNTR(mutex_trc_mutex_slp_fn); MUTEX_DPRINT2("%d: In Mutex Sleep\n", process_id); if (LOCK_AVAILABLE == addr->semaphore.u.parts.latch_pid) /* there is nobody in crit */ { /* * The above condition is an optimistic check to speed * things up by not letting a process sleep. * In an n-way SMP, there is a possibility that n processes * (atleast one writer) might run in a lock-step manner * testing the above condition almost at the same time and * deciding that nobody is in crit. This might go on till * atleast one of them grabs crit, or lock attempts cross a * threshold (leading to recovery). This is not desired. To * avoid such a scenario, we test the number of times we have * run into this situation and force ourselves to sleep */ if (++optimistic_attempts < MUTEX_MAX_OPTIMISTIC_ATTEMPTS) { MUTEX_DPRINT2("%d: Nobody in crit (I) wake procs\n", process_id); MUTEX_TRACE_CNTR(mutex_trc_mutex_slp_fn_noslp); return (mutex_wakeup(addr)); } } redo_cntr = 0; quant_retry_counter_remq = QUANT_RETRY; do { queue_retry_counter_remq = QUEUE_RETRY; do { free_slot = (mutex_que_entry_ptr_t)REMQHI((que_head_ptr_t)&addr->freehead); # ifdef MUTEX_MSEM_WAKE msem_slot = free_slot; # endif if (!process_exiting && (NULL != free_slot) && (mutex_que_entry_ptr_t)INTERLOCK_FAIL != free_slot) { free_slot->pid = process_id; free_slot->mutex_wake_instance = mutex_expected_wake_instance; # ifdef MUTEX_MSEM_WAKE mutex_wake_msem_ptr = &free_slot->mutex_wake_msem; /* this loop makes sure that the msemaphore is locked initially * before the process goes to long sleep */ do { rc = MSEM_LOCKNW(mutex_wake_msem_ptr); } while (-1 == rc && EINTR == errno); # endif /* * Significance of mutex_wake_instance field : * ----------------------------------------- * After queueing itself, a process * might go to sleep (select call in * mutex_long_sleep) awaiting a wakeup message * or a timeout. It is possible that a wakeup * message might arrive after timeout. In this * case, a later attempt at waiting for a * wakeup message will falsely succeed on an * old wakeup message. We use the * mutex_wake_instance field (value 0 or 1) * to distinguish between an old and a new * wakeup message. Since at any given time * there is atmost one entry in the queue for * a process, the only values we need for * mutex_wake_instance are 0 and 1. */ mutex_expected_wake_instance = BIN_TOGGLE(mutex_expected_wake_instance); quant_retry_counter_insq = QUANT_RETRY; do { queue_retry_counter_insq = QUEUE_RETRY; do { if (INTERLOCK_FAIL != INSQTI((que_ent_ptr_t)free_slot, (que_head_ptr_t)&addr->prochead)) { MUTEX_DPRINT3("%d: Inserted %d into wait queue\n", process_id, free_slot->pid); return (mutex_long_sleep(addr, mutex_lock_type, csa)); } } while (--queue_retry_counter_insq); if (!(--quant_retry_counter_insq)) return (cdb_sc_dbccerr); /* Too many failures */ rel_quant(); } while (quant_retry_counter_insq); continue; } if ((mutex_que_entry_ptr_t)INTERLOCK_FAIL == free_slot) { /* secondary interlock failed on an attempt to * remove an entry from the free queue */ redo_cntr = 0; continue; } if ((mutex_que_entry_ptr_t)NULL == free_slot) { /* Record queue full event in db file header if applicable. * Take care not to do it for jnlpool which has no concept of a db cache. * In that case csa->hdr is NULL so use CAREFUL_BG_TRACE_PRO_ANY macro. */ CAREFUL_BG_TRACE_PRO_ANY(csa, mutex_queue_full); MUTEX_DPRINT2("%d: Free Queue full\n", process_id); /* * When I can't find a free slot in the queue * repeatedly, it means that there is no * progress in the system. A recovery attempt * might be warranted in this scenario. The * trick is to return cdb_sc_normal which in * turn causes another spin-loop initiation (or * recovery when implemented). * The objective of mutex_sleep is achieved * (partially) in that sleep is done, though * queueing isn't. */ } else assert(process_exiting); /* timers might be off, but this adds CPU load at an awkward time */ MICROSEC_SLEEP(ONE_MILLION - 1); /* Wait a second, then try again */ mutex_deadlock_check(addr, csa); if (++redo_cntr < MUTEX_MAX_WAIT_FOR_PROGRESS_CNTR) break; return (cdb_sc_normal); } while (--queue_retry_counter_remq); if (redo_cntr) quant_retry_counter_remq = QUANT_RETRY + 1; else rel_quant(); } while (--quant_retry_counter_remq); return (cdb_sc_dbccerr); } static enum cdb_sc mutex_wakeup(mutex_struct_ptr_t addr) { mutex_que_entry_ptr_t free_entry; int queue_retry_counter_remq, quant_retry_counter_remq, queue_retry_counter_insq, quant_retry_counter_insq; uint4 wake_this_pid; int wake_instance; woke_self = FALSE; woke_none = TRUE; quant_retry_counter_remq = QUANT_RETRY; do { queue_retry_counter_remq = QUEUE_RETRY; do { free_entry = (mutex_que_entry_ptr_t)REMQHI((que_head_ptr_t)&addr->prochead); if ((mutex_que_entry_ptr_t)NULL != free_entry && (mutex_que_entry_ptr_t)INTERLOCK_FAIL != free_entry) { quant_retry_counter_insq = QUANT_RETRY; wake_this_pid = free_entry->pid; wake_instance = free_entry->mutex_wake_instance; # ifdef MUTEX_MSEM_WAKE /* * In case of msem wakeup, the msem has to be * unlocked before returning free_entry to * free queue, or else another process might * use the same msem (in free_entry) for its * sleep. */ if (wake_this_pid != process_id) mutex_wake_proc(&free_entry->mutex_wake_msem); else woke_self = TRUE; /* This makes this entry not belong to any process before * inserting it into the free queue. */ free_entry->pid = 0; # endif do { queue_retry_counter_insq = QUEUE_RETRY; do { if (INTERLOCK_FAIL != INSQTI((que_ent_ptr_t)free_entry, (que_head_ptr_t)&addr->freehead)) { MUTEX_DPRINT3("%d: Waking up %d\n", process_id, wake_this_pid); woke_none = FALSE; if (wake_this_pid != process_id) { MUTEX_TRACE_CNTR(mutex_trc_crit_wk); # ifndef MUTEX_MSEM_WAKE mutex_wake_proc((sm_int_ptr_t)&wake_this_pid, wake_instance); # endif } else { /* With * msem wake, * this can * never * happen */ woke_self = TRUE; } return (cdb_sc_normal); /* No more wakes */ } } while (--queue_retry_counter_insq); if (!(--quant_retry_counter_insq)) { # ifndef MUTEX_MSEM_WAKE if (wake_this_pid != process_id) mutex_wake_proc((sm_int_ptr_t)&wake_this_pid, wake_instance); # endif /* Too many failures */ return (cdb_sc_dbccerr); } else rel_quant(); } while (quant_retry_counter_insq); } else if ((mutex_que_entry_ptr_t)NULL == free_entry) { /* Empty wait queue */ MUTEX_DPRINT2("%d: Empty wait queue\n", process_id); return (cdb_sc_normal); } /* else secondary interlock failed */ } while (--queue_retry_counter_remq); if (!(--quant_retry_counter_remq)) return (cdb_sc_dbccerr); /* Too many queue failures */ else rel_quant(); } while (quant_retry_counter_remq); return (cdb_sc_dbccerr); /* This will never get executed, added to make compiler happy */ } void gtm_mutex_init(gd_region *reg, int n, bool crash) { if (!crash) clean_initialize((&FILE_INFO(reg)->s_addrs)->critical, n, crash); else crash_initialize((&FILE_INFO(reg)->s_addrs)->critical, n, crash); return; } static enum cdb_sc write_lock_spin(gd_region *reg, mutex_spin_parms_ptr_t mutex_spin_parms, int crash_count, int attempt_recovery, mutex_lock_t mutex_lock_type) { int write_sleep_spin_count, write_hard_spin_count; sgmnt_addrs *csa; mutex_struct_ptr_t addr; # ifdef MUTEX_REAL_SLEEP int micro_sleep_time; # endif csa = &FILE_INFO(reg)->s_addrs; assert(!csa->now_crit); addr = csa->critical; write_sleep_spin_count = 0; write_hard_spin_count = 0; do { do { if (crash_count != addr->crashcnt) return (cdb_sc_critreset); if (GET_SWAPLOCK(&addr->semaphore)) { csa->critical->crit_cycle++; MUTEX_DPRINT3("%d: Write %sACQUIRED\n", process_id, (MUTEX_LOCK_WRITE == mutex_lock_type) ? "" : "IMMEDIATE "); MUTEX_TEST_SIGNAL_HERE("WRTLCK NOW CRIT\n", FALSE); csa->now_crit = TRUE; MUTEX_TEST_SIGNAL_HERE("WRTLCK SUCCESS\n", FALSE); return (cdb_sc_normal); } else if (attempt_recovery) { mutex_salvage(reg); attempt_recovery = FALSE; } if (!write_hard_spin_count) /* save memory reference on fast path */ write_hard_spin_count = num_additional_processors ? mutex_spin_parms->mutex_hard_spin_count : 1; } while (--write_hard_spin_count); /* Sleep for a very short duration */ # ifdef MUTEX_TRACE if (MUTEX_LOCK_WRITE == mutex_lock_type) MUTEX_TRACE_CNTR(mutex_trc_wt_short_slp); else MUTEX_TRACE_CNTR(mutex_trc_wtim_short_slp); # endif # ifdef MUTEX_REAL_SLEEP micro_sleep_time = (nrand48(next_rand) & mutex_spin_parms->mutex_spin_sleep_mask) + 1; assert(micro_sleep_time < ONE_MILLION); assert(FALSE == csa->now_crit); MICROSEC_SLEEP(micro_sleep_time); # else rel_quant(); # endif if (!write_sleep_spin_count) /* save memory reference on fast path */ write_sleep_spin_count = mutex_spin_parms->mutex_sleep_spin_count; } while (--write_sleep_spin_count); MUTEX_DPRINT4("%d: Could not acquire WRITE %sLOCK, held by %d\n", process_id, (MUTEX_LOCK_WRITE == mutex_lock_type) ? "" : "IMMEDIATE ", addr->semaphore.u.parts.latch_pid); return (cdb_sc_nolock); } static enum cdb_sc mutex_lock(gd_region *reg, mutex_spin_parms_ptr_t mutex_spin_parms, int crash_count, mutex_lock_t mutex_lock_type) { boolean_t try_recovery; enum cdb_sc status; int lock_attempts; latch_t local_crit_cycle; pid_t in_crit_pid; sgmnt_addrs *csa; time_t curr_time; uint4 curr_time_uint4, next_alert_uint4; csa = &FILE_INFO(reg)->s_addrs; /* Check that "mutex_per_process_init" has happened before we try to grab crit and that it was done with our current * pid (i.e. ensure that even in the case where parent did the mutex init with its pid and did a fork, the child process * has done a reinitialization with its pid). The only exception is if we are in "mu_rndwn_file" in which case we * know for sure there is no other pid accessing the database shared memory. */ assert((MUTEX_LOCK_WRITE_IMMEDIATE == mutex_lock_type) || (MUTEX_LOCK_WRITE == mutex_lock_type)); assert(mutex_per_process_init_pid == process_id || (0 == mutex_per_process_init_pid) && in_mu_rndwn_file); MUTEX_TRACE_CNTR((MUTEX_LOCK_WRITE == mutex_lock_type) ? mutex_trc_lockw : mutex_trc_lockwim); optimistic_attempts = 0; lock_attempts = 0; local_crit_cycle = 0; /* this keeps us from doing a MUTEXLCKALERT on the first cycle in case the time latch is stale */ try_recovery = jgbl.onlnrlbk; /* salvage lock the first time if we are online rollback thereby reducing unnecessary waits */ do { in_crit_pid = csa->nl->in_crit; lock_attempts++; MUTEX_TRACE_CNTR(mutex_trc_w_atmpts); status = write_lock_spin(reg, mutex_spin_parms, crash_count, try_recovery, mutex_lock_type); if ((cdb_sc_normal == status) || (MUTEX_LOCK_WRITE_IMMEDIATE == mutex_lock_type) || (cdb_sc_critreset == status)) return (status); try_recovery = FALSE; /* only try recovery once per MUTEXLCKALERT */ assert(cdb_sc_nolock == status); time(&curr_time); assert(MAXUINT4 > curr_time); curr_time_uint4 = (uint4)curr_time; next_alert_uint4 = csa->critical->stuckexec.cas_time; if (curr_time_uint4 > next_alert_uint4) { /* We've waited long enough */ if (COMPSWAP_LOCK(&csa->critical->stuckexec.time_latch, next_alert_uint4, 0, (curr_time_uint4 + MUTEXLCKALERT_INTERVAL), 0)) { /* and no one else beat us to it */ MUTEX_DPRINT3("%d: Acquired STUCKEXEC time lock, to trace %d\n", process_id, in_crit_pid); if (process_id == in_crit_pid) { /* This is just a precaution - shouldn't ever happen */ assert(FALSE); csa->now_crit = TRUE; return (cdb_sc_normal); } if (in_crit_pid && (in_crit_pid == csa->nl->in_crit) && is_proc_alive(in_crit_pid, 0)) { /* and we're waiting on some living process */ if (local_crit_cycle == csa->critical->crit_cycle) { /* and things aren't moving */ assert(local_crit_cycle); if (0 == csa->nl->onln_rlbk_pid) { /* not rollback - send_msg after trace less likely to lose process */ GET_C_STACK_FROM_SCRIPT("MUTEXLCKALERT", process_id, in_crit_pid, csa->critical->crit_cycle); send_msg(VARLSTCNT(6) ERR_MUTEXLCKALERT, 4, DB_LEN_STR(reg), in_crit_pid, csa->critical->crit_cycle); try_recovery = TRUE; /* set off a salvage */ continue; /* make sure to act on it soon, likely this process */ } /* If the holding PID belongs to online rollback which holds crit on database and * journal pool for its entire duration, use a different message */ send_msg(VARLSTCNT(5) ERR_ORLBKINPROG, 3, csa->nl->onln_rlbk_pid, DB_LEN_STR(reg)); assert(csa->nl->in_crit == csa->nl->onln_rlbk_pid); } } else { /* nobody home */ local_crit_cycle = csa->critical->crit_cycle; try_recovery = TRUE; /* set off a salvage */ continue; /* make sure to act on it soon, likely this process */ } } else { /* didn't get resource to do the MUTEXLCKALERT and procestuckexec */ MUTEX_DPRINT2("%d: Could not acquire STUCKEXEC time lock", process_id); } } if (0 == local_crit_cycle) local_crit_cycle = csa->critical->crit_cycle; /* sync first time waiter */ if (cdb_sc_dbccerr == mutex_sleep(csa, mutex_lock_type)) return (cdb_sc_dbccerr); } while (TRUE); } /* in UNIX calls to the following two entry points should be replaced by appropriate (perhaps macro) calls to mutex_lock */ enum cdb_sc mutex_lockw(gd_region *reg, mutex_spin_parms_ptr_t mutex_spin_parms, int crash_count) { return (mutex_lock(reg, mutex_spin_parms, crash_count, MUTEX_LOCK_WRITE)); } enum cdb_sc mutex_lockwim(gd_region *reg, mutex_spin_parms_ptr_t mutex_spin_parms, int crash_count) { return (mutex_lock(reg, mutex_spin_parms, crash_count, MUTEX_LOCK_WRITE_IMMEDIATE)); } enum cdb_sc mutex_unlockw(gd_region *reg, int crash_count) { /* Unlock write access to the mutex at addr */ uint4 already_clear; sgmnt_addrs *csa; csa = &FILE_INFO(reg)->s_addrs; if (crash_count != csa->critical->crashcnt) return (cdb_sc_critreset); assert(csa->now_crit); MUTEX_TEST_SIGNAL_HERE("WRTUNLCK NOW CRIT\n", FALSE); csa->now_crit = FALSE; assert(csa->critical->semaphore.u.parts.latch_pid == process_id); RELEASE_SWAPLOCK(&csa->critical->semaphore); MUTEX_DPRINT2("%d: WRITE LOCK RELEASED\n", process_id); return (mutex_wakeup(csa->critical)); } void mutex_cleanup(gd_region *reg) { sgmnt_addrs *csa; /* mutex_cleanup is called after doing a rel_crit on the same area so if we still own the lock it is because csa->now_crit was not in sync with our semaphore. At this point, if we own the lock, go ahead and release it. */ csa = &FILE_INFO(reg)->s_addrs; if (COMPSWAP_UNLOCK(&csa->critical->semaphore, process_id, image_count, LOCK_AVAILABLE, 0)) { MUTEX_DPRINT2("%d mutex_cleanup : released lock\n", process_id); } } void mutex_seed_init(void) { time_t mutex_seed; mutex_seed = time(NULL) * process_id; next_rand[0] = (unsigned short)(mutex_seed & ((1U << (SIZEOF(unsigned short) * 8)) - 1)); mutex_seed >>= (SIZEOF(unsigned short) * 8); next_rand[1] = (unsigned short)(mutex_seed & ((1U << (SIZEOF(unsigned short) * 8)) - 1)); mutex_seed >>= (SIZEOF(unsigned short) * 8); next_rand[2] = (unsigned short)(mutex_seed & ((1U << (SIZEOF(unsigned short) * 8)) - 1)); } void mutex_salvage(gd_region *reg) { sgmnt_addrs *csa; int salvage_status; pid_t holder_pid, onln_rlbk_pid; boolean_t mutex_salvaged; VMS_ONLY(uint4 holder_imgcnt;) DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; csa = &FILE_INFO(reg)->s_addrs; if (0 != (holder_pid = csa->critical->semaphore.u.parts.latch_pid)) { mutex_salvaged = FALSE; VMS_ONLY(holder_imgcnt = csa->critical->semaphore.u.parts.latch_image_count); if (holder_pid == process_id VMS_ONLY(&& holder_imgcnt == image_count)) { /* We were trying to obtain a lock we already held -- very odd */ RELEASE_SWAPLOCK(&csa->critical->semaphore); csa->nl->in_crit = 0; /* Mutex crash repaired, want to do write cache recovery, just in case. * Take care not to do it for jnlpool which has no concept of a db cache. * In that case csa->hdr is NULL so use CAREFUL_SET_TRACEABLE_VAR macro. */ CAREFUL_SET_TRACEABLE_VAR(csa, TRUE); mutex_salvaged = TRUE; MUTEX_DPRINT2("%d : mutex salvaged, culprit was our own process\n", process_id); } else if (!is_proc_alive(holder_pid, UNIX_ONLY(0) VMS_ONLY(holder_imgcnt))) { /* Release the COMPSWAP lock AFTER setting csa->nl->in_crit to 0 as an assert in * grab_crit (checking that csa->nl->in_crit is 0) relies on this order. */ send_msg(VARLSTCNT(5) ERR_MUTEXFRCDTERM, 3, holder_pid, REG_LEN_STR(reg)); csa->nl->in_crit = 0; /* Mutex crash repaired, want to do write cache recovery, in case previous holder of crit had set * some cr->in_cw_set to a non-zero value. Not doing cache recovery could cause incorrect * GTMASSERTs in PIN_CACHE_RECORD macro in t_end/tp_tend. * Take care not to do it for jnlpool which has no concept of a db cache. * In that case csa->hdr is NULL so use CAREFUL_SET_TRACEABLE_VAR macro. */ CAREFUL_SET_TRACEABLE_VAR(csa, TRUE); COMPSWAP_UNLOCK(&csa->critical->semaphore, holder_pid, holder_imgcnt, LOCK_AVAILABLE, 0); mutex_salvaged = TRUE; /* Reset jb->blocked as well if the holder_pid had it set */ if ((NULL != csa->jnl) && (NULL != csa->jnl->jnl_buff) && (csa->jnl->jnl_buff->blocked == holder_pid)) csa->jnl->jnl_buff->blocked = 0; MUTEX_DPRINT3("%d : mutex salvaged, culprit was %d\n", process_id, holder_pid); } else if (!TREF(disable_sigcont)) { /* The process might have been STOPPED (kill -SIGSTOP). Send SIGCONT and nudge the stopped process forward. * However, skip this call in case of SENDTO_EPERM white-box test, because we do not want the intentionally * stuck process to be awakened prematurely. */ DEBUG_ONLY(if (!gtm_white_box_test_case_enabled || WBTEST_SENDTO_EPERM != gtm_white_box_test_case_number)) continue_proc(holder_pid); } /* Record salvage event in db file header if applicable. * Take care not to do it for jnlpool which has no concept of a db cache. * In that case csa->hdr is NULL so check accordingly. */ assert((NULL != csa->hdr) || (csa == &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs)); if (mutex_salvaged && (NULL != csa->hdr)) { BG_TRACE_PRO_ANY(csa, wcb_mutex_salvage); /* no need to use CAREFUL_BG_TRACE_PRO_ANY macro * since we already checked for csa->hdr non-NULL. */ send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_mutex_salvage"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); } } } /* Do the per process initialization of mutex stuff. This function should be invoked only once per process. The only * exception is the receiver server which could invoke this twice. Once through the receiver server startup command when * it does "jnlpool_init" and the second through the child receiver server process initialization. The second initialization * is needed to set the mutex structures up to correspond to the child process id (and not the parent pid). The function below * has to be coded to ensure that the second call nullifies any effects of the first call. */ void mutex_per_process_init(void) { int4 status; assert(process_id != mutex_per_process_init_pid); mutex_seed_init(); # ifndef MUTEX_MSEM_WAKE if (mutex_per_process_init_pid) { /* Close socket opened by the first call. But dont delete the socket file as the parent process will do that. */ assert(FD_INVALID != mutex_sock_fd); if (FD_INVALID != mutex_sock_fd) CLOSEFILE_RESET(mutex_sock_fd, status); /* resets "mutex_sock_fd" to FD_INVALID */ } assert(FD_INVALID == mutex_sock_fd); mutex_sock_init(); assert(FD_INVALID != mutex_sock_fd); # endif mutex_per_process_init_pid = process_id; }