341 lines
14 KiB
C
341 lines
14 KiB
C
|
/***************************************************************
|
||
|
* *
|
||
|
* Copyright 2001, 2011 Fidelity Information Services, Inc *
|
||
|
* *
|
||
|
* This source code contains the intellectual property *
|
||
|
* of its copyright holder(s), and is made available *
|
||
|
* under a license. If you do not know the terms of *
|
||
|
* the license, please stop and do not read further. *
|
||
|
* *
|
||
|
****************************************************************/
|
||
|
|
||
|
#include "mdef.h"
|
||
|
|
||
|
#include <errno.h>
|
||
|
#include "gtm_unistd.h" /* fsync() needs this */
|
||
|
#include "gtm_string.h"
|
||
|
|
||
|
#include "gtmio.h" /* this has to come in before gdsfhead.h, for all "open" to be defined
|
||
|
to "open64", including the open in header files */
|
||
|
#include "aswp.h"
|
||
|
#include "gdsroot.h"
|
||
|
#include "gtm_facility.h"
|
||
|
#include "fileinfo.h"
|
||
|
#include "gdsbt.h"
|
||
|
#include "gdsblk.h"
|
||
|
#include "gdsfhead.h"
|
||
|
#include "filestruct.h"
|
||
|
#include "gt_timer.h"
|
||
|
#include "jnl.h"
|
||
|
#include "lockconst.h"
|
||
|
#include "interlock.h"
|
||
|
#include "iosp.h"
|
||
|
#include "gdsbgtr.h"
|
||
|
#include "is_file_identical.h"
|
||
|
#include "dpgbldir.h"
|
||
|
#include "rel_quant.h"
|
||
|
#include "repl_sp.h" /* for F_CLOSE used by the JNL_FD_CLOSE macro */
|
||
|
#include "memcoherency.h"
|
||
|
#include "gtm_dbjnl_dupfd_check.h"
|
||
|
|
||
|
GBLREF volatile int4 db_fsync_in_prog;
|
||
|
GBLREF volatile int4 jnl_qio_in_prog;
|
||
|
GBLREF uint4 process_id;
|
||
|
|
||
|
error_def(ERR_DBFSYNCERR);
|
||
|
error_def(ERR_JNLACCESS);
|
||
|
error_def(ERR_JNLCNTRL);
|
||
|
error_def(ERR_JNLRDERR);
|
||
|
error_def(ERR_JNLWRTDEFER);
|
||
|
error_def(ERR_JNLWRTNOWWRTR);
|
||
|
error_def(ERR_PREMATEOF);
|
||
|
|
||
|
uint4 jnl_sub_qio_start(jnl_private_control *jpc, boolean_t aligned_write);
|
||
|
void jnl_mm_timer_write(void);
|
||
|
|
||
|
/* If the second argument is TRUE, then the jnl write is done only upto the previous aligned boundary.
|
||
|
* else the write is done upto the freeaddr */
|
||
|
|
||
|
uint4 jnl_sub_qio_start(jnl_private_control *jpc, boolean_t aligned_write)
|
||
|
{
|
||
|
boolean_t was_wrapped;
|
||
|
int tsz, close_res;
|
||
|
jnl_buffer_ptr_t jb;
|
||
|
int4 free_ptr;
|
||
|
sgmnt_addrs *csa;
|
||
|
sm_uc_ptr_t base;
|
||
|
unix_db_info *udi;
|
||
|
unsigned int status;
|
||
|
int save_errno;
|
||
|
uint4 aligned_dskaddr, dskaddr;
|
||
|
int4 aligned_dsk, dsk;
|
||
|
int aligned_tsz;
|
||
|
sm_uc_ptr_t aligned_base;
|
||
|
uint4 jnl_fs_block_size;
|
||
|
|
||
|
assert(NULL != jpc);
|
||
|
udi = FILE_INFO(jpc->region);
|
||
|
csa = &udi->s_addrs;
|
||
|
jb = jpc->jnl_buff;
|
||
|
if (jb->io_in_prog_latch.u.parts.latch_pid == process_id) /* We already have the lock? */
|
||
|
return ERR_JNLWRTNOWWRTR; /* timer driven io in progress */
|
||
|
jnl_qio_in_prog++;
|
||
|
if (!GET_SWAPLOCK(&jb->io_in_prog_latch))
|
||
|
{
|
||
|
jnl_qio_in_prog--;
|
||
|
assert(0 <= jnl_qio_in_prog);
|
||
|
return ERR_JNLWRTDEFER;
|
||
|
}
|
||
|
if (jb->dsk != (jb->dskaddr % jb->size))
|
||
|
{
|
||
|
RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
|
||
|
jnl_qio_in_prog--;
|
||
|
assert(0 <= jnl_qio_in_prog);
|
||
|
return ERR_JNLCNTRL;
|
||
|
}
|
||
|
if (!JNL_FILE_SWITCHED(jpc))
|
||
|
jpc->fd_mismatch = FALSE;
|
||
|
else
|
||
|
{ /* journal file has been switched; release io_in_prog lock and return */
|
||
|
jpc->fd_mismatch = TRUE;
|
||
|
RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
|
||
|
jnl_qio_in_prog--;
|
||
|
assert(0 <= jnl_qio_in_prog);
|
||
|
return SS_NORMAL;
|
||
|
}
|
||
|
/* Currently we overload io_in_prog_latch to perform the db fsync too. Anyone trying to do a
|
||
|
* jnl_qio_start will first check if a db_fsync is needed and if so sync that before doing any jnl qio.
|
||
|
* Note that since an epoch record is written when need_db_fsync is set to TRUE, we are guaranteed that
|
||
|
* (dskaddr < freeaddr) which is necessary for the jnl_wait --> jnl_write_attempt mechanism (triggered
|
||
|
* by wcs_flu) to actually initiate a call to jnl_qio_start().
|
||
|
*/
|
||
|
if (jb->need_db_fsync)
|
||
|
{
|
||
|
DB_FSYNC(jpc->region, udi, csa, db_fsync_in_prog, save_errno);
|
||
|
if (0 != save_errno)
|
||
|
{
|
||
|
RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
|
||
|
jnl_qio_in_prog--;
|
||
|
assert(0 <= jnl_qio_in_prog);
|
||
|
rts_error(VARLSTCNT(5) ERR_DBFSYNCERR, 2, DB_LEN_STR(jpc->region), save_errno);
|
||
|
assert(FALSE); /* should not come here as the rts_error above should not return */
|
||
|
return ERR_DBFSYNCERR; /* ensure we do not fall through to the code below as we no longer have the lock */
|
||
|
}
|
||
|
jb->need_db_fsync = FALSE;
|
||
|
}
|
||
|
free_ptr = jb->free;
|
||
|
/* The following barrier is to make sure that for the value of "free" that we extract (which may be
|
||
|
* slightly stale but that is not a correctness issue) we make sure we dont write out a stale version of
|
||
|
* the journal buffer contents. While it is possible that we see journal buffer contents that are more
|
||
|
* uptodate than "free", this would only mean writing out a less than optimal number of bytes but again,
|
||
|
* not a correctness issue. Secondary effect is that it also enforces a corresponding non-stale value of
|
||
|
* freeaddr is read and this is relied upon by asserts below.
|
||
|
*/
|
||
|
SHM_READ_MEMORY_BARRIER;
|
||
|
dsk = jb->dsk;
|
||
|
dskaddr = jb->dskaddr;
|
||
|
was_wrapped = (free_ptr < dsk);
|
||
|
jnl_fs_block_size = jb->fs_block_size;
|
||
|
if (aligned_write)
|
||
|
free_ptr = ROUND_DOWN2(free_ptr, jnl_fs_block_size);
|
||
|
assert(!(jb->size % jnl_fs_block_size));
|
||
|
tsz = (free_ptr < dsk ? jb->size : free_ptr) - dsk;
|
||
|
if ((aligned_write && !was_wrapped && (free_ptr <= dsk)) || (NOJNL == jpc->channel))
|
||
|
tsz = 0;
|
||
|
assert(0 <= tsz);
|
||
|
assert(dskaddr + tsz <= jb->freeaddr);
|
||
|
status = SS_NORMAL;
|
||
|
if (tsz)
|
||
|
{ /* ensure that dsk and free are never equal and we have left space for JNL_WRT_START_MASK */
|
||
|
assert(SS_NORMAL == status);
|
||
|
assert((free_ptr > dsk) || (free_ptr < (dsk & JNL_WRT_START_MASK(jb)))
|
||
|
|| (dsk != (dsk & JNL_WRT_START_MASK(jb))));
|
||
|
jb->wrtsize = tsz;
|
||
|
jb->qiocnt++;
|
||
|
base = &jb->buff[dsk + jb->buff_off];
|
||
|
assert((base + tsz) <= (jb->buff + jb->size + jnl_fs_block_size));
|
||
|
assert(NOJNL != jpc->channel);
|
||
|
/* If sync_io is turned on, we would have turned on the O_DIRECT flag on some platforms. That will
|
||
|
* require us to do aligned writes. Both the source buffer and the size of the write need to be aligned
|
||
|
* for this to work on some platforms. The alignment needs to be on a filesystem-block-size granularity.
|
||
|
* If sync_io is not turned on, doing aligned writes saves us from the OS doing a read of the block
|
||
|
* under the covers in case we write only a part of the filesystem block.
|
||
|
* Therefore we do aligned writes no matter what. This means we could be writing some garbage padding
|
||
|
* data out after the last valid journal record jut to fit in the alignment requirements. But that is
|
||
|
* considered okay because as part of writing the EOF record out (for a clean termination), jnl_write
|
||
|
* would have 0-padded the journal buffer for us. So a cleanly shutdown journal file will have 0-padding
|
||
|
* following the EOF record but an actively used journal file might have garbage padding following the
|
||
|
* last valid record. This is considered okay as journal recovery has logic to scan past the garbage and
|
||
|
* locate the last valid record in case of a crash before writing the EOF.
|
||
|
*/
|
||
|
aligned_dsk = ROUND_DOWN2(dsk, jnl_fs_block_size);
|
||
|
aligned_dskaddr = ROUND_DOWN2(dskaddr, jnl_fs_block_size);
|
||
|
aligned_tsz = ROUND_UP2((tsz + (dskaddr - aligned_dskaddr)), jnl_fs_block_size);
|
||
|
aligned_base = (sm_uc_ptr_t)ROUND_DOWN2((uintszofptr_t)base, jnl_fs_block_size);
|
||
|
/* Assert that aligned_dsk never backs up to a point BEFORE where the free pointer is */
|
||
|
assert((aligned_dsk > free_ptr) || (dsk <= free_ptr));
|
||
|
/* Assert that aligned_dskaddr never backs up to a point inside journal file header territory.
|
||
|
* This is because those fields are always updated inside crit and therefore we should
|
||
|
* never touch those while we hold only the jnl qio lock.
|
||
|
*/
|
||
|
assert(JNL_HDR_LEN <= aligned_dskaddr);
|
||
|
/* Assert that both ends of the source buffer for the write falls within journal buffer limits */
|
||
|
assert(aligned_base >= &jb->buff[jb->buff_off]);
|
||
|
assert(aligned_base + aligned_tsz <= &jb->buff[jb->buff_off + jb->size]);
|
||
|
LSEEKWRITE(jpc->channel, (off_t)aligned_dskaddr, aligned_base, aligned_tsz, jpc->status);
|
||
|
status = jpc->status;
|
||
|
if (SS_NORMAL == status)
|
||
|
{ /* update jnl_buff pointers to reflect the successful write to the journal file */
|
||
|
assert(dsk <= jb->size);
|
||
|
assert(jb->io_in_prog_latch.u.parts.latch_pid == process_id);
|
||
|
jpc->new_dsk = dsk + tsz;
|
||
|
if (jpc->new_dsk >= jb->size)
|
||
|
{
|
||
|
assert(jpc->new_dsk == jb->size);
|
||
|
jpc->new_dsk = 0;
|
||
|
}
|
||
|
jpc->new_dskaddr = dskaddr + tsz;
|
||
|
assert(jpc->new_dsk == jpc->new_dskaddr % jb->size);
|
||
|
assert(jb->freeaddr >= jpc->new_dskaddr);
|
||
|
jpc->dsk_update_inprog = TRUE; /* for secshr_db_clnup to clean it up (when it becomes feasible in Unix) */
|
||
|
jb->dsk = jpc->new_dsk;
|
||
|
jb->dskaddr = jpc->new_dskaddr;
|
||
|
jpc->dsk_update_inprog = FALSE;
|
||
|
} else
|
||
|
{
|
||
|
assert(ENOSPC == status);
|
||
|
jb->errcnt++;
|
||
|
if (ENOSPC == status)
|
||
|
jb->enospc_errcnt++;
|
||
|
else
|
||
|
jb->enospc_errcnt = 0;
|
||
|
jnl_send_oper(jpc, ERR_JNLACCESS);
|
||
|
# ifdef GTM_FD_TRACE
|
||
|
if ((EBADF == status) || (ESPIPE == status))
|
||
|
{ /* likely case of D9I11-002714. check if fd is valid */
|
||
|
gtm_dbjnl_dupfd_check();
|
||
|
/* If fd of this journal points to some other database or journal file opened by this process
|
||
|
* the above call would have reset jpc->channel. If it did not get reset, then check
|
||
|
* if the fd in itself is valid and points back to the journal file. If not reset it to NOJNL.
|
||
|
*/
|
||
|
if (NOJNL != jpc->channel)
|
||
|
gtm_check_fd_is_valid(jpc->region, FALSE, jpc->channel);
|
||
|
/* If jpc->channel still did not get reset to NOJNL, it means the file descriptor is valid but
|
||
|
* not sure why we are getting EBADF/ESPIPE errors. No further recovery attempted at this point.
|
||
|
*/
|
||
|
}
|
||
|
# endif
|
||
|
status = ERR_JNLACCESS;
|
||
|
}
|
||
|
}
|
||
|
RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
|
||
|
if ((jnl_closed == csa->hdr->jnl_state) && (NOJNL != jpc->channel))
|
||
|
{
|
||
|
JNL_FD_CLOSE(jpc->channel, close_res); /* sets jpc->channel to NOJNL */
|
||
|
jpc->pini_addr = 0;
|
||
|
}
|
||
|
jnl_qio_in_prog--;
|
||
|
assert(0 <= jnl_qio_in_prog);
|
||
|
return status;
|
||
|
}
|
||
|
|
||
|
/* This is a wrapper for jnl_sub_qio_start that tries to divide the writes into optimal chunks.
|
||
|
* It calls jnl_sub_qio_start() with appropriate arguments in two stages, the first one with
|
||
|
* optimal "jnl_fs_block_size" boundary and the other suboptimal tail end of the write. The latter
|
||
|
* call is made only if no other process has finished the jnl write upto the required point
|
||
|
* during the time this process yields
|
||
|
*/
|
||
|
uint4 jnl_qio_start(jnl_private_control *jpc)
|
||
|
{
|
||
|
unsigned int yield_cnt, status;
|
||
|
uint4 target_freeaddr, lcl_dskaddr, old_freeaddr;
|
||
|
jnl_buffer_ptr_t jb;
|
||
|
sgmnt_addrs *csa;
|
||
|
unix_db_info *udi;
|
||
|
uint4 jnl_fs_block_size;
|
||
|
|
||
|
assert(NULL != jpc);
|
||
|
udi = FILE_INFO(jpc->region);
|
||
|
csa = &udi->s_addrs;
|
||
|
jb = jpc->jnl_buff;
|
||
|
/* this block of code (till yield()) processes the buffer upto an "jnl_fs_block_size" alignment boundary
|
||
|
* and the next block of code (after the yield()) processes the tail end of the data (if necessary)
|
||
|
*/
|
||
|
lcl_dskaddr = jb->dskaddr;
|
||
|
target_freeaddr = jb->freeaddr;
|
||
|
if (lcl_dskaddr >= target_freeaddr)
|
||
|
return SS_NORMAL;
|
||
|
/* ROUND_DOWN2 macro is used under the assumption that "jnl_fs_block_size" would be a power of 2 */
|
||
|
jnl_fs_block_size = jb->fs_block_size;
|
||
|
if (ROUND_DOWN2(lcl_dskaddr, jnl_fs_block_size) != ROUND_DOWN2(target_freeaddr, jnl_fs_block_size))
|
||
|
{ /* data crosses/touches an alignment boundary */
|
||
|
if (SS_NORMAL != (status = jnl_sub_qio_start(jpc, TRUE)))
|
||
|
return status;
|
||
|
} /* else, data does not cross/touch an alignment boundary, yield and see if someone else
|
||
|
* does the dirty job more efficiently
|
||
|
*/
|
||
|
for (yield_cnt = 0; yield_cnt < csa->hdr->yield_lmt; yield_cnt++)
|
||
|
{ /* yield() until someone has finished your job or no one else is active on the jnl file */
|
||
|
old_freeaddr = jb->freeaddr;
|
||
|
rel_quant();
|
||
|
/* Purpose of this memory barrier is to get a current view of asyncrhonously changed fields
|
||
|
* like whether the jnl file was switched, the write position in the journal file and the
|
||
|
* write address in the journal buffer for all the remaining statements in this loop because
|
||
|
* the rel_quant call above allows any and all of them to change and we aren't under any
|
||
|
* locks while in this loop. This is not a correctness issue as we would either eventually
|
||
|
* see the updates or it means we are writing what has already been written. It is a performance
|
||
|
* issue keeping more current with state changes done by other processes on other processors.
|
||
|
*/
|
||
|
SHM_READ_MEMORY_BARRIER;
|
||
|
if (JNL_FILE_SWITCHED(jpc))
|
||
|
return SS_NORMAL;
|
||
|
/* assert(old_freeaddr <= jb->freeaddr) ** Potential race condition with jnl file switch could
|
||
|
* make this assert fail so it is removed
|
||
|
*/
|
||
|
if (old_freeaddr == jb->freeaddr || target_freeaddr <= jb->dskaddr)
|
||
|
break;
|
||
|
}
|
||
|
status = SS_NORMAL;
|
||
|
if (target_freeaddr > jb->dskaddr)
|
||
|
status = jnl_sub_qio_start(jpc, FALSE);
|
||
|
return status;
|
||
|
}
|
||
|
|
||
|
static boolean_t jnl_timer;
|
||
|
void jnl_mm_timer_write(void)
|
||
|
{ /* While this should work by region and use baton passing to more accurately and efficiently perform its task,
|
||
|
* it is currently a blunt instrument
|
||
|
*/
|
||
|
gd_region *reg, *r_top;
|
||
|
gd_addr *addr_ptr;
|
||
|
sgmnt_addrs *csa;
|
||
|
|
||
|
for (addr_ptr = get_next_gdr(NULL); NULL != addr_ptr; addr_ptr = get_next_gdr(addr_ptr))
|
||
|
{ /* since the unix timers don't provide an argument, for now write all regions */
|
||
|
for (reg = addr_ptr->regions, r_top = reg + addr_ptr->n_regions; reg < r_top; reg++)
|
||
|
{
|
||
|
if ((dba_mm == reg->dyn.addr->acc_meth) && reg->open)
|
||
|
{
|
||
|
csa = &FILE_INFO(reg)->s_addrs;
|
||
|
if ((NULL != csa->jnl) && (NOJNL != csa->jnl->channel))
|
||
|
jnl_qio_start(csa->jnl);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
jnl_timer = FALSE;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
void jnl_mm_timer(sgmnt_addrs *csa, gd_region *reg)
|
||
|
{ /* While this should work by region and use baton passing to more accurately and efficiently perform its task,
|
||
|
* it is currently a blunt instrument.
|
||
|
*/
|
||
|
assert(reg->open);
|
||
|
if (FALSE == jnl_timer)
|
||
|
{
|
||
|
jnl_timer = TRUE;
|
||
|
start_timer((TID)jnl_mm_timer, csa->hdr->flush_time[0], &jnl_mm_timer_write, 0, NULL);
|
||
|
}
|
||
|
return;
|
||
|
}
|