240 lines
11 KiB
C
240 lines
11 KiB
C
/****************************************************************
|
|
* *
|
|
* Copyright 2001, 2013 Fidelity Information Services, Inc *
|
|
* *
|
|
* This source code contains the intellectual property *
|
|
* of its copyright holder(s), and is made available *
|
|
* under a license. If you do not know the terms of *
|
|
* the license, please stop and do not read further. *
|
|
* *
|
|
****************************************************************/
|
|
|
|
#include "mdef.h"
|
|
|
|
#include <sys/types.h>
|
|
#include "gtm_unistd.h"
|
|
#include "gtm_string.h"
|
|
#include <signal.h>
|
|
#include <errno.h>
|
|
#ifdef DEBUG
|
|
#include "gtm_stdio.h"
|
|
#endif
|
|
|
|
#include "gdsroot.h"
|
|
#include "gdsblk.h"
|
|
#include "gtm_facility.h"
|
|
#include "fileinfo.h"
|
|
#include "gdsbt.h"
|
|
#include "gdsfhead.h"
|
|
#include "filestruct.h"
|
|
#include "iosp.h"
|
|
#include "error.h"
|
|
#include "gtmio.h"
|
|
#include "gds_blk_upgrade.h"
|
|
#include "gdsbml.h"
|
|
#ifdef GTM_CRYPT
|
|
#include "gtmcrypt.h"
|
|
#endif
|
|
#include "gdsdbver.h"
|
|
#include "min_max.h"
|
|
#include "gtmimagename.h"
|
|
#include "memcoherency.h"
|
|
#include "gdskill.h"
|
|
#include "gdscc.h"
|
|
#include "jnl.h"
|
|
#include "buddy_list.h" /* needed for tp.h */
|
|
#include "hashtab_int4.h" /* needed for tp.h */
|
|
#include "tp.h"
|
|
|
|
GBLREF gd_region *gv_cur_region;
|
|
GBLREF sgmnt_addrs *cs_addrs;
|
|
GBLREF sgmnt_data_ptr_t cs_data;
|
|
GBLREF volatile int4 fast_lock_count;
|
|
GBLREF boolean_t dse_running;
|
|
GBLREF boolean_t mu_reorg_upgrd_dwngrd_in_prog;
|
|
GBLREF unsigned int t_tries;
|
|
GBLREF uint4 dollar_tlevel;
|
|
GBLREF sgm_info *sgm_info_ptr;
|
|
GBLREF sgmnt_addrs *kip_csa;
|
|
GBLREF jnl_gbls_t jgbl;
|
|
|
|
error_def(ERR_DYNUPGRDFAIL);
|
|
|
|
int4 dsk_read (block_id blk, sm_uc_ptr_t buff, enum db_ver *ondsk_blkver, boolean_t blk_free)
|
|
{
|
|
unix_db_info *udi;
|
|
int4 size, save_errno;
|
|
enum db_ver tmp_ondskblkver;
|
|
sm_uc_ptr_t save_buff = NULL, enc_save_buff;
|
|
boolean_t fully_upgraded, buff_is_modified_after_lseekread;
|
|
int bsiz;
|
|
# ifdef DEBUG
|
|
unsigned int effective_t_tries;
|
|
boolean_t killinprog;
|
|
blk_hdr_ptr_t blk_hdr_val;
|
|
static int in_dsk_read;
|
|
# endif
|
|
/* It is possible that the block that we read in from disk is a V4 format block. The database block scanning routines
|
|
* (gvcst_*search*.c) that might be concurrently running currently assume all global buffers (particularly the block
|
|
* headers) are V5 format. They are not robust enough to handle a V4 format block. Therefore we do not want to
|
|
* risk reading a potential V4 format block directly into the cache and then upgrading it. Instead we read it into
|
|
* a private buffer, upgrade it there and then copy it over to the cache in V5 format. This is the static variable
|
|
* read_reformat_buffer. We could have as well used the global variable "reformat_buffer" for this purpose. But
|
|
* that would then prevent dsk_reads and concurrent dsk_writes from proceeding. We dont want that loss of asynchronocity.
|
|
* Hence we keep them separate. Note that while "reformat_buffer" is used by a lot of routines, "read_reformat_buffer"
|
|
* is used only by this routine and hence is a static instead of a GBLDEF.
|
|
*/
|
|
static sm_uc_ptr_t read_reformat_buffer;
|
|
static int read_reformat_buffer_len;
|
|
# ifdef GTM_CRYPT
|
|
int in_len, gtmcrypt_errno;
|
|
char *in, *out;
|
|
boolean_t is_encrypted;
|
|
# endif
|
|
DCL_THREADGBL_ACCESS;
|
|
|
|
SETUP_THREADGBL_ACCESS;
|
|
/* Note: Even in snapshots, only INTEG requires dsk_read to read FREE blocks. The assert below should be modified
|
|
* if we later introduce a scheme where we can figure out as to who started the snapshots and assert accordingly
|
|
*/
|
|
assert(!blk_free || SNAPSHOTS_IN_PROG(cs_addrs)); /* Only SNAPSHOTS require dsk_read to read a FREE block from the disk */
|
|
assert(0 == in_dsk_read); /* dsk_read should never be nested. the read_reformat_buffer logic below relies on this */
|
|
DEBUG_ONLY(in_dsk_read++;)
|
|
udi = (unix_db_info *)(gv_cur_region->dyn.addr->file_cntl->file_info);
|
|
assert(cs_addrs->hdr == cs_data);
|
|
size = cs_data->blk_size;
|
|
assert (cs_data->acc_meth == dba_bg);
|
|
/* Since cs_data->fully_upgraded is referenced more than once in this module (once explicitly and once in
|
|
* GDS_BLK_UPGRADE_IF_NEEDED macro used below), take a copy of it and use that so all usages see the same value.
|
|
* Not doing this, for example, can cause us to see the database as fully upgraded in the first check causing us
|
|
* not to allocate save_buff (a temporary buffer to hold a V4 format block) at all but later in the macro
|
|
* we might see the database as NOT fully upgraded so we might choose to call the function gds_blk_upgrade which
|
|
* does expect a temporary buffer to have been pre-allocated. It is ok if the value of cs_data->fully_upgraded
|
|
* changes after we took a copy of it since we have a buffer locked for this particular block (at least in BG)
|
|
* so no concurrent process could be changing the format of this block. For MM there might be an issue.
|
|
*/
|
|
fully_upgraded = cs_data->fully_upgraded;
|
|
if (!blk_free && !fully_upgraded) /* No V4->V5 translations required if block is FREE */
|
|
{
|
|
buff_is_modified_after_lseekread = TRUE;
|
|
save_buff = buff;
|
|
if (size > read_reformat_buffer_len)
|
|
{ /* do the same for the reformat_buffer used by dsk_read */
|
|
assert(0 == fast_lock_count); /* this is mainline (non-interrupt) code */
|
|
++fast_lock_count; /* No interrupts in free/malloc across this change */
|
|
if (NULL != read_reformat_buffer)
|
|
free(read_reformat_buffer);
|
|
read_reformat_buffer = malloc(size);
|
|
read_reformat_buffer_len = size;
|
|
--fast_lock_count;
|
|
}
|
|
buff = read_reformat_buffer;
|
|
} else
|
|
buff_is_modified_after_lseekread = FALSE;
|
|
assert(NULL != cs_addrs->nl);
|
|
INCR_GVSTATS_COUNTER(cs_addrs, cs_addrs->nl, n_dsk_read, 1);
|
|
enc_save_buff = buff;
|
|
# ifdef GTM_CRYPT
|
|
is_encrypted = cs_data->is_encrypted;
|
|
if (is_encrypted)
|
|
{
|
|
DBG_ENSURE_PTR_IS_VALID_GLOBUFF(cs_addrs, cs_data, buff);
|
|
enc_save_buff = GDS_ANY_ENCRYPTGLOBUF(buff, cs_addrs);
|
|
DBG_ENSURE_PTR_IS_VALID_ENCTWINGLOBUFF(cs_addrs, cs_data, enc_save_buff);
|
|
}
|
|
# endif
|
|
LSEEKREAD(udi->fd,
|
|
(DISK_BLOCK_SIZE * (cs_data->start_vbn - 1) + (off_t)blk * size),
|
|
enc_save_buff,
|
|
size,
|
|
save_errno);
|
|
assert((0 == save_errno) GTM_TRUNCATE_ONLY(|| (-1 == save_errno)));
|
|
WBTEST_ASSIGN_ONLY(WBTEST_PREAD_SYSCALL_FAIL, save_errno, EIO);
|
|
# ifdef GTM_CRYPT
|
|
if (is_encrypted && (0 == save_errno))
|
|
{
|
|
bsiz = (int)((blk_hdr_ptr_t)enc_save_buff)->bsiz;
|
|
in_len = MIN(cs_data->blk_size, bsiz) - SIZEOF(blk_hdr);
|
|
buff_is_modified_after_lseekread = TRUE;
|
|
/* Do not do encryption/decryption if block is FREE */
|
|
if (!blk_free && (IS_BLK_ENCRYPTED(((blk_hdr_ptr_t)enc_save_buff)->levl, in_len)))
|
|
{ /* Due to concurrency conflicts, we are potentially reading a free block even though blk_free is
|
|
* FALSE. Go ahead and safely "decrypt" such a block, even though it contains no valid contents.
|
|
* We expect GTMCRYPT_DECRYPT to return success even if it is presented with garbage data.
|
|
*/
|
|
ASSERT_ENCRYPTION_INITIALIZED;
|
|
memcpy(buff, enc_save_buff, SIZEOF(blk_hdr));
|
|
in = (char *)(enc_save_buff + SIZEOF(blk_hdr));
|
|
out = (char *)(buff + SIZEOF(blk_hdr));
|
|
GTMCRYPT_DECRYPT(cs_addrs, cs_addrs->encr_key_handle, in, in_len, out, gtmcrypt_errno);
|
|
save_errno = gtmcrypt_errno;
|
|
} else
|
|
memcpy(buff, enc_save_buff, size);
|
|
}
|
|
# endif
|
|
if (!blk_free && (0 == save_errno))
|
|
{ /* See if block needs to be converted to current version. Assuming buffer is at least short aligned */
|
|
assert(0 == (long)buff % 2);
|
|
/* GDSV4 (0) version uses "buff->bver" as a block length so should always be > 0 when M code is running.
|
|
* The only exception is if the block has not been initialized (possible if it is BLK_FREE status in the
|
|
* bitmap). This is possible due to concurrency issues while traversing down the tree. But if we have
|
|
* crit on this region, we should not see these either.
|
|
*/
|
|
assert(!IS_MCODE_RUNNING || !cs_addrs->now_crit || ((blk_hdr_ptr_t)buff)->bver);
|
|
/* Block must be converted to current version (if necessary) for use by internals.
|
|
* By definition, all blocks are converted from/to their on-disk version at the IO point.
|
|
*/
|
|
GDS_BLK_UPGRADE_IF_NEEDED(blk, buff, save_buff, cs_data, &tmp_ondskblkver, save_errno, fully_upgraded);
|
|
DEBUG_DYNGRD_ONLY(
|
|
if (GDSVCURR != tmp_ondskblkver)
|
|
PRINTF("DSK_READ: Block %d being dynamically upgraded on read\n", blk);
|
|
)
|
|
assert((GDSV6 == tmp_ondskblkver) || (NULL != save_buff)); /* never read a V4 block directly into cache */
|
|
if (NULL != ondsk_blkver)
|
|
*ondsk_blkver = tmp_ondskblkver;
|
|
/* a bitmap block should never be short of space for a dynamic upgrade. assert that. */
|
|
assert((NULL == ondsk_blkver) || !IS_BITMAP_BLK(blk) || (ERR_DYNUPGRDFAIL != save_errno));
|
|
/* If we didn't run gds_blk_upgrade which would move the block into the cache, we need to do
|
|
* it ourselves. Note that buff will be cleared by the GDS_BLK_UPGRADE_IF_NEEDED macro if
|
|
* buff and save_buff are different and gds_blk_upgrade was called.
|
|
*/
|
|
if ((NULL != save_buff) && (NULL != buff)) /* Buffer not moved by upgrade, we must move */
|
|
memcpy(save_buff, buff, size);
|
|
}
|
|
if (buff_is_modified_after_lseekread)
|
|
{ /* Normally the disk read (done in LSEEKREAD macro) would do the necessary write memory barrier to make the
|
|
* updated shared memory global buffer contents visible to all other processes as long as they see any later
|
|
* updates done to shared memory by the reader. But in case of a V4 -> V5 upgrade or reading of an encrypted
|
|
* block, the actual disk read would have happened into a different buffer. That would then be used as a
|
|
* source for the upgrade or decryption before placing the final contents in the input global buffer.
|
|
* We now need a write memory barrier before returning from this function to publish this shared memory
|
|
* update to other processes waiting on this read. Note: it is possible in rare cases (e.g. mupip reorg upgrade)
|
|
* that the input buffer is NOT a shared memory buffer in which case the write memory barrier is not necessary
|
|
* but it is not easily possible to identify that and we want to save if checks on the fast path and so do
|
|
* the memory barrier in all cases.
|
|
*/
|
|
SHM_WRITE_MEMORY_BARRIER;
|
|
}
|
|
# ifdef DEBUG
|
|
in_dsk_read--;
|
|
/* Expect t_tries to be 3 if we have crit. Exceptions: gvcst_redo_root_search (where t_tries is temporarily reset
|
|
* for the duration of the redo_root_search and so we should look at the real t_tries in redo_rootsrch_ctxt),
|
|
* gvcst_expand_free_subtree, REORG UPGRADE/DOWNGRADE, DSE (where we grab crit before doing the t_qread irrespective
|
|
* of t_tries), forward recovery (where we grab crit before doing everything)
|
|
*/
|
|
effective_t_tries = UNIX_ONLY( (TREF(in_gvcst_redo_root_search)) ? (TREF(redo_rootsrch_ctxt)).t_tries : ) t_tries;
|
|
effective_t_tries = MAX(effective_t_tries, t_tries);
|
|
killinprog = (NULL != ((dollar_tlevel) ? sgm_info_ptr->kip_csa : kip_csa));
|
|
assert(dse_running || killinprog || jgbl.forw_phase_recovery || mu_reorg_upgrd_dwngrd_in_prog
|
|
|| (cs_addrs->now_crit != (CDB_STAGNATE > effective_t_tries)));
|
|
if (!blk_free && cs_addrs->now_crit && !dse_running && (0 == save_errno))
|
|
{ /* Do basic checks on GDS block that was just read. Do it only if holding crit as we could read
|
|
* uninitialized blocks otherwise. Also DSE might read bad blocks even inside crit so skip checks.
|
|
*/
|
|
blk_hdr_val = (NULL != save_buff) ? (blk_hdr_ptr_t)save_buff : (blk_hdr_ptr_t)buff;
|
|
GDS_BLK_HDR_CHECK(cs_data, blk_hdr_val, fully_upgraded);
|
|
}
|
|
# endif
|
|
return save_errno;
|
|
}
|