fis-gtm/sr_unix/mu_size_impsample.c

282 lines
8.7 KiB
C

/****************************************************************
* *
* Copyright 2012 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_string.h"
#include "cdb_sc.h"
#include "gdsroot.h"
#include "gdsblk.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "jnl.h"
#include "gdsblkops.h"
#include "gdskill.h"
#include "gdscc.h"
#include "copy.h"
#include "interlock.h"
#include "muextr.h"
#include "mu_reorg.h"
/* Include prototypes */
#include "t_end.h"
#include "t_retry.h"
#include "mupip_size.h"
#include "util.h"
#include "t_begin.h"
#include "op.h"
#include "gvcst_protos.h" /* for gvcst_rtsib,gvcst_search prototype */
#include "gvcst_bmp_mark_free.h"
#include "gvcst_kill_sort.h"
#include "gtmmsg.h"
#include "add_inter.h"
#include "t_abort.h"
#include "sleep_cnt.h"
#include "wcs_sleep.h"
#include "memcoherency.h"
#include "gtm_time.h"
#include "mvalconv.h"
#include "t_qread.h"
#include "longset.h" /* needed for cws_insert.h */
#include "hashtab_int4.h"
#include "cws_insert.h"
#include <math.h>
error_def(ERR_GBLNOEXIST);
error_def(ERR_MUSIZEFAIL);
GBLREF bool mu_ctrlc_occurred;
GBLREF bool mu_ctrly_occurred;
GBLREF gv_namehead *gv_target;
GBLREF unsigned int t_tries;
GBLREF int4 process_id;
GBLREF inctn_opcode_t inctn_opcode;
#define MAX_RECS_PER_BLK 65535
#define MAX_RELIABLE 10000 /* Used to tweak the error estimates */
#define EPS 1e-6
#define SQR(X) ((double)(X) * (double)(X))
#define ROUND(X) ((int)((X) + 0.5)) /* c89 does not have round() and some Solaris machines uses that compiler */
typedef struct
{ /* cumulative stats */
int4 n; /* number of samples */
double W[MAX_BT_DEPTH + 1]; /* Sum of the importance values of samples for each depth level */
double w_mu[MAX_BT_DEPTH + 1]; /* The mean of importance values. It is used to calculate w_variance */
double w_variance[MAX_BT_DEPTH + 1];/* The variance of importance values. It is used to calculate effective sample size */
double mu[MAX_BT_DEPTH + 1]; /* mu[j] := mean of weighted r[j]'s over previous n traversals.
It is the expected number of records at depth j
* Note: mu_n = mu_{n-1} + w_n/W_n*(r_n - M_{n-1})
*/
double S[MAX_BT_DEPTH + 1]; /* S[j] := sum of w_i*(r_i[j] - M[j])^2 over previous traversals.
* Note: S_n = S_{n-1} + w_n*(r_n - M_n)*(r_n - M_{n-1})
* Later, S values are divided by W values to give plugin estimate of the variance.
* Subsequently they are divided by the effective sample size to give the variance
* of the mean
*/
/* Final estimates */
double blktot[MAX_BT_DEPTH + 1]; /* estimated #blocks at each level */
double blkerr[MAX_BT_DEPTH + 1]; /* approximate variance of blktot */
double rectot[MAX_BT_DEPTH + 1]; /* estimated #records at each level */
double B; /* estimated total blocks */
double error; /* approximate error in estimate B */
double R; /* estimated total records */
} stat_t;
STATICFNDCL void clear_vector_impsmpl(double *v);
STATICFNDCL void init_stats_impsmpl(stat_t *stat);
STATICFNDCL void finalize_stats_impsmpl(stat_t *stat);
STATICFNDCL void accum_stats_impsmpl(stat_t *stat, double *r);
STATICFNDEF void clear_vector_impsmpl(double *v)
{
int j;
for (j = 0; j <= MAX_BT_DEPTH; j++)
v[j] = 0;
}
STATICFNDEF void init_stats_impsmpl(stat_t *stat)
{
stat->n = 0;
clear_vector_impsmpl(stat->W);
clear_vector_impsmpl(stat->w_mu);
clear_vector_impsmpl(stat->w_variance);
clear_vector_impsmpl(stat->mu);
clear_vector_impsmpl(stat->S);
clear_vector_impsmpl(stat->blktot);
clear_vector_impsmpl(stat->blkerr);
clear_vector_impsmpl(stat->rectot);
}
/*
* Importance Sampling
*/
int4 mu_size_impsample(mval *gn, int4 M, int4 seed)
{
enum cdb_sc status;
trans_num ret_tn;
int k, h;
boolean_t verify_reads;
boolean_t tn_aborted;
unsigned int lcl_t_tries;
double r[MAX_BT_DEPTH + 1]; /* r[j] is #records in level j block of current traversal */
stat_t rstat, ustat;
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
inctn_opcode = inctn_invalid_op;
op_gvname(VARLSTCNT(1) gn);
if (0 == gv_target->root)
{ /* Global does not exist (online rollback). Not an error. */
gtm_putmsg(VARLSTCNT(4) ERR_GBLNOEXIST, 2, gn->str.len, gn->str.addr);
return EXIT_NRM;
}
if (!seed)
seed = (int4)(time(0) * process_id);
srand48(seed);
/* do M random traversals */
init_stats_impsmpl(&rstat);
for (k = 1; k <= M; k++)
{
if (mu_ctrlc_occurred || mu_ctrly_occurred)
return EXIT_ERR;
t_begin(ERR_MUSIZEFAIL, 0);
for (;;)
{
clear_vector_impsmpl(r);
if (cdb_sc_normal != (status = rand_traverse(r)))
{
assert(CDB_STAGNATE > t_tries);
t_retry(status);
continue;
}
gv_target->clue.end = 0;
gv_target->hist.h[0] = gv_target->hist.h[1]; /* No level 0 block to validate */
DEBUG_ONLY(lcl_t_tries = t_tries);
if ((trans_num)0 == (ret_tn = t_end(&gv_target->hist, NULL, TN_NOT_SPECIFIED)))
{
ABORT_TRANS_IF_GBL_EXIST_NOMORE(lcl_t_tries, tn_aborted);
if (tn_aborted)
{ /* Global does not exist (online rollback). Not an error. */
gtm_putmsg(VARLSTCNT(4) ERR_GBLNOEXIST, 2, gn->str.len, gn->str.addr);
return EXIT_NRM;
}
continue;
}
accum_stats_impsmpl(&rstat, r);
break;
}
}
finalize_stats_impsmpl(&rstat);
/* display rstat data
* Showing the error as 2 standard deviations which is a 95% confidence interval for the
* mean number of blocks at each level
*/
util_out_print("Number of generated samples = !UL", FLUSH, rstat.n);
util_out_print("Level Blocks 2 sigma(+/-)", FLUSH);
for (h = MAX_BT_DEPTH; (h >= 0) && (rstat.blktot[h] < EPS); h--);
for ( ; h > 0; h--)
util_out_print("!5UL !15UL !15UL ~ !3UL%", FLUSH, h, (int)ROUND(rstat.blktot[h]),
(int)ROUND(sqrt(rstat.blkerr[h])*2),
(int)ROUND(sqrt(rstat.blkerr[h])*2/rstat.blktot[h]*100.0)
);
util_out_print("!5UL !15UL !15UL ~ !3UL%", FLUSH, h, (int)ROUND(rstat.blktot[h]),
(int)ROUND(sqrt(rstat.blkerr[h])*2),
(int)ROUND(sqrt(rstat.blkerr[h])*2/rstat.blktot[h]*100.0)
);
util_out_print("Total !15UL !15UL ~ !3UL%", FLUSH, (int)ROUND(rstat.B),
(int)ROUND(sqrt(rstat.error)*2),
(int)ROUND(sqrt(rstat.error)*2/rstat.B*100.0)
);
return EXIT_NRM;
}
STATICFNDEF void accum_stats_impsmpl(stat_t *stat, double *r)
{
int l, root_level, n;
double mu0, w_mu0, w[MAX_BT_DEPTH + 1] /* importance */;
++stat->n;
for (l = MAX_BT_DEPTH; (l >= 0) && (r[l] < EPS); l--)
w[l] = 0;
root_level = l;
assert(root_level >= 0);
w[root_level] = 1;
for (l = root_level - 1; l >= 1; l--)
w[l] = w[l + 1] * r[l + 1]; /* TODO consider using log to avoid overflow if it becomes an issue */
w[0] = 0; /* computing #blks (e.g #recs in lvl 1+), not #recs in lvl 0+ */
for (l = 1; l <= root_level; l++)
{
stat->W[l] += w[l];
w_mu0 = stat->w_mu[l];
stat->w_mu[l] += (w[l] - stat->w_mu[l])/stat->n;
stat->w_variance[l] += (w[l] - stat->w_mu[l])*(w[l] - w_mu0);
mu0 = stat->mu[l];
stat->mu[l] += w[l]/stat->W[l]*(r[l] - stat->mu[l]);
stat->S[l] += w[l]*(r[l] - stat->mu[l])*(r[l] - mu0);
}
}
STATICFNDEF void finalize_stats_impsmpl(stat_t *stat)
{
int h;
double ess; /* effective sample size */
for (h = 1; h <= MAX_BT_DEPTH; h++)
if (stat->W[h] > 0)
{
/* ess = n / ( 1 + Var( w/mu(w) ) ).
* This comes from effective sample size for importance sampling in the literature*/
ess = stat->n / ( 1 + (stat->w_variance[h]/stat->n)/SQR(stat->w_mu[h]) );
/* Variance of the mean (mean referes to avg number of records per block) is
* Var(R)/N where N is effective sample size */
stat->S[h] /= stat->W[h];
stat->S[h] /= (ess + 1);
}
stat->W[0] = stat->n; /* for arithmetic below */
for (h = MAX_BT_DEPTH; (h >= 0) && (stat->mu[h] < EPS); h--);
assert(h >= 0); /* stat->mu[0] should remain zero */
stat->blktot[h] = 1;
stat->blkerr[h] = 0;
for (h-- ; h >= 0; h--)
{
stat->blktot[h] = stat->blktot[h + 1] * stat->mu[h + 1];
/* Var(XY) assuming X and Y are independent = E[X]^2*Var(Y) + E[Y]^2*Var(X) + Var(X)*Var(Y) */
stat->blkerr[h] = SQR(stat->mu[h + 1])*stat->blkerr[h + 1] + SQR(stat->blktot[h + 1])*stat->S[h + 1]
+ stat->blkerr[h + 1]*stat->S[h + 1];
}
stat->B = 0;
stat->error = 0;
for (h = 0; h <= MAX_BT_DEPTH; h++)
{
stat->B += stat->blktot[h];
stat->error += stat->blkerr[h];
}
stat->R = 0;
for (h = 0; h <= MAX_BT_DEPTH; h++)
{
stat->rectot[h] = stat->blktot[h] * stat->mu[h];
stat->R += stat->rectot[h];
}
}