HDFS-15293. Relax the condition for accepting a fsimage when receiving a checkpoint. Contributed by Chen Liang
This commit is contained in:
parent
c84e6beada
commit
7bb902bc0d
|
@ -99,6 +99,19 @@ public class ImageServlet extends HttpServlet {
|
|||
"recent.image.check.enabled";
|
||||
public static final boolean RECENT_IMAGE_CHECK_ENABLED_DEFAULT = true;
|
||||
|
||||
/*
|
||||
* Specify a relaxation for the time delta check, the relaxation is to account
|
||||
* for the scenario that there are chances that minor time difference (e.g.
|
||||
* due to image upload delay, or minor machine clock skew) can cause ANN to
|
||||
* reject a fsImage too aggressively.
|
||||
*/
|
||||
private static double recentImageCheckTimePrecision = 0.75;
|
||||
|
||||
@VisibleForTesting
|
||||
static void setRecentImageCheckTimePrecision(double ratio) {
|
||||
recentImageCheckTimePrecision = ratio;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void doGet(final HttpServletRequest request,
|
||||
final HttpServletResponse response) throws ServletException, IOException {
|
||||
|
@ -592,6 +605,9 @@ public class ImageServlet extends HttpServlet {
|
|||
long checkpointPeriod =
|
||||
conf.getTimeDuration(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY,
|
||||
DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT, TimeUnit.SECONDS);
|
||||
checkpointPeriod = Math.round(
|
||||
checkpointPeriod * recentImageCheckTimePrecision);
|
||||
|
||||
long checkpointTxnCount =
|
||||
conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
|
||||
DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
|
||||
|
@ -612,21 +628,24 @@ public class ImageServlet extends HttpServlet {
|
|||
// a new fsImage
|
||||
// 1. most recent image's txid is too far behind
|
||||
// 2. last checkpoint time was too old
|
||||
response.sendError(HttpServletResponse.SC_CONFLICT,
|
||||
"Most recent checkpoint is neither too far behind in "
|
||||
+ "txid, nor too old. New txnid cnt is "
|
||||
+ (txid - lastCheckpointTxid)
|
||||
+ ", expecting at least " + checkpointTxnCount
|
||||
+ " unless too long since last upload.");
|
||||
String message = "Rejecting a fsimage due to small time delta "
|
||||
+ "and txnid delta. Time since previous checkpoint is "
|
||||
+ timeDelta + " expecting at least " + checkpointPeriod
|
||||
+ " txnid delta since previous checkpoint is " +
|
||||
(txid - lastCheckpointTxid) + " expecting at least "
|
||||
+ checkpointTxnCount;
|
||||
LOG.info(message);
|
||||
response.sendError(HttpServletResponse.SC_CONFLICT, message);
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
if (nnImage.getStorage().findImageFile(nnf, txid) != null) {
|
||||
response.sendError(HttpServletResponse.SC_CONFLICT,
|
||||
"Either current namenode has checkpointed or "
|
||||
+ "another checkpointer already uploaded an "
|
||||
+ "checkpoint for txid " + txid);
|
||||
String message = "Either current namenode has checkpointed or "
|
||||
+ "another checkpointer already uploaded an "
|
||||
+ "checkpoint for txid " + txid;
|
||||
LOG.info(message);
|
||||
response.sendError(HttpServletResponse.SC_CONFLICT, message);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
@ -2464,7 +2464,7 @@ public class TestCheckpoint {
|
|||
}
|
||||
|
||||
@Test(timeout = 300000)
|
||||
public void testActiveRejectSmallerDeltaImage() throws Exception {
|
||||
public void testActiveRejectSmallerTxidDeltaImage() throws Exception {
|
||||
MiniDFSCluster cluster = null;
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
// Set the delta txid threshold to 10
|
||||
|
@ -2517,6 +2517,57 @@ public class TestCheckpoint {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that even with txid and time delta threshold, by having time
|
||||
* relaxation, SBN can still upload images to ANN.
|
||||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
@Test
|
||||
public void testActiveImageWithTimeDeltaRelaxation() throws Exception {
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
// Set the delta txid threshold to some arbitrarily large value, so
|
||||
// it does not trigger a checkpoint during this test.
|
||||
conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1000000);
|
||||
// Set the delta time threshold to some arbitrarily large value, so
|
||||
// it does not trigger a checkpoint during this test.
|
||||
conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 900000);
|
||||
// Set relaxation to 0, means time delta = 0 from previous image is fine,
|
||||
// this will effectively disable reject small delta image
|
||||
ImageServlet.setRecentImageCheckTimePrecision(0);
|
||||
|
||||
SecondaryNameNode secondary = null;
|
||||
|
||||
try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
|
||||
.numDataNodes(0).format(true).build()) {
|
||||
// enable small delta rejection
|
||||
NameNode active = cluster.getNameNode();
|
||||
active.httpServer.getHttpServer()
|
||||
.setAttribute(RECENT_IMAGE_CHECK_ENABLED, true);
|
||||
|
||||
secondary = startSecondaryNameNode(conf);
|
||||
|
||||
FileSystem fs = cluster.getFileSystem();
|
||||
assertEquals(0, active.getNamesystem().getFSImage()
|
||||
.getMostRecentCheckpointTxId());
|
||||
|
||||
// create 5 dir.
|
||||
for (int i = 0; i < 5; i++) {
|
||||
fs.mkdirs(new Path("dir-" + i));
|
||||
}
|
||||
|
||||
// Checkpoint 1st
|
||||
secondary.doCheckpoint();
|
||||
// at this point, despite this is a small delta change, w.r.t both
|
||||
// txid and time delta, due to we set relaxation to 0, this image
|
||||
// still gets accepted
|
||||
assertEquals(9, active.getNamesystem().getFSImage()
|
||||
.getMostRecentCheckpointTxId());
|
||||
} finally {
|
||||
cleanup(secondary);
|
||||
}
|
||||
}
|
||||
|
||||
private static void cleanup(SecondaryNameNode snn) {
|
||||
if (snn != null) {
|
||||
try {
|
||||
|
|
Loading…
Reference in New Issue