HDFS-15036. Active NameNode should not silently fail the image transfer. Contributed by Chen Liang.

(cherry picked from commit 65c4660bcd)
(cherry picked from commit d4a6901c42)
This commit is contained in:
Chen Liang 2019-12-12 10:22:05 -08:00
parent 974706c5fd
commit ea0eeb8f1a
3 changed files with 37 additions and 1 deletions

View File

@ -573,7 +573,13 @@ public class ImageServlet extends HttpServlet {
long timeDelta = TimeUnit.MILLISECONDS.toSeconds(
now - lastCheckpointTime);
// Since the goal of the check below is to prevent overly
// frequent upload from Standby, the check should only be done
// for the periodical upload from Standby. For the other
// scenarios such as rollback image and ckpt file, they skip
// this check, see HDFS-15036 for more info.
if (checkRecentImageEnable &&
NameNodeFile.IMAGE.equals(parsedParams.getNameNodeFile()) &&
timeDelta < checkpointPeriod &&
txid - lastCheckpointTxid < checkpointTxnCount) {
// only when at least one of two conditions are met we accept

View File

@ -292,10 +292,20 @@ public class StandbyCheckpointer {
// TODO should there be some smarts here about retries nodes that
// are not the active NN?
CheckpointReceiverEntry receiverEntry = checkpointReceivers.get(url);
if (upload.get() == TransferFsImage.TransferResult.SUCCESS) {
TransferFsImage.TransferResult uploadResult = upload.get();
if (uploadResult == TransferFsImage.TransferResult.SUCCESS) {
receiverEntry.setLastUploadTime(monotonicNow());
receiverEntry.setIsPrimary(true);
} else {
// Getting here means image upload is explicitly rejected
// by the other node. This could happen if:
// 1. the other is also a standby, or
// 2. the other is active, but already accepted another
// newer image, or
// 3. the other is active but has a recent enough image.
// All these are valid cases, just log for information.
LOG.info("Image upload rejected by the other NameNode: {}",
uploadResult);
receiverEntry.setIsPrimary(false);
}
} catch (ExecutionException e) {

View File

@ -58,6 +58,7 @@ import org.apache.hadoop.test.GenericTestUtils;
import org.junit.Assert;
import org.junit.Test;
import static org.apache.hadoop.hdfs.server.namenode.ImageServlet.RECENT_IMAGE_CHECK_ENABLED;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNull;
@ -431,7 +432,22 @@ public class TestRollingUpgrade {
testFinalize(3);
}
@Test(timeout = 300000)
public void testFinalizeWithDeltaCheck() throws Exception {
testFinalize(2, true);
}
@Test(timeout = 300000)
public void testFinalizeWithMultipleNNDeltaCheck() throws Exception {
testFinalize(3, true);
}
private void testFinalize(int nnCount) throws Exception {
testFinalize(nnCount, false);
}
private void testFinalize(int nnCount, boolean skipImageDeltaCheck)
throws Exception {
final Configuration conf = new HdfsConfiguration();
MiniQJMHACluster cluster = null;
final Path foo = new Path("/foo");
@ -450,6 +466,10 @@ public class TestRollingUpgrade {
dfsCluster.restartNameNodes();
dfsCluster.transitionToActive(0);
dfsCluster.getNameNode(0).getHttpServer()
.setAttribute(RECENT_IMAGE_CHECK_ENABLED, skipImageDeltaCheck);
DistributedFileSystem dfs = dfsCluster.getFileSystem(0);
dfs.mkdirs(foo);