HDFS-15036. Active NameNode should not silently fail the image transfer. Contributed by Chen Liang.

This commit is contained in:
Chen Liang 2019-12-12 10:22:05 -08:00
parent c210cede5c
commit 65c4660bcd
3 changed files with 37 additions and 1 deletions

View File

@ -599,7 +599,13 @@ public class ImageServlet extends HttpServlet {
long timeDelta = TimeUnit.MILLISECONDS.toSeconds( long timeDelta = TimeUnit.MILLISECONDS.toSeconds(
now - lastCheckpointTime); now - lastCheckpointTime);
// Since the goal of the check below is to prevent overly
// frequent upload from Standby, the check should only be done
// for the periodical upload from Standby. For the other
// scenarios such as rollback image and ckpt file, they skip
// this check, see HDFS-15036 for more info.
if (checkRecentImageEnable && if (checkRecentImageEnable &&
NameNodeFile.IMAGE.equals(parsedParams.getNameNodeFile()) &&
timeDelta < checkpointPeriod && timeDelta < checkpointPeriod &&
txid - lastCheckpointTxid < checkpointTxnCount) { txid - lastCheckpointTxid < checkpointTxnCount) {
// only when at least one of two conditions are met we accept // only when at least one of two conditions are met we accept

View File

@ -292,10 +292,20 @@ public class StandbyCheckpointer {
// TODO should there be some smarts here about retries nodes that // TODO should there be some smarts here about retries nodes that
// are not the active NN? // are not the active NN?
CheckpointReceiverEntry receiverEntry = checkpointReceivers.get(url); CheckpointReceiverEntry receiverEntry = checkpointReceivers.get(url);
if (upload.get() == TransferFsImage.TransferResult.SUCCESS) { TransferFsImage.TransferResult uploadResult = upload.get();
if (uploadResult == TransferFsImage.TransferResult.SUCCESS) {
receiverEntry.setLastUploadTime(monotonicNow()); receiverEntry.setLastUploadTime(monotonicNow());
receiverEntry.setIsPrimary(true); receiverEntry.setIsPrimary(true);
} else { } else {
// Getting here means image upload is explicitly rejected
// by the other node. This could happen if:
// 1. the other is also a standby, or
// 2. the other is active, but already accepted another
// newer image, or
// 3. the other is active but has a recent enough image.
// All these are valid cases, just log for information.
LOG.info("Image upload rejected by the other NameNode: {}",
uploadResult);
receiverEntry.setIsPrimary(false); receiverEntry.setIsPrimary(false);
} }
} catch (ExecutionException e) { } catch (ExecutionException e) {

View File

@ -58,6 +58,7 @@ import org.apache.hadoop.test.GenericTestUtils;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import static org.apache.hadoop.hdfs.server.namenode.ImageServlet.RECENT_IMAGE_CHECK_ENABLED;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNull; import static org.junit.Assert.assertNull;
@ -432,7 +433,22 @@ public class TestRollingUpgrade {
testFinalize(3); testFinalize(3);
} }
@Test(timeout = 300000)
public void testFinalizeWithDeltaCheck() throws Exception {
testFinalize(2, true);
}
@Test(timeout = 300000)
public void testFinalizeWithMultipleNNDeltaCheck() throws Exception {
testFinalize(3, true);
}
private void testFinalize(int nnCount) throws Exception { private void testFinalize(int nnCount) throws Exception {
testFinalize(nnCount, false);
}
private void testFinalize(int nnCount, boolean skipImageDeltaCheck)
throws Exception {
final Configuration conf = new HdfsConfiguration(); final Configuration conf = new HdfsConfiguration();
MiniQJMHACluster cluster = null; MiniQJMHACluster cluster = null;
final Path foo = new Path("/foo"); final Path foo = new Path("/foo");
@ -451,6 +467,10 @@ public class TestRollingUpgrade {
dfsCluster.restartNameNodes(); dfsCluster.restartNameNodes();
dfsCluster.transitionToActive(0); dfsCluster.transitionToActive(0);
dfsCluster.getNameNode(0).getHttpServer()
.setAttribute(RECENT_IMAGE_CHECK_ENABLED, skipImageDeltaCheck);
DistributedFileSystem dfs = dfsCluster.getFileSystem(0); DistributedFileSystem dfs = dfsCluster.getFileSystem(0);
dfs.mkdirs(foo); dfs.mkdirs(foo);