HDFS-3247. Improve bootstrapStandby behavior when original NN is not active. Contributed by Todd Lipcon.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1324559 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
40497a9d04
commit
f47d418e07
|
@ -236,6 +236,9 @@ Release 2.0.0 - UNRELEASED
|
|||
|
||||
HDFS-3244. Remove dead writable code from hdfs/protocol. (eli)
|
||||
|
||||
HDFS-3247. Improve bootstrapStandby behavior when original NN is not active
|
||||
(todd)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
HDFS-2477. Optimize computing the diff between a block report and the
|
||||
|
|
|
@ -33,10 +33,14 @@ import org.apache.hadoop.HadoopIllegalArgumentException;
|
|||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.conf.Configurable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.ha.HAServiceProtocol;
|
||||
import org.apache.hadoop.ha.HAServiceStatus;
|
||||
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
|
||||
import org.apache.hadoop.ha.ServiceFailedException;
|
||||
import org.apache.hadoop.hdfs.DFSUtil;
|
||||
import org.apache.hadoop.hdfs.HAUtil;
|
||||
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
||||
import org.apache.hadoop.hdfs.NameNodeProxies;
|
||||
import org.apache.hadoop.hdfs.NameNodeProxies.ProxyAndInfo;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||
import org.apache.hadoop.hdfs.server.namenode.CheckpointSignature;
|
||||
import org.apache.hadoop.hdfs.server.namenode.EditLogInputStream;
|
||||
|
@ -47,8 +51,10 @@ import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
|||
import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.io.MD5Hash;
|
||||
import org.apache.hadoop.security.AccessControlException;
|
||||
import org.apache.hadoop.security.SecurityUtil;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.util.Tool;
|
||||
|
@ -80,6 +86,12 @@ public class BootstrapStandby implements Tool, Configurable {
|
|||
private boolean force = false;
|
||||
private boolean interactive = true;
|
||||
|
||||
// Exit/return codes.
|
||||
static final int ERR_CODE_FAILED_CONNECT = 2;
|
||||
static final int ERR_CODE_INVALID_VERSION = 3;
|
||||
static final int ERR_CODE_OTHER_NN_NOT_ACTIVE = 4;
|
||||
static final int ERR_CODE_ALREADY_FORMATTED = 5;
|
||||
static final int ERR_CODE_LOGS_UNAVAILABLE = 6;
|
||||
|
||||
public int run(String[] args) throws Exception {
|
||||
SecurityUtil.initKrb5CipherSuites();
|
||||
|
@ -122,24 +134,43 @@ public class BootstrapStandby implements Tool, Configurable {
|
|||
"[-force] [-nonInteractive]");
|
||||
}
|
||||
|
||||
private int doRun() throws IOException {
|
||||
ProxyAndInfo<NamenodeProtocol> proxyAndInfo = NameNodeProxies.createNonHAProxy(getConf(),
|
||||
private NamenodeProtocol createNNProtocolProxy()
|
||||
throws IOException {
|
||||
return NameNodeProxies.createNonHAProxy(getConf(),
|
||||
otherIpcAddr, NamenodeProtocol.class,
|
||||
UserGroupInformation.getLoginUser(), true);
|
||||
NamenodeProtocol proxy = proxyAndInfo.getProxy();
|
||||
UserGroupInformation.getLoginUser(), true)
|
||||
.getProxy();
|
||||
}
|
||||
|
||||
private HAServiceProtocol createHAProtocolProxy()
|
||||
throws IOException {
|
||||
return new NNHAServiceTarget(new HdfsConfiguration(conf),
|
||||
nsId, otherNNId).getProxy(conf, 15000);
|
||||
}
|
||||
|
||||
private int doRun() throws IOException {
|
||||
|
||||
NamenodeProtocol proxy = createNNProtocolProxy();
|
||||
NamespaceInfo nsInfo;
|
||||
try {
|
||||
nsInfo = proxy.versionRequest();
|
||||
checkLayoutVersion(nsInfo);
|
||||
} catch (IOException ioe) {
|
||||
LOG.fatal("Unable to fetch namespace information from active NN at " +
|
||||
otherIpcAddr + ": " + ioe.getMessage());
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Full exception trace", ioe);
|
||||
}
|
||||
return 1;
|
||||
return ERR_CODE_FAILED_CONNECT;
|
||||
}
|
||||
|
||||
if (!checkLayoutVersion(nsInfo)) {
|
||||
LOG.fatal("Layout version on remote node (" +
|
||||
nsInfo.getLayoutVersion() + ") does not match " +
|
||||
"this node's layout version (" + HdfsConstants.LAYOUT_VERSION + ")");
|
||||
return ERR_CODE_INVALID_VERSION;
|
||||
}
|
||||
|
||||
|
||||
System.out.println(
|
||||
"=====================================================\n" +
|
||||
"About to bootstrap Standby ID " + nnId + " from:\n" +
|
||||
|
@ -153,12 +184,35 @@ public class BootstrapStandby implements Tool, Configurable {
|
|||
" Layout version: " + nsInfo.getLayoutVersion() + "\n" +
|
||||
"=====================================================");
|
||||
|
||||
// Ensure the other NN is active - we can't force it to roll edit logs
|
||||
// below if it's not active.
|
||||
if (!isOtherNNActive()) {
|
||||
String err = "NameNode " + nsId + "." + nnId + " at " + otherIpcAddr +
|
||||
" is not currently in ACTIVE state.";
|
||||
if (!interactive) {
|
||||
LOG.fatal(err + " Please transition it to " +
|
||||
"active before attempting to bootstrap a standby node.");
|
||||
return ERR_CODE_OTHER_NN_NOT_ACTIVE;
|
||||
}
|
||||
|
||||
System.err.println(err);
|
||||
if (ToolRunner.confirmPrompt(
|
||||
"Do you want to automatically transition it to active now?")) {
|
||||
transitionOtherNNActive();
|
||||
} else {
|
||||
LOG.fatal("User aborted. Exiting without bootstrapping standby.");
|
||||
return ERR_CODE_OTHER_NN_NOT_ACTIVE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Check with the user before blowing away data.
|
||||
if (!NameNode.confirmFormat(
|
||||
Sets.union(Sets.newHashSet(dirsToFormat),
|
||||
Sets.newHashSet(editUrisToFormat)),
|
||||
force, interactive)) {
|
||||
return 1;
|
||||
return ERR_CODE_ALREADY_FORMATTED;
|
||||
}
|
||||
|
||||
// Force the active to roll its log
|
||||
|
@ -180,7 +234,7 @@ public class BootstrapStandby implements Tool, Configurable {
|
|||
// Ensure that we have enough edits already in the shared directory to
|
||||
// start up from the last checkpoint on the active.
|
||||
if (!checkLogsAvailableForRead(image, imageTxId, rollTxId)) {
|
||||
return 1;
|
||||
return ERR_CODE_LOGS_UNAVAILABLE;
|
||||
}
|
||||
|
||||
image.getStorage().writeTransactionIdFileToStorage(rollTxId);
|
||||
|
@ -193,6 +247,14 @@ public class BootstrapStandby implements Tool, Configurable {
|
|||
return 0;
|
||||
}
|
||||
|
||||
|
||||
private void transitionOtherNNActive()
|
||||
throws AccessControlException, ServiceFailedException, IOException {
|
||||
LOG.info("Transitioning the running namenode to active...");
|
||||
createHAProtocolProxy().transitionToActive();
|
||||
LOG.info("Successful");
|
||||
}
|
||||
|
||||
private boolean checkLogsAvailableForRead(FSImage image, long imageTxId,
|
||||
long rollTxId) {
|
||||
|
||||
|
@ -225,12 +287,14 @@ public class BootstrapStandby implements Tool, Configurable {
|
|||
}
|
||||
}
|
||||
|
||||
private void checkLayoutVersion(NamespaceInfo nsInfo) throws IOException {
|
||||
if (nsInfo.getLayoutVersion() != HdfsConstants.LAYOUT_VERSION) {
|
||||
throw new IOException("Layout version on remote node (" +
|
||||
nsInfo.getLayoutVersion() + ") does not match " +
|
||||
"this node's layout version (" + HdfsConstants.LAYOUT_VERSION + ")");
|
||||
private boolean checkLayoutVersion(NamespaceInfo nsInfo) throws IOException {
|
||||
return (nsInfo.getLayoutVersion() == HdfsConstants.LAYOUT_VERSION);
|
||||
}
|
||||
|
||||
private boolean isOtherNNActive()
|
||||
throws AccessControlException, IOException {
|
||||
HAServiceStatus status = createHAProtocolProxy().getServiceStatus();
|
||||
return status.getState() == HAServiceState.ACTIVE;
|
||||
}
|
||||
|
||||
private void parseConfAndFindOtherNN() throws IOException {
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
*/
|
||||
package org.apache.hadoop.hdfs.server.namenode.ha;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
|
@ -40,6 +41,7 @@ import org.junit.After;
|
|||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.base.Suppliers;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
|
@ -170,7 +172,7 @@ public class TestBootstrapStandby {
|
|||
int rc = BootstrapStandby.run(
|
||||
new String[]{"-force"},
|
||||
cluster.getConfiguration(1));
|
||||
assertEquals(1, rc);
|
||||
assertEquals(BootstrapStandby.ERR_CODE_LOGS_UNAVAILABLE, rc);
|
||||
} finally {
|
||||
logs.stopCapturing();
|
||||
}
|
||||
|
@ -184,7 +186,7 @@ public class TestBootstrapStandby {
|
|||
int rc = BootstrapStandby.run(
|
||||
new String[]{"-nonInteractive"},
|
||||
cluster.getConfiguration(1));
|
||||
assertEquals(1, rc);
|
||||
assertEquals(BootstrapStandby.ERR_CODE_ALREADY_FORMATTED, rc);
|
||||
|
||||
// Should pass with -force
|
||||
rc = BootstrapStandby.run(
|
||||
|
@ -193,6 +195,24 @@ public class TestBootstrapStandby {
|
|||
assertEquals(0, rc);
|
||||
}
|
||||
|
||||
@Test(timeout=30000)
|
||||
public void testOtherNodeNotActive() throws Exception {
|
||||
cluster.transitionToStandby(0);
|
||||
int rc = BootstrapStandby.run(
|
||||
new String[]{"-nonInteractive"},
|
||||
cluster.getConfiguration(1));
|
||||
assertEquals(BootstrapStandby.ERR_CODE_OTHER_NN_NOT_ACTIVE, rc);
|
||||
|
||||
// Answer "yes" to the prompt about transition to active
|
||||
System.setIn(new ByteArrayInputStream("yes\n".getBytes()));
|
||||
rc = BootstrapStandby.run(
|
||||
new String[]{"-force"},
|
||||
cluster.getConfiguration(1));
|
||||
assertEquals(0, rc);
|
||||
|
||||
assertFalse(nn0.getNamesystem().isInStandbyState());
|
||||
}
|
||||
|
||||
private void assertNNFilesMatch() throws Exception {
|
||||
List<File> curDirs = Lists.newArrayList();
|
||||
curDirs.addAll(FSImageTestUtil.getNameNodeCurrentDirs(cluster, 0));
|
||||
|
|
Loading…
Reference in New Issue