HDFS-3247. Improve bootstrapStandby behavior when original NN is not active. Contributed by Todd Lipcon.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1324558 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2012-04-11 05:16:06 +00:00
parent 13bfe4794b
commit 21824d8232
3 changed files with 106 additions and 19 deletions

View File

@ -362,6 +362,9 @@ Release 2.0.0 - UNRELEASED
HDFS-3244. Remove dead writable code from hdfs/protocol. (eli)
HDFS-3247. Improve bootstrapStandby behavior when original NN is not active
(todd)
OPTIMIZATIONS
HDFS-3024. Improve performance of stringification in addStoredBlock (todd)

View File

@ -33,10 +33,14 @@ import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.ha.HAServiceProtocol;
import org.apache.hadoop.ha.HAServiceStatus;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.ha.ServiceFailedException;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.NameNodeProxies;
import org.apache.hadoop.hdfs.NameNodeProxies.ProxyAndInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.server.namenode.CheckpointSignature;
import org.apache.hadoop.hdfs.server.namenode.EditLogInputStream;
@ -47,8 +51,10 @@ import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Tool;
@ -65,7 +71,7 @@ import com.google.common.collect.Sets;
*/
@InterfaceAudience.Private
public class BootstrapStandby implements Tool, Configurable {
private static final Log LOG = LogFactory.getLog(BootstrapStandby.class);
private static final Log LOG = LogFactory.getLog(BootstrapStandby.class);
private String nsId;
private String nnId;
private String otherNNId;
@ -79,7 +85,13 @@ public class BootstrapStandby implements Tool, Configurable {
private boolean force = false;
private boolean interactive = true;
// Exit/return codes.
static final int ERR_CODE_FAILED_CONNECT = 2;
static final int ERR_CODE_INVALID_VERSION = 3;
static final int ERR_CODE_OTHER_NN_NOT_ACTIVE = 4;
static final int ERR_CODE_ALREADY_FORMATTED = 5;
static final int ERR_CODE_LOGS_UNAVAILABLE = 6;
public int run(String[] args) throws Exception {
SecurityUtil.initKrb5CipherSuites();
@ -121,24 +133,43 @@ public class BootstrapStandby implements Tool, Configurable {
System.err.println("Usage: " + this.getClass().getSimpleName() +
"[-force] [-nonInteractive]");
}
private NamenodeProtocol createNNProtocolProxy()
throws IOException {
return NameNodeProxies.createNonHAProxy(getConf(),
otherIpcAddr, NamenodeProtocol.class,
UserGroupInformation.getLoginUser(), true)
.getProxy();
}
private HAServiceProtocol createHAProtocolProxy()
throws IOException {
return new NNHAServiceTarget(new HdfsConfiguration(conf),
nsId, otherNNId).getProxy(conf, 15000);
}
private int doRun() throws IOException {
ProxyAndInfo<NamenodeProtocol> proxyAndInfo = NameNodeProxies.createNonHAProxy(getConf(),
otherIpcAddr, NamenodeProtocol.class,
UserGroupInformation.getLoginUser(), true);
NamenodeProtocol proxy = proxyAndInfo.getProxy();
NamenodeProtocol proxy = createNNProtocolProxy();
NamespaceInfo nsInfo;
try {
nsInfo = proxy.versionRequest();
checkLayoutVersion(nsInfo);
} catch (IOException ioe) {
LOG.fatal("Unable to fetch namespace information from active NN at " +
otherIpcAddr + ": " + ioe.getMessage());
if (LOG.isDebugEnabled()) {
LOG.debug("Full exception trace", ioe);
}
return 1;
return ERR_CODE_FAILED_CONNECT;
}
if (!checkLayoutVersion(nsInfo)) {
LOG.fatal("Layout version on remote node (" +
nsInfo.getLayoutVersion() + ") does not match " +
"this node's layout version (" + HdfsConstants.LAYOUT_VERSION + ")");
return ERR_CODE_INVALID_VERSION;
}
System.out.println(
"=====================================================\n" +
@ -153,12 +184,35 @@ public class BootstrapStandby implements Tool, Configurable {
" Layout version: " + nsInfo.getLayoutVersion() + "\n" +
"=====================================================");
// Ensure the other NN is active - we can't force it to roll edit logs
// below if it's not active.
if (!isOtherNNActive()) {
String err = "NameNode " + nsId + "." + nnId + " at " + otherIpcAddr +
" is not currently in ACTIVE state.";
if (!interactive) {
LOG.fatal(err + " Please transition it to " +
"active before attempting to bootstrap a standby node.");
return ERR_CODE_OTHER_NN_NOT_ACTIVE;
}
System.err.println(err);
if (ToolRunner.confirmPrompt(
"Do you want to automatically transition it to active now?")) {
transitionOtherNNActive();
} else {
LOG.fatal("User aborted. Exiting without bootstrapping standby.");
return ERR_CODE_OTHER_NN_NOT_ACTIVE;
}
}
// Check with the user before blowing away data.
if (!NameNode.confirmFormat(
Sets.union(Sets.newHashSet(dirsToFormat),
Sets.newHashSet(editUrisToFormat)),
force, interactive)) {
return 1;
return ERR_CODE_ALREADY_FORMATTED;
}
// Force the active to roll its log
@ -180,7 +234,7 @@ public class BootstrapStandby implements Tool, Configurable {
// Ensure that we have enough edits already in the shared directory to
// start up from the last checkpoint on the active.
if (!checkLogsAvailableForRead(image, imageTxId, rollTxId)) {
return 1;
return ERR_CODE_LOGS_UNAVAILABLE;
}
image.getStorage().writeTransactionIdFileToStorage(rollTxId);
@ -193,6 +247,14 @@ public class BootstrapStandby implements Tool, Configurable {
return 0;
}
private void transitionOtherNNActive()
throws AccessControlException, ServiceFailedException, IOException {
LOG.info("Transitioning the running namenode to active...");
createHAProtocolProxy().transitionToActive();
LOG.info("Successful");
}
private boolean checkLogsAvailableForRead(FSImage image, long imageTxId,
long rollTxId) {
@ -225,12 +287,14 @@ public class BootstrapStandby implements Tool, Configurable {
}
}
private void checkLayoutVersion(NamespaceInfo nsInfo) throws IOException {
if (nsInfo.getLayoutVersion() != HdfsConstants.LAYOUT_VERSION) {
throw new IOException("Layout version on remote node (" +
nsInfo.getLayoutVersion() + ") does not match " +
"this node's layout version (" + HdfsConstants.LAYOUT_VERSION + ")");
}
private boolean checkLayoutVersion(NamespaceInfo nsInfo) throws IOException {
return (nsInfo.getLayoutVersion() == HdfsConstants.LAYOUT_VERSION);
}
private boolean isOtherNNActive()
throws AccessControlException, IOException {
HAServiceStatus status = createHAProtocolProxy().getServiceStatus();
return status.getState() == HAServiceState.ACTIVE;
}
private void parseConfAndFindOtherNN() throws IOException {

View File

@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hdfs.server.namenode.ha;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.net.URI;
@ -40,6 +41,7 @@ import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import com.google.common.base.Suppliers;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
@ -170,7 +172,7 @@ public class TestBootstrapStandby {
int rc = BootstrapStandby.run(
new String[]{"-force"},
cluster.getConfiguration(1));
assertEquals(1, rc);
assertEquals(BootstrapStandby.ERR_CODE_LOGS_UNAVAILABLE, rc);
} finally {
logs.stopCapturing();
}
@ -184,7 +186,7 @@ public class TestBootstrapStandby {
int rc = BootstrapStandby.run(
new String[]{"-nonInteractive"},
cluster.getConfiguration(1));
assertEquals(1, rc);
assertEquals(BootstrapStandby.ERR_CODE_ALREADY_FORMATTED, rc);
// Should pass with -force
rc = BootstrapStandby.run(
@ -192,6 +194,24 @@ public class TestBootstrapStandby {
cluster.getConfiguration(1));
assertEquals(0, rc);
}
@Test(timeout=30000)
public void testOtherNodeNotActive() throws Exception {
cluster.transitionToStandby(0);
int rc = BootstrapStandby.run(
new String[]{"-nonInteractive"},
cluster.getConfiguration(1));
assertEquals(BootstrapStandby.ERR_CODE_OTHER_NN_NOT_ACTIVE, rc);
// Answer "yes" to the prompt about transition to active
System.setIn(new ByteArrayInputStream("yes\n".getBytes()));
rc = BootstrapStandby.run(
new String[]{"-force"},
cluster.getConfiguration(1));
assertEquals(0, rc);
assertFalse(nn0.getNamesystem().isInStandbyState());
}
private void assertNNFilesMatch() throws Exception {
List<File> curDirs = Lists.newArrayList();