HDFS-3026. HA: Handle failure during HA state transition. Contributed by Aaron T. Myers.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1337030 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8def56e51e
commit
5dbd09ee76
|
@ -651,6 +651,8 @@ Release 2.0.0 - UNRELEASED
|
|||
necessarily a BlockInfoUnderConstruction, so do not cast it in
|
||||
FSNamesystem.recoverLeaseInternal(..). (szetszwo)
|
||||
|
||||
HDFS-3026. HA: Handle failure during HA state transition. (atm)
|
||||
|
||||
BREAKDOWN OF HDFS-1623 SUBTASKS
|
||||
|
||||
HDFS-2179. Add fencing framework and mechanisms for NameNode HA. (todd)
|
||||
|
|
|
@ -206,6 +206,7 @@ public class NameNode {
|
|||
private final boolean haEnabled;
|
||||
private final HAContext haContext;
|
||||
protected boolean allowStaleStandbyReads;
|
||||
private Runtime runtime = Runtime.getRuntime();
|
||||
|
||||
|
||||
/** httpServer */
|
||||
|
@ -481,11 +482,16 @@ public class NameNode {
|
|||
}
|
||||
|
||||
private void startTrashEmptier(Configuration conf) throws IOException {
|
||||
long trashInterval
|
||||
= conf.getLong(CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY,
|
||||
CommonConfigurationKeys.FS_TRASH_INTERVAL_DEFAULT);
|
||||
if(trashInterval == 0)
|
||||
long trashInterval = conf.getLong(
|
||||
CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY,
|
||||
CommonConfigurationKeys.FS_TRASH_INTERVAL_DEFAULT);
|
||||
if (trashInterval == 0) {
|
||||
return;
|
||||
} else if (trashInterval < 0) {
|
||||
throw new IOException("Cannot start tresh emptier with negative interval."
|
||||
+ " Set " + CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY + " to a"
|
||||
+ " positive value.");
|
||||
}
|
||||
this.emptier = new Thread(new Trash(conf).getEmptier(), "Trash Emptier");
|
||||
this.emptier.setDaemon(true);
|
||||
this.emptier.start();
|
||||
|
@ -1235,14 +1241,37 @@ public class NameNode {
|
|||
}
|
||||
return state.getServiceState();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public synchronized void setRuntimeForTesting(Runtime runtime) {
|
||||
this.runtime = runtime;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class used as expose {@link NameNode} as context to {@link HAState}
|
||||
* Shutdown the NN immediately in an ungraceful way. Used when it would be
|
||||
* unsafe for the NN to continue operating, e.g. during a failed HA state
|
||||
* transition.
|
||||
*
|
||||
* TODO(HA):
|
||||
* When entering and exiting state, on failing to start services,
|
||||
* appropriate action is needed todo either shutdown the node or recover
|
||||
* from failure.
|
||||
* @param t exception which warrants the shutdown. Printed to the NN log
|
||||
* before exit.
|
||||
* @throws ServiceFailedException thrown only for testing.
|
||||
*/
|
||||
private synchronized void doImmediateShutdown(Throwable t)
|
||||
throws ServiceFailedException {
|
||||
String message = "Error encountered requiring NN shutdown. " +
|
||||
"Shutting down immediately.";
|
||||
try {
|
||||
LOG.fatal(message, t);
|
||||
} catch (Throwable ignored) {
|
||||
// This is unlikely to happen, but there's nothing we can do if it does.
|
||||
}
|
||||
runtime.exit(1);
|
||||
// This code is only reached during testing, when runtime is stubbed out.
|
||||
throw new ServiceFailedException(message, t);
|
||||
}
|
||||
|
||||
/**
|
||||
* Class used to expose {@link NameNode} as context to {@link HAState}
|
||||
*/
|
||||
protected class NameNodeHAContext implements HAContext {
|
||||
@Override
|
||||
|
@ -1257,32 +1286,52 @@ public class NameNode {
|
|||
|
||||
@Override
|
||||
public void startActiveServices() throws IOException {
|
||||
namesystem.startActiveServices();
|
||||
startTrashEmptier(conf);
|
||||
try {
|
||||
namesystem.startActiveServices();
|
||||
startTrashEmptier(conf);
|
||||
} catch (Throwable t) {
|
||||
doImmediateShutdown(t);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopActiveServices() throws IOException {
|
||||
if (namesystem != null) {
|
||||
namesystem.stopActiveServices();
|
||||
try {
|
||||
if (namesystem != null) {
|
||||
namesystem.stopActiveServices();
|
||||
}
|
||||
stopTrashEmptier();
|
||||
} catch (Throwable t) {
|
||||
doImmediateShutdown(t);
|
||||
}
|
||||
stopTrashEmptier();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startStandbyServices() throws IOException {
|
||||
namesystem.startStandbyServices(conf);
|
||||
try {
|
||||
namesystem.startStandbyServices(conf);
|
||||
} catch (Throwable t) {
|
||||
doImmediateShutdown(t);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prepareToStopStandbyServices() throws ServiceFailedException {
|
||||
namesystem.prepareToStopStandbyServices();
|
||||
try {
|
||||
namesystem.prepareToStopStandbyServices();
|
||||
} catch (Throwable t) {
|
||||
doImmediateShutdown(t);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopStandbyServices() throws IOException {
|
||||
if (namesystem != null) {
|
||||
namesystem.stopStandbyServices();
|
||||
try {
|
||||
if (namesystem != null) {
|
||||
namesystem.stopStandbyServices();
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
doImmediateShutdown(t);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.server.namenode.ha;
|
||||
|
||||
import static org.apache.hadoop.test.GenericTestUtils.assertExceptionContains;
|
||||
import static org.junit.Assert.fail;
|
||||
import static org.mockito.Matchers.anyInt;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeys;
|
||||
import org.apache.hadoop.ha.ServiceFailedException;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Tests to verify the behavior of failing to fully start transition HA states.
|
||||
*/
|
||||
public class TestStateTransitionFailure {
|
||||
|
||||
public static final Log LOG = LogFactory.getLog(TestStateTransitionFailure.class);
|
||||
|
||||
/**
|
||||
* Ensure that a failure to fully transition to the active state causes a
|
||||
* shutdown of the NameNode.
|
||||
*/
|
||||
@Test
|
||||
public void testFailureToTransitionCausesShutdown() throws IOException {
|
||||
MiniDFSCluster cluster = null;
|
||||
try {
|
||||
Configuration conf = new Configuration();
|
||||
// Set an illegal value for the trash emptier interval. This will cause
|
||||
// the NN to fail to transition to the active state.
|
||||
conf.setLong(CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY, -1);
|
||||
cluster = new MiniDFSCluster.Builder(conf)
|
||||
.nnTopology(MiniDFSNNTopology.simpleHATopology())
|
||||
.numDataNodes(0)
|
||||
.build();
|
||||
cluster.waitActive();
|
||||
Runtime mockRuntime = mock(Runtime.class);
|
||||
cluster.getNameNode(0).setRuntimeForTesting(mockRuntime);
|
||||
verify(mockRuntime, times(0)).exit(anyInt());
|
||||
try {
|
||||
cluster.transitionToActive(0);
|
||||
fail("Transitioned to active but should not have been able to.");
|
||||
} catch (ServiceFailedException sfe) {
|
||||
assertExceptionContains("Error encountered requiring NN shutdown. " +
|
||||
"Shutting down immediately.", sfe);
|
||||
LOG.info("got expected exception", sfe);
|
||||
}
|
||||
verify(mockRuntime, times(1)).exit(anyInt());
|
||||
} finally {
|
||||
if (cluster != null) {
|
||||
cluster.shutdown();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue