HDFS-3026. HA: Handle failure during HA state transition. Contributed by Aaron T. Myers.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1337031 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6795b5ad8f
commit
97a8118315
|
@ -532,6 +532,8 @@ Release 2.0.0 - UNRELEASED
|
||||||
necessarily a BlockInfoUnderConstruction, so do not cast it in
|
necessarily a BlockInfoUnderConstruction, so do not cast it in
|
||||||
FSNamesystem.recoverLeaseInternal(..). (szetszwo)
|
FSNamesystem.recoverLeaseInternal(..). (szetszwo)
|
||||||
|
|
||||||
|
HDFS-3026. HA: Handle failure during HA state transition. (atm)
|
||||||
|
|
||||||
BREAKDOWN OF HDFS-1623 SUBTASKS
|
BREAKDOWN OF HDFS-1623 SUBTASKS
|
||||||
|
|
||||||
HDFS-2179. Add fencing framework and mechanisms for NameNode HA. (todd)
|
HDFS-2179. Add fencing framework and mechanisms for NameNode HA. (todd)
|
||||||
|
|
|
@ -206,6 +206,7 @@ public class NameNode {
|
||||||
private final boolean haEnabled;
|
private final boolean haEnabled;
|
||||||
private final HAContext haContext;
|
private final HAContext haContext;
|
||||||
protected boolean allowStaleStandbyReads;
|
protected boolean allowStaleStandbyReads;
|
||||||
|
private Runtime runtime = Runtime.getRuntime();
|
||||||
|
|
||||||
|
|
||||||
/** httpServer */
|
/** httpServer */
|
||||||
|
@ -481,11 +482,16 @@ public class NameNode {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void startTrashEmptier(Configuration conf) throws IOException {
|
private void startTrashEmptier(Configuration conf) throws IOException {
|
||||||
long trashInterval
|
long trashInterval = conf.getLong(
|
||||||
= conf.getLong(CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY,
|
CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY,
|
||||||
CommonConfigurationKeys.FS_TRASH_INTERVAL_DEFAULT);
|
CommonConfigurationKeys.FS_TRASH_INTERVAL_DEFAULT);
|
||||||
if(trashInterval == 0)
|
if (trashInterval == 0) {
|
||||||
return;
|
return;
|
||||||
|
} else if (trashInterval < 0) {
|
||||||
|
throw new IOException("Cannot start tresh emptier with negative interval."
|
||||||
|
+ " Set " + CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY + " to a"
|
||||||
|
+ " positive value.");
|
||||||
|
}
|
||||||
this.emptier = new Thread(new Trash(conf).getEmptier(), "Trash Emptier");
|
this.emptier = new Thread(new Trash(conf).getEmptier(), "Trash Emptier");
|
||||||
this.emptier.setDaemon(true);
|
this.emptier.setDaemon(true);
|
||||||
this.emptier.start();
|
this.emptier.start();
|
||||||
|
@ -1236,13 +1242,36 @@ public class NameNode {
|
||||||
return state.getServiceState();
|
return state.getServiceState();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public synchronized void setRuntimeForTesting(Runtime runtime) {
|
||||||
|
this.runtime = runtime;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class used as expose {@link NameNode} as context to {@link HAState}
|
* Shutdown the NN immediately in an ungraceful way. Used when it would be
|
||||||
|
* unsafe for the NN to continue operating, e.g. during a failed HA state
|
||||||
|
* transition.
|
||||||
*
|
*
|
||||||
* TODO(HA):
|
* @param t exception which warrants the shutdown. Printed to the NN log
|
||||||
* When entering and exiting state, on failing to start services,
|
* before exit.
|
||||||
* appropriate action is needed todo either shutdown the node or recover
|
* @throws ServiceFailedException thrown only for testing.
|
||||||
* from failure.
|
*/
|
||||||
|
private synchronized void doImmediateShutdown(Throwable t)
|
||||||
|
throws ServiceFailedException {
|
||||||
|
String message = "Error encountered requiring NN shutdown. " +
|
||||||
|
"Shutting down immediately.";
|
||||||
|
try {
|
||||||
|
LOG.fatal(message, t);
|
||||||
|
} catch (Throwable ignored) {
|
||||||
|
// This is unlikely to happen, but there's nothing we can do if it does.
|
||||||
|
}
|
||||||
|
runtime.exit(1);
|
||||||
|
// This code is only reached during testing, when runtime is stubbed out.
|
||||||
|
throw new ServiceFailedException(message, t);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class used to expose {@link NameNode} as context to {@link HAState}
|
||||||
*/
|
*/
|
||||||
protected class NameNodeHAContext implements HAContext {
|
protected class NameNodeHAContext implements HAContext {
|
||||||
@Override
|
@Override
|
||||||
|
@ -1257,32 +1286,52 @@ public class NameNode {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void startActiveServices() throws IOException {
|
public void startActiveServices() throws IOException {
|
||||||
namesystem.startActiveServices();
|
try {
|
||||||
startTrashEmptier(conf);
|
namesystem.startActiveServices();
|
||||||
|
startTrashEmptier(conf);
|
||||||
|
} catch (Throwable t) {
|
||||||
|
doImmediateShutdown(t);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void stopActiveServices() throws IOException {
|
public void stopActiveServices() throws IOException {
|
||||||
if (namesystem != null) {
|
try {
|
||||||
namesystem.stopActiveServices();
|
if (namesystem != null) {
|
||||||
|
namesystem.stopActiveServices();
|
||||||
|
}
|
||||||
|
stopTrashEmptier();
|
||||||
|
} catch (Throwable t) {
|
||||||
|
doImmediateShutdown(t);
|
||||||
}
|
}
|
||||||
stopTrashEmptier();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void startStandbyServices() throws IOException {
|
public void startStandbyServices() throws IOException {
|
||||||
namesystem.startStandbyServices(conf);
|
try {
|
||||||
|
namesystem.startStandbyServices(conf);
|
||||||
|
} catch (Throwable t) {
|
||||||
|
doImmediateShutdown(t);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void prepareToStopStandbyServices() throws ServiceFailedException {
|
public void prepareToStopStandbyServices() throws ServiceFailedException {
|
||||||
namesystem.prepareToStopStandbyServices();
|
try {
|
||||||
|
namesystem.prepareToStopStandbyServices();
|
||||||
|
} catch (Throwable t) {
|
||||||
|
doImmediateShutdown(t);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void stopStandbyServices() throws IOException {
|
public void stopStandbyServices() throws IOException {
|
||||||
if (namesystem != null) {
|
try {
|
||||||
namesystem.stopStandbyServices();
|
if (namesystem != null) {
|
||||||
|
namesystem.stopStandbyServices();
|
||||||
|
}
|
||||||
|
} catch (Throwable t) {
|
||||||
|
doImmediateShutdown(t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hdfs.server.namenode.ha;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.test.GenericTestUtils.assertExceptionContains;
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
|
import static org.mockito.Matchers.anyInt;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.times;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.CommonConfigurationKeys;
|
||||||
|
import org.apache.hadoop.ha.ServiceFailedException;
|
||||||
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
|
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests to verify the behavior of failing to fully start transition HA states.
|
||||||
|
*/
|
||||||
|
public class TestStateTransitionFailure {
|
||||||
|
|
||||||
|
public static final Log LOG = LogFactory.getLog(TestStateTransitionFailure.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensure that a failure to fully transition to the active state causes a
|
||||||
|
* shutdown of the NameNode.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testFailureToTransitionCausesShutdown() throws IOException {
|
||||||
|
MiniDFSCluster cluster = null;
|
||||||
|
try {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
// Set an illegal value for the trash emptier interval. This will cause
|
||||||
|
// the NN to fail to transition to the active state.
|
||||||
|
conf.setLong(CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY, -1);
|
||||||
|
cluster = new MiniDFSCluster.Builder(conf)
|
||||||
|
.nnTopology(MiniDFSNNTopology.simpleHATopology())
|
||||||
|
.numDataNodes(0)
|
||||||
|
.build();
|
||||||
|
cluster.waitActive();
|
||||||
|
Runtime mockRuntime = mock(Runtime.class);
|
||||||
|
cluster.getNameNode(0).setRuntimeForTesting(mockRuntime);
|
||||||
|
verify(mockRuntime, times(0)).exit(anyInt());
|
||||||
|
try {
|
||||||
|
cluster.transitionToActive(0);
|
||||||
|
fail("Transitioned to active but should not have been able to.");
|
||||||
|
} catch (ServiceFailedException sfe) {
|
||||||
|
assertExceptionContains("Error encountered requiring NN shutdown. " +
|
||||||
|
"Shutting down immediately.", sfe);
|
||||||
|
LOG.info("got expected exception", sfe);
|
||||||
|
}
|
||||||
|
verify(mockRuntime, times(1)).exit(anyInt());
|
||||||
|
} finally {
|
||||||
|
if (cluster != null) {
|
||||||
|
cluster.shutdown();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue