HDFS-14527. Stop all DataNodes may result in NN terminate. Contributed by He Xiaoqiao.

This commit is contained in:
Inigo Goiri 2019-06-06 10:20:28 -07:00
parent 0b8a2608e0
commit 944adc61b1
3 changed files with 114 additions and 2 deletions

View File

@ -348,7 +348,9 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
} }
// No calculation needed when there is only one rack or picking one node. // No calculation needed when there is only one rack or picking one node.
int numOfRacks = clusterMap.getNumOfRacks(); int numOfRacks = clusterMap.getNumOfRacks();
if (numOfRacks == 1 || totalNumOfReplicas <= 1) { // HDFS-14527 return default when numOfRacks = 0 to avoid
// ArithmeticException when calc maxNodesPerRack at following logic.
if (numOfRacks <= 1 || totalNumOfReplicas <= 1) {
return new int[] {numOfReplicas, totalNumOfReplicas}; return new int[] {numOfReplicas, totalNumOfReplicas};
} }

View File

@ -43,7 +43,9 @@ public class BlockPlacementPolicyRackFaultTolerant extends BlockPlacementPolicyD
} }
// No calculation needed when there is only one rack or picking one node. // No calculation needed when there is only one rack or picking one node.
int numOfRacks = clusterMap.getNumOfRacks(); int numOfRacks = clusterMap.getNumOfRacks();
if (numOfRacks == 1 || totalNumOfReplicas <= 1) { // HDFS-14527 return default when numOfRacks = 0 to avoid
// ArithmeticException when calc maxNodesPerRack at following logic.
if (numOfRacks <= 1 || totalNumOfReplicas <= 1) {
return new int[] {numOfReplicas, totalNumOfReplicas}; return new int[] {numOfReplicas, totalNumOfReplicas};
} }
// If more racks than replicas, put one replica per rack. // If more racks than replicas, put one replica per rack.

View File

@ -0,0 +1,108 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.blockmanagement;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.TestBlockStoragePolicy;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.test.GenericTestUtils.DelayAnswer;
import org.junit.Test;
import java.util.ArrayList;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import static org.apache.hadoop.fs.contract.hdfs.HDFSContract.BLOCK_SIZE;
import static org.mockito.Mockito.doAnswer;
import static org.mockito.Mockito.spy;
/**
* This class tests RedundancyMonitor in BlockManager.
*/
public class TestRedundancyMonitor {
private static final String FILENAME = "/dummyfile.txt";
/**
* RedundancyMonitor invoke choose target out of global lock when
* #computeDatanodeWork. However it may result in NN terminate when choose
* target meet runtime exception(ArithmeticException) since we stop all
* DataNodes during that time.
* Verify that NN should not terminate even stop all datanodes.
*/
@Test
public void testChooseTargetWhenAllDataNodesStop() throws Throwable {
HdfsConfiguration conf = new HdfsConfiguration();
String[] hosts = new String[]{"host1", "host2"};
String[] racks = new String[]{"/d1/r1", "/d1/r1"};
try (MiniDFSCluster miniCluster = new MiniDFSCluster.Builder(conf)
.racks(racks).hosts(hosts).numDataNodes(hosts.length).build()) {
miniCluster.waitActive();
FSNamesystem fsn = miniCluster.getNamesystem();
BlockManager blockManager = fsn.getBlockManager();
BlockPlacementPolicyDefault replicator
= (BlockPlacementPolicyDefault) blockManager
.getBlockPlacementPolicy();
Set<DatanodeDescriptor> dns = blockManager.getDatanodeManager()
.getDatanodes();
DelayAnswer delayer = new DelayAnswer(BlockPlacementPolicyDefault.LOG);
NetworkTopology clusterMap = replicator.clusterMap;
NetworkTopology spyClusterMap = spy(clusterMap);
replicator.clusterMap = spyClusterMap;
doAnswer(delayer).when(spyClusterMap).getNumOfRacks();
ExecutorService pool = Executors.newFixedThreadPool(2);
// Trigger chooseTarget
Future<Void> chooseTargetFuture = pool.submit(() -> {
replicator.chooseTarget(FILENAME, 2, dns.iterator().next(),
new ArrayList<DatanodeStorageInfo>(), false, null, BLOCK_SIZE,
TestBlockStoragePolicy.DEFAULT_STORAGE_POLICY, null);
return null;
});
// Wait until chooseTarget calls NetworkTopology#getNumOfRacks
delayer.waitForCall();
// Remove all DataNodes
Future<Void> stopDatanodesFuture = pool.submit(() -> {
for (DatanodeDescriptor dn : dns) {
spyClusterMap.remove(dn);
}
return null;
});
// Wait stopDatanodesFuture run finish
stopDatanodesFuture.get();
// Allow chooseTarget to proceed
delayer.proceed();
try {
chooseTargetFuture.get();
} catch (ExecutionException ee) {
throw ee.getCause();
}
}
}
}