HDFS-5922. DN heartbeat thread can get stuck in tight loop. (Arpit Agarwal)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1571542 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Arpit Agarwal 2014-02-25 02:16:29 +00:00
parent ad70f26b1f
commit 440c3cd105
4 changed files with 230 additions and 10 deletions

View File

@ -613,6 +613,8 @@ Release 2.4.0 - UNRELEASED
HDFS-5981. PBImageXmlWriter generates malformed XML. HDFS-5981. PBImageXmlWriter generates malformed XML.
(Haohui Mai via cnauroth) (Haohui Mai via cnauroth)
HDFS-5922. DN heartbeat thread can get stuck in tight loop. (Arpit Agarwal)
Release 2.3.1 - UNRELEASED Release 2.3.1 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -1108,6 +1108,7 @@ public class DFSOutputStream extends FSOutputSummer
excluded.length > 0 ? excluded : null); excluded.length > 0 ? excluded : null);
block = lb.getBlock(); block = lb.getBlock();
block.setNumBytes(0); block.setNumBytes(0);
bytesSent = 0;
accessToken = lb.getBlockToken(); accessToken = lb.getBlockToken();
nodes = lb.getLocations(); nodes = lb.getLocations();

View File

@ -101,7 +101,10 @@ class BPServiceActor implements Runnable {
private final Map<String, PerStoragePendingIncrementalBR> private final Map<String, PerStoragePendingIncrementalBR>
pendingIncrementalBRperStorage = Maps.newHashMap(); pendingIncrementalBRperStorage = Maps.newHashMap();
private volatile int pendingReceivedRequests = 0; // IBR = Incremental Block Report. If this flag is set then an IBR will be
// sent immediately by the actor thread without waiting for the IBR timer
// to elapse.
private volatile boolean sendImmediateIBR = false;
private volatile boolean shouldServiceRun = true; private volatile boolean shouldServiceRun = true;
private final DataNode dn; private final DataNode dn;
private final DNConf dnConf; private final DNConf dnConf;
@ -283,12 +286,10 @@ class BPServiceActor implements Runnable {
if (perStorageMap.getBlockInfoCount() > 0) { if (perStorageMap.getBlockInfoCount() > 0) {
// Send newly-received and deleted blockids to namenode // Send newly-received and deleted blockids to namenode
ReceivedDeletedBlockInfo[] rdbi = perStorageMap.dequeueBlockInfos(); ReceivedDeletedBlockInfo[] rdbi = perStorageMap.dequeueBlockInfos();
pendingReceivedRequests =
(pendingReceivedRequests > rdbi.length ?
(pendingReceivedRequests - rdbi.length) : 0);
reports.add(new StorageReceivedDeletedBlocks(storageUuid, rdbi)); reports.add(new StorageReceivedDeletedBlocks(storageUuid, rdbi));
} }
} }
sendImmediateIBR = false;
} }
if (reports.size() == 0) { if (reports.size() == 0) {
@ -312,8 +313,8 @@ class BPServiceActor implements Runnable {
// didn't put something newer in the meantime. // didn't put something newer in the meantime.
PerStoragePendingIncrementalBR perStorageMap = PerStoragePendingIncrementalBR perStorageMap =
pendingIncrementalBRperStorage.get(report.getStorageID()); pendingIncrementalBRperStorage.get(report.getStorageID());
pendingReceivedRequests +=
perStorageMap.putMissingBlockInfos(report.getBlocks()); perStorageMap.putMissingBlockInfos(report.getBlocks());
sendImmediateIBR = true;
} }
} }
} }
@ -371,7 +372,7 @@ class BPServiceActor implements Runnable {
ReceivedDeletedBlockInfo bInfo, String storageUuid) { ReceivedDeletedBlockInfo bInfo, String storageUuid) {
synchronized (pendingIncrementalBRperStorage) { synchronized (pendingIncrementalBRperStorage) {
addPendingReplicationBlockInfo(bInfo, storageUuid); addPendingReplicationBlockInfo(bInfo, storageUuid);
pendingReceivedRequests++; sendImmediateIBR = true;
pendingIncrementalBRperStorage.notifyAll(); pendingIncrementalBRperStorage.notifyAll();
} }
} }
@ -433,6 +434,11 @@ class BPServiceActor implements Runnable {
} }
} }
@VisibleForTesting
boolean hasPendingIBR() {
return sendImmediateIBR;
}
/** /**
* Report the list blocks to the Namenode * Report the list blocks to the Namenode
* @return DatanodeCommands returned by the NN. May be null. * @return DatanodeCommands returned by the NN. May be null.
@ -676,8 +682,8 @@ class BPServiceActor implements Runnable {
} }
} }
} }
if (pendingReceivedRequests > 0 if (sendImmediateIBR ||
|| (startTime - lastDeletedReport > dnConf.deleteReportInterval)) { (startTime - lastDeletedReport > dnConf.deleteReportInterval)) {
reportReceivedDeletedBlocks(); reportReceivedDeletedBlocks();
lastDeletedReport = startTime; lastDeletedReport = startTime;
} }
@ -701,7 +707,7 @@ class BPServiceActor implements Runnable {
long waitTime = dnConf.heartBeatInterval - long waitTime = dnConf.heartBeatInterval -
(Time.now() - lastHeartbeat); (Time.now() - lastHeartbeat);
synchronized(pendingIncrementalBRperStorage) { synchronized(pendingIncrementalBRperStorage) {
if (waitTime > 0 && pendingReceivedRequests == 0) { if (waitTime > 0 && !sendImmediateIBR) {
try { try {
pendingIncrementalBRperStorage.wait(waitTime); pendingIncrementalBRperStorage.wait(waitTime);
} catch (InterruptedException ie) { } catch (InterruptedException ie) {

View File

@ -0,0 +1,211 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode;
import static junit.framework.Assert.assertFalse;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyString;
import static org.mockito.Mockito.times;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.*;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo.BlockStatus;
import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
import org.junit.Before;
import org.junit.Test;
import org.mockito.Mockito;
/**
* Verify that incremental block reports are generated in response to
* block additions/deletions.
*/
public class TestIncrementalBlockReports {
public static final Log LOG = LogFactory.getLog(TestIncrementalBlockReports.class);
private static final short DN_COUNT = 1;
private static final long DUMMY_BLOCK_ID = 5678;
private static final long DUMMY_BLOCK_LENGTH = 1024 * 1024;
private static final long DUMMY_BLOCK_GENSTAMP = 1000;
private MiniDFSCluster cluster = null;
private DistributedFileSystem fs;
private Configuration conf;
private NameNode singletonNn;
private DataNode singletonDn;
private BPOfferService bpos; // BPOS to use for block injection.
private BPServiceActor actor; // BPSA to use for block injection.
private String storageUuid; // DatanodeStorage to use for block injection.
@Before
public void startCluster() throws IOException {
conf = new HdfsConfiguration();
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(DN_COUNT).build();
fs = cluster.getFileSystem();
singletonNn = cluster.getNameNode();
singletonDn = cluster.getDataNodes().get(0);
bpos = singletonDn.getAllBpOs()[0];
actor = bpos.getBPServiceActors().get(0);
storageUuid = singletonDn.getFSDataset().getVolumes().get(0).getStorageID();
}
private static Block getDummyBlock() {
return new Block(DUMMY_BLOCK_ID, DUMMY_BLOCK_LENGTH, DUMMY_BLOCK_GENSTAMP);
}
/**
* Inject a fake 'received' block into the BPServiceActor state.
*/
private void injectBlockReceived() {
ReceivedDeletedBlockInfo rdbi = new ReceivedDeletedBlockInfo(
getDummyBlock(), BlockStatus.RECEIVED_BLOCK, null);
actor.notifyNamenodeBlockImmediately(rdbi, storageUuid);
}
/**
* Inject a fake 'deleted' block into the BPServiceActor state.
*/
private void injectBlockDeleted() {
ReceivedDeletedBlockInfo rdbi = new ReceivedDeletedBlockInfo(
getDummyBlock(), BlockStatus.DELETED_BLOCK, null);
actor.notifyNamenodeDeletedBlock(rdbi, storageUuid);
}
/**
* Spy on calls from the DN to the NN.
* @return spy object that can be used for Mockito verification.
*/
DatanodeProtocolClientSideTranslatorPB spyOnDnCallsToNn() {
return DataNodeTestUtils.spyOnBposToNN(singletonDn, singletonNn);
}
/**
* Ensure that an IBR is generated immediately for a block received by
* the DN.
*
* @throws InterruptedException
* @throws IOException
*/
@Test (timeout=60000)
public void testReportBlockReceived() throws InterruptedException, IOException {
try {
DatanodeProtocolClientSideTranslatorPB nnSpy = spyOnDnCallsToNn();
injectBlockReceived();
// Sleep for a very short time, this is necessary since the IBR is
// generated asynchronously.
Thread.sleep(2000);
// Ensure that the received block was reported immediately.
Mockito.verify(nnSpy, times(1)).blockReceivedAndDeleted(
any(DatanodeRegistration.class),
anyString(),
any(StorageReceivedDeletedBlocks[].class));
} finally {
cluster.shutdown();
cluster = null;
}
}
/**
* Ensure that a delayed IBR is generated for a block deleted on the DN.
*
* @throws InterruptedException
* @throws IOException
*/
@Test (timeout=60000)
public void testReportBlockDeleted() throws InterruptedException, IOException {
try {
// Trigger a block report to reset the IBR timer.
DataNodeTestUtils.triggerBlockReport(singletonDn);
// Spy on calls from the DN to the NN
DatanodeProtocolClientSideTranslatorPB nnSpy = spyOnDnCallsToNn();
injectBlockDeleted();
// Sleep for a very short time since IBR is generated
// asynchronously.
Thread.sleep(2000);
// Ensure that no block report was generated immediately.
// Deleted blocks are reported when the IBR timer elapses.
Mockito.verify(nnSpy, times(0)).blockReceivedAndDeleted(
any(DatanodeRegistration.class),
anyString(),
any(StorageReceivedDeletedBlocks[].class));
// Trigger a block report, this also triggers an IBR.
DataNodeTestUtils.triggerBlockReport(singletonDn);
Thread.sleep(2000);
// Ensure that the deleted block is reported.
Mockito.verify(nnSpy, times(1)).blockReceivedAndDeleted(
any(DatanodeRegistration.class),
anyString(),
any(StorageReceivedDeletedBlocks[].class));
} finally {
cluster.shutdown();
cluster = null;
}
}
/**
* Add a received block entry and then replace it. Ensure that a single
* IBR is generated and that pending receive request state is cleared.
* This test case verifies the failure in HDFS-5922.
*
* @throws InterruptedException
* @throws IOException
*/
@Test (timeout=60000)
public void testReplaceReceivedBlock() throws InterruptedException, IOException {
try {
// Spy on calls from the DN to the NN
DatanodeProtocolClientSideTranslatorPB nnSpy = spyOnDnCallsToNn();
injectBlockReceived();
injectBlockReceived(); // Overwrite the existing entry.
// Sleep for a very short time since IBR is generated
// asynchronously.
Thread.sleep(2000);
// Ensure that the received block is reported.
Mockito.verify(nnSpy, times(1)).blockReceivedAndDeleted(
any(DatanodeRegistration.class),
anyString(),
any(StorageReceivedDeletedBlocks[].class));
// Ensure that no more IBRs are pending.
assertFalse(actor.hasPendingIBR());
} finally {
cluster.shutdown();
cluster = null;
}
}
}