HDFS-5922. DN heartbeat thread can get stuck in tight loop. (Arpit Agarwal)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1571542 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ad70f26b1f
commit
440c3cd105
|
@ -613,6 +613,8 @@ Release 2.4.0 - UNRELEASED
|
||||||
HDFS-5981. PBImageXmlWriter generates malformed XML.
|
HDFS-5981. PBImageXmlWriter generates malformed XML.
|
||||||
(Haohui Mai via cnauroth)
|
(Haohui Mai via cnauroth)
|
||||||
|
|
||||||
|
HDFS-5922. DN heartbeat thread can get stuck in tight loop. (Arpit Agarwal)
|
||||||
|
|
||||||
Release 2.3.1 - UNRELEASED
|
Release 2.3.1 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -1108,6 +1108,7 @@ public class DFSOutputStream extends FSOutputSummer
|
||||||
excluded.length > 0 ? excluded : null);
|
excluded.length > 0 ? excluded : null);
|
||||||
block = lb.getBlock();
|
block = lb.getBlock();
|
||||||
block.setNumBytes(0);
|
block.setNumBytes(0);
|
||||||
|
bytesSent = 0;
|
||||||
accessToken = lb.getBlockToken();
|
accessToken = lb.getBlockToken();
|
||||||
nodes = lb.getLocations();
|
nodes = lb.getLocations();
|
||||||
|
|
||||||
|
|
|
@ -101,7 +101,10 @@ class BPServiceActor implements Runnable {
|
||||||
private final Map<String, PerStoragePendingIncrementalBR>
|
private final Map<String, PerStoragePendingIncrementalBR>
|
||||||
pendingIncrementalBRperStorage = Maps.newHashMap();
|
pendingIncrementalBRperStorage = Maps.newHashMap();
|
||||||
|
|
||||||
private volatile int pendingReceivedRequests = 0;
|
// IBR = Incremental Block Report. If this flag is set then an IBR will be
|
||||||
|
// sent immediately by the actor thread without waiting for the IBR timer
|
||||||
|
// to elapse.
|
||||||
|
private volatile boolean sendImmediateIBR = false;
|
||||||
private volatile boolean shouldServiceRun = true;
|
private volatile boolean shouldServiceRun = true;
|
||||||
private final DataNode dn;
|
private final DataNode dn;
|
||||||
private final DNConf dnConf;
|
private final DNConf dnConf;
|
||||||
|
@ -283,12 +286,10 @@ class BPServiceActor implements Runnable {
|
||||||
if (perStorageMap.getBlockInfoCount() > 0) {
|
if (perStorageMap.getBlockInfoCount() > 0) {
|
||||||
// Send newly-received and deleted blockids to namenode
|
// Send newly-received and deleted blockids to namenode
|
||||||
ReceivedDeletedBlockInfo[] rdbi = perStorageMap.dequeueBlockInfos();
|
ReceivedDeletedBlockInfo[] rdbi = perStorageMap.dequeueBlockInfos();
|
||||||
pendingReceivedRequests =
|
|
||||||
(pendingReceivedRequests > rdbi.length ?
|
|
||||||
(pendingReceivedRequests - rdbi.length) : 0);
|
|
||||||
reports.add(new StorageReceivedDeletedBlocks(storageUuid, rdbi));
|
reports.add(new StorageReceivedDeletedBlocks(storageUuid, rdbi));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
sendImmediateIBR = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (reports.size() == 0) {
|
if (reports.size() == 0) {
|
||||||
|
@ -312,8 +313,8 @@ class BPServiceActor implements Runnable {
|
||||||
// didn't put something newer in the meantime.
|
// didn't put something newer in the meantime.
|
||||||
PerStoragePendingIncrementalBR perStorageMap =
|
PerStoragePendingIncrementalBR perStorageMap =
|
||||||
pendingIncrementalBRperStorage.get(report.getStorageID());
|
pendingIncrementalBRperStorage.get(report.getStorageID());
|
||||||
pendingReceivedRequests +=
|
|
||||||
perStorageMap.putMissingBlockInfos(report.getBlocks());
|
perStorageMap.putMissingBlockInfos(report.getBlocks());
|
||||||
|
sendImmediateIBR = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -371,7 +372,7 @@ class BPServiceActor implements Runnable {
|
||||||
ReceivedDeletedBlockInfo bInfo, String storageUuid) {
|
ReceivedDeletedBlockInfo bInfo, String storageUuid) {
|
||||||
synchronized (pendingIncrementalBRperStorage) {
|
synchronized (pendingIncrementalBRperStorage) {
|
||||||
addPendingReplicationBlockInfo(bInfo, storageUuid);
|
addPendingReplicationBlockInfo(bInfo, storageUuid);
|
||||||
pendingReceivedRequests++;
|
sendImmediateIBR = true;
|
||||||
pendingIncrementalBRperStorage.notifyAll();
|
pendingIncrementalBRperStorage.notifyAll();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -433,6 +434,11 @@ class BPServiceActor implements Runnable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
boolean hasPendingIBR() {
|
||||||
|
return sendImmediateIBR;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Report the list blocks to the Namenode
|
* Report the list blocks to the Namenode
|
||||||
* @return DatanodeCommands returned by the NN. May be null.
|
* @return DatanodeCommands returned by the NN. May be null.
|
||||||
|
@ -676,8 +682,8 @@ class BPServiceActor implements Runnable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (pendingReceivedRequests > 0
|
if (sendImmediateIBR ||
|
||||||
|| (startTime - lastDeletedReport > dnConf.deleteReportInterval)) {
|
(startTime - lastDeletedReport > dnConf.deleteReportInterval)) {
|
||||||
reportReceivedDeletedBlocks();
|
reportReceivedDeletedBlocks();
|
||||||
lastDeletedReport = startTime;
|
lastDeletedReport = startTime;
|
||||||
}
|
}
|
||||||
|
@ -701,7 +707,7 @@ class BPServiceActor implements Runnable {
|
||||||
long waitTime = dnConf.heartBeatInterval -
|
long waitTime = dnConf.heartBeatInterval -
|
||||||
(Time.now() - lastHeartbeat);
|
(Time.now() - lastHeartbeat);
|
||||||
synchronized(pendingIncrementalBRperStorage) {
|
synchronized(pendingIncrementalBRperStorage) {
|
||||||
if (waitTime > 0 && pendingReceivedRequests == 0) {
|
if (waitTime > 0 && !sendImmediateIBR) {
|
||||||
try {
|
try {
|
||||||
pendingIncrementalBRperStorage.wait(waitTime);
|
pendingIncrementalBRperStorage.wait(waitTime);
|
||||||
} catch (InterruptedException ie) {
|
} catch (InterruptedException ie) {
|
||||||
|
|
|
@ -0,0 +1,211 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hdfs.server.datanode;
|
||||||
|
|
||||||
|
import static junit.framework.Assert.assertFalse;
|
||||||
|
import static org.mockito.Matchers.any;
|
||||||
|
import static org.mockito.Matchers.anyString;
|
||||||
|
import static org.mockito.Mockito.times;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.hdfs.*;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.Block;
|
||||||
|
import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
|
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
|
||||||
|
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
|
||||||
|
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo.BlockStatus;
|
||||||
|
|
||||||
|
import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verify that incremental block reports are generated in response to
|
||||||
|
* block additions/deletions.
|
||||||
|
*/
|
||||||
|
public class TestIncrementalBlockReports {
|
||||||
|
public static final Log LOG = LogFactory.getLog(TestIncrementalBlockReports.class);
|
||||||
|
|
||||||
|
private static final short DN_COUNT = 1;
|
||||||
|
private static final long DUMMY_BLOCK_ID = 5678;
|
||||||
|
private static final long DUMMY_BLOCK_LENGTH = 1024 * 1024;
|
||||||
|
private static final long DUMMY_BLOCK_GENSTAMP = 1000;
|
||||||
|
|
||||||
|
private MiniDFSCluster cluster = null;
|
||||||
|
private DistributedFileSystem fs;
|
||||||
|
private Configuration conf;
|
||||||
|
private NameNode singletonNn;
|
||||||
|
private DataNode singletonDn;
|
||||||
|
private BPOfferService bpos; // BPOS to use for block injection.
|
||||||
|
private BPServiceActor actor; // BPSA to use for block injection.
|
||||||
|
private String storageUuid; // DatanodeStorage to use for block injection.
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void startCluster() throws IOException {
|
||||||
|
conf = new HdfsConfiguration();
|
||||||
|
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(DN_COUNT).build();
|
||||||
|
fs = cluster.getFileSystem();
|
||||||
|
singletonNn = cluster.getNameNode();
|
||||||
|
singletonDn = cluster.getDataNodes().get(0);
|
||||||
|
bpos = singletonDn.getAllBpOs()[0];
|
||||||
|
actor = bpos.getBPServiceActors().get(0);
|
||||||
|
storageUuid = singletonDn.getFSDataset().getVolumes().get(0).getStorageID();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Block getDummyBlock() {
|
||||||
|
return new Block(DUMMY_BLOCK_ID, DUMMY_BLOCK_LENGTH, DUMMY_BLOCK_GENSTAMP);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Inject a fake 'received' block into the BPServiceActor state.
|
||||||
|
*/
|
||||||
|
private void injectBlockReceived() {
|
||||||
|
ReceivedDeletedBlockInfo rdbi = new ReceivedDeletedBlockInfo(
|
||||||
|
getDummyBlock(), BlockStatus.RECEIVED_BLOCK, null);
|
||||||
|
actor.notifyNamenodeBlockImmediately(rdbi, storageUuid);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Inject a fake 'deleted' block into the BPServiceActor state.
|
||||||
|
*/
|
||||||
|
private void injectBlockDeleted() {
|
||||||
|
ReceivedDeletedBlockInfo rdbi = new ReceivedDeletedBlockInfo(
|
||||||
|
getDummyBlock(), BlockStatus.DELETED_BLOCK, null);
|
||||||
|
actor.notifyNamenodeDeletedBlock(rdbi, storageUuid);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Spy on calls from the DN to the NN.
|
||||||
|
* @return spy object that can be used for Mockito verification.
|
||||||
|
*/
|
||||||
|
DatanodeProtocolClientSideTranslatorPB spyOnDnCallsToNn() {
|
||||||
|
return DataNodeTestUtils.spyOnBposToNN(singletonDn, singletonNn);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensure that an IBR is generated immediately for a block received by
|
||||||
|
* the DN.
|
||||||
|
*
|
||||||
|
* @throws InterruptedException
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
@Test (timeout=60000)
|
||||||
|
public void testReportBlockReceived() throws InterruptedException, IOException {
|
||||||
|
try {
|
||||||
|
DatanodeProtocolClientSideTranslatorPB nnSpy = spyOnDnCallsToNn();
|
||||||
|
injectBlockReceived();
|
||||||
|
|
||||||
|
// Sleep for a very short time, this is necessary since the IBR is
|
||||||
|
// generated asynchronously.
|
||||||
|
Thread.sleep(2000);
|
||||||
|
|
||||||
|
// Ensure that the received block was reported immediately.
|
||||||
|
Mockito.verify(nnSpy, times(1)).blockReceivedAndDeleted(
|
||||||
|
any(DatanodeRegistration.class),
|
||||||
|
anyString(),
|
||||||
|
any(StorageReceivedDeletedBlocks[].class));
|
||||||
|
} finally {
|
||||||
|
cluster.shutdown();
|
||||||
|
cluster = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensure that a delayed IBR is generated for a block deleted on the DN.
|
||||||
|
*
|
||||||
|
* @throws InterruptedException
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
@Test (timeout=60000)
|
||||||
|
public void testReportBlockDeleted() throws InterruptedException, IOException {
|
||||||
|
try {
|
||||||
|
// Trigger a block report to reset the IBR timer.
|
||||||
|
DataNodeTestUtils.triggerBlockReport(singletonDn);
|
||||||
|
|
||||||
|
// Spy on calls from the DN to the NN
|
||||||
|
DatanodeProtocolClientSideTranslatorPB nnSpy = spyOnDnCallsToNn();
|
||||||
|
injectBlockDeleted();
|
||||||
|
|
||||||
|
// Sleep for a very short time since IBR is generated
|
||||||
|
// asynchronously.
|
||||||
|
Thread.sleep(2000);
|
||||||
|
|
||||||
|
// Ensure that no block report was generated immediately.
|
||||||
|
// Deleted blocks are reported when the IBR timer elapses.
|
||||||
|
Mockito.verify(nnSpy, times(0)).blockReceivedAndDeleted(
|
||||||
|
any(DatanodeRegistration.class),
|
||||||
|
anyString(),
|
||||||
|
any(StorageReceivedDeletedBlocks[].class));
|
||||||
|
|
||||||
|
// Trigger a block report, this also triggers an IBR.
|
||||||
|
DataNodeTestUtils.triggerBlockReport(singletonDn);
|
||||||
|
Thread.sleep(2000);
|
||||||
|
|
||||||
|
// Ensure that the deleted block is reported.
|
||||||
|
Mockito.verify(nnSpy, times(1)).blockReceivedAndDeleted(
|
||||||
|
any(DatanodeRegistration.class),
|
||||||
|
anyString(),
|
||||||
|
any(StorageReceivedDeletedBlocks[].class));
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
cluster.shutdown();
|
||||||
|
cluster = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add a received block entry and then replace it. Ensure that a single
|
||||||
|
* IBR is generated and that pending receive request state is cleared.
|
||||||
|
* This test case verifies the failure in HDFS-5922.
|
||||||
|
*
|
||||||
|
* @throws InterruptedException
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
@Test (timeout=60000)
|
||||||
|
public void testReplaceReceivedBlock() throws InterruptedException, IOException {
|
||||||
|
try {
|
||||||
|
// Spy on calls from the DN to the NN
|
||||||
|
DatanodeProtocolClientSideTranslatorPB nnSpy = spyOnDnCallsToNn();
|
||||||
|
injectBlockReceived();
|
||||||
|
injectBlockReceived(); // Overwrite the existing entry.
|
||||||
|
|
||||||
|
// Sleep for a very short time since IBR is generated
|
||||||
|
// asynchronously.
|
||||||
|
Thread.sleep(2000);
|
||||||
|
|
||||||
|
// Ensure that the received block is reported.
|
||||||
|
Mockito.verify(nnSpy, times(1)).blockReceivedAndDeleted(
|
||||||
|
any(DatanodeRegistration.class),
|
||||||
|
anyString(),
|
||||||
|
any(StorageReceivedDeletedBlocks[].class));
|
||||||
|
|
||||||
|
// Ensure that no more IBRs are pending.
|
||||||
|
assertFalse(actor.hasPendingIBR());
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
cluster.shutdown();
|
||||||
|
cluster = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue