HDFS-9445. Datanode may deadlock while handling a bad volume. Contributed by Walter Su.
This commit is contained in:
parent
0381556a42
commit
236a6ba13e
@ -24,6 +24,9 @@ Release 2.6.4 - UNRELEASED
|
||||
|
||||
BUG FIXES
|
||||
|
||||
HDFS-9445. Datanode may deadlock while handling a bad volume.
|
||||
(Wlater Su via Kihwal)
|
||||
|
||||
Release 2.6.3 - 2015-12-17
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
@ -385,51 +385,70 @@ public void addVolume(final StorageLocation location,
|
||||
/**
|
||||
* Removes a collection of volumes from FsDataset.
|
||||
* @param volumes the root directories of the volumes.
|
||||
*
|
||||
* DataNode should call this function before calling
|
||||
* {@link DataStorage#removeVolumes(java.util.Collection)}.
|
||||
*/
|
||||
@Override
|
||||
public synchronized void removeVolumes(Collection<StorageLocation> volumes) {
|
||||
public void removeVolumes(Collection<StorageLocation> volumes) {
|
||||
Set<String> volumeSet = new HashSet<String>();
|
||||
for (StorageLocation sl : volumes) {
|
||||
volumeSet.add(sl.getFile().getAbsolutePath());
|
||||
}
|
||||
for (int idx = 0; idx < dataStorage.getNumStorageDirs(); idx++) {
|
||||
Storage.StorageDirectory sd = dataStorage.getStorageDir(idx);
|
||||
String volume = sd.getRoot().getAbsolutePath();
|
||||
if (volumeSet.contains(volume)) {
|
||||
LOG.info("Removing " + volume + " from FsDataset.");
|
||||
|
||||
// Disable the volume from the service.
|
||||
asyncDiskService.removeVolume(sd.getCurrentDir());
|
||||
this.volumes.removeVolume(sd.getRoot());
|
||||
Map<String, List<ReplicaInfo>> blkToInvalidate =
|
||||
new HashMap<String, List<ReplicaInfo>>();
|
||||
List<String> storageToRemove = new ArrayList<String>();
|
||||
synchronized (this) {
|
||||
for (int idx = 0; idx < dataStorage.getNumStorageDirs(); idx++) {
|
||||
Storage.StorageDirectory sd = dataStorage.getStorageDir(idx);
|
||||
String volume = sd.getRoot().getAbsolutePath();
|
||||
if (volumeSet.contains(volume)) {
|
||||
LOG.info("Removing " + volume + " from FsDataset.");
|
||||
|
||||
// Removed all replica information for the blocks on the volume. Unlike
|
||||
// updating the volumeMap in addVolume(), this operation does not scan
|
||||
// disks.
|
||||
for (String bpid : volumeMap.getBlockPoolList()) {
|
||||
List<Block> blocks = new ArrayList<Block>();
|
||||
for (Iterator<ReplicaInfo> it = volumeMap.replicas(bpid).iterator();
|
||||
it.hasNext(); ) {
|
||||
ReplicaInfo block = it.next();
|
||||
String absBasePath =
|
||||
// Disable the volume from the service.
|
||||
asyncDiskService.removeVolume(sd.getCurrentDir());
|
||||
this.volumes.removeVolume(sd.getRoot());
|
||||
|
||||
// Removed all replica information for the blocks on the volume.
|
||||
// Unlike updating the volumeMap in addVolume(), this operation does
|
||||
// not scan disks.
|
||||
for (String bpid : volumeMap.getBlockPoolList()) {
|
||||
List<ReplicaInfo> blocks = new ArrayList<ReplicaInfo>();
|
||||
for (Iterator<ReplicaInfo> it = volumeMap.replicas(bpid).iterator();
|
||||
it.hasNext(); ) {
|
||||
ReplicaInfo block = it.next();
|
||||
String absBasePath =
|
||||
new File(block.getVolume().getBasePath()).getAbsolutePath();
|
||||
if (absBasePath.equals(volume)) {
|
||||
invalidate(bpid, block);
|
||||
blocks.add(block);
|
||||
it.remove();
|
||||
if (absBasePath.equals(volume)) {
|
||||
blocks.add(block);
|
||||
it.remove();
|
||||
}
|
||||
}
|
||||
blkToInvalidate.put(bpid, blocks);
|
||||
// Delete blocks from the block scanner in batch.
|
||||
datanode.getBlockScanner().deleteBlocks(bpid,
|
||||
blocks.toArray(new Block[blocks.size()]));
|
||||
}
|
||||
// Delete blocks from the block scanner in batch.
|
||||
datanode.getBlockScanner().deleteBlocks(bpid,
|
||||
blocks.toArray(new Block[blocks.size()]));
|
||||
}
|
||||
|
||||
storageMap.remove(sd.getStorageUuid());
|
||||
storageToRemove.add(sd.getStorageUuid());
|
||||
}
|
||||
}
|
||||
setupAsyncLazyPersistThreads();
|
||||
}
|
||||
|
||||
// Call this outside the lock.
|
||||
for (Map.Entry<String, List<ReplicaInfo>> entry :
|
||||
blkToInvalidate.entrySet()) {
|
||||
String bpid = entry.getKey();
|
||||
List<ReplicaInfo> blocks = entry.getValue();
|
||||
for (ReplicaInfo block : blocks) {
|
||||
invalidate(bpid, block);
|
||||
}
|
||||
}
|
||||
|
||||
synchronized (this) {
|
||||
for(String storageUuid : storageToRemove) {
|
||||
storageMap.remove(storageUuid);
|
||||
}
|
||||
}
|
||||
setupAsyncLazyPersistThreads();
|
||||
}
|
||||
|
||||
private StorageType getStorageTypeFromLocations(
|
||||
@ -1639,15 +1658,11 @@ public void invalidate(String bpid, Block invalidBlks[]) throws IOException {
|
||||
public void invalidate(String bpid, ReplicaInfo block) {
|
||||
// If a DFSClient has the replica in its cache of short-circuit file
|
||||
// descriptors (and the client is using ShortCircuitShm), invalidate it.
|
||||
// The short-circuit registry is null in the unit tests, because the
|
||||
// datanode is mock object.
|
||||
if (datanode.getShortCircuitRegistry() != null) {
|
||||
datanode.getShortCircuitRegistry().processBlockInvalidation(
|
||||
new ExtendedBlockId(block.getBlockId(), bpid));
|
||||
datanode.getShortCircuitRegistry().processBlockInvalidation(
|
||||
new ExtendedBlockId(block.getBlockId(), bpid));
|
||||
|
||||
// If the block is cached, start uncaching it.
|
||||
cacheManager.uncacheBlock(bpid, block.getBlockId());
|
||||
}
|
||||
// If the block is cached, start uncaching it.
|
||||
cacheManager.uncacheBlock(bpid, block.getBlockId());
|
||||
|
||||
datanode.notifyNamenodeDeletedBlock(new ExtendedBlock(bpid, block),
|
||||
block.getStorageUuid());
|
||||
|
@ -33,6 +33,7 @@
|
||||
import org.apache.hadoop.hdfs.server.datanode.DataStorage;
|
||||
import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica;
|
||||
import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo;
|
||||
import org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry;
|
||||
import org.apache.hadoop.hdfs.server.datanode.StorageLocation;
|
||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.RoundRobinVolumeChoosingPolicy;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
@ -127,6 +128,9 @@ public void setUp() throws IOException {
|
||||
when(datanode.getConf()).thenReturn(conf);
|
||||
when(datanode.getDnConf()).thenReturn(dnConf);
|
||||
when(datanode.getBlockScanner()).thenReturn(scanner);
|
||||
final ShortCircuitRegistry shortCircuitRegistry =
|
||||
new ShortCircuitRegistry(conf);
|
||||
when(datanode.getShortCircuitRegistry()).thenReturn(shortCircuitRegistry);
|
||||
|
||||
createStorageDirs(storage, conf, NUM_INIT_VOLUMES);
|
||||
dataset = new FsDatasetImpl(datanode, storage, conf);
|
||||
|
Loading…
x
Reference in New Issue
Block a user