Incorrect locking in FsVolumeList#checkDirs can hang datanodes (Noah Lorang via Colin P. McCabe)

This commit is contained in:
Colin Patrick Mccabe 2014-12-09 10:55:17 -08:00
parent be86237c09
commit d8352b9b2b
2 changed files with 31 additions and 28 deletions

View File

@ -574,6 +574,9 @@ Release 2.6.1 - UNRELEASED
HDFS-4882. Prevent the Namenode's LeaseManager from looping forever in
checkLeases (Ravi Prakash via Colin P. McCabe)
HDFS-7489. Incorrect locking in FsVolumeList#checkDirs can hang datanodes
(Noah Lorang via Colin P. McCabe)
Release 2.6.0 - 2014-11-18
INCOMPATIBLE CHANGES

View File

@ -36,6 +36,7 @@ class FsVolumeList {
* This list is replaced on modification holding "this" lock.
*/
volatile List<FsVolumeImpl> volumes = null;
private Object checkDirsMutex = new Object();
private final VolumeChoosingPolicy<FsVolumeImpl> blockChooser;
private volatile int numFailedVolumes;
@ -167,11 +168,12 @@ public void run() {
* Calls {@link FsVolumeImpl#checkDirs()} on each volume, removing any
* volumes from the active list that result in a DiskErrorException.
*
* This method is synchronized to allow only one instance of checkDirs()
* call
* Use checkDirsMutext to allow only one instance of checkDirs() call
*
* @return list of all the removed volumes.
*/
synchronized List<FsVolumeImpl> checkDirs() {
List<FsVolumeImpl> checkDirs() {
synchronized(checkDirsMutex) {
ArrayList<FsVolumeImpl> removedVols = null;
// Make a copy of volumes for performing modification
@ -187,21 +189,19 @@ synchronized List<FsVolumeImpl> checkDirs() {
removedVols = new ArrayList<FsVolumeImpl>(1);
}
removedVols.add(fsv);
fsv.shutdown();
i.remove(); // Remove the volume
removeVolume(fsv.getBasePath());
numFailedVolumes++;
}
}
if (removedVols != null && removedVols.size() > 0) {
// Replace volume list
volumes = Collections.unmodifiableList(volumeList);
FsDatasetImpl.LOG.warn("Completed checkDirs. Removed " + removedVols.size()
+ " volumes. Current volumes: " + this);
}
return removedVols;
}
}
@Override
public String toString() {