Incorrect locking in FsVolumeList#checkDirs can hang datanodes (Noah Lorang via Colin P. McCabe)

This commit is contained in:
Colin Patrick Mccabe 2014-12-09 10:55:17 -08:00
parent be86237c09
commit d8352b9b2b
2 changed files with 31 additions and 28 deletions

View File

@ -574,6 +574,9 @@ Release 2.6.1 - UNRELEASED
HDFS-4882. Prevent the Namenode's LeaseManager from looping forever in
checkLeases (Ravi Prakash via Colin P. McCabe)
HDFS-7489. Incorrect locking in FsVolumeList#checkDirs can hang datanodes
(Noah Lorang via Colin P. McCabe)
Release 2.6.0 - 2014-11-18
INCOMPATIBLE CHANGES

View File

@ -36,6 +36,7 @@ class FsVolumeList {
* This list is replaced on modification holding "this" lock.
*/
volatile List<FsVolumeImpl> volumes = null;
private Object checkDirsMutex = new Object();
private final VolumeChoosingPolicy<FsVolumeImpl> blockChooser;
private volatile int numFailedVolumes;
@ -167,40 +168,39 @@ public void run() {
* Calls {@link FsVolumeImpl#checkDirs()} on each volume, removing any
* volumes from the active list that result in a DiskErrorException.
*
* This method is synchronized to allow only one instance of checkDirs()
* call
* Use checkDirsMutext to allow only one instance of checkDirs() call
*
* @return list of all the removed volumes.
*/
synchronized List<FsVolumeImpl> checkDirs() {
ArrayList<FsVolumeImpl> removedVols = null;
// Make a copy of volumes for performing modification
final List<FsVolumeImpl> volumeList = new ArrayList<FsVolumeImpl>(volumes);
List<FsVolumeImpl> checkDirs() {
synchronized(checkDirsMutex) {
ArrayList<FsVolumeImpl> removedVols = null;
// Make a copy of volumes for performing modification
final List<FsVolumeImpl> volumeList = new ArrayList<FsVolumeImpl>(volumes);
for(Iterator<FsVolumeImpl> i = volumeList.iterator(); i.hasNext(); ) {
final FsVolumeImpl fsv = i.next();
try {
fsv.checkDirs();
} catch (DiskErrorException e) {
FsDatasetImpl.LOG.warn("Removing failed volume " + fsv + ": ",e);
if (removedVols == null) {
removedVols = new ArrayList<FsVolumeImpl>(1);
for(Iterator<FsVolumeImpl> i = volumeList.iterator(); i.hasNext(); ) {
final FsVolumeImpl fsv = i.next();
try {
fsv.checkDirs();
} catch (DiskErrorException e) {
FsDatasetImpl.LOG.warn("Removing failed volume " + fsv + ": ",e);
if (removedVols == null) {
removedVols = new ArrayList<FsVolumeImpl>(1);
}
removedVols.add(fsv);
removeVolume(fsv.getBasePath());
numFailedVolumes++;
}
removedVols.add(fsv);
fsv.shutdown();
i.remove(); // Remove the volume
numFailedVolumes++;
}
}
if (removedVols != null && removedVols.size() > 0) {
// Replace volume list
volumes = Collections.unmodifiableList(volumeList);
FsDatasetImpl.LOG.warn("Completed checkDirs. Removed " + removedVols.size()
+ " volumes. Current volumes: " + this);
}
if (removedVols != null && removedVols.size() > 0) {
FsDatasetImpl.LOG.warn("Completed checkDirs. Removed " + removedVols.size()
+ " volumes. Current volumes: " + this);
}
return removedVols;
return removedVols;
}
}
@Override