Incorrect locking in FsVolumeList#checkDirs can hang datanodes (Noah Lorang via Colin P. McCabe)

This commit is contained in:
Colin Patrick Mccabe 2014-12-09 10:55:17 -08:00
parent be86237c09
commit d8352b9b2b
2 changed files with 31 additions and 28 deletions

View File

@ -574,6 +574,9 @@ Release 2.6.1 - UNRELEASED
HDFS-4882. Prevent the Namenode's LeaseManager from looping forever in HDFS-4882. Prevent the Namenode's LeaseManager from looping forever in
checkLeases (Ravi Prakash via Colin P. McCabe) checkLeases (Ravi Prakash via Colin P. McCabe)
HDFS-7489. Incorrect locking in FsVolumeList#checkDirs can hang datanodes
(Noah Lorang via Colin P. McCabe)
Release 2.6.0 - 2014-11-18 Release 2.6.0 - 2014-11-18
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -36,6 +36,7 @@ class FsVolumeList {
* This list is replaced on modification holding "this" lock. * This list is replaced on modification holding "this" lock.
*/ */
volatile List<FsVolumeImpl> volumes = null; volatile List<FsVolumeImpl> volumes = null;
private Object checkDirsMutex = new Object();
private final VolumeChoosingPolicy<FsVolumeImpl> blockChooser; private final VolumeChoosingPolicy<FsVolumeImpl> blockChooser;
private volatile int numFailedVolumes; private volatile int numFailedVolumes;
@ -167,40 +168,39 @@ class FsVolumeList {
* Calls {@link FsVolumeImpl#checkDirs()} on each volume, removing any * Calls {@link FsVolumeImpl#checkDirs()} on each volume, removing any
* volumes from the active list that result in a DiskErrorException. * volumes from the active list that result in a DiskErrorException.
* *
* This method is synchronized to allow only one instance of checkDirs() * Use checkDirsMutext to allow only one instance of checkDirs() call
* call *
* @return list of all the removed volumes. * @return list of all the removed volumes.
*/ */
synchronized List<FsVolumeImpl> checkDirs() { List<FsVolumeImpl> checkDirs() {
ArrayList<FsVolumeImpl> removedVols = null; synchronized(checkDirsMutex) {
ArrayList<FsVolumeImpl> removedVols = null;
// Make a copy of volumes for performing modification
final List<FsVolumeImpl> volumeList = new ArrayList<FsVolumeImpl>(volumes); // Make a copy of volumes for performing modification
final List<FsVolumeImpl> volumeList = new ArrayList<FsVolumeImpl>(volumes);
for(Iterator<FsVolumeImpl> i = volumeList.iterator(); i.hasNext(); ) { for(Iterator<FsVolumeImpl> i = volumeList.iterator(); i.hasNext(); ) {
final FsVolumeImpl fsv = i.next(); final FsVolumeImpl fsv = i.next();
try { try {
fsv.checkDirs(); fsv.checkDirs();
} catch (DiskErrorException e) { } catch (DiskErrorException e) {
FsDatasetImpl.LOG.warn("Removing failed volume " + fsv + ": ",e); FsDatasetImpl.LOG.warn("Removing failed volume " + fsv + ": ",e);
if (removedVols == null) { if (removedVols == null) {
removedVols = new ArrayList<FsVolumeImpl>(1); removedVols = new ArrayList<FsVolumeImpl>(1);
}
removedVols.add(fsv);
removeVolume(fsv.getBasePath());
numFailedVolumes++;
} }
removedVols.add(fsv);
fsv.shutdown();
i.remove(); // Remove the volume
numFailedVolumes++;
} }
}
if (removedVols != null && removedVols.size() > 0) {
if (removedVols != null && removedVols.size() > 0) { FsDatasetImpl.LOG.warn("Completed checkDirs. Removed " + removedVols.size()
// Replace volume list + " volumes. Current volumes: " + this);
volumes = Collections.unmodifiableList(volumeList); }
FsDatasetImpl.LOG.warn("Completed checkDirs. Removed " + removedVols.size()
+ " volumes. Current volumes: " + this);
}
return removedVols; return removedVols;
}
} }
@Override @Override