HBASE-14317 Stuck FSHLog: bad disk (HDFS-8960) and can't roll WAL; addendum2 -- found a fix testing the branch-1 patch

2015-09-04 00:40:32 -07:00 · 2015-09-04 00:40:32 -07:00 · ec4d719f19
parent 54717a6314
commit ec4d719f19
1 changed files with 13 additions and 2 deletions
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
@ -878,8 +878,19 @@ public class FSHLog implements WAL {
        // Let the writer thread go regardless, whether error or not.
        if (zigzagLatch != null) {
          zigzagLatch.releaseSafePoint();
-          // It will be null if we failed our wait on safe point above.
-          if (syncFuture != null) blockOnSync(syncFuture);
+          // syncFuture will be null if we failed our wait on safe point above. Otherwise, if
+          // latch was obtained successfully, the sync we threw in either trigger the latch or it
+          // got stamped with an exception because the WAL was damaged and we could not sync. Now
+          // the write pipeline has been opened up again by releasing the safe point, process the
+          // syncFuture we got above. This is probably a noop but it may be stale exception from
+          // when old WAL was in place. Catch it if so.
+          if (syncFuture != null) {
+            try {
+              blockOnSync(syncFuture);
+            } catch (IOException ioe) {
+              if (LOG.isTraceEnabled()) LOG.trace("Stale sync exception", ioe);
+            }
+          }
        }
      } finally {
        scope.close();