diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java index 342b826072b..99666df44d2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java @@ -60,8 +60,10 @@ import org.apache.hadoop.hbase.replication.ReplicationQueues; import org.apache.hadoop.hbase.replication.SystemTableWALEntryFilter; import org.apache.hadoop.hbase.replication.WALEntryFilter; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.util.LeaseNotRecoveredException; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.wal.DefaultWALProvider; import org.apache.hadoop.hbase.wal.WAL; @@ -450,9 +452,9 @@ public class ReplicationSource extends Thread * @param p path to split * @return start time */ - private long getTS(Path p) { - String[] parts = p.getName().split("\\."); - return Long.parseLong(parts[parts.length-1]); + private static long getTS(Path p) { + int tsIndex = p.getName().lastIndexOf('.') + 1; + return Long.parseLong(p.getName().substring(tsIndex)); } } @@ -791,7 +793,6 @@ public class ReplicationSource extends Thread * @return true if we should continue with that file, false if we are over with it */ protected boolean openReader(int sleepMultiplier) { - try { try { if (LOG.isTraceEnabled()) { @@ -872,6 +873,11 @@ public class ReplicationSource extends Thread // TODO What happens the log is missing in both places? } } + } catch (LeaseNotRecoveredException lnre) { + // HBASE-15019 the WAL was not closed due to some hiccup. + LOG.warn(peerClusterZnode + " Try to recover the WAL lease " + currentPath, lnre); + recoverLease(conf, currentPath); + this.reader = null; } catch (IOException ioe) { if (ioe instanceof EOFException && isCurrentLogEmpty()) return true; LOG.warn(peerClusterZnode + " Got: ", ioe); @@ -881,7 +887,7 @@ public class ReplicationSource extends Thread // which throws a NPE if we open a file before any data node has the most recent block // Just sleep and retry. Will require re-reading compressed WALs for compressionContext. LOG.warn("Got NPE opening reader, will retry."); - } else if (sleepMultiplier == maxRetriesMultiplier) { + } else if (sleepMultiplier >= maxRetriesMultiplier) { // TODO Need a better way to determine if a file is really gone but // TODO without scanning all logs dir LOG.warn("Waited too long for this file, considering dumping"); @@ -891,6 +897,22 @@ public class ReplicationSource extends Thread return true; } + private void recoverLease(final Configuration conf, final Path path) { + try { + final FileSystem dfs = FSUtils.getCurrentFileSystem(conf); + FSUtils fsUtils = FSUtils.getInstance(dfs, conf); + fsUtils.recoverFileLease(dfs, path, conf, new CancelableProgressable() { + @Override + public boolean progress() { + LOG.debug("recover WAL lease: " + path); + return isWorkerActive(); + } + }); + } catch (IOException e) { + LOG.warn("unable to recover lease for WAL: " + path, e); + } + } + /* * Checks whether the current log file is empty, and it is not a recovered queue. This is to * handle scenario when in an idle cluster, there is no entry in the current log and we keep on diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/LeaseNotRecoveredException.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/LeaseNotRecoveredException.java new file mode 100644 index 00000000000..ca769b8bcc4 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/LeaseNotRecoveredException.java @@ -0,0 +1,47 @@ +/** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.util; + +import org.apache.hadoop.hbase.HBaseIOException; +import org.apache.hadoop.hbase.classification.InterfaceAudience; +import org.apache.hadoop.hbase.classification.InterfaceStability; + +/** + * Thrown when the lease was expected to be recovered, + * but the file can't be opened. + */ +@InterfaceAudience.Public +@InterfaceStability.Stable +public class LeaseNotRecoveredException extends HBaseIOException { + public LeaseNotRecoveredException() { + super(); + } + + public LeaseNotRecoveredException(String message) { + super(message); + } + + public LeaseNotRecoveredException(String message, Throwable cause) { + super(message, cause); + } + + public LeaseNotRecoveredException(Throwable cause) { + super(cause); + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java index 67f42c3a30e..7dd3e7d1c68 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hbase.wal.WAL.Reader; import org.apache.hadoop.hbase.wal.WALProvider.Writer; import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.LeaseNotRecoveredException; // imports for things that haven't moved from regionserver.wal yet. import org.apache.hadoop.hbase.regionserver.wal.MetricsWAL; @@ -335,8 +336,10 @@ public class WALFactory { throw iioe; } } + throw new LeaseNotRecoveredException(e); + } else { + throw e; } - throw e; } } } catch (IOException ie) {