HBASE-15019 Replication stuck when HDFS is restarted.
Signed-off-by: Sean Busbey <busbey@cloudera.com>
This commit is contained in:
parent
486f7612be
commit
67c2fc7cd6
|
@ -60,8 +60,10 @@ import org.apache.hadoop.hbase.replication.ReplicationQueues;
|
|||
import org.apache.hadoop.hbase.replication.SystemTableWALEntryFilter;
|
||||
import org.apache.hadoop.hbase.replication.WALEntryFilter;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.CancelableProgressable;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.apache.hadoop.hbase.util.LeaseNotRecoveredException;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.apache.hadoop.hbase.wal.DefaultWALProvider;
|
||||
import org.apache.hadoop.hbase.wal.WAL;
|
||||
|
@ -450,9 +452,9 @@ public class ReplicationSource extends Thread
|
|||
* @param p path to split
|
||||
* @return start time
|
||||
*/
|
||||
private long getTS(Path p) {
|
||||
String[] parts = p.getName().split("\\.");
|
||||
return Long.parseLong(parts[parts.length-1]);
|
||||
private static long getTS(Path p) {
|
||||
int tsIndex = p.getName().lastIndexOf('.') + 1;
|
||||
return Long.parseLong(p.getName().substring(tsIndex));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -791,7 +793,6 @@ public class ReplicationSource extends Thread
|
|||
* @return true if we should continue with that file, false if we are over with it
|
||||
*/
|
||||
protected boolean openReader(int sleepMultiplier) {
|
||||
|
||||
try {
|
||||
try {
|
||||
if (LOG.isTraceEnabled()) {
|
||||
|
@ -872,6 +873,11 @@ public class ReplicationSource extends Thread
|
|||
// TODO What happens the log is missing in both places?
|
||||
}
|
||||
}
|
||||
} catch (LeaseNotRecoveredException lnre) {
|
||||
// HBASE-15019 the WAL was not closed due to some hiccup.
|
||||
LOG.warn(peerClusterZnode + " Try to recover the WAL lease " + currentPath, lnre);
|
||||
recoverLease(conf, currentPath);
|
||||
this.reader = null;
|
||||
} catch (IOException ioe) {
|
||||
if (ioe instanceof EOFException && isCurrentLogEmpty()) return true;
|
||||
LOG.warn(peerClusterZnode + " Got: ", ioe);
|
||||
|
@ -881,7 +887,7 @@ public class ReplicationSource extends Thread
|
|||
// which throws a NPE if we open a file before any data node has the most recent block
|
||||
// Just sleep and retry. Will require re-reading compressed WALs for compressionContext.
|
||||
LOG.warn("Got NPE opening reader, will retry.");
|
||||
} else if (sleepMultiplier == maxRetriesMultiplier) {
|
||||
} else if (sleepMultiplier >= maxRetriesMultiplier) {
|
||||
// TODO Need a better way to determine if a file is really gone but
|
||||
// TODO without scanning all logs dir
|
||||
LOG.warn("Waited too long for this file, considering dumping");
|
||||
|
@ -891,6 +897,22 @@ public class ReplicationSource extends Thread
|
|||
return true;
|
||||
}
|
||||
|
||||
private void recoverLease(final Configuration conf, final Path path) {
|
||||
try {
|
||||
final FileSystem dfs = FSUtils.getCurrentFileSystem(conf);
|
||||
FSUtils fsUtils = FSUtils.getInstance(dfs, conf);
|
||||
fsUtils.recoverFileLease(dfs, path, conf, new CancelableProgressable() {
|
||||
@Override
|
||||
public boolean progress() {
|
||||
LOG.debug("recover WAL lease: " + path);
|
||||
return isWorkerActive();
|
||||
}
|
||||
});
|
||||
} catch (IOException e) {
|
||||
LOG.warn("unable to recover lease for WAL: " + path, e);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks whether the current log file is empty, and it is not a recovered queue. This is to
|
||||
* handle scenario when in an idle cluster, there is no entry in the current log and we keep on
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/**
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.util;
|
||||
|
||||
import org.apache.hadoop.hbase.HBaseIOException;
|
||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.hbase.classification.InterfaceStability;
|
||||
|
||||
/**
|
||||
* Thrown when the lease was expected to be recovered,
|
||||
* but the file can't be opened.
|
||||
*/
|
||||
@InterfaceAudience.Public
|
||||
@InterfaceStability.Stable
|
||||
public class LeaseNotRecoveredException extends HBaseIOException {
|
||||
public LeaseNotRecoveredException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public LeaseNotRecoveredException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public LeaseNotRecoveredException(String message, Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public LeaseNotRecoveredException(Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
}
|
|
@ -40,6 +40,7 @@ import org.apache.hadoop.hbase.wal.WAL.Reader;
|
|||
import org.apache.hadoop.hbase.wal.WALProvider.Writer;
|
||||
import org.apache.hadoop.hbase.util.CancelableProgressable;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||
import org.apache.hadoop.hbase.util.LeaseNotRecoveredException;
|
||||
|
||||
// imports for things that haven't moved from regionserver.wal yet.
|
||||
import org.apache.hadoop.hbase.regionserver.wal.MetricsWAL;
|
||||
|
@ -335,8 +336,10 @@ public class WALFactory {
|
|||
throw iioe;
|
||||
}
|
||||
}
|
||||
throw new LeaseNotRecoveredException(e);
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
} catch (IOException ie) {
|
||||
|
|
Loading…
Reference in New Issue