From c1a9e8973a10d9ae484a3ec30964f2c99e74b0d5 Mon Sep 17 00:00:00 2001 From: XinSun Date: Sat, 20 Feb 2021 10:20:54 +0800 Subject: [PATCH] HBASE-25562 ReplicationSourceWALReader log and handle exception immediately without retrying (#2943) Signed-off-by: Wellington Chevreuil Signed-off-by: stack Signed-off-by: shahrs87 --- .../ReplicationSourceWALReader.java | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java index 52ac144aa14..f52a83a6ff7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java @@ -150,14 +150,13 @@ class ReplicationSourceWALReader extends Thread { } } } catch (IOException e) { // stream related - if (sleepMultiplier < maxRetriesMultiplier) { - LOG.debug("Failed to read stream of replication entries: " + e); - sleepMultiplier++; - } else { - LOG.error("Failed to read stream of replication entries", e); - handleEofException(e); + if (!handleEofException(e)) { + LOG.warn("Failed to read stream of replication entries", e); + if (sleepMultiplier < maxRetriesMultiplier) { + sleepMultiplier ++; + } + Threads.sleep(sleepForRetries * sleepMultiplier); } - Threads.sleep(sleepForRetries * sleepMultiplier); } catch (InterruptedException e) { LOG.trace("Interrupted while sleeping between WAL reads"); Thread.currentThread().interrupt(); @@ -244,10 +243,13 @@ class ReplicationSourceWALReader extends Thread { } } - // if we get an EOF due to a zero-length log, and there are other logs in queue - // (highly likely we've closed the current log), we've hit the max retries, and autorecovery is - // enabled, then dump the log - private void handleEofException(IOException e) { + /** + * if we get an EOF due to a zero-length log, and there are other logs in queue + * (highly likely we've closed the current log), and autorecovery is + * enabled, then dump the log + * @return true only the IOE can be handled + */ + private boolean handleEofException(IOException e) { PriorityBlockingQueue queue = logQueue.getQueue(walGroupId); // Dump the log even if logQueue size is 1 if the source is from recovered Source // since we don't add current log to recovered source queue so it is safe to remove. @@ -255,14 +257,16 @@ class ReplicationSourceWALReader extends Thread { (source.isRecovered() || queue.size() > 1) && this.eofAutoRecovery) { try { if (fs.getFileStatus(queue.peek()).getLen() == 0) { - LOG.warn("Forcing removal of 0 length log in queue: " + queue.peek()); + LOG.warn("Forcing removal of 0 length log in queue: {}", queue.peek()); logQueue.remove(walGroupId); currentPosition = 0; + return true; } } catch (IOException ioe) { - LOG.warn("Couldn't get file length information about log " + queue.peek()); + LOG.warn("Couldn't get file length information about log {}", queue.peek()); } } + return false; } public Path getCurrentPath() {