From 9a7293c1740e895476166e63e2cb9a385a672a9f Mon Sep 17 00:00:00 2001 From: Sergey Soldatov Date: Fri, 17 May 2019 19:31:34 -0700 Subject: [PATCH] HBASE-22289 WAL-based log splitting resubmit threshold may result in a task being stuck forever --- .../ZkSplitLogWorkerCoordination.java | 4 +- .../hbase/regionserver/SplitLogWorker.java | 2 + .../handler/WALSplitterHandler.java | 45 +++++++++++-------- 3 files changed, 32 insertions(+), 19 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java index ff555f22add..7efb4a574b3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java @@ -572,7 +572,9 @@ public class ZkSplitLogWorkerCoordination extends ZKListener implements try { if (ZKUtil.setData(watcher, task, slt.toByteArray(), taskZKVersion)) { LOG.info("successfully transitioned task " + task + " to final state " + slt); - ctr.increment(); + if (ctr != null) { + ctr.increment(); + } return; } LOG.warn("failed to transistion task " + task + " to end state " + slt diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java index a1c20306b38..d4549deca62 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java @@ -96,6 +96,8 @@ public class SplitLogWorker implements Runnable { // TODO have to correctly figure out when log splitting has been // interrupted or has encountered a transient error and when it has // encountered a bad non-retry-able persistent error. + // Note: this can actually get the master stuck (HBASE-22289) so treat preempted as error. + // splitLogFile does return false for legitimate retriable errors. try { if (!WALSplitter.splitLogFile(walDir, fs.getFileStatus(new Path(walDir, filename)), fs, conf, p, sequenceIdChecker, diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java index 49ab574ec52..f1bbfe797ff 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java @@ -62,38 +62,47 @@ public class WALSplitterHandler extends EventHandler { this.splitTaskExecutor = splitTaskExecutor; } + + @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="SF_SWITCH_FALLTHROUGH", + justification="Intentional") @Override public void process() throws IOException { long startTime = System.currentTimeMillis(); Status status = null; try { status = this.splitTaskExecutor.exec(splitTaskDetails.getWALFile(), reporter); + boolean wasCounterIncremented = false; switch (status) { case DONE: coordination.endTask(new SplitLogTask.Done(this.serverName), SplitLogCounters.tot_wkr_task_done, splitTaskDetails); break; case PREEMPTED: - SplitLogCounters.tot_wkr_preempt_task.increment(); - LOG.warn("task execution preempted " + splitTaskDetails.getWALFile()); - break; + SplitLogCounters.tot_wkr_preempt_task.increment(); + wasCounterIncremented = true; + // Preempted state can currently be returned either when task is preempted, or when + // there's a particular kind of error (e.g. some ZK/HDFS errors, in my observation). + // In the latter case, master-side split task will get stuck if we don't update the + // status. Treat preemption as error to be on the safe side. + LOG.warn("task execution preempted; treating as error " + splitTaskDetails.getWALFile()); + //$FALL-THROUGH$ case ERR: - if (server != null && !server.isStopped()) { - coordination.endTask(new SplitLogTask.Err(this.serverName), - SplitLogCounters.tot_wkr_task_err, splitTaskDetails); - break; - } - // if the RS is exiting then there is probably a tons of stuff - // that can go wrong. Resign instead of signaling error. - //$FALL-THROUGH$ + if (server != null && !server.isStopped()) { + coordination.endTask(new SplitLogTask.Err(this.serverName), wasCounterIncremented + ? null : SplitLogCounters.tot_wkr_task_err, splitTaskDetails); + break; + } + // if the RS is exiting then there is probably a tons of stuff + // that can go wrong. Resign instead of signaling error. + //$FALL-THROUGH$ case RESIGNED: - if (server != null && server.isStopped()) { - LOG.info("task execution interrupted because worker is exiting " - + splitTaskDetails.toString()); - } - coordination.endTask(new SplitLogTask.Resigned(this.serverName), - SplitLogCounters.tot_wkr_task_resigned, splitTaskDetails); - break; + if (server != null && server.isStopped()) { + LOG.info("task execution interrupted because worker is exiting " + + splitTaskDetails.toString()); + } + coordination.endTask(new SplitLogTask.Resigned(this.serverName), wasCounterIncremented + ? null : SplitLogCounters.tot_wkr_task_resigned, splitTaskDetails); + break; } } finally { LOG.info("Worker " + serverName + " done with task " + splitTaskDetails.toString() + " in "