From 392bce03f6e60def4f624ad0087685bf8f9c9c95 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Fri, 20 Mar 2020 15:25:06 -0700 Subject: [PATCH] HBASE-23984 [Flakey Tests] TestMasterAbortAndRSGotKilled fails in teardown (#1311) hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Change parameter name and add javadoc to make it more clear what the param actually is. hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java Move postOpenDeployTasks so if it fails to talk to the Master -- which can happen on cluster shutdown -- then we will do cleanup of state; without this the RS can get stuck and won't go down. hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java Add handleException so CRH looks more like UnassignRegionHandler and AssignRegionHandler around exception handling. Add a bit of doc on why CRH. hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java Right shift most of the body of process so can add in a finally that cleans up rs.getRegionsInTransitionInRS is on exception (otherwise outstanding entries can stop a RS going down on cluster shutdown) Signed-off-by: Nick Dimiduk Signed-off-by: Duo Zhang --- .../hadoop/hbase/regionserver/HRegion.java | 14 +++-- .../hbase/regionserver/HRegionServer.java | 12 +++- .../handler/AssignRegionHandler.java | 8 ++- .../handler/CloseRegionHandler.java | 61 +++++++++---------- .../handler/UnassignRegionHandler.java | 17 +++--- 5 files changed, 64 insertions(+), 48 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java index 72eb3515b13..c58a5917cfb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java @@ -944,7 +944,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi " initialization."); } if (LOG.isDebugEnabled()) { - LOG.debug("Region open journal:\n" + status.prettyPrintJournal()); + LOG.debug("Region open journal for {}:\n{}", this.getRegionInfo().getEncodedName(), + status.prettyPrintJournal()); } status.cleanup(); } @@ -1553,7 +1554,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } } finally { if (LOG.isDebugEnabled()) { - LOG.debug("Region close journal:\n" + status.prettyPrintJournal()); + LOG.debug("Region close journal for {}:\n{}", this.getRegionInfo().getEncodedName(), + status.prettyPrintJournal()); } status.cleanup(); } @@ -1755,7 +1757,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi Closeables.close(this.metricsRegionWrapper, true); } status.markComplete("Closed"); - LOG.info("Closed " + this); + LOG.info("Closed {}", this); return result; } finally { lock.writeLock().unlock(); @@ -2266,7 +2268,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } finally { if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction); if (status != null) { - LOG.debug("Compaction status journal:\n\t" + status.prettyPrintJournal()); + LOG.debug("Compaction status journal for {}:\n\t{}", this.getRegionInfo().getEncodedName(), + status.prettyPrintJournal()); status.cleanup(); } } @@ -2413,7 +2416,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } } finally { lock.readLock().unlock(); - LOG.debug("Flush status journal:\n\t" + status.prettyPrintJournal()); + LOG.debug("Flush status journal for {}:\n\t{}", this.getRegionInfo().getEncodedName(), + status.prettyPrintJournal()); status.cleanup(); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index fc0b69345ca..0de2c1755b7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -53,6 +53,7 @@ import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.stream.Collectors; import javax.management.MalformedObjectNameException; import javax.servlet.http.HttpServlet; import org.apache.commons.lang3.RandomUtils; @@ -1457,6 +1458,9 @@ public class HRegionServer extends HasThread implements " because some regions failed closing"); } break; + } else { + LOG.debug("Waiting on {}", this.regionsInTransitionInRS.keySet().stream(). + map(e -> Bytes.toString(e)).collect(Collectors.joining(", "))); } if (sleep(200)) { interrupted = true; @@ -3187,10 +3191,12 @@ public class HRegionServer extends HasThread implements * * @param encodedName Region to close * @param abort True if we are aborting + * @param destination Where the Region is being moved too... maybe null if unknown. * @return True if closed a region. * @throws NotServingRegionException if the region is not online */ - protected boolean closeRegion(String encodedName, final boolean abort, final ServerName sn) + protected boolean closeRegion(String encodedName, final boolean abort, + final ServerName destination) throws NotServingRegionException { //Check for permissions to close. HRegion actualRegion = this.getRegion(encodedName); @@ -3216,7 +3222,7 @@ public class HRegionServer extends HasThread implements // We're going to try to do a standard close then. LOG.warn("The opening for region " + encodedName + " was done before we could cancel it." + " Doing a standard close now"); - return closeRegion(encodedName, abort, sn); + return closeRegion(encodedName, abort, destination); } // Let's get the region from the online region list again actualRegion = this.getRegion(encodedName); @@ -3247,7 +3253,7 @@ public class HRegionServer extends HasThread implements if (hri.isMetaRegion()) { crh = new CloseMetaHandler(this, this, hri, abort); } else { - crh = new CloseRegionHandler(this, this, hri, abort, sn); + crh = new CloseRegionHandler(this, this, hri, abort, destination); } this.executorService.submit(crh); return true; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java index 76e8f802712..3474bf916be 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -34,7 +34,6 @@ import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** @@ -136,6 +135,8 @@ public class AssignRegionHandler extends EventHandler { cleanUpAndReportFailure(e); return; } + // From here on out, this is PONR. We can not revert back. The only way to address an + // exception from here on out is to abort the region server. rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime)); rs.addRegion(region); LOG.info("Opened {}", regionName); @@ -156,6 +157,9 @@ public class AssignRegionHandler extends EventHandler { protected void handleException(Throwable t) { LOG.warn("Fatal error occurred while opening region {}, aborting...", regionInfo.getRegionNameAsString(), t); + // Clear any reference in getServer().getRegionsInTransitionInRS() otherwise can hold up + // regionserver abort on cluster shutdown. HBASE-23984. + getServer().getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes()); getServer().abort( "Failed to open region " + regionInfo.getRegionNameAsString() + " and can not recover", t); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java index d4ea004cb23..6f430429635 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java @@ -1,4 +1,4 @@ -/** +/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file @@ -19,7 +19,6 @@ package org.apache.hadoop.hbase.regionserver.handler; import java.io.IOException; - import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerName; @@ -38,9 +37,13 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProto /** * Handles closing of a region on a region server. *

- * Now for regular close region request, we will use {@link UnassignRegionHandler} instead. But when - * shutting down the region server, will also close regions and the related methods still use this - * class so we keep it here. + * In normal operation, we use {@link UnassignRegionHandler} closing Regions but when shutting down + * the region server and closing out Regions, we use this handler instead; it does not expect to + * be able to communicate the close back to the Master. + *

Expects that the close *has* been registered in the hosting RegionServer before + * submitting this Handler; i.e. rss.getRegionsInTransitionInRS().putIfAbsent( + * this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE); has been called first. + * In here when done, we do the deregister.

* @see UnassignRegionHandler */ @InterfaceAudience.Private @@ -62,11 +65,7 @@ public class CloseRegionHandler extends EventHandler { /** * This method used internally by the RegionServer to close out regions. - * @param server - * @param rsServices - * @param regionInfo * @param abort If the regionserver is aborting. - * @param destination */ public CloseRegionHandler(final Server server, final RegionServerServices rsServices, @@ -92,13 +91,13 @@ public class CloseRegionHandler extends EventHandler { } @Override - public void process() { + public void process() throws IOException { + String name = regionInfo.getEncodedName(); + LOG.trace("Processing close of {}", name); + String encodedRegionName = regionInfo.getEncodedName(); + // Check that this region is being served here + HRegion region = (HRegion)rsServices.getRegion(encodedRegionName); try { - String name = regionInfo.getEncodedName(); - LOG.trace("Processing close of {}", name); - String encodedRegionName = regionInfo.getEncodedName(); - // Check that this region is being served here - HRegion region = (HRegion)rsServices.getRegion(encodedRegionName); if (region == null) { LOG.warn("Received CLOSE for region {} but currently not serving - ignoring", name); // TODO: do better than a simple warning @@ -106,20 +105,11 @@ public class CloseRegionHandler extends EventHandler { } // Close the region - try { - if (region.close(abort) == null) { - // This region got closed. Most likely due to a split. - // The split message will clean up the master state. - LOG.warn("Can't close region {}, was already closed during close()", name); - return; - } - } catch (IOException ioe) { - // An IOException here indicates that we couldn't successfully flush the - // memstore before closing. So, we need to abort the server and allow - // the master to split our logs in order to recover the data. - server.abort("Unrecoverable exception while closing region " + - regionInfo.getRegionNameAsString() + ", still finishing close", ioe); - throw new RuntimeException(ioe); + if (region.close(abort) == null) { + // This region has already been closed. Should not happen (A unit test makes this + // happen as a side effect, TestRegionObserverInterface.testPreWALAppendNotCalledOnMetaEdit) + LOG.warn("Can't close {}; already closed", name); + return; } this.rsServices.removeRegion(region, destination); @@ -127,10 +117,19 @@ public class CloseRegionHandler extends EventHandler { HConstants.NO_SEQNUM, Procedure.NO_PROC_ID, -1, regionInfo)); // Done! Region is closed on this RS - LOG.debug("Closed " + region.getRegionInfo().getRegionNameAsString()); - } finally { this.rsServices.getRegionsInTransitionInRS(). remove(this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE); + LOG.debug("Closed {}" + region.getRegionInfo().getRegionNameAsString()); + } finally { + // Clear any reference in getServer().getRegionsInTransitionInRS() on success or failure, + // since a reference was added before this CRH was invoked. If we don't clear it, it can + // hold up regionserver abort on cluster shutdown. HBASE-23984. + this.rsServices.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes()); } } + + @Override protected void handleException(Throwable t) { + server.abort("Unrecoverable exception while closing " + + this.regionInfo.getRegionNameAsString(), t); + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java index 0fc8edc47a0..8b275d0e6ed 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java @@ -33,7 +33,6 @@ import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** @@ -106,10 +105,10 @@ public class UnassignRegionHandler extends EventHandler { LOG.info("Close {}", regionName); if (region.getCoprocessorHost() != null) { // XXX: The behavior is a bit broken. At master side there is no FAILED_CLOSE state, so if - // there are exception thrown from the CP, we can not report the error to master, and if here - // we just return without calling reportRegionStateTransition, the TRSP at master side will - // hang there for ever. So here if the CP throws an exception out, the only way is to abort - // the RS... + // there are exception thrown from the CP, we can not report the error to master, and if + // here we just return without calling reportRegionStateTransition, the TRSP at master side + // will hang there for ever. So here if the CP throws an exception out, the only way is to + // abort the RS... region.getCoprocessorHost().preClose(abort); } if (region.close(abort) == null) { @@ -120,8 +119,9 @@ public class UnassignRegionHandler extends EventHandler { return; } rs.removeRegion(region, destination); - if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED, - HConstants.NO_SEQNUM, closeProcId, -1, region.getRegionInfo()))) { + if (!rs.reportRegionStateTransition( + new RegionStateTransitionContext(TransitionCode.CLOSED, HConstants.NO_SEQNUM, closeProcId, + -1, region.getRegionInfo()))) { throw new IOException("Failed to report close to master: " + regionName); } // Cache the close region procedure id after report region transition succeed. @@ -133,6 +133,9 @@ public class UnassignRegionHandler extends EventHandler { @Override protected void handleException(Throwable t) { LOG.warn("Fatal error occurred while closing region {}, aborting...", encodedName, t); + // Clear any reference in getServer().getRegionsInTransitionInRS() otherwise can hold up + // regionserver abort on cluster shutdown. HBASE-23984. + getServer().getRegionsInTransitionInRS().remove(Bytes.toBytes(this.encodedName)); getServer().abort("Failed to close region " + encodedName + " and can not recover", t); }