HBASE-23984 [Flakey Tests] TestMasterAbortAndRSGotKilled fails in teardown (#1311)
hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Change parameter name and add javadoc to make it more clear what the param actually is. hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java Move postOpenDeployTasks so if it fails to talk to the Master -- which can happen on cluster shutdown -- then we will do cleanup of state; without this the RS can get stuck and won't go down. hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java Add handleException so CRH looks more like UnassignRegionHandler and AssignRegionHandler around exception handling. Add a bit of doc on why CRH. hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java Right shift most of the body of process so can add in a finally that cleans up rs.getRegionsInTransitionInRS is on exception (otherwise outstanding entries can stop a RS going down on cluster shutdown) Signed-off-by: Nick Dimiduk <ndimiduk@apache.org> Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
parent
8320f73c8c
commit
392bce03f6
|
@ -944,7 +944,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
" initialization.");
|
" initialization.");
|
||||||
}
|
}
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.debug("Region open journal:\n" + status.prettyPrintJournal());
|
LOG.debug("Region open journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
|
||||||
|
status.prettyPrintJournal());
|
||||||
}
|
}
|
||||||
status.cleanup();
|
status.cleanup();
|
||||||
}
|
}
|
||||||
|
@ -1553,7 +1554,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.debug("Region close journal:\n" + status.prettyPrintJournal());
|
LOG.debug("Region close journal for {}:\n{}", this.getRegionInfo().getEncodedName(),
|
||||||
|
status.prettyPrintJournal());
|
||||||
}
|
}
|
||||||
status.cleanup();
|
status.cleanup();
|
||||||
}
|
}
|
||||||
|
@ -1755,7 +1757,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
Closeables.close(this.metricsRegionWrapper, true);
|
Closeables.close(this.metricsRegionWrapper, true);
|
||||||
}
|
}
|
||||||
status.markComplete("Closed");
|
status.markComplete("Closed");
|
||||||
LOG.info("Closed " + this);
|
LOG.info("Closed {}", this);
|
||||||
return result;
|
return result;
|
||||||
} finally {
|
} finally {
|
||||||
lock.writeLock().unlock();
|
lock.writeLock().unlock();
|
||||||
|
@ -2266,7 +2268,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
} finally {
|
} finally {
|
||||||
if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
|
if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
|
||||||
if (status != null) {
|
if (status != null) {
|
||||||
LOG.debug("Compaction status journal:\n\t" + status.prettyPrintJournal());
|
LOG.debug("Compaction status journal for {}:\n\t{}", this.getRegionInfo().getEncodedName(),
|
||||||
|
status.prettyPrintJournal());
|
||||||
status.cleanup();
|
status.cleanup();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2413,7 +2416,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
lock.readLock().unlock();
|
lock.readLock().unlock();
|
||||||
LOG.debug("Flush status journal:\n\t" + status.prettyPrintJournal());
|
LOG.debug("Flush status journal for {}:\n\t{}", this.getRegionInfo().getEncodedName(),
|
||||||
|
status.prettyPrintJournal());
|
||||||
status.cleanup();
|
status.cleanup();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,6 +53,7 @@ import java.util.concurrent.ConcurrentSkipListMap;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import javax.management.MalformedObjectNameException;
|
import javax.management.MalformedObjectNameException;
|
||||||
import javax.servlet.http.HttpServlet;
|
import javax.servlet.http.HttpServlet;
|
||||||
import org.apache.commons.lang3.RandomUtils;
|
import org.apache.commons.lang3.RandomUtils;
|
||||||
|
@ -1457,6 +1458,9 @@ public class HRegionServer extends HasThread implements
|
||||||
" because some regions failed closing");
|
" because some regions failed closing");
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
} else {
|
||||||
|
LOG.debug("Waiting on {}", this.regionsInTransitionInRS.keySet().stream().
|
||||||
|
map(e -> Bytes.toString(e)).collect(Collectors.joining(", ")));
|
||||||
}
|
}
|
||||||
if (sleep(200)) {
|
if (sleep(200)) {
|
||||||
interrupted = true;
|
interrupted = true;
|
||||||
|
@ -3187,10 +3191,12 @@ public class HRegionServer extends HasThread implements
|
||||||
*
|
*
|
||||||
* @param encodedName Region to close
|
* @param encodedName Region to close
|
||||||
* @param abort True if we are aborting
|
* @param abort True if we are aborting
|
||||||
|
* @param destination Where the Region is being moved too... maybe null if unknown.
|
||||||
* @return True if closed a region.
|
* @return True if closed a region.
|
||||||
* @throws NotServingRegionException if the region is not online
|
* @throws NotServingRegionException if the region is not online
|
||||||
*/
|
*/
|
||||||
protected boolean closeRegion(String encodedName, final boolean abort, final ServerName sn)
|
protected boolean closeRegion(String encodedName, final boolean abort,
|
||||||
|
final ServerName destination)
|
||||||
throws NotServingRegionException {
|
throws NotServingRegionException {
|
||||||
//Check for permissions to close.
|
//Check for permissions to close.
|
||||||
HRegion actualRegion = this.getRegion(encodedName);
|
HRegion actualRegion = this.getRegion(encodedName);
|
||||||
|
@ -3216,7 +3222,7 @@ public class HRegionServer extends HasThread implements
|
||||||
// We're going to try to do a standard close then.
|
// We're going to try to do a standard close then.
|
||||||
LOG.warn("The opening for region " + encodedName + " was done before we could cancel it." +
|
LOG.warn("The opening for region " + encodedName + " was done before we could cancel it." +
|
||||||
" Doing a standard close now");
|
" Doing a standard close now");
|
||||||
return closeRegion(encodedName, abort, sn);
|
return closeRegion(encodedName, abort, destination);
|
||||||
}
|
}
|
||||||
// Let's get the region from the online region list again
|
// Let's get the region from the online region list again
|
||||||
actualRegion = this.getRegion(encodedName);
|
actualRegion = this.getRegion(encodedName);
|
||||||
|
@ -3247,7 +3253,7 @@ public class HRegionServer extends HasThread implements
|
||||||
if (hri.isMetaRegion()) {
|
if (hri.isMetaRegion()) {
|
||||||
crh = new CloseMetaHandler(this, this, hri, abort);
|
crh = new CloseMetaHandler(this, this, hri, abort);
|
||||||
} else {
|
} else {
|
||||||
crh = new CloseRegionHandler(this, this, hri, abort, sn);
|
crh = new CloseRegionHandler(this, this, hri, abort, destination);
|
||||||
}
|
}
|
||||||
this.executorService.submit(crh);
|
this.executorService.submit(crh);
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/**
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
* or more contributor license agreements. See the NOTICE file
|
* or more contributor license agreements. See the NOTICE file
|
||||||
* distributed with this work for additional information
|
* distributed with this work for additional information
|
||||||
|
@ -34,7 +34,6 @@ import org.apache.hadoop.hbase.util.RetryCounter;
|
||||||
import org.apache.yetus.audience.InterfaceAudience;
|
import org.apache.yetus.audience.InterfaceAudience;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -136,6 +135,8 @@ public class AssignRegionHandler extends EventHandler {
|
||||||
cleanUpAndReportFailure(e);
|
cleanUpAndReportFailure(e);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// From here on out, this is PONR. We can not revert back. The only way to address an
|
||||||
|
// exception from here on out is to abort the region server.
|
||||||
rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime));
|
rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime));
|
||||||
rs.addRegion(region);
|
rs.addRegion(region);
|
||||||
LOG.info("Opened {}", regionName);
|
LOG.info("Opened {}", regionName);
|
||||||
|
@ -156,6 +157,9 @@ public class AssignRegionHandler extends EventHandler {
|
||||||
protected void handleException(Throwable t) {
|
protected void handleException(Throwable t) {
|
||||||
LOG.warn("Fatal error occurred while opening region {}, aborting...",
|
LOG.warn("Fatal error occurred while opening region {}, aborting...",
|
||||||
regionInfo.getRegionNameAsString(), t);
|
regionInfo.getRegionNameAsString(), t);
|
||||||
|
// Clear any reference in getServer().getRegionsInTransitionInRS() otherwise can hold up
|
||||||
|
// regionserver abort on cluster shutdown. HBASE-23984.
|
||||||
|
getServer().getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes());
|
||||||
getServer().abort(
|
getServer().abort(
|
||||||
"Failed to open region " + regionInfo.getRegionNameAsString() + " and can not recover", t);
|
"Failed to open region " + regionInfo.getRegionNameAsString() + " and can not recover", t);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/**
|
/*
|
||||||
*
|
*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
* or more contributor license agreements. See the NOTICE file
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
@ -19,7 +19,6 @@
|
||||||
package org.apache.hadoop.hbase.regionserver.handler;
|
package org.apache.hadoop.hbase.regionserver.handler;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.hadoop.hbase.HConstants;
|
import org.apache.hadoop.hbase.HConstants;
|
||||||
import org.apache.hadoop.hbase.Server;
|
import org.apache.hadoop.hbase.Server;
|
||||||
import org.apache.hadoop.hbase.ServerName;
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
|
@ -38,9 +37,13 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProto
|
||||||
/**
|
/**
|
||||||
* Handles closing of a region on a region server.
|
* Handles closing of a region on a region server.
|
||||||
* <p/>
|
* <p/>
|
||||||
* Now for regular close region request, we will use {@link UnassignRegionHandler} instead. But when
|
* In normal operation, we use {@link UnassignRegionHandler} closing Regions but when shutting down
|
||||||
* shutting down the region server, will also close regions and the related methods still use this
|
* the region server and closing out Regions, we use this handler instead; it does not expect to
|
||||||
* class so we keep it here.
|
* be able to communicate the close back to the Master.
|
||||||
|
* <p>Expects that the close *has* been registered in the hosting RegionServer before
|
||||||
|
* submitting this Handler; i.e. <code>rss.getRegionsInTransitionInRS().putIfAbsent(
|
||||||
|
* this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE);</code> has been called first.
|
||||||
|
* In here when done, we do the deregister.</p>
|
||||||
* @see UnassignRegionHandler
|
* @see UnassignRegionHandler
|
||||||
*/
|
*/
|
||||||
@InterfaceAudience.Private
|
@InterfaceAudience.Private
|
||||||
|
@ -62,11 +65,7 @@ public class CloseRegionHandler extends EventHandler {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method used internally by the RegionServer to close out regions.
|
* This method used internally by the RegionServer to close out regions.
|
||||||
* @param server
|
|
||||||
* @param rsServices
|
|
||||||
* @param regionInfo
|
|
||||||
* @param abort If the regionserver is aborting.
|
* @param abort If the regionserver is aborting.
|
||||||
* @param destination
|
|
||||||
*/
|
*/
|
||||||
public CloseRegionHandler(final Server server,
|
public CloseRegionHandler(final Server server,
|
||||||
final RegionServerServices rsServices,
|
final RegionServerServices rsServices,
|
||||||
|
@ -92,13 +91,13 @@ public class CloseRegionHandler extends EventHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process() {
|
public void process() throws IOException {
|
||||||
|
String name = regionInfo.getEncodedName();
|
||||||
|
LOG.trace("Processing close of {}", name);
|
||||||
|
String encodedRegionName = regionInfo.getEncodedName();
|
||||||
|
// Check that this region is being served here
|
||||||
|
HRegion region = (HRegion)rsServices.getRegion(encodedRegionName);
|
||||||
try {
|
try {
|
||||||
String name = regionInfo.getEncodedName();
|
|
||||||
LOG.trace("Processing close of {}", name);
|
|
||||||
String encodedRegionName = regionInfo.getEncodedName();
|
|
||||||
// Check that this region is being served here
|
|
||||||
HRegion region = (HRegion)rsServices.getRegion(encodedRegionName);
|
|
||||||
if (region == null) {
|
if (region == null) {
|
||||||
LOG.warn("Received CLOSE for region {} but currently not serving - ignoring", name);
|
LOG.warn("Received CLOSE for region {} but currently not serving - ignoring", name);
|
||||||
// TODO: do better than a simple warning
|
// TODO: do better than a simple warning
|
||||||
|
@ -106,20 +105,11 @@ public class CloseRegionHandler extends EventHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close the region
|
// Close the region
|
||||||
try {
|
if (region.close(abort) == null) {
|
||||||
if (region.close(abort) == null) {
|
// This region has already been closed. Should not happen (A unit test makes this
|
||||||
// This region got closed. Most likely due to a split.
|
// happen as a side effect, TestRegionObserverInterface.testPreWALAppendNotCalledOnMetaEdit)
|
||||||
// The split message will clean up the master state.
|
LOG.warn("Can't close {}; already closed", name);
|
||||||
LOG.warn("Can't close region {}, was already closed during close()", name);
|
return;
|
||||||
return;
|
|
||||||
}
|
|
||||||
} catch (IOException ioe) {
|
|
||||||
// An IOException here indicates that we couldn't successfully flush the
|
|
||||||
// memstore before closing. So, we need to abort the server and allow
|
|
||||||
// the master to split our logs in order to recover the data.
|
|
||||||
server.abort("Unrecoverable exception while closing region " +
|
|
||||||
regionInfo.getRegionNameAsString() + ", still finishing close", ioe);
|
|
||||||
throw new RuntimeException(ioe);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
this.rsServices.removeRegion(region, destination);
|
this.rsServices.removeRegion(region, destination);
|
||||||
|
@ -127,10 +117,19 @@ public class CloseRegionHandler extends EventHandler {
|
||||||
HConstants.NO_SEQNUM, Procedure.NO_PROC_ID, -1, regionInfo));
|
HConstants.NO_SEQNUM, Procedure.NO_PROC_ID, -1, regionInfo));
|
||||||
|
|
||||||
// Done! Region is closed on this RS
|
// Done! Region is closed on this RS
|
||||||
LOG.debug("Closed " + region.getRegionInfo().getRegionNameAsString());
|
|
||||||
} finally {
|
|
||||||
this.rsServices.getRegionsInTransitionInRS().
|
this.rsServices.getRegionsInTransitionInRS().
|
||||||
remove(this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE);
|
remove(this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE);
|
||||||
|
LOG.debug("Closed {}" + region.getRegionInfo().getRegionNameAsString());
|
||||||
|
} finally {
|
||||||
|
// Clear any reference in getServer().getRegionsInTransitionInRS() on success or failure,
|
||||||
|
// since a reference was added before this CRH was invoked. If we don't clear it, it can
|
||||||
|
// hold up regionserver abort on cluster shutdown. HBASE-23984.
|
||||||
|
this.rsServices.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override protected void handleException(Throwable t) {
|
||||||
|
server.abort("Unrecoverable exception while closing " +
|
||||||
|
this.regionInfo.getRegionNameAsString(), t);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.hadoop.hbase.util.RetryCounter;
|
||||||
import org.apache.yetus.audience.InterfaceAudience;
|
import org.apache.yetus.audience.InterfaceAudience;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -106,10 +105,10 @@ public class UnassignRegionHandler extends EventHandler {
|
||||||
LOG.info("Close {}", regionName);
|
LOG.info("Close {}", regionName);
|
||||||
if (region.getCoprocessorHost() != null) {
|
if (region.getCoprocessorHost() != null) {
|
||||||
// XXX: The behavior is a bit broken. At master side there is no FAILED_CLOSE state, so if
|
// XXX: The behavior is a bit broken. At master side there is no FAILED_CLOSE state, so if
|
||||||
// there are exception thrown from the CP, we can not report the error to master, and if here
|
// there are exception thrown from the CP, we can not report the error to master, and if
|
||||||
// we just return without calling reportRegionStateTransition, the TRSP at master side will
|
// here we just return without calling reportRegionStateTransition, the TRSP at master side
|
||||||
// hang there for ever. So here if the CP throws an exception out, the only way is to abort
|
// will hang there for ever. So here if the CP throws an exception out, the only way is to
|
||||||
// the RS...
|
// abort the RS...
|
||||||
region.getCoprocessorHost().preClose(abort);
|
region.getCoprocessorHost().preClose(abort);
|
||||||
}
|
}
|
||||||
if (region.close(abort) == null) {
|
if (region.close(abort) == null) {
|
||||||
|
@ -120,8 +119,9 @@ public class UnassignRegionHandler extends EventHandler {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
rs.removeRegion(region, destination);
|
rs.removeRegion(region, destination);
|
||||||
if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED,
|
if (!rs.reportRegionStateTransition(
|
||||||
HConstants.NO_SEQNUM, closeProcId, -1, region.getRegionInfo()))) {
|
new RegionStateTransitionContext(TransitionCode.CLOSED, HConstants.NO_SEQNUM, closeProcId,
|
||||||
|
-1, region.getRegionInfo()))) {
|
||||||
throw new IOException("Failed to report close to master: " + regionName);
|
throw new IOException("Failed to report close to master: " + regionName);
|
||||||
}
|
}
|
||||||
// Cache the close region procedure id after report region transition succeed.
|
// Cache the close region procedure id after report region transition succeed.
|
||||||
|
@ -133,6 +133,9 @@ public class UnassignRegionHandler extends EventHandler {
|
||||||
@Override
|
@Override
|
||||||
protected void handleException(Throwable t) {
|
protected void handleException(Throwable t) {
|
||||||
LOG.warn("Fatal error occurred while closing region {}, aborting...", encodedName, t);
|
LOG.warn("Fatal error occurred while closing region {}, aborting...", encodedName, t);
|
||||||
|
// Clear any reference in getServer().getRegionsInTransitionInRS() otherwise can hold up
|
||||||
|
// regionserver abort on cluster shutdown. HBASE-23984.
|
||||||
|
getServer().getRegionsInTransitionInRS().remove(Bytes.toBytes(this.encodedName));
|
||||||
getServer().abort("Failed to close region " + encodedName + " and can not recover", t);
|
getServer().abort("Failed to close region " + encodedName + " and can not recover", t);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue