diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java index e5105a50452..3a75d33dd26 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java @@ -264,9 +264,31 @@ public class ProcedureExecutor { private final CopyOnWriteArrayList listeners = new CopyOnWriteArrayList<>(); private Configuration conf; + + /** + * Created in the {@link #start(int, boolean)} method. Destroyed in {@link #join()} (FIX! Doing + * resource handling rather than observing in a #join is unexpected). + * Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery + * (Should be ok). + */ private ThreadGroup threadGroup; + + /** + * Created in the {@link #start(int, boolean)} method. Terminated in {@link #join()} (FIX! Doing + * resource handling rather than observing in a #join is unexpected). + * Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery + * (Should be ok). + */ private CopyOnWriteArrayList workerThreads; + + /** + * Created in the {@link #start(int, boolean)} method. Terminated in {@link #join()} (FIX! Doing + * resource handling rather than observing in a #join is unexpected). + * Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery + * (Should be ok). + */ private TimeoutExecutorThread timeoutExecutor; + private int corePoolSize; private int maxPoolSize; @@ -299,6 +321,7 @@ public class ProcedureExecutor { this.conf = conf; this.checkOwnerSet = conf.getBoolean(CHECK_OWNER_SET_CONF_KEY, DEFAULT_CHECK_OWNER_SET); refreshConfiguration(conf); + } private void load(final boolean abortOnCorruption) throws IOException { @@ -510,11 +533,8 @@ public class ProcedureExecutor { LOG.info("Starting {} core workers (bigger of cpus/4 or 16) with max (burst) worker count={}", corePoolSize, maxPoolSize); - // Create the Thread Group for the executors - threadGroup = new ThreadGroup("PEWorkerGroup"); - - // Create the timeout executor - timeoutExecutor = new TimeoutExecutorThread(this, threadGroup); + this.threadGroup = new ThreadGroup("PEWorkerGroup"); + this.timeoutExecutor = new TimeoutExecutorThread(this, threadGroup); // Create the workers workerId.set(0); @@ -576,22 +596,21 @@ public class ProcedureExecutor { // stop the timeout executor timeoutExecutor.awaitTermination(); - timeoutExecutor = null; // stop the worker threads for (WorkerThread worker: workerThreads) { worker.awaitTermination(); } - workerThreads = null; // Destroy the Thread Group for the executors + // TODO: Fix. #join is not place to destroy resources. try { threadGroup.destroy(); } catch (IllegalThreadStateException e) { - LOG.error("ThreadGroup " + threadGroup + " contains running threads; " + e.getMessage()); - threadGroup.list(); - } finally { - threadGroup = null; + LOG.error("ThreadGroup {} contains running threads; {}: See STDOUT", + this.threadGroup, e.getMessage()); + // This dumps list of threads on STDOUT. + this.threadGroup.list(); } // reset the in-memory state for testing diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java index 87f6fa4e25f..4c9d0e3c33d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java @@ -33,6 +33,7 @@ import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.client.TableDescriptorBuilder; +import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.test.MetricsAssertHelper; import org.apache.hadoop.hbase.testclassification.MasterTests; @@ -91,6 +92,8 @@ public class TestAssignmentManagerMetrics { // set a small interval for updating rit metrics conf.setInt(AssignmentManager.RIT_CHORE_INTERVAL_MSEC_CONF_KEY, MSG_INTERVAL); + // keep rs online so it can report the failed opens. + conf.setBoolean(CoprocessorHost.ABORT_ON_ERROR_KEY, false); TEST_UTIL.startMiniCluster(1); CLUSTER = TEST_UTIL.getHBaseCluster(); MASTER = CLUSTER.getMaster(); @@ -148,6 +151,9 @@ public class TestAssignmentManagerMetrics { } // Sleep 3 seconds, wait for doMetrics chore catching up + // the rit count consists of rit and failed opens. see RegionInTransitionStat#update + // Waiting for the completion of rit makes the assert stable. + TEST_UTIL.waitUntilNoRegionsInTransition(); Thread.sleep(MSG_INTERVAL * 3); METRICS_HELPER.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_NAME, 1, amSource); METRICS_HELPER.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_OVER_THRESHOLD_NAME, 1,