HBASE-19828 Flakey TestRegionsOnMasterOptions.testRegionsOnAllServers

Rename the PE Worker threads.

Send an interrupt if worker taking a long time to go down
(it may be RPC'ing out to a dead server, retrying so
interrupt). Also join on the ProcedureExecutor shutting down.
This will make problems shutting down more obvious.

Disable TestRegionsOnMasterOptions. Master carrying Regions is broke.
This commit is contained in:
Michael Stack 2018-01-19 16:02:26 -08:00
parent 8f713fcc19
commit 86ecc963e4
4 changed files with 15 additions and 7 deletions

View File

@ -510,10 +510,10 @@ public class ProcedureExecutor<TEnvironment> {
// We have numThreads executor + one timer thread used for timing out
// procedures and triggering periodic procedures.
this.corePoolSize = numThreads;
LOG.info("Starting ProcedureExecutor Worker threads (ProcExecWrkr)=" + corePoolSize);
LOG.info("Starting ProcedureExecutor Worker threads (ProcedureExecutorWorker)=" + corePoolSize);
// Create the Thread Group for the executors
threadGroup = new ThreadGroup("ProcExecThrdGrp");
threadGroup = new ThreadGroup("ProcedureExecutorWorkerGroup");
// Create the timeout executor
timeoutExecutor = new TimeoutExecutorThread(threadGroup);
@ -592,7 +592,7 @@ public class ProcedureExecutor<TEnvironment> {
try {
threadGroup.destroy();
} catch (IllegalThreadStateException e) {
LOG.error("Thread group " + threadGroup + " contains running threads");
LOG.error("ThreadGroup " + threadGroup + " contains running threads; " + e.getMessage());
threadGroup.list();
} finally {
threadGroup = null;
@ -1709,7 +1709,7 @@ public class ProcedureExecutor<TEnvironment> {
private Procedure activeProcedure;
public WorkerThread(final ThreadGroup group) {
super(group, "ProcExecWrkr-" + workerId.incrementAndGet());
super(group, "ProcedureExecutorWorker-" + workerId.incrementAndGet());
setDaemon(true);
}
@ -1752,7 +1752,7 @@ public class ProcedureExecutor<TEnvironment> {
} catch (Throwable t) {
LOG.warn("Worker terminating UNNATURALLY " + this.activeProcedure, t);
} finally {
LOG.debug("Worker terminated.");
LOG.trace("Worker terminated.");
}
workerThreads.remove(this);
}
@ -1904,9 +1904,12 @@ public class ProcedureExecutor<TEnvironment> {
for (int i = 0; isAlive(); ++i) {
sendStopSignal();
join(250);
// Log every two seconds; send interrupt too.
if (i > 0 && (i % 8) == 0) {
LOG.warn("Waiting termination of thread " + getName() + ", " +
StringUtils.humanTimeDiff(EnvironmentEdgeManager.currentTime() - startTime));
StringUtils.humanTimeDiff(EnvironmentEdgeManager.currentTime() - startTime) +
"; sending interrupt");
interrupt();
}
}
} catch (InterruptedException e) {

View File

@ -277,7 +277,8 @@ public class ActiveMasterManager extends ZKListener {
ZNodeClearer.deleteMyEphemeralNodeOnDisk();
}
} catch (KeeperException e) {
LOG.error(this.watcher.prefix("Error deleting our own master address node"), e);
LOG.debug(this.watcher.prefix("Failed delete of our master address node; " +
e.getMessage()));
}
}
}

View File

@ -1245,6 +1245,7 @@ public class HMaster extends HRegionServer implements MasterServices {
configurationManager.deregisterObserver(procedureExecutor.getEnvironment());
procedureExecutor.getEnvironment().getRemoteDispatcher().stop();
procedureExecutor.stop();
procedureExecutor.join();
procedureExecutor = null;
}

View File

@ -50,7 +50,10 @@ import static org.junit.Assert.assertTrue;
* Test options for regions on master; none, system, or any (i.e. master is like any other
* regionserver). Checks how regions are deployed when each of the options are enabled.
* It then does kill combinations to make sure the distribution is more than just for startup.
* NOTE: Regions on Master does not work well. See HBASE-19828. Until addressed, disabling this
* test.
*/
@Ignore
@Category({MediumTests.class})
public class TestRegionsOnMasterOptions {
private static final Logger LOG = LoggerFactory.getLogger(TestRegionsOnMasterOptions.class);