HBASE-3037 When new master joins running cluster does "Received report from unknown server -- telling it to STOP_REGIONSERVER..."
M src/main/java/org/apache/hadoop/hbase/master/ServerManager.java On regionServerReport, if we get a report from an 'unknown' regionserver, we used to tell it stop itself. Now, if 'unknown' server AND this master did not start the cluster, its joining the cluster, then treat the report as a regionServerStart and register the incoming server rather than tell it shutdown. M src/main/java/org/apache/hadoop/hbase/master/HMaster.java Pass the freshClusterStartup flag to ServerManager. Add more executors for opening and closing. On cluster startup a master shouldn't be bottleneck clearning the server opens. Expose the run-balancer method so can make it available in ServeAdmin. M src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Minor formatting and javadoc M src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java Upped rpc version number because of new balancer addition (and because we didn't do it when we put in new master). M src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java Added balance method. M src/main/resources/hbase-default.xml Change how ofter we check in from every 3 seconds to every 5 seconds. git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1001140 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e58ebc4830
commit
1339f395d6
|
@ -540,6 +540,8 @@ Release 0.21.0 - Unreleased
|
||||||
HBASE-3028 No basescanner means no GC'ing of split, offlined parent regions
|
HBASE-3028 No basescanner means no GC'ing of split, offlined parent regions
|
||||||
HBASE-2989 [replication] RSM won't cleanup after locking if 0 peers
|
HBASE-2989 [replication] RSM won't cleanup after locking if 0 peers
|
||||||
HBASE-2992 [replication] MalformedObjectNameException in ReplicationMetrics
|
HBASE-2992 [replication] MalformedObjectNameException in ReplicationMetrics
|
||||||
|
HBASE-3037 When new master joins running cluster does "Received report from
|
||||||
|
unknown server -- telling it to STOP_REGIONSERVER.
|
||||||
|
|
||||||
IMPROVEMENTS
|
IMPROVEMENTS
|
||||||
HBASE-1760 Cleanup TODOs in HTable
|
HBASE-1760 Cleanup TODOs in HTable
|
||||||
|
|
|
@ -879,14 +879,24 @@ public class HBaseAdmin implements Abortable {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Turn the load balancer on or off.
|
||||||
* @param b If true, enable balancer. If false, disable balancer.
|
* @param b If true, enable balancer. If false, disable balancer.
|
||||||
* @return Previous balancer value
|
* @return Previous balancer value
|
||||||
* @throws ZooKeeperConnectionException
|
|
||||||
* @throws MasterNotRunningException
|
|
||||||
*/
|
*/
|
||||||
public boolean balance(final boolean b)
|
public boolean balanceSwitch(final boolean b)
|
||||||
throws MasterNotRunningException, ZooKeeperConnectionException {
|
throws MasterNotRunningException, ZooKeeperConnectionException {
|
||||||
return getMaster().balance(b);
|
return getMaster().balanceSwitch(b);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invoke the balancer. Will run the balancer and if regions to move, it will
|
||||||
|
* go ahead and do the reassignments. Can NOT run for various reasons. Check
|
||||||
|
* logs.
|
||||||
|
* @return True if balancer ran, false otherwise.
|
||||||
|
*/
|
||||||
|
public boolean balancer()
|
||||||
|
throws MasterNotRunningException, ZooKeeperConnectionException {
|
||||||
|
return getMaster().balance();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -77,7 +77,8 @@ public interface HBaseRPCProtocolVersion extends VersionedProtocol {
|
||||||
* <li>Version 23: HBASE-2066, multi-put.</li>
|
* <li>Version 23: HBASE-2066, multi-put.</li>
|
||||||
* <li>Version 24: HBASE-2473, create table with regions.</li>
|
* <li>Version 24: HBASE-2473, create table with regions.</li>
|
||||||
* <li>Version 25: Added openRegion and Stoppable/Abortable to API.</li>
|
* <li>Version 25: Added openRegion and Stoppable/Abortable to API.</li>
|
||||||
|
* <li>Version 26: New master.</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public static final long versionID = 25L;
|
public static final long versionID = 26L;
|
||||||
}
|
}
|
||||||
|
|
|
@ -144,8 +144,17 @@ public interface HMasterInterface extends HBaseRPCProtocolVersion {
|
||||||
throws UnknownRegionException;
|
throws UnknownRegionException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Run the balancer. Will run the balancer and if regions to move, it will
|
||||||
|
* go ahead and do the reassignments. Can NOT run for various reasons. Check
|
||||||
|
* logs.
|
||||||
|
* @return True if balancer ran, false otherwise.
|
||||||
|
*/
|
||||||
|
public boolean balance();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Turn the load balancer on or off.
|
||||||
* @param b If true, enable balancer. If false, disable balancer.
|
* @param b If true, enable balancer. If false, disable balancer.
|
||||||
* @return Previous balancer value
|
* @return Previous balancer value
|
||||||
*/
|
*/
|
||||||
public boolean balance(final boolean b);
|
public boolean balanceSwitch(final boolean b);
|
||||||
}
|
}
|
|
@ -177,13 +177,13 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
// Check existing regions in transition
|
// Check existing regions in transition
|
||||||
List<String> nodes = ZKUtil.listChildrenAndWatchForNewChildren(watcher,
|
List<String> nodes = ZKUtil.listChildrenAndWatchForNewChildren(watcher,
|
||||||
watcher.assignmentZNode);
|
watcher.assignmentZNode);
|
||||||
if(nodes.isEmpty()) {
|
if (nodes.isEmpty()) {
|
||||||
LOG.info("No regions in transition in ZK to process on failover");
|
LOG.info("No regions in transition in ZK to process on failover");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
LOG.info("Failed-over master needs to process " + nodes.size() +
|
LOG.info("Failed-over master needs to process " + nodes.size() +
|
||||||
" regions in transition");
|
" regions in transition");
|
||||||
for(String regionName : nodes) {
|
for (String regionName: nodes) {
|
||||||
RegionTransitionData data = ZKAssign.getData(watcher, regionName);
|
RegionTransitionData data = ZKAssign.getData(watcher, regionName);
|
||||||
HRegionInfo regionInfo =
|
HRegionInfo regionInfo =
|
||||||
MetaReader.getRegion(catalogTracker, data.getRegionName()).getFirst();
|
MetaReader.getRegion(catalogTracker, data.getRegionName()).getFirst();
|
||||||
|
@ -738,10 +738,10 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
private void rebuildUserRegions() throws IOException {
|
private void rebuildUserRegions() throws IOException {
|
||||||
Map<HRegionInfo,HServerAddress> allRegions =
|
Map<HRegionInfo,HServerAddress> allRegions =
|
||||||
MetaReader.fullScan(catalogTracker);
|
MetaReader.fullScan(catalogTracker);
|
||||||
for(Map.Entry<HRegionInfo,HServerAddress> region : allRegions.entrySet()) {
|
for (Map.Entry<HRegionInfo,HServerAddress> region : allRegions.entrySet()) {
|
||||||
HServerAddress regionLocation = region.getValue();
|
HServerAddress regionLocation = region.getValue();
|
||||||
HRegionInfo regionInfo = region.getKey();
|
HRegionInfo regionInfo = region.getKey();
|
||||||
if(regionLocation == null) {
|
if (regionLocation == null) {
|
||||||
regions.put(regionInfo, null);
|
regions.put(regionInfo, null);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1020,6 +1020,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
synchronized (this.regions) {
|
synchronized (this.regions) {
|
||||||
checkRegion(hsi, parent, true);
|
checkRegion(hsi, parent, true);
|
||||||
checkRegion(hsi, a, false);
|
checkRegion(hsi, a, false);
|
||||||
|
checkRegion(hsi, b, false);
|
||||||
this.regions.put(a, hsi);
|
this.regions.put(a, hsi);
|
||||||
this.regions.put(b, hsi);
|
this.regions.put(b, hsi);
|
||||||
removeFromServers(hsi, parent, true);
|
removeFromServers(hsi, parent, true);
|
||||||
|
@ -1031,10 +1032,10 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Caller must hold locks on regions Map.
|
* Caller must hold locks on this.regions Map.
|
||||||
* @param hsi
|
* @param hsi
|
||||||
* @param hri
|
* @param hri
|
||||||
* @param expected
|
* @param expected True if we expect <code>hri</code> to be in this.regions.
|
||||||
*/
|
*/
|
||||||
private void checkRegion(final HServerInfo hsi, final HRegionInfo hri,
|
private void checkRegion(final HServerInfo hsi, final HRegionInfo hri,
|
||||||
final boolean expected) {
|
final boolean expected) {
|
||||||
|
|
|
@ -160,7 +160,8 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
||||||
|
|
||||||
private LoadBalancer balancer = new LoadBalancer();
|
private LoadBalancer balancer = new LoadBalancer();
|
||||||
private Thread balancerChore;
|
private Thread balancerChore;
|
||||||
private volatile boolean balance = true;
|
// If 'true', the balancer is 'on'. If 'false', the balancer will not run.
|
||||||
|
private volatile boolean balanceSwitch = true;
|
||||||
|
|
||||||
private Thread catalogJanitorChore;
|
private Thread catalogJanitorChore;
|
||||||
|
|
||||||
|
@ -241,7 +242,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
||||||
this.connection = HConnectionManager.getConnection(conf);
|
this.connection = HConnectionManager.getConnection(conf);
|
||||||
this.executorService = new ExecutorService(getServerName());
|
this.executorService = new ExecutorService(getServerName());
|
||||||
|
|
||||||
this.serverManager = new ServerManager(this, this);
|
this.serverManager = new ServerManager(this, this, this.freshClusterStartup);
|
||||||
|
|
||||||
this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection,
|
this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection,
|
||||||
this, conf.getInt("hbase.master.catalog.timeout", Integer.MAX_VALUE));
|
this, conf.getInt("hbase.master.catalog.timeout", Integer.MAX_VALUE));
|
||||||
|
@ -304,7 +305,6 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
||||||
this.serverManager.waitForRegionServers();
|
this.serverManager.waitForRegionServers();
|
||||||
|
|
||||||
// Start assignment of user regions, startup or failure
|
// Start assignment of user regions, startup or failure
|
||||||
if (!this.stopped) {
|
|
||||||
if (this.freshClusterStartup) {
|
if (this.freshClusterStartup) {
|
||||||
clusterStarterInitializations(this.fileSystemManager,
|
clusterStarterInitializations(this.fileSystemManager,
|
||||||
this.serverManager, this.catalogTracker, this.assignmentManager);
|
this.serverManager, this.catalogTracker, this.assignmentManager);
|
||||||
|
@ -313,13 +313,13 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
||||||
// rebuild in-memory state.
|
// rebuild in-memory state.
|
||||||
this.assignmentManager.processFailover();
|
this.assignmentManager.processFailover();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Start balancer and meta catalog janitor after meta and regions have
|
// Start balancer and meta catalog janitor after meta and regions have
|
||||||
// been assigned.
|
// been assigned.
|
||||||
this.balancerChore = getAndStartBalancerChore(this);
|
this.balancerChore = getAndStartBalancerChore(this);
|
||||||
this.catalogJanitorChore =
|
this.catalogJanitorChore =
|
||||||
Threads.setDaemonThreadRunning(new CatalogJanitor(this, this));
|
Threads.setDaemonThreadRunning(new CatalogJanitor(this, this));
|
||||||
|
|
||||||
// Check if we should stop every second.
|
// Check if we should stop every second.
|
||||||
Sleeper sleeper = new Sleeper(1000, this);
|
Sleeper sleeper = new Sleeper(1000, this);
|
||||||
while (!this.stopped) sleeper.sleep();
|
while (!this.stopped) sleeper.sleep();
|
||||||
|
@ -442,9 +442,9 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
||||||
try {
|
try {
|
||||||
// Start the executor service pools
|
// Start the executor service pools
|
||||||
this.executorService.startExecutorService(ExecutorType.MASTER_OPEN_REGION,
|
this.executorService.startExecutorService(ExecutorType.MASTER_OPEN_REGION,
|
||||||
conf.getInt("hbase.master.executor.openregion.threads", 5));
|
conf.getInt("hbase.master.executor.openregion.threads", 10));
|
||||||
this.executorService.startExecutorService(ExecutorType.MASTER_CLOSE_REGION,
|
this.executorService.startExecutorService(ExecutorType.MASTER_CLOSE_REGION,
|
||||||
conf.getInt("hbase.master.executor.closeregion.threads", 5));
|
conf.getInt("hbase.master.executor.closeregion.threads", 10));
|
||||||
this.executorService.startExecutorService(ExecutorType.MASTER_SERVER_OPERATIONS,
|
this.executorService.startExecutorService(ExecutorType.MASTER_SERVER_OPERATIONS,
|
||||||
conf.getInt("hbase.master.executor.serverops.threads", 5));
|
conf.getInt("hbase.master.executor.serverops.threads", 5));
|
||||||
this.executorService.startExecutorService(ExecutorType.MASTER_TABLE_OPERATIONS,
|
this.executorService.startExecutorService(ExecutorType.MASTER_TABLE_OPERATIONS,
|
||||||
|
@ -496,9 +496,8 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Thread getAndStartBalancerChore(final HMaster master) {
|
private static Thread getAndStartBalancerChore(final HMaster master) {
|
||||||
String name = master.getServerName() + "-balancerChore";
|
String name = master.getServerName() + "-BalancerChore";
|
||||||
int period = master.getConfiguration().
|
int period = master.getConfiguration().getInt("hbase.balancer.period", 300000);
|
||||||
getInt("hbase.balancer.period", 3000000);
|
|
||||||
// Start up the load balancer chore
|
// Start up the load balancer chore
|
||||||
Chore chore = new Chore(name, period, master) {
|
Chore chore = new Chore(name, period, master) {
|
||||||
@Override
|
@Override
|
||||||
|
@ -566,13 +565,10 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
||||||
return !isStopped();
|
return !isStopped();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
@Override
|
||||||
* Run the balancer.
|
|
||||||
* @return True if balancer ran, false otherwise.
|
|
||||||
*/
|
|
||||||
public boolean balance() {
|
public boolean balance() {
|
||||||
// If balance not true, don't run balancer.
|
// If balance not true, don't run balancer.
|
||||||
if (!this.balance) return false;
|
if (!this.balanceSwitch) return false;
|
||||||
synchronized (this.balancer) {
|
synchronized (this.balancer) {
|
||||||
// Only allow one balance run at at time.
|
// Only allow one balance run at at time.
|
||||||
if (this.assignmentManager.isRegionsInTransition()) {
|
if (this.assignmentManager.isRegionsInTransition()) {
|
||||||
|
@ -606,9 +602,9 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean balance(final boolean b) {
|
public boolean balanceSwitch(final boolean b) {
|
||||||
boolean oldValue = this.balance;
|
boolean oldValue = this.balanceSwitch;
|
||||||
this.balance = b;
|
this.balanceSwitch = b;
|
||||||
LOG.info("Balance=" + b);
|
LOG.info("Balance=" + b);
|
||||||
return oldValue;
|
return oldValue;
|
||||||
}
|
}
|
||||||
|
|
|
@ -84,6 +84,7 @@ public class ServerManager {
|
||||||
|
|
||||||
private final Server master;
|
private final Server master;
|
||||||
private final MasterServices services;
|
private final MasterServices services;
|
||||||
|
private final boolean freshClusterStartup;
|
||||||
|
|
||||||
private final ServerMonitor serverMonitorThread;
|
private final ServerMonitor serverMonitorThread;
|
||||||
|
|
||||||
|
@ -119,10 +120,14 @@ public class ServerManager {
|
||||||
* Constructor.
|
* Constructor.
|
||||||
* @param master
|
* @param master
|
||||||
* @param services
|
* @param services
|
||||||
|
* @param freshClusterStartup True if we are original master on a fresh
|
||||||
|
* cluster startup else if false, we are joining an already running cluster.
|
||||||
*/
|
*/
|
||||||
public ServerManager(final Server master, final MasterServices services) {
|
public ServerManager(final Server master, final MasterServices services,
|
||||||
|
final boolean freshClusterStartup) {
|
||||||
this.master = master;
|
this.master = master;
|
||||||
this.services = services;
|
this.services = services;
|
||||||
|
this.freshClusterStartup = freshClusterStartup;
|
||||||
Configuration c = master.getConfiguration();
|
Configuration c = master.getConfiguration();
|
||||||
int monitorInterval = c.getInt("hbase.master.monitor.interval", 60 * 1000);
|
int monitorInterval = c.getInt("hbase.master.monitor.interval", 60 * 1000);
|
||||||
this.metrics = new MasterMetrics(master.getServerName());
|
this.metrics = new MasterMetrics(master.getServerName());
|
||||||
|
@ -249,10 +254,30 @@ public class ServerManager {
|
||||||
// If we don't know this server, tell it shutdown.
|
// If we don't know this server, tell it shutdown.
|
||||||
HServerInfo storedInfo = this.onlineServers.get(info.getServerName());
|
HServerInfo storedInfo = this.onlineServers.get(info.getServerName());
|
||||||
if (storedInfo == null) {
|
if (storedInfo == null) {
|
||||||
LOG.warn("Received report from unknown server -- telling it " +
|
if (!this.freshClusterStartup) {
|
||||||
"to " + HMsg.Type.STOP_REGIONSERVER + ": " + info.getServerName());
|
// If we are joining an existing cluster, then soon as we come up we'll
|
||||||
|
// be getting reports from already running regionservers.
|
||||||
|
LOG.info("Registering new server: " + info.getServerName());
|
||||||
|
// recordNewServer is what happens at the end of reportServerStartup.
|
||||||
|
// The only thing we are skipping is passing back to the regionserver
|
||||||
|
// the HServerInfo to use. Here we presume a master has already done
|
||||||
|
// that so we'll press on with whatever it gave us for HSI.
|
||||||
|
recordNewServer(info);
|
||||||
|
// If msgs, put off their processing but this is not enough because
|
||||||
|
// its possible that the next time the server reports in, we'll still
|
||||||
|
// not be up and serving. For example, if a split, we'll need the
|
||||||
|
// regions and servers setup in the master before the below
|
||||||
|
// handleSplitReport will work. TODO: FIx!!
|
||||||
|
if (msgs.length > 0) throw new PleaseHoldException("FIX! Putting off " +
|
||||||
|
"message processing because not yet rwady but possible we won't be " +
|
||||||
|
"ready next on next report");
|
||||||
|
} else {
|
||||||
|
LOG.warn("Received report from unknown server, a server calling " +
|
||||||
|
" regionServerReport w/o having first called regionServerStartup; " +
|
||||||
|
"telling it " + HMsg.Type.STOP_REGIONSERVER + ": " + info.getServerName());
|
||||||
return HMsg.STOP_REGIONSERVER_ARRAY;
|
return HMsg.STOP_REGIONSERVER_ARRAY;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Check startcodes
|
// Check startcodes
|
||||||
if (raceThatShouldNotHappenAnymore(storedInfo, info)) {
|
if (raceThatShouldNotHappenAnymore(storedInfo, info)) {
|
||||||
|
|
|
@ -157,10 +157,9 @@
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>hbase.regionserver.msginterval</name>
|
<name>hbase.regionserver.msginterval</name>
|
||||||
<value>3000</value>
|
<value>5000</value>
|
||||||
<description>Interval between messages from the RegionServer to HMaster
|
<description>Interval between messages from the RegionServer to HMaster
|
||||||
in milliseconds. Use a high value for clusters with more than 100
|
in milliseconds.
|
||||||
nodes. Default is 3 seconds.
|
|
||||||
</description>
|
</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
|
|
Loading…
Reference in New Issue