HBASE-3039 Stuck in regionsInTransition because rebalance came in at same time as a split

M src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
  Add region name to warning log message (w/o it message is no good).
M src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
  Add src of split message else need to deduce where it came from by looking
  elsewhere.
M src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
  Updated log messages to include region and where appropritate source
  server name; debug hard w/o
  Changed regionOnline and regionOffline to check for unexpected
  states and log warnings rather than proceed regardless.
  Added in fix for concurrent balance+split; split message now
  updates regionsintransition where previous it did not.
  Remove checkRegion method.  Its a reimplementation of
  what regionOnline and regionOffline do only less comprehensive
  regards what gets updated (this.regions + this.servers rather
  than this.regions, this.servers and regionsInTransition)
  That they were less comprehensive is root of this bug.
M src/main/java/org/apache/hadoop/hbase/master/HMaster.java
  Make the message about why we are not running balancer richer
  (print out how many reigons in transition and more of the
  regionsintrnasition list).
M src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java
  Javadoc and minor formatting.


git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1001911 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2010-09-27 21:02:04 +00:00
parent 5efa0ba9c9
commit 10a84f3b88
6 changed files with 62 additions and 92 deletions

View File

@ -542,6 +542,8 @@ Release 0.21.0 - Unreleased
HBASE-2992 [replication] MalformedObjectNameException in ReplicationMetrics
HBASE-3037 When new master joins running cluster does "Received report from
unknown server -- telling it to STOP_REGIONSERVER.
HBASE-3039 Stuck in regionsInTransition because rebalance came in at same
time as a split
IMPROVEMENTS
HBASE-1760 Cleanup TODOs in HTable

View File

@ -68,7 +68,7 @@ public class RegionTransitionData implements Writable {
* {@link EventType#M2ZK_REGION_OFFLINE}.
*
* @param eventType type of event
* @param regionName name of region
* @param regionName name of region as per {@link HRegionInfo#getRegionName()}
*/
public RegionTransitionData(EventType eventType, byte [] regionName) {
this(eventType, regionName, null);
@ -85,7 +85,7 @@ public class RegionTransitionData implements Writable {
* and {@link EventType#RS2ZK_REGION_OPENED}.
*
* @param eventType type of event
* @param regionName name of region
* @param regionName name of region as per {@link HRegionInfo#getRegionName()}
* @param serverName name of server setting data
*/
public RegionTransitionData(EventType eventType, byte [] regionName,
@ -117,7 +117,7 @@ public class RegionTransitionData implements Writable {
* Gets the name of the region being transitioned.
*
* <p>Region name is required so this never returns null.
* @return region name
* @return region name, the result of a call to {@link HRegionInfo#getRegionName()}
*/
public byte [] getRegionName() {
return regionName;

View File

@ -60,8 +60,8 @@ import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZKTableDisable;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData;
import org.apache.hadoop.io.Writable;
import org.apache.zookeeper.KeeperException;
@ -234,7 +234,7 @@ public class AssignmentManager extends ZooKeeperListener {
* yet).
* @param data
*/
private void handleRegion(RegionTransitionData data) {
private void handleRegion(final RegionTransitionData data) {
synchronized(regionsInTransition) {
// Verify this is a known server
if (!serverManager.isServerOnline(data.getServerName()) &&
@ -248,7 +248,7 @@ public class AssignmentManager extends ZooKeeperListener {
LOG.debug("Handling transition=" + data.getEventType() +
", server=" + data.getServerName() + ", region=" + prettyPrintedRegionName);
RegionState regionState = regionsInTransition.get(encodedName);
switch(data.getEventType()) {
switch (data.getEventType()) {
case M_ZK_REGION_OFFLINE:
// Nothing to do.
break;
@ -422,12 +422,21 @@ public class AssignmentManager extends ZooKeeperListener {
* @param serverInfo
*/
public void regionOnline(HRegionInfo regionInfo, HServerInfo serverInfo) {
synchronized(regionsInTransition) {
regionsInTransition.remove(regionInfo.getEncodedName());
regionsInTransition.notifyAll();
synchronized (this.regionsInTransition) {
RegionState rs =
this.regionsInTransition.remove(regionInfo.getEncodedName());
if (rs != null) {
this.regionsInTransition.notifyAll();
LOG.warn("Asked online a region that was already in " +
"regionsInTransition: " + rs);
}
synchronized(regions) {
regions.put(regionInfo, serverInfo);
}
synchronized (this.regions) {
// Add check
HServerInfo hsi = this.regions.get(regionInfo);
if (hsi != null) LOG.warn("Overwriting " + regionInfo.getEncodedName() +
" on " + hsi);
this.regions.put(regionInfo, serverInfo);
addToServers(serverInfo, regionInfo);
}
}
@ -440,15 +449,20 @@ public class AssignmentManager extends ZooKeeperListener {
* @param regionInfo
* @param serverInfo
*/
public void regionOffline(HRegionInfo regionInfo) {
synchronized(regionsInTransition) {
regionsInTransition.remove(regionInfo.getEncodedName());
regionsInTransition.notifyAll();
public void regionOffline(final HRegionInfo regionInfo) {
synchronized(this.regionsInTransition) {
if (this.regionsInTransition.remove(regionInfo.getEncodedName()) != null) {
this.regionsInTransition.notifyAll();
}
synchronized(regions) {
HServerInfo serverInfo = regions.remove(regionInfo);
List<HRegionInfo> serverRegions = servers.get(serverInfo);
}
synchronized(this.regions) {
HServerInfo serverInfo = this.regions.remove(regionInfo);
if (serverInfo != null) {
List<HRegionInfo> serverRegions = this.servers.get(serverInfo);
serverRegions.remove(regionInfo);
} else {
LOG.warn("Asked offline a region that was not online: " + regionInfo);
}
}
}
@ -580,17 +594,17 @@ public class AssignmentManager extends ZooKeeperListener {
region.getRegionNameAsString() + " (offlining)");
// Check if this region is currently assigned
if (!regions.containsKey(region)) {
LOG.debug("Attempted to unassign region " + region.getRegionNameAsString() +
" but it is not " +
LOG.debug("Attempted to unassign region " +
region.getRegionNameAsString() + " but it is not " +
"currently assigned anywhere");
return;
}
String encodedName = region.getEncodedName();
// Grab the state of this region and synchronize on it
RegionState state;
synchronized(regionsInTransition) {
synchronized (regionsInTransition) {
state = regionsInTransition.get(encodedName);
if(state == null) {
if (state == null) {
state = new RegionState(region, RegionState.State.PENDING_CLOSE);
regionsInTransition.put(encodedName, state);
} else {
@ -923,7 +937,7 @@ public class AssignmentManager extends ZooKeeperListener {
HRegionInfo regionInfo = regionState.getRegion();
LOG.info("Regions in transition timed out: " + regionState);
// Expired! Do a retry.
switch(regionState.getState()) {
switch (regionState.getState()) {
case OFFLINE:
case CLOSED:
LOG.info("Region has been OFFLINE or CLOSED for too long, " +
@ -933,17 +947,19 @@ public class AssignmentManager extends ZooKeeperListener {
case PENDING_OPEN:
case OPENING:
LOG.info("Region has been PENDING_OPEN or OPENING for too " +
"long, reassigning " + regionInfo.getRegionNameAsString());
"long, reassigning region=" +
regionInfo.getRegionNameAsString());
assign(regionState.getRegion());
break;
case OPEN:
LOG.warn("Long-running region in OPEN state? This should " +
"not happen");
"not happen; region=" + regionInfo.getRegionNameAsString());
break;
case PENDING_CLOSE:
case CLOSING:
LOG.info("Region has been PENDING_CLOSE or CLOSING for too " +
"long, resending close rpc");
"long, running unassign again on region=" +
regionInfo.getRegionNameAsString());
unassign(regionInfo);
break;
}
@ -1017,61 +1033,9 @@ public class AssignmentManager extends ZooKeeperListener {
*/
public void handleSplitReport(final HServerInfo hsi, final HRegionInfo parent,
final HRegionInfo a, final HRegionInfo b) {
synchronized (this.regions) {
checkRegion(hsi, parent, true);
checkRegion(hsi, a, false);
checkRegion(hsi, b, false);
this.regions.put(a, hsi);
this.regions.put(b, hsi);
removeFromServers(hsi, parent, true);
removeFromServers(hsi, a, false);
removeFromServers(hsi, b, false);
addToServers(hsi, a);
addToServers(hsi, b);
}
}
/*
* Caller must hold locks on this.regions Map.
* @param hsi
* @param hri
* @param expected True if we expect <code>hri</code> to be in this.regions.
*/
private void checkRegion(final HServerInfo hsi, final HRegionInfo hri,
final boolean expected) {
HServerInfo serverInfo = regions.remove(hri);
if (expected) {
if (serverInfo == null) {
LOG.info("Region not on a server: " + hri.getRegionNameAsString());
}
} else {
if (serverInfo != null) {
LOG.warn("Region present on " + hsi + "; unexpected");
}
}
}
/*
* Caller must hold locks on servers Map.
* @param hsi
* @param hri
* @param expected
*/
private void removeFromServers(final HServerInfo hsi, final HRegionInfo hri,
final boolean expected) {
List<HRegionInfo> serverRegions = this.servers.get(hsi);
boolean removed = serverRegions.remove(hri);
if (expected) {
if (!removed) {
LOG.warn(hri.getRegionNameAsString() + " not found on " + hsi +
"; unexpected");
}
} else {
if (removed) {
LOG.warn(hri.getRegionNameAsString() + " found on " + hsi +
"; unexpected");
}
}
regionOffline(parent);
regionOnline(a, hsi);
regionOnline(b, hsi);
}
/**

View File

@ -572,9 +572,11 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
synchronized (this.balancer) {
// Only allow one balance run at at time.
if (this.assignmentManager.isRegionsInTransition()) {
LOG.debug("Not running balancer because regions in transition: " +
LOG.debug("Not running balancer because " +
this.assignmentManager.getRegionsInTransition().size() +
" region(s) in transition: " +
org.apache.commons.lang.StringUtils.
abbreviate(this.assignmentManager.getRegionsInTransition().toString(), 64));
abbreviate(this.assignmentManager.getRegionsInTransition().toString(), 256));
return false;
}
if (!this.serverManager.getDeadServers().isEmpty()) {

View File

@ -285,7 +285,7 @@ public class ServerManager {
}
for (HMsg msg: msgs) {
LOG.info("Received " + msg);
LOG.info("Received " + msg + " from " + serverInfo.getServerName());
switch (msg.getType()) {
case REGION_SPLIT:
this.services.getAssignmentManager().handleSplitReport(serverInfo,
@ -527,8 +527,9 @@ public class ServerManager {
throws NotServingRegionException {
HRegionInterface hri = getServerConnection(server);
if(hri == null) {
LOG.warn("Attempting to send CLOSE RPC to server " + server.getServerName()
+ " failed because no RPC connection found to this server");
LOG.warn("Attempting to send CLOSE RPC to server " +
server.getServerName() + " failed because no RPC connection found " +
"to this server");
return;
}
hri.closeRegion(region);

View File

@ -1924,7 +1924,8 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
// TODO: Need to check if this is being served here but currently undergoing
// a split (so master needs to retry close after split is complete)
if (!onlineRegions.containsKey(region.getEncodedName())) {
LOG.warn("Received close for region we are not serving");
LOG.warn("Received close for region we are not serving; " +
region.getEncodedName());
throw new NotServingRegionException("Received close for "
+ region.getRegionNameAsString() + " but we are not serving it");
}