HBASE-3039 Stuck in regionsInTransition because rebalance came in at same time as a split
M src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Add region name to warning log message (w/o it message is no good). M src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Add src of split message else need to deduce where it came from by looking elsewhere. M src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Updated log messages to include region and where appropritate source server name; debug hard w/o Changed regionOnline and regionOffline to check for unexpected states and log warnings rather than proceed regardless. Added in fix for concurrent balance+split; split message now updates regionsintransition where previous it did not. Remove checkRegion method. Its a reimplementation of what regionOnline and regionOffline do only less comprehensive regards what gets updated (this.regions + this.servers rather than this.regions, this.servers and regionsInTransition) That they were less comprehensive is root of this bug. M src/main/java/org/apache/hadoop/hbase/master/HMaster.java Make the message about why we are not running balancer richer (print out how many reigons in transition and more of the regionsintrnasition list). M src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java Javadoc and minor formatting. git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1001911 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5efa0ba9c9
commit
10a84f3b88
|
@ -542,6 +542,8 @@ Release 0.21.0 - Unreleased
|
|||
HBASE-2992 [replication] MalformedObjectNameException in ReplicationMetrics
|
||||
HBASE-3037 When new master joins running cluster does "Received report from
|
||||
unknown server -- telling it to STOP_REGIONSERVER.
|
||||
HBASE-3039 Stuck in regionsInTransition because rebalance came in at same
|
||||
time as a split
|
||||
|
||||
IMPROVEMENTS
|
||||
HBASE-1760 Cleanup TODOs in HTable
|
||||
|
|
|
@ -68,7 +68,7 @@ public class RegionTransitionData implements Writable {
|
|||
* {@link EventType#M2ZK_REGION_OFFLINE}.
|
||||
*
|
||||
* @param eventType type of event
|
||||
* @param regionName name of region
|
||||
* @param regionName name of region as per {@link HRegionInfo#getRegionName()}
|
||||
*/
|
||||
public RegionTransitionData(EventType eventType, byte [] regionName) {
|
||||
this(eventType, regionName, null);
|
||||
|
@ -85,7 +85,7 @@ public class RegionTransitionData implements Writable {
|
|||
* and {@link EventType#RS2ZK_REGION_OPENED}.
|
||||
*
|
||||
* @param eventType type of event
|
||||
* @param regionName name of region
|
||||
* @param regionName name of region as per {@link HRegionInfo#getRegionName()}
|
||||
* @param serverName name of server setting data
|
||||
*/
|
||||
public RegionTransitionData(EventType eventType, byte [] regionName,
|
||||
|
@ -117,7 +117,7 @@ public class RegionTransitionData implements Writable {
|
|||
* Gets the name of the region being transitioned.
|
||||
*
|
||||
* <p>Region name is required so this never returns null.
|
||||
* @return region name
|
||||
* @return region name, the result of a call to {@link HRegionInfo#getRegionName()}
|
||||
*/
|
||||
public byte [] getRegionName() {
|
||||
return regionName;
|
||||
|
|
|
@ -60,8 +60,8 @@ import org.apache.hadoop.hbase.util.Threads;
|
|||
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKTableDisable;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData;
|
||||
import org.apache.hadoop.io.Writable;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
|
||||
|
@ -234,7 +234,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
* yet).
|
||||
* @param data
|
||||
*/
|
||||
private void handleRegion(RegionTransitionData data) {
|
||||
private void handleRegion(final RegionTransitionData data) {
|
||||
synchronized(regionsInTransition) {
|
||||
// Verify this is a known server
|
||||
if (!serverManager.isServerOnline(data.getServerName()) &&
|
||||
|
@ -248,7 +248,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
LOG.debug("Handling transition=" + data.getEventType() +
|
||||
", server=" + data.getServerName() + ", region=" + prettyPrintedRegionName);
|
||||
RegionState regionState = regionsInTransition.get(encodedName);
|
||||
switch(data.getEventType()) {
|
||||
switch (data.getEventType()) {
|
||||
case M_ZK_REGION_OFFLINE:
|
||||
// Nothing to do.
|
||||
break;
|
||||
|
@ -422,12 +422,21 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
* @param serverInfo
|
||||
*/
|
||||
public void regionOnline(HRegionInfo regionInfo, HServerInfo serverInfo) {
|
||||
synchronized(regionsInTransition) {
|
||||
regionsInTransition.remove(regionInfo.getEncodedName());
|
||||
regionsInTransition.notifyAll();
|
||||
synchronized (this.regionsInTransition) {
|
||||
RegionState rs =
|
||||
this.regionsInTransition.remove(regionInfo.getEncodedName());
|
||||
if (rs != null) {
|
||||
this.regionsInTransition.notifyAll();
|
||||
LOG.warn("Asked online a region that was already in " +
|
||||
"regionsInTransition: " + rs);
|
||||
}
|
||||
synchronized(regions) {
|
||||
regions.put(regionInfo, serverInfo);
|
||||
}
|
||||
synchronized (this.regions) {
|
||||
// Add check
|
||||
HServerInfo hsi = this.regions.get(regionInfo);
|
||||
if (hsi != null) LOG.warn("Overwriting " + regionInfo.getEncodedName() +
|
||||
" on " + hsi);
|
||||
this.regions.put(regionInfo, serverInfo);
|
||||
addToServers(serverInfo, regionInfo);
|
||||
}
|
||||
}
|
||||
|
@ -440,15 +449,20 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
* @param regionInfo
|
||||
* @param serverInfo
|
||||
*/
|
||||
public void regionOffline(HRegionInfo regionInfo) {
|
||||
synchronized(regionsInTransition) {
|
||||
regionsInTransition.remove(regionInfo.getEncodedName());
|
||||
regionsInTransition.notifyAll();
|
||||
public void regionOffline(final HRegionInfo regionInfo) {
|
||||
synchronized(this.regionsInTransition) {
|
||||
if (this.regionsInTransition.remove(regionInfo.getEncodedName()) != null) {
|
||||
this.regionsInTransition.notifyAll();
|
||||
}
|
||||
synchronized(regions) {
|
||||
HServerInfo serverInfo = regions.remove(regionInfo);
|
||||
List<HRegionInfo> serverRegions = servers.get(serverInfo);
|
||||
}
|
||||
synchronized(this.regions) {
|
||||
HServerInfo serverInfo = this.regions.remove(regionInfo);
|
||||
if (serverInfo != null) {
|
||||
List<HRegionInfo> serverRegions = this.servers.get(serverInfo);
|
||||
serverRegions.remove(regionInfo);
|
||||
} else {
|
||||
LOG.warn("Asked offline a region that was not online: " + regionInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -580,17 +594,17 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
region.getRegionNameAsString() + " (offlining)");
|
||||
// Check if this region is currently assigned
|
||||
if (!regions.containsKey(region)) {
|
||||
LOG.debug("Attempted to unassign region " + region.getRegionNameAsString() +
|
||||
" but it is not " +
|
||||
LOG.debug("Attempted to unassign region " +
|
||||
region.getRegionNameAsString() + " but it is not " +
|
||||
"currently assigned anywhere");
|
||||
return;
|
||||
}
|
||||
String encodedName = region.getEncodedName();
|
||||
// Grab the state of this region and synchronize on it
|
||||
RegionState state;
|
||||
synchronized(regionsInTransition) {
|
||||
synchronized (regionsInTransition) {
|
||||
state = regionsInTransition.get(encodedName);
|
||||
if(state == null) {
|
||||
if (state == null) {
|
||||
state = new RegionState(region, RegionState.State.PENDING_CLOSE);
|
||||
regionsInTransition.put(encodedName, state);
|
||||
} else {
|
||||
|
@ -923,7 +937,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
HRegionInfo regionInfo = regionState.getRegion();
|
||||
LOG.info("Regions in transition timed out: " + regionState);
|
||||
// Expired! Do a retry.
|
||||
switch(regionState.getState()) {
|
||||
switch (regionState.getState()) {
|
||||
case OFFLINE:
|
||||
case CLOSED:
|
||||
LOG.info("Region has been OFFLINE or CLOSED for too long, " +
|
||||
|
@ -933,17 +947,19 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
case PENDING_OPEN:
|
||||
case OPENING:
|
||||
LOG.info("Region has been PENDING_OPEN or OPENING for too " +
|
||||
"long, reassigning " + regionInfo.getRegionNameAsString());
|
||||
"long, reassigning region=" +
|
||||
regionInfo.getRegionNameAsString());
|
||||
assign(regionState.getRegion());
|
||||
break;
|
||||
case OPEN:
|
||||
LOG.warn("Long-running region in OPEN state? This should " +
|
||||
"not happen");
|
||||
"not happen; region=" + regionInfo.getRegionNameAsString());
|
||||
break;
|
||||
case PENDING_CLOSE:
|
||||
case CLOSING:
|
||||
LOG.info("Region has been PENDING_CLOSE or CLOSING for too " +
|
||||
"long, resending close rpc");
|
||||
"long, running unassign again on region=" +
|
||||
regionInfo.getRegionNameAsString());
|
||||
unassign(regionInfo);
|
||||
break;
|
||||
}
|
||||
|
@ -1017,61 +1033,9 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
*/
|
||||
public void handleSplitReport(final HServerInfo hsi, final HRegionInfo parent,
|
||||
final HRegionInfo a, final HRegionInfo b) {
|
||||
synchronized (this.regions) {
|
||||
checkRegion(hsi, parent, true);
|
||||
checkRegion(hsi, a, false);
|
||||
checkRegion(hsi, b, false);
|
||||
this.regions.put(a, hsi);
|
||||
this.regions.put(b, hsi);
|
||||
removeFromServers(hsi, parent, true);
|
||||
removeFromServers(hsi, a, false);
|
||||
removeFromServers(hsi, b, false);
|
||||
addToServers(hsi, a);
|
||||
addToServers(hsi, b);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Caller must hold locks on this.regions Map.
|
||||
* @param hsi
|
||||
* @param hri
|
||||
* @param expected True if we expect <code>hri</code> to be in this.regions.
|
||||
*/
|
||||
private void checkRegion(final HServerInfo hsi, final HRegionInfo hri,
|
||||
final boolean expected) {
|
||||
HServerInfo serverInfo = regions.remove(hri);
|
||||
if (expected) {
|
||||
if (serverInfo == null) {
|
||||
LOG.info("Region not on a server: " + hri.getRegionNameAsString());
|
||||
}
|
||||
} else {
|
||||
if (serverInfo != null) {
|
||||
LOG.warn("Region present on " + hsi + "; unexpected");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Caller must hold locks on servers Map.
|
||||
* @param hsi
|
||||
* @param hri
|
||||
* @param expected
|
||||
*/
|
||||
private void removeFromServers(final HServerInfo hsi, final HRegionInfo hri,
|
||||
final boolean expected) {
|
||||
List<HRegionInfo> serverRegions = this.servers.get(hsi);
|
||||
boolean removed = serverRegions.remove(hri);
|
||||
if (expected) {
|
||||
if (!removed) {
|
||||
LOG.warn(hri.getRegionNameAsString() + " not found on " + hsi +
|
||||
"; unexpected");
|
||||
}
|
||||
} else {
|
||||
if (removed) {
|
||||
LOG.warn(hri.getRegionNameAsString() + " found on " + hsi +
|
||||
"; unexpected");
|
||||
}
|
||||
}
|
||||
regionOffline(parent);
|
||||
regionOnline(a, hsi);
|
||||
regionOnline(b, hsi);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -572,9 +572,11 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
|||
synchronized (this.balancer) {
|
||||
// Only allow one balance run at at time.
|
||||
if (this.assignmentManager.isRegionsInTransition()) {
|
||||
LOG.debug("Not running balancer because regions in transition: " +
|
||||
LOG.debug("Not running balancer because " +
|
||||
this.assignmentManager.getRegionsInTransition().size() +
|
||||
" region(s) in transition: " +
|
||||
org.apache.commons.lang.StringUtils.
|
||||
abbreviate(this.assignmentManager.getRegionsInTransition().toString(), 64));
|
||||
abbreviate(this.assignmentManager.getRegionsInTransition().toString(), 256));
|
||||
return false;
|
||||
}
|
||||
if (!this.serverManager.getDeadServers().isEmpty()) {
|
||||
|
|
|
@ -285,7 +285,7 @@ public class ServerManager {
|
|||
}
|
||||
|
||||
for (HMsg msg: msgs) {
|
||||
LOG.info("Received " + msg);
|
||||
LOG.info("Received " + msg + " from " + serverInfo.getServerName());
|
||||
switch (msg.getType()) {
|
||||
case REGION_SPLIT:
|
||||
this.services.getAssignmentManager().handleSplitReport(serverInfo,
|
||||
|
@ -527,8 +527,9 @@ public class ServerManager {
|
|||
throws NotServingRegionException {
|
||||
HRegionInterface hri = getServerConnection(server);
|
||||
if(hri == null) {
|
||||
LOG.warn("Attempting to send CLOSE RPC to server " + server.getServerName()
|
||||
+ " failed because no RPC connection found to this server");
|
||||
LOG.warn("Attempting to send CLOSE RPC to server " +
|
||||
server.getServerName() + " failed because no RPC connection found " +
|
||||
"to this server");
|
||||
return;
|
||||
}
|
||||
hri.closeRegion(region);
|
||||
|
|
|
@ -1924,7 +1924,8 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
|
|||
// TODO: Need to check if this is being served here but currently undergoing
|
||||
// a split (so master needs to retry close after split is complete)
|
||||
if (!onlineRegions.containsKey(region.getEncodedName())) {
|
||||
LOG.warn("Received close for region we are not serving");
|
||||
LOG.warn("Received close for region we are not serving; " +
|
||||
region.getEncodedName());
|
||||
throw new NotServingRegionException("Received close for "
|
||||
+ region.getRegionNameAsString() + " but we are not serving it");
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue