From 10a84f3b886871c5a3a3a2b074a8bbffd1ea1602 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Mon, 27 Sep 2010 21:02:04 +0000 Subject: [PATCH] HBASE-3039 Stuck in regionsInTransition because rebalance came in at same time as a split M src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Add region name to warning log message (w/o it message is no good). M src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Add src of split message else need to deduce where it came from by looking elsewhere. M src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Updated log messages to include region and where appropritate source server name; debug hard w/o Changed regionOnline and regionOffline to check for unexpected states and log warnings rather than proceed regardless. Added in fix for concurrent balance+split; split message now updates regionsintransition where previous it did not. Remove checkRegion method. Its a reimplementation of what regionOnline and regionOffline do only less comprehensive regards what gets updated (this.regions + this.servers rather than this.regions, this.servers and regionsInTransition) That they were less comprehensive is root of this bug. M src/main/java/org/apache/hadoop/hbase/master/HMaster.java Make the message about why we are not running balancer richer (print out how many reigons in transition and more of the regionsintrnasition list). M src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java Javadoc and minor formatting. git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1001911 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 2 + .../hbase/executor/RegionTransitionData.java | 6 +- .../hbase/master/AssignmentManager.java | 130 +++++++----------- .../apache/hadoop/hbase/master/HMaster.java | 6 +- .../hadoop/hbase/master/ServerManager.java | 7 +- .../hbase/regionserver/HRegionServer.java | 3 +- 6 files changed, 62 insertions(+), 92 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 52905d6c34b..76696e5676a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -542,6 +542,8 @@ Release 0.21.0 - Unreleased HBASE-2992 [replication] MalformedObjectNameException in ReplicationMetrics HBASE-3037 When new master joins running cluster does "Received report from unknown server -- telling it to STOP_REGIONSERVER. + HBASE-3039 Stuck in regionsInTransition because rebalance came in at same + time as a split IMPROVEMENTS HBASE-1760 Cleanup TODOs in HTable diff --git a/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java b/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java index 4e33ebad22d..b0f18e46965 100644 --- a/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java +++ b/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java @@ -68,7 +68,7 @@ public class RegionTransitionData implements Writable { * {@link EventType#M2ZK_REGION_OFFLINE}. * * @param eventType type of event - * @param regionName name of region + * @param regionName name of region as per {@link HRegionInfo#getRegionName()} */ public RegionTransitionData(EventType eventType, byte [] regionName) { this(eventType, regionName, null); @@ -85,7 +85,7 @@ public class RegionTransitionData implements Writable { * and {@link EventType#RS2ZK_REGION_OPENED}. * * @param eventType type of event - * @param regionName name of region + * @param regionName name of region as per {@link HRegionInfo#getRegionName()} * @param serverName name of server setting data */ public RegionTransitionData(EventType eventType, byte [] regionName, @@ -117,7 +117,7 @@ public class RegionTransitionData implements Writable { * Gets the name of the region being transitioned. * *

Region name is required so this never returns null. - * @return region name + * @return region name, the result of a call to {@link HRegionInfo#getRegionName()} */ public byte [] getRegionName() { return regionName; diff --git a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 0e7ae5a7c1c..efdd339faac 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -60,8 +60,8 @@ import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.zookeeper.ZKAssign; import org.apache.hadoop.hbase.zookeeper.ZKTableDisable; import org.apache.hadoop.hbase.zookeeper.ZKUtil; -import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData; import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener; +import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData; import org.apache.hadoop.io.Writable; import org.apache.zookeeper.KeeperException; @@ -234,7 +234,7 @@ public class AssignmentManager extends ZooKeeperListener { * yet). * @param data */ - private void handleRegion(RegionTransitionData data) { + private void handleRegion(final RegionTransitionData data) { synchronized(regionsInTransition) { // Verify this is a known server if (!serverManager.isServerOnline(data.getServerName()) && @@ -248,7 +248,7 @@ public class AssignmentManager extends ZooKeeperListener { LOG.debug("Handling transition=" + data.getEventType() + ", server=" + data.getServerName() + ", region=" + prettyPrintedRegionName); RegionState regionState = regionsInTransition.get(encodedName); - switch(data.getEventType()) { + switch (data.getEventType()) { case M_ZK_REGION_OFFLINE: // Nothing to do. break; @@ -259,9 +259,9 @@ public class AssignmentManager extends ZooKeeperListener { if (regionState == null || (!regionState.isPendingClose() && !regionState.isClosing())) { LOG.warn("Received CLOSING for region " + prettyPrintedRegionName + - " from server " + data.getServerName() + " but region was in " + - " the state " + regionState + " and not " + - "in expected PENDING_CLOSE or CLOSING states"); + " from server " + data.getServerName() + " but region was in " + + " the state " + regionState + " and not " + + "in expected PENDING_CLOSE or CLOSING states"); return; } // Transition to CLOSING (or update stamp if already CLOSING) @@ -422,12 +422,21 @@ public class AssignmentManager extends ZooKeeperListener { * @param serverInfo */ public void regionOnline(HRegionInfo regionInfo, HServerInfo serverInfo) { - synchronized(regionsInTransition) { - regionsInTransition.remove(regionInfo.getEncodedName()); - regionsInTransition.notifyAll(); + synchronized (this.regionsInTransition) { + RegionState rs = + this.regionsInTransition.remove(regionInfo.getEncodedName()); + if (rs != null) { + this.regionsInTransition.notifyAll(); + LOG.warn("Asked online a region that was already in " + + "regionsInTransition: " + rs); + } } - synchronized(regions) { - regions.put(regionInfo, serverInfo); + synchronized (this.regions) { + // Add check + HServerInfo hsi = this.regions.get(regionInfo); + if (hsi != null) LOG.warn("Overwriting " + regionInfo.getEncodedName() + + " on " + hsi); + this.regions.put(regionInfo, serverInfo); addToServers(serverInfo, regionInfo); } } @@ -440,15 +449,20 @@ public class AssignmentManager extends ZooKeeperListener { * @param regionInfo * @param serverInfo */ - public void regionOffline(HRegionInfo regionInfo) { - synchronized(regionsInTransition) { - regionsInTransition.remove(regionInfo.getEncodedName()); - regionsInTransition.notifyAll(); + public void regionOffline(final HRegionInfo regionInfo) { + synchronized(this.regionsInTransition) { + if (this.regionsInTransition.remove(regionInfo.getEncodedName()) != null) { + this.regionsInTransition.notifyAll(); + } } - synchronized(regions) { - HServerInfo serverInfo = regions.remove(regionInfo); - List serverRegions = servers.get(serverInfo); - serverRegions.remove(regionInfo); + synchronized(this.regions) { + HServerInfo serverInfo = this.regions.remove(regionInfo); + if (serverInfo != null) { + List serverRegions = this.servers.get(serverInfo); + serverRegions.remove(regionInfo); + } else { + LOG.warn("Asked offline a region that was not online: " + regionInfo); + } } } @@ -580,17 +594,17 @@ public class AssignmentManager extends ZooKeeperListener { region.getRegionNameAsString() + " (offlining)"); // Check if this region is currently assigned if (!regions.containsKey(region)) { - LOG.debug("Attempted to unassign region " + region.getRegionNameAsString() + - " but it is not " + + LOG.debug("Attempted to unassign region " + + region.getRegionNameAsString() + " but it is not " + "currently assigned anywhere"); return; } String encodedName = region.getEncodedName(); // Grab the state of this region and synchronize on it RegionState state; - synchronized(regionsInTransition) { + synchronized (regionsInTransition) { state = regionsInTransition.get(encodedName); - if(state == null) { + if (state == null) { state = new RegionState(region, RegionState.State.PENDING_CLOSE); regionsInTransition.put(encodedName, state); } else { @@ -923,27 +937,29 @@ public class AssignmentManager extends ZooKeeperListener { HRegionInfo regionInfo = regionState.getRegion(); LOG.info("Regions in transition timed out: " + regionState); // Expired! Do a retry. - switch(regionState.getState()) { + switch (regionState.getState()) { case OFFLINE: case CLOSED: LOG.info("Region has been OFFLINE or CLOSED for too long, " + - "reassigning " + regionInfo.getRegionNameAsString()); + "reassigning " + regionInfo.getRegionNameAsString()); assign(regionState.getRegion()); break; case PENDING_OPEN: case OPENING: LOG.info("Region has been PENDING_OPEN or OPENING for too " + - "long, reassigning " + regionInfo.getRegionNameAsString()); + "long, reassigning region=" + + regionInfo.getRegionNameAsString()); assign(regionState.getRegion()); break; case OPEN: LOG.warn("Long-running region in OPEN state? This should " + - "not happen"); + "not happen; region=" + regionInfo.getRegionNameAsString()); break; case PENDING_CLOSE: case CLOSING: LOG.info("Region has been PENDING_CLOSE or CLOSING for too " + - "long, resending close rpc"); + "long, running unassign again on region=" + + regionInfo.getRegionNameAsString()); unassign(regionInfo); break; } @@ -1017,61 +1033,9 @@ public class AssignmentManager extends ZooKeeperListener { */ public void handleSplitReport(final HServerInfo hsi, final HRegionInfo parent, final HRegionInfo a, final HRegionInfo b) { - synchronized (this.regions) { - checkRegion(hsi, parent, true); - checkRegion(hsi, a, false); - checkRegion(hsi, b, false); - this.regions.put(a, hsi); - this.regions.put(b, hsi); - removeFromServers(hsi, parent, true); - removeFromServers(hsi, a, false); - removeFromServers(hsi, b, false); - addToServers(hsi, a); - addToServers(hsi, b); - } - } - - /* - * Caller must hold locks on this.regions Map. - * @param hsi - * @param hri - * @param expected True if we expect hri to be in this.regions. - */ - private void checkRegion(final HServerInfo hsi, final HRegionInfo hri, - final boolean expected) { - HServerInfo serverInfo = regions.remove(hri); - if (expected) { - if (serverInfo == null) { - LOG.info("Region not on a server: " + hri.getRegionNameAsString()); - } - } else { - if (serverInfo != null) { - LOG.warn("Region present on " + hsi + "; unexpected"); - } - } - } - - /* - * Caller must hold locks on servers Map. - * @param hsi - * @param hri - * @param expected - */ - private void removeFromServers(final HServerInfo hsi, final HRegionInfo hri, - final boolean expected) { - List serverRegions = this.servers.get(hsi); - boolean removed = serverRegions.remove(hri); - if (expected) { - if (!removed) { - LOG.warn(hri.getRegionNameAsString() + " not found on " + hsi + - "; unexpected"); - } - } else { - if (removed) { - LOG.warn(hri.getRegionNameAsString() + " found on " + hsi + - "; unexpected"); - } - } + regionOffline(parent); + regionOnline(a, hsi); + regionOnline(b, hsi); } /** diff --git a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 8a55108b375..6374b5014e9 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -572,9 +572,11 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server { synchronized (this.balancer) { // Only allow one balance run at at time. if (this.assignmentManager.isRegionsInTransition()) { - LOG.debug("Not running balancer because regions in transition: " + + LOG.debug("Not running balancer because " + + this.assignmentManager.getRegionsInTransition().size() + + " region(s) in transition: " + org.apache.commons.lang.StringUtils. - abbreviate(this.assignmentManager.getRegionsInTransition().toString(), 64)); + abbreviate(this.assignmentManager.getRegionsInTransition().toString(), 256)); return false; } if (!this.serverManager.getDeadServers().isEmpty()) { diff --git a/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index dcb573623e4..7445ebd2bbc 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -285,7 +285,7 @@ public class ServerManager { } for (HMsg msg: msgs) { - LOG.info("Received " + msg); + LOG.info("Received " + msg + " from " + serverInfo.getServerName()); switch (msg.getType()) { case REGION_SPLIT: this.services.getAssignmentManager().handleSplitReport(serverInfo, @@ -527,8 +527,9 @@ public class ServerManager { throws NotServingRegionException { HRegionInterface hri = getServerConnection(server); if(hri == null) { - LOG.warn("Attempting to send CLOSE RPC to server " + server.getServerName() - + " failed because no RPC connection found to this server"); + LOG.warn("Attempting to send CLOSE RPC to server " + + server.getServerName() + " failed because no RPC connection found " + + "to this server"); return; } hri.closeRegion(region); diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index e9d77514595..e498ed1f467 100644 --- a/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -1924,7 +1924,8 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, // TODO: Need to check if this is being served here but currently undergoing // a split (so master needs to retry close after split is complete) if (!onlineRegions.containsKey(region.getEncodedName())) { - LOG.warn("Received close for region we are not serving"); + LOG.warn("Received close for region we are not serving; " + + region.getEncodedName()); throw new NotServingRegionException("Received close for " + region.getRegionNameAsString() + " but we are not serving it"); }