diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java index 0e41146d3a0..05d31504dd4 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java @@ -320,6 +320,7 @@ public class MetaTableAccessor { * is stored in the name, so the returned object should only be used for the fields * in the regionName. */ + // This should be moved to RegionInfo? TODO. public static RegionInfo parseRegionInfoFromRegionName(byte[] regionName) throws IOException { byte[][] fields = RegionInfo.parseRegionName(regionName); long regionId = Long.parseLong(Bytes.toString(fields[2])); @@ -1308,9 +1309,7 @@ public class MetaTableAccessor { * Generates and returns a Put containing the region into for the catalog table */ public static Put makePutFromRegionInfo(RegionInfo regionInfo, long ts) throws IOException { - Put put = new Put(regionInfo.getRegionName(), ts); - addRegionInfo(put, regionInfo); - return put; + return addRegionInfo(new Put(regionInfo.getRegionName(), ts), regionInfo); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java index d7e97ec9db6..4b792f9d867 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java @@ -735,7 +735,7 @@ public class CatalogJanitor extends ScheduledChore { continue; } RegionState rs = this.services.getAssignmentManager().getRegionStates().getRegionState(ri); - if (rs.isClosedOrAbnormallyClosed()) { + if (rs == null || rs.isClosedOrAbnormallyClosed()) { // If closed against an 'Unknown Server', that is should be fine. continue; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 880cf8035df..dc48959a230 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -1828,7 +1828,7 @@ public class HMaster extends HRegionServer implements MasterServices { } catch (HBaseIOException hioe) { //should ignore failed plans here, avoiding the whole balance plans be aborted //later calls of balance() can fetch up the failed and skipped plans - LOG.warn("Failed balance plan: {}, just skip it", plan, hioe); + LOG.warn("Failed balance plan {}, skipping...", plan, hioe); } //rpCount records balance plans processed, does not care if a plan succeeds rpCount++; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index ac88c5eeffa..70dc6626137 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -672,6 +672,7 @@ public class ServerManager { /** * Contacts a region server and waits up to timeout ms * to close the region. This bypasses the active hmaster. + * Pass -1 as timeout if you do not want to wait on result. */ public static void closeRegionSilentlyAndWait(ClusterConnection connection, ServerName server, RegionInfo region, long timeout) throws IOException, InterruptedException { @@ -682,6 +683,9 @@ public class ServerManager { } catch (IOException e) { LOG.warn("Exception when closing region: " + region.getRegionNameAsString(), e); } + if (timeout < 0) { + return; + } long expiration = timeout + System.currentTimeMillis(); while (System.currentTimeMillis() < expiration) { controller.reset(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 06aef4a83ec..6d2312ed5d3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -36,6 +36,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.HBaseIOException; import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.PleaseHoldException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; @@ -629,7 +630,8 @@ public class AssignmentManager { ServerName targetServer) throws HBaseIOException { RegionStateNode regionNode = this.regionStates.getRegionStateNode(regionInfo); if (regionNode == null) { - throw new UnknownRegionException("No RegionState found for " + regionInfo.getEncodedName()); + throw new UnknownRegionException("No RegionStateNode found for " + + regionInfo.getEncodedName() + "(Closed/Deleted?)"); } TransitRegionStateProcedure proc; regionNode.lock(); @@ -944,7 +946,7 @@ public class AssignmentManager { if (regionNode == null) { // the table/region is gone. maybe a delete, split, merge throw new UnexpectedStateException(String.format( - "Server %s was trying to transition region %s to %s. but the region was removed.", + "Server %s was trying to transition region %s to %s. but Region is not known.", serverName, regionInfo, state)); } LOG.trace("Update region transition serverName={} region={} regionState={}", serverName, @@ -966,7 +968,8 @@ public class AssignmentManager { state.equals(TransitionCode.CLOSED)) { LOG.info("RegionServer {} {}", state, regionNode.getRegionInfo().getEncodedName()); } else { - LOG.warn("No matching procedure found for {} transition to {}", regionNode, state); + LOG.warn("No matching procedure found for {} transition on {} to {}", + serverName, regionNode, state); } } } finally { @@ -1092,7 +1095,27 @@ public class AssignmentManager { checkOnlineRegionsReport(serverNode, regionNames); } - // just check and output possible inconsistency, without actually doing anything + /** + * Close regionName on sn silently and immediately without + * using a Procedure or going via hbase:meta. For case where a RegionServer's hosting + * of a Region is not aligned w/ the Master's accounting of Region state. This is for + * cleaning up an error in accounting. + */ + private void closeRegionSilently(ServerName sn, byte [] regionName) { + try { + RegionInfo ri = MetaTableAccessor.parseRegionInfoFromRegionName(regionName); + // Pass -1 for timeout. Means do not wait. + ServerManager.closeRegionSilentlyAndWait(this.master.getClusterConnection(), sn, ri, -1); + } catch (Exception e) { + LOG.error("Failed trying to close {} on {}", Bytes.toStringBinary(regionName), sn, e); + } + } + + /** + * Check that what the RegionServer reports aligns with the Master's image. + * If disagreement, we will tell the RegionServer to expediently close + * a Region we do not think it should have. + */ private void checkOnlineRegionsReport(ServerStateNode serverNode, Set regionNames) { ServerName serverName = serverNode.getServerName(); for (byte[] regionName : regionNames) { @@ -1101,10 +1124,13 @@ public class AssignmentManager { } RegionStateNode regionNode = regionStates.getRegionStateNodeFromName(regionName); if (regionNode == null) { - LOG.warn("No region state node for {}, it should already be on {}", - Bytes.toStringBinary(regionName), serverName); + String regionNameAsStr = Bytes.toStringBinary(regionName); + LOG.warn("No RegionStateNode for {} but reported as up on {}; closing...", + regionNameAsStr, serverName); + closeRegionSilently(serverNode.getServerName(), regionName); continue; } + final long lag = 1000; regionNode.lock(); try { long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate(); @@ -1112,17 +1138,19 @@ public class AssignmentManager { // This is possible as a region server has just closed a region but the region server // report is generated before the closing, but arrive after the closing. Make sure there // is some elapsed time so less false alarms. - if (!regionNode.getRegionLocation().equals(serverName) && diff > 1000) { - LOG.warn("{} reported OPEN on server={} but state has otherwise", regionNode, - serverName); + if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) { + LOG.warn("Reporting {} server does not match {} (time since last " + + "update={}ms); closing...", + serverName, regionNode, diff); + closeRegionSilently(serverNode.getServerName(), regionName); } } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) { // So, we can get report that a region is CLOSED or SPLIT because a heartbeat // came in at about same time as a region transition. Make sure there is some // elapsed time so less false alarms. - if (diff > 1000) { - LOG.warn("{} reported an unexpected OPEN on {}; time since last update={}ms", - regionNode, serverName, diff); + if (diff > lag) { + LOG.warn("Reporting {} state does not match {} (time since last update={}ms)", + serverName, regionNode, diff); } } } finally { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java index 04f42ee9aae..ba731a657e9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java @@ -278,7 +278,7 @@ public class RegionStateNode implements Comparable { public String toShortString() { // rit= is the current Region-In-Transition State -- see State enum. - return String.format("rit=%s, location=%s", getState(), getRegionLocation()); + return String.format("state=%s, location=%s", getState(), getRegionLocation()); } public String toDescriptiveString() { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java index d3874a6e241..8c393650cfc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java @@ -96,6 +96,9 @@ public class RegionStates { public RegionStates() { } + /** + * Called on stop of AssignmentManager. + */ public void clear() { regionsMap.clear(); regionInTransition.clear(); @@ -741,12 +744,15 @@ public class RegionStates { return node; } + /** + * Called by SCP at end of successful processing. + */ public void removeServer(final ServerName serverName) { serverMap.remove(serverName); } /** - * @return Pertinent ServerStateNode or NULL if none found. + * @return Pertinent ServerStateNode or NULL if none found (Do not make modifications). */ @VisibleForTesting public ServerStateNode getServerNode(final ServerName serverName) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index 3cb41eb04f8..f63de8bb1fd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -248,7 +248,8 @@ public class TransitRegionStateProcedure if (retries > env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) { // Throw exception to backoff and retry when failed open too many times - throw new HBaseIOException("Failed to open region"); + throw new HBaseIOException("Failed confirm OPEN of " + regionNode + + " (remote log may yield more detail on why)."); } else { // Here we do not throw exception because we want to the region to be online ASAP return Flow.HAS_MORE_STATE; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index d8a05381548..c97d18ffdad 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -3171,7 +3171,7 @@ public class HRegionServer extends HasThread implements /** * Close asynchronously a region, can be called from the master or internally by the regionserver - * when stopping. If called from the master, the region will update the znode status. + * when stopping. If called from the master, the region will update the status. * *

* If an opening was in progress, this method will cancel it, but will not start a new close. The @@ -3201,6 +3201,7 @@ public class HRegionServer extends HasThread implements } } + // previous can come back 'null' if not in map. final Boolean previous = this.regionsInTransitionInRS.putIfAbsent(Bytes.toBytes(encodedName), Boolean.FALSE); @@ -3222,6 +3223,8 @@ public class HRegionServer extends HasThread implements throw new NotServingRegionException("The region " + encodedName + " was opening but not yet served. Opening is cancelled."); } + } else if (previous == null) { + LOG.info("Received CLOSE for {}", encodedName); } else if (Boolean.FALSE.equals(previous)) { LOG.info("Received CLOSE for the region: " + encodedName + ", which we are already trying to CLOSE, but not completed yet");