diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon index e2ae09d0a18..06b320faa07 100644 --- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon +++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon @@ -25,6 +25,8 @@ org.apache.hadoop.hbase.HBaseConfiguration; org.apache.hadoop.hbase.HConstants; java.util.HashSet; java.util.SortedSet; +java.util.Map; +java.util.concurrent.atomic.AtomicInteger; <%args> AssignmentManager assignmentManager; @@ -32,7 +34,9 @@ int limit = 100; <%java SortedSet rit = assignmentManager - .getRegionStates().getRegionsInTransitionOrderedByTimestamp(); %> + .getRegionStates().getRegionsInTransitionOrderedByTimestamp(); + Map failedRegionTracker = assignmentManager.getFailedOpenTracker(); + %> <%if !rit.isEmpty() %> <%java> @@ -83,7 +87,7 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage);
- + <%if ritsOverThreshold.contains(rs.getRegion().getEncodedName()) %> @@ -93,9 +97,21 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage); <%else> + <%java> + String retryStatus = "0"; + String name = rs.getRegion().getEncodedName(); + RegionState state = assignmentManager.getState(name); + AtomicInteger numOpenRetries = failedRegionTracker.get(name); + if (numOpenRetries != null ) { + retryStatus = Integer.toString(numOpenRetries.get()); + } else if (state.getState() == RegionState.State.FAILED_OPEN) { + retryStatus = "Failed"; + } + + <%java recordItr++; %> <%if (recordItr % ritsPerPage) == 0 %> diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 2ffe466851c..ffdbac87582 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -36,6 +36,7 @@ import java.util.TreeMap; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; @@ -87,6 +88,7 @@ import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.KeyLocker; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.PairOfSameType; +import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; @@ -149,8 +151,8 @@ public class AssignmentManager { private final ExecutorService executorService; - // Thread pool executor service. TODO, consolidate with executorService? private java.util.concurrent.ExecutorService threadPoolExecutorService; + private ScheduledThreadPoolExecutor scheduledThreadPoolExecutor; private final RegionStates regionStates; @@ -202,6 +204,8 @@ public class AssignmentManager { private RegionStateListener regionStateListener; + private RetryCounter.BackoffPolicy backoffPolicy; + private RetryCounter.RetryConfig retryConfig; /** * Constructs a new assignment manager. * @@ -240,8 +244,13 @@ public class AssignmentManager { "hbase.meta.assignment.retry.sleeptime", 1000l); this.balancer = balancer; int maxThreads = conf.getInt("hbase.assignment.threads.max", 30); + this.threadPoolExecutorService = Threads.getBoundedCachedThreadPool( - maxThreads, 60L, TimeUnit.SECONDS, Threads.newDaemonThreadFactory("AM.")); + maxThreads, 60L, TimeUnit.SECONDS, Threads.newDaemonThreadFactory("AM.")); + + this.scheduledThreadPoolExecutor = new ScheduledThreadPoolExecutor(1, + Threads.newDaemonThreadFactory("AM.Scheduler")); + this.regionStates = new RegionStates( server, tableStateManager, serverManager, regionStateStore); @@ -254,6 +263,23 @@ public class AssignmentManager { this.metricsAssignmentManager = new MetricsAssignmentManager(); this.tableLockManager = tableLockManager; + + // Configurations for retrying opening a region on receiving a FAILED_OPEN + this.retryConfig = new RetryCounter.RetryConfig(); + this.retryConfig.setSleepInterval(conf.getLong("hbase.assignment.retry.sleep.initial", 0l)); + // Set the max time limit to the initial sleep interval so we use a constant time sleep strategy + // if the user does not set a max sleep limit + this.retryConfig.setMaxSleepTime(conf.getLong("hbase.assignment.retry.sleep.max", + retryConfig.getSleepInterval())); + this.backoffPolicy = getBackoffPolicy(); + } + + /** + * Returns the backoff policy used for Failed Region Open retries + * @return the backoff policy used for Failed Region Open retries + */ + RetryCounter.BackoffPolicy getBackoffPolicy() { + return new RetryCounter.ExponentialBackoffPolicyWithLimit(); } MetricsAssignmentManager getAssignmentManagerMetrics() { @@ -2028,6 +2054,11 @@ public class AssignmentManager { threadPoolExecutorService.submit(new AssignCallable(this, regionInfo)); } + void invokeAssignLater(HRegionInfo regionInfo, long sleepMillis) { + scheduledThreadPoolExecutor.schedule(new DelayedAssignCallable( + new AssignCallable(this, regionInfo)), sleepMillis, TimeUnit.MILLISECONDS); + } + void invokeUnAssign(HRegionInfo regionInfo) { threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo)); } @@ -2233,7 +2264,10 @@ public class AssignmentManager { } catch (HBaseIOException e) { LOG.warn("Failed to get region plan", e); } - invokeAssign(hri); + // Have the current thread sleep a bit before resubmitting the RPC request + long sleepTime = backoffPolicy.getBackoffTime(retryConfig, + failedOpenTracker.get(encodedName).get()); + invokeAssignLater(hri, sleepTime); } } // Null means no error @@ -2732,6 +2766,10 @@ public class AssignmentManager { return replicasToClose; } + public Map getFailedOpenTracker() {return failedOpenTracker;} + + public RegionState getState(String encodedName) {return regionStates.getRegionState(encodedName);} + /** * A region is offline. The new state should be the specified one, * if not null. If the specified state is null, the new state is Offline. @@ -2934,4 +2972,16 @@ public class AssignmentManager { void setRegionStateListener(RegionStateListener listener) { this.regionStateListener = listener; } + + private class DelayedAssignCallable implements Runnable { + Callable callable; + public DelayedAssignCallable(Callable callable) { + this.callable = callable; + } + + @Override + public void run() { + threadPoolExecutorService.submit(callable); + } + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java index b95b894c309..ba08a059817 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java @@ -67,10 +67,16 @@ public class RegionStates { public final static RegionStateStampComparator REGION_STATE_COMPARATOR = new RegionStateStampComparator(); + + // This comparator sorts the RegionStates by time stamp then Region name. + // Comparing by timestamp alone can lead us to discard different RegionStates that happen + // to share a timestamp. private static class RegionStateStampComparator implements Comparator { @Override public int compare(RegionState l, RegionState r) { - return Long.compare(l.getStamp(), r.getStamp()); + return Long.compare(l.getStamp(), r.getStamp()) == 0 ? + Bytes.compareTo(l.getRegion().getRegionName(), r.getRegion().getRegionName()) : + Long.compare(l.getStamp(), r.getStamp()); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterStatusServlet.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterStatusServlet.java index 02f01c448a5..f975d9d4d0a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterStatusServlet.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterStatusServlet.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master; import static org.junit.Assert.*; +import static org.mockito.Matchers.any; import java.io.IOException; import java.io.StringWriter; @@ -26,8 +27,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; @@ -178,6 +177,11 @@ public class TestMasterStatusServlet { Mockito.doReturn(rs).when(am).getRegionStates(); Mockito.doReturn(regionsInTransition).when(rs).getRegionsInTransition(); Mockito.doReturn(regionsInTransition).when(rs).getRegionsInTransitionOrderedByTimestamp(); + Mockito.when( + am.getState( + any(String.class) + ) + ).thenReturn(new RegionState(null, null)); // Render to a string StringWriter sw = new StringWriter();
RegionStateRIT time (ms)
StateRIT time (ms) Retries
<% rs.getRegion().getEncodedName() %> <% HRegionInfo.getDescriptiveNameFromRegionStateForDisplay(rs, conf) %> <% (currentTime - rs.getStamp()) %> <% retryStatus %>