HBASE-25973 Balancer should explain progress in a better way in log - backport branch-2 (#3485)

Signed-off-by: stack <stack@apache.org>
This commit is contained in:
clarax 2021-07-16 15:20:14 -07:00 committed by GitHub
parent dd2ae3605d
commit 6ab6d6f231
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 79 additions and 68 deletions

View File

@ -1858,6 +1858,8 @@ public class HMaster extends HRegionServer implements MasterServices {
} }
} }
} }
LOG.info("Balancer is going into sleep until next period in {}ms", getConfiguration()
.getInt(HConstants.HBASE_BALANCER_PERIOD, HConstants.DEFAULT_HBASE_BALANCER_PERIOD));
return successRegionPlans; return successRegionPlans;
} }

View File

@ -74,6 +74,8 @@ import org.apache.hbase.thirdparty.com.google.common.collect.Sets;
* *
*/ */
@InterfaceAudience.Private @InterfaceAudience.Private
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="IS2_INCONSISTENT_SYNC",
justification="Complaint is about isByTable not being synchronized; we don't modify often")
public abstract class BaseLoadBalancer implements LoadBalancer { public abstract class BaseLoadBalancer implements LoadBalancer {
public static final String BALANCER_DECISION_BUFFER_ENABLED = public static final String BALANCER_DECISION_BUFFER_ENABLED =

View File

@ -149,7 +149,8 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
private List<CandidateGenerator> candidateGenerators; private List<CandidateGenerator> candidateGenerators;
private List<CostFunction> costFunctions; // FindBugs: Wants this protected; IS2_INCONSISTENT_SYNC private List<CostFunction> costFunctions; // FindBugs: Wants this protected; IS2_INCONSISTENT_SYNC
// To save currently configed sum of multiplier. Defaulted at 1 for cases that carry high cost
private float sumMultiplier = 1.0f;
// to save and report costs to JMX // to save and report costs to JMX
private double curOverallCost = 0d; private double curOverallCost = 0d;
private double[] tempFunctionCosts; private double[] tempFunctionCosts;
@ -206,7 +207,6 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
} }
regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf); regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf);
regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf); regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf);
costFunctions = new ArrayList<>(); costFunctions = new ArrayList<>();
addCostFunction(new RegionCountSkewCostFunction(conf)); addCostFunction(new RegionCountSkewCostFunction(conf));
addCostFunction(new PrimaryRegionCountSkewCostFunction(conf)); addCostFunction(new PrimaryRegionCountSkewCostFunction(conf));
@ -327,63 +327,65 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
protected boolean needsBalance(TableName tableName, Cluster cluster) { protected boolean needsBalance(TableName tableName, Cluster cluster) {
ClusterLoadState cs = new ClusterLoadState(cluster.clusterState); ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
if (cs.getNumServers() < MIN_SERVER_BALANCE) { if (cs.getNumServers() < MIN_SERVER_BALANCE) {
if (LOG.isDebugEnabled()) { LOG.info("Not running balancer because only " + cs.getNumServers() +
LOG.debug("Not running balancer because only " + cs.getNumServers() " active regionserver(s)");
+ " active regionserver(s)"); sendRejectionReasonToRingBuffer("The number of RegionServers " + cs.getNumServers() +
} " < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null);
if (this.isBalancerRejectionRecording) {
sendRejectionReasonToRingBuffer("The number of RegionServers " +
cs.getNumServers() + " < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null);
}
return false; return false;
} }
if (areSomeRegionReplicasColocated(cluster)) { if (areSomeRegionReplicasColocated(cluster)) {
LOG.info("Running balancer because at least one server hosts replicas of the same region.");
return true; return true;
} }
if (idleRegionServerExist(cluster)){ if (idleRegionServerExist(cluster)){
LOG.info("Running balancer because cluster has idle server(s).");
return true; return true;
} }
sumMultiplier = 0.0f;
double total = 0.0; double total = 0.0;
float sumMultiplier = 0.0f;
for (CostFunction c : costFunctions) { for (CostFunction c : costFunctions) {
float multiplier = c.getMultiplier(); float multiplier = c.getMultiplier();
if (multiplier <= 0) { double cost = c.cost();
LOG.trace("{} not needed because multiplier is <= 0", c.getClass().getSimpleName());
continue;
}
if (!c.isNeeded()) { if (!c.isNeeded()) {
LOG.trace("{} not needed", c.getClass().getSimpleName()); LOG.trace("{} not needed", c.getClass().getSimpleName());
continue; continue;
} }
total += cost * multiplier;
sumMultiplier += multiplier; sumMultiplier += multiplier;
total += c.cost() * multiplier; }
if (sumMultiplier <= 0) {
LOG.error("At least one cost function needs a multiplier > 0. For example, set "
+ "hbase.master.balancer.stochastic.regionCountCost to a positive value or default");
return false;
} }
boolean balanced = total <= 0 || sumMultiplier <= 0 || boolean balanced = (total / sumMultiplier < minCostNeedBalance);
(sumMultiplier > 0 && (total / sumMultiplier) < minCostNeedBalance); if (balanced) {
if(balanced && isBalancerRejectionRecording){ if (isBalancerRejectionRecording) {
String reason = ""; String reason = "";
if (total <= 0) { if (total <= 0) {
reason = "(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern) = " + total + " <= 0"; reason =
} else if (sumMultiplier <= 0) { "(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern) = " + total + " <= 0";
reason = "sumMultiplier = " + sumMultiplier + " <= 0"; } else if (sumMultiplier <= 0) {
} else if ((total / sumMultiplier) < minCostNeedBalance) { reason = "sumMultiplier = " + sumMultiplier + " <= 0";
reason = } else if ((total / sumMultiplier) < minCostNeedBalance) {
"[(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern)]/sumMultiplier = " + (total reason =
/ sumMultiplier) + " <= minCostNeedBalance(" + minCostNeedBalance + ")"; "[(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern)]/sumMultiplier = " + (
} total / sumMultiplier) + " <= minCostNeedBalance(" + minCostNeedBalance + ")";
sendRejectionReasonToRingBuffer(reason, costFunctions); }
} sendRejectionReasonToRingBuffer(reason, costFunctions);
if (LOG.isDebugEnabled()) {
LOG.debug("{} {}; total cost={}, sum multiplier={}; cost/multiplier to need a balance is {}",
balanced ? "Skipping load balancing because balanced" : "We need to load balance",
isByTable ? String.format("table (%s)", tableName) : "cluster",
total, sumMultiplier, minCostNeedBalance);
if (LOG.isTraceEnabled()) {
LOG.trace("Balance decision detailed function costs={}", functionCost());
} }
LOG.info("{} - skipping load balancing because weighted average imbalance={} <= " +
"threshold({}). If you want more aggressive balancing, either lower "
+ "hbase.master.balancer.stochastic.minCostNeedBalance from {} or increase the relative"
+ " multiplier(s) of the specific cost function(s). functionCost={}",
isByTable ? "Table specific (" + tableName + ")" : "Cluster wide", total / sumMultiplier,
minCostNeedBalance, minCostNeedBalance, functionCost());
} else {
LOG.info("{} - Calculating plan. may take up to {}ms to complete.",
isByTable ? "Table specific (" + tableName + ")" : "Cluster wide", maxRunningTime);
} }
return !balanced; return !balanced;
} }
@ -419,8 +421,8 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
// Allow turning this feature off if the locality cost is not going to // Allow turning this feature off if the locality cost is not going to
// be used in any computations. // be used in any computations.
RegionLocationFinder finder = null; RegionLocationFinder finder = null;
if ((this.localityCost != null && this.localityCost.getMultiplier() > 0) if ((this.localityCost != null && this.localityCost.getMultiplier() > 0) || (
|| (this.rackLocalityCost != null && this.rackLocalityCost.getMultiplier() > 0)) { this.rackLocalityCost != null && this.rackLocalityCost.getMultiplier() > 0)) {
finder = this.regionFinder; finder = this.regionFinder;
} }
@ -446,21 +448,22 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
long computedMaxSteps; long computedMaxSteps;
if (runMaxSteps) { if (runMaxSteps) {
computedMaxSteps = Math.max(this.maxSteps, computedMaxSteps = Math.max(this.maxSteps,
((long)cluster.numRegions * (long)this.stepsPerRegion * (long)cluster.numServers)); ((long) cluster.numRegions * (long) this.stepsPerRegion * (long) cluster.numServers));
} else { } else {
long calculatedMaxSteps = (long)cluster.numRegions * (long)this.stepsPerRegion * long calculatedMaxSteps =
(long)cluster.numServers; (long) cluster.numRegions * (long) this.stepsPerRegion * (long) cluster.numServers;
computedMaxSteps = Math.min(this.maxSteps, calculatedMaxSteps); computedMaxSteps = Math.min(this.maxSteps, calculatedMaxSteps);
if (calculatedMaxSteps > maxSteps) { if (calculatedMaxSteps > maxSteps) {
LOG.warn("calculatedMaxSteps:{} for loadbalancer's stochastic walk is larger than " LOG.warn("calculatedMaxSteps:{} for loadbalancer's stochastic walk is larger than " +
+ "maxSteps:{}. Hence load balancing may not work well. Setting parameter " "maxSteps:{}. Hence load balancing may not work well. Setting parameter " +
+ "\"hbase.master.balancer.stochastic.runMaxSteps\" to true can overcome this issue." "\"hbase.master.balancer.stochastic.runMaxSteps\" to true can overcome this issue." +
+ "(This config change does not require service restart)", calculatedMaxSteps, "(This config change does not require service restart)", calculatedMaxSteps,
maxSteps); maxSteps);
} }
} }
LOG.info("start StochasticLoadBalancer.balancer, initCost=" + currentCost + ", functionCost=" LOG.info("Start StochasticLoadBalancer.balancer, initial weighted average imbalance={}," +
+ functionCost() + " computedMaxSteps: " + computedMaxSteps); " functionCost={} computedMaxSteps={}",
currentCost / sumMultiplier, functionCost(), computedMaxSteps);
final String initFunctionTotalCosts = totalCostsPerFunc(); final String initFunctionTotalCosts = totalCostsPerFunc();
// Perform a stochastic walk to see if we can get a good fit. // Perform a stochastic walk to see if we can get a good fit.
@ -493,8 +496,7 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
updateCostsWithAction(cluster, undoAction); updateCostsWithAction(cluster, undoAction);
} }
if (EnvironmentEdgeManager.currentTime() - startTime > if (EnvironmentEdgeManager.currentTime() - startTime > maxRunningTime) {
maxRunningTime) {
break; break;
} }
} }
@ -506,17 +508,18 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
updateStochasticCosts(tableName, curOverallCost, curFunctionCosts); updateStochasticCosts(tableName, curOverallCost, curFunctionCosts);
if (initCost > currentCost) { if (initCost > currentCost) {
plans = createRegionPlans(cluster); plans = createRegionPlans(cluster);
LOG.info("Finished computing new load balance plan. Computation took {}" + LOG.info("Finished computing new moving plan. Computation took {} ms" +
" to try {} different iterations. Found a solution that moves " + " to try {} different iterations. Found a solution that moves " +
"{} regions; Going from a computed cost of {}" + "{} regions; Going from a computed imbalance of {}" + " to a new imbalance of {}. ",
" to a new cost of {}", java.time.Duration.ofMillis(endTime - startTime), endTime - startTime, step, plans.size(), initCost / sumMultiplier,
step, plans.size(), initCost, currentCost); currentCost / sumMultiplier);
sendRegionPlansToRingBuffer(plans, currentCost, initCost, initFunctionTotalCosts, step); sendRegionPlansToRingBuffer(plans, currentCost, initCost, initFunctionTotalCosts, step);
return plans; return plans;
} }
LOG.info("Could not find a better load balance plan. Tried {} different configurations in " + LOG.info("Could not find a better moving plan. Tried {} different configurations in "
"{}, and did not find anything with a computed cost less than {}", step, + "{} ms, and did not find anything with an imbalance score less than {}", step,
java.time.Duration.ofMillis(endTime - startTime), initCost); endTime - startTime, initCost / sumMultiplier);
return null; return null;
} }
@ -527,8 +530,7 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
.setReason(reason); .setReason(reason);
if (costFunctions != null) { if (costFunctions != null) {
for (CostFunction c : costFunctions) { for (CostFunction c : costFunctions) {
float multiplier = c.getMultiplier(); if (!c.isNeeded()) {
if (multiplier <= 0 || !c.isNeeded()) {
continue; continue;
} }
builder.addCostFuncInfo(c.getClass().getName(), c.cost(), c.getMultiplier()); builder.addCostFuncInfo(c.getClass().getName(), c.cost(), c.getMultiplier());
@ -587,7 +589,8 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
} }
private void addCostFunction(CostFunction costFunction) { private void addCostFunction(CostFunction costFunction) {
if (costFunction.getMultiplier() > 0) { float multiplier = costFunction.getMultiplier();
if (multiplier > 0) {
costFunctions.add(costFunction); costFunctions.add(costFunction);
} }
} }
@ -598,9 +601,13 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
builder.append(c.getClass().getSimpleName()); builder.append(c.getClass().getSimpleName());
builder.append(" : ("); builder.append(" : (");
if (c.isNeeded()) { if (c.isNeeded()) {
builder.append(c.getMultiplier()); builder.append("multiplier=" + c.getMultiplier());
builder.append(", "); builder.append(", ");
builder.append(c.cost()); double cost = c.cost();
builder.append("imbalance=" + cost);
if (cost < minCostNeedBalance) {
builder.append(", balanced");
}
} else { } else {
builder.append("not needed"); builder.append("not needed");
} }
@ -612,7 +619,7 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
private String totalCostsPerFunc() { private String totalCostsPerFunc() {
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
for (CostFunction c : costFunctions) { for (CostFunction c : costFunctions) {
if (c.getMultiplier() <= 0 || !c.isNeeded()) { if (!c.isNeeded()) {
continue; continue;
} }
double cost = c.getMultiplier() * c.cost(); double cost = c.getMultiplier() * c.cost();
@ -696,7 +703,7 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
allowedOnPath = ".*(/src/test/.*|StochasticLoadBalancer).java") allowedOnPath = ".*(/src/test/.*|StochasticLoadBalancer).java")
void updateCostsWithAction(Cluster cluster, Action action) { void updateCostsWithAction(Cluster cluster, Action action) {
for (CostFunction c : costFunctions) { for (CostFunction c : costFunctions) {
if (c.getMultiplier() > 0 && c.isNeeded()) { if (c.isNeeded()) {
c.postAction(action); c.postAction(action);
} }
} }
@ -735,7 +742,7 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
CostFunction c = costFunctions.get(i); CostFunction c = costFunctions.get(i);
this.tempFunctionCosts[i] = 0.0; this.tempFunctionCosts[i] = 0.0;
if (c.getMultiplier() <= 0 || !c.isNeeded()) { if (!c.isNeeded()) {
continue; continue;
} }