HBASE-23596 HBCKServerCrashProcedure can double assign
Signed-off-by: Duo Zhang <zhangduo@apache.org> Signed-off-by: Lijin Bin <binlijin@apache.org> Signed-off-by: Viraj Jasani <vjasani@apache.org> Change its behavior so it will only look in hbase:meta if the call to the super class turns up zero references. Only then will it search hbase:meta for references to 'Unknown Servers'. Normal operation where we read Master context is usual and sufficient. The scan of hbase:meta is only for case where Master state has been corrupted and we need to clear out 'Unknown Servers'.
This commit is contained in:
parent
280b94434c
commit
db7fb063d1
|
@ -1389,7 +1389,7 @@ public class MetaTableAccessor {
|
|||
}
|
||||
}
|
||||
|
||||
private static void addRegionStateToPut(Put put, RegionState.State state) throws IOException {
|
||||
private static Put addRegionStateToPut(Put put, RegionState.State state) throws IOException {
|
||||
put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
|
||||
.setRow(put.getRow())
|
||||
.setFamily(HConstants.CATALOG_FAMILY)
|
||||
|
@ -1398,6 +1398,17 @@ public class MetaTableAccessor {
|
|||
.setType(Cell.Type.Put)
|
||||
.setValue(Bytes.toBytes(state.name()))
|
||||
.build());
|
||||
return put;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update state column in hbase:meta.
|
||||
*/
|
||||
public static void updateRegionState(Connection connection, RegionInfo ri,
|
||||
RegionState.State state) throws IOException {
|
||||
Put put = new Put(RegionReplicaUtil.getRegionInfoForDefaultReplica(ri).getRegionName());
|
||||
MetaTableAccessor.putsToMetaTable(connection,
|
||||
Collections.singletonList(addRegionStateToPut(put, state)));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -51,7 +51,6 @@ import org.apache.zookeeper.KeeperException;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
|
||||
|
||||
/**
|
||||
|
@ -132,7 +131,7 @@ public class RegionStateStore {
|
|||
if (regionInfo == null) continue;
|
||||
|
||||
final int replicaId = regionInfo.getReplicaId();
|
||||
final State state = getRegionState(result, replicaId, regionInfo);
|
||||
final State state = getRegionState(result, regionInfo);
|
||||
|
||||
final ServerName lastHost = hrl.getServerName();
|
||||
final ServerName regionLocation = getRegionServer(result, replicaId);
|
||||
|
@ -343,12 +342,11 @@ public class RegionStateStore {
|
|||
|
||||
/**
|
||||
* Pull the region state from a catalog table {@link Result}.
|
||||
* @param r Result to pull the region state from
|
||||
* @return the region state, or null if unknown.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public static State getRegionState(final Result r, int replicaId, RegionInfo regionInfo) {
|
||||
Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY, getStateColumn(replicaId));
|
||||
public static State getRegionState(final Result r, RegionInfo regionInfo) {
|
||||
Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY,
|
||||
getStateColumn(regionInfo.getReplicaId()));
|
||||
if (cell == null || cell.getValueLength() == 0) {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -20,21 +20,31 @@ package org.apache.hadoop.hbase.master.procedure;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.hadoop.hbase.HRegionLocation;
|
||||
import org.apache.hadoop.hbase.MetaTableAccessor;
|
||||
import org.apache.hadoop.hbase.RegionLocations;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.Connection;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hadoop.hbase.client.Result;
|
||||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* A SCP that differs from default only in how it gets the list of
|
||||
* Regions hosted on the crashed-server; it also reads hbase:meta directly rather
|
||||
* than rely solely on Master memory for list of Regions that were on crashed server.
|
||||
* This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
|
||||
* scheduleRecoveries). It is for the case where meta has references to 'Unknown Servers',
|
||||
* Acts like the super class in all cases except when no Regions found in the
|
||||
* current Master in-memory context. In this latter case, when the call to
|
||||
* super#getRegionsOnCrashedServer returns nothing, this SCP will scan
|
||||
* hbase:meta for references to the passed ServerName. If any found, we'll
|
||||
* clean them up.
|
||||
*
|
||||
* <p>This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
|
||||
* scheduleRecoveries); the super class is used during normal recovery operations.
|
||||
* It is for the case where meta has references to 'Unknown Servers',
|
||||
* servers that are in hbase:meta but not in live-server or dead-server lists; i.e. Master
|
||||
* and hbase:meta content have deviated. It should never happen in normal running
|
||||
* cluster but if we do drop accounting of servers, we need a means of fix-up.
|
||||
|
@ -65,31 +75,97 @@ public class HBCKServerCrashProcedure extends ServerCrashProcedure {
|
|||
public HBCKServerCrashProcedure() {}
|
||||
|
||||
/**
|
||||
* Adds Regions found by super method any found scanning hbase:meta.
|
||||
* If no Regions found in Master context, then we will search hbase:meta for references
|
||||
* to the passed server. Operator may have passed ServerName because they have found
|
||||
* references to 'Unknown Servers'. They are using HBCKSCP to clear them out.
|
||||
*/
|
||||
@Override
|
||||
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION",
|
||||
justification="FindBugs seems confused on ps in below.")
|
||||
List<RegionInfo> getRegionsOnCrashedServer(MasterProcedureEnv env) {
|
||||
// Super can return immutable emptyList.
|
||||
// Super will return an immutable list (empty if nothing on this server).
|
||||
List<RegionInfo> ris = super.getRegionsOnCrashedServer(env);
|
||||
List<Pair<RegionInfo, ServerName>> ps = null;
|
||||
try {
|
||||
ps = MetaTableAccessor.getTableRegionsAndLocations(env.getMasterServices().getConnection(),
|
||||
null, false);
|
||||
} catch (IOException ioe) {
|
||||
LOG.warn("Failed get of all regions; continuing", ioe);
|
||||
}
|
||||
if (ps == null || ps.isEmpty()) {
|
||||
LOG.warn("No regions found in hbase:meta");
|
||||
if (!ris.isEmpty()) {
|
||||
return ris;
|
||||
}
|
||||
List<RegionInfo> aggregate = ris == null || ris.isEmpty()?
|
||||
new ArrayList<>(): new ArrayList<>(ris);
|
||||
int before = aggregate.size();
|
||||
ps.stream().filter(p -> p.getSecond() != null && p.getSecond().equals(getServerName())).
|
||||
forEach(p -> aggregate.add(p.getFirst()));
|
||||
LOG.info("Found {} mentions of {} in hbase:meta", aggregate.size() - before, getServerName());
|
||||
return aggregate;
|
||||
// Nothing in in-master context. Check for Unknown Server! in hbase:meta.
|
||||
// If super list is empty, then allow that an operator scheduled an SCP because they are trying
|
||||
// to purge 'Unknown Servers' -- servers that are neither online nor in dead servers
|
||||
// list but that ARE in hbase:meta and so showing as unknown in places like 'HBCK Report'.
|
||||
// This mis-accounting does not happen in normal circumstance but may arise in-extremis
|
||||
// when cluster has been damaged in operation.
|
||||
UnknownServerVisitor visitor =
|
||||
new UnknownServerVisitor(env.getMasterServices().getConnection(), getServerName());
|
||||
try {
|
||||
MetaTableAccessor.scanMetaForTableRegions(env.getMasterServices().getConnection(),
|
||||
visitor, null);
|
||||
} catch (IOException ioe) {
|
||||
LOG.warn("Failed scan of hbase:meta for 'Unknown Servers'", ioe);
|
||||
return ris;
|
||||
}
|
||||
LOG.info("Found {} mentions of {} in hbase:meta of OPEN/OPENING Regions: {}",
|
||||
visitor.getReassigns().size(), getServerName(),
|
||||
visitor.getReassigns().stream().map(RegionInfo::getEncodedName).
|
||||
collect(Collectors.joining(",")));
|
||||
return visitor.getReassigns();
|
||||
}
|
||||
|
||||
/**
|
||||
* Visitor for hbase:meta that 'fixes' Unknown Server issues. Collects
|
||||
* a List of Regions to reassign as 'result'.
|
||||
*/
|
||||
private static class UnknownServerVisitor implements MetaTableAccessor.Visitor {
|
||||
private final List<RegionInfo> reassigns = new ArrayList<>();
|
||||
private final ServerName unknownServerName;
|
||||
private final Connection connection;
|
||||
|
||||
private UnknownServerVisitor(Connection connection, ServerName unknownServerName) {
|
||||
this.connection = connection;
|
||||
this.unknownServerName = unknownServerName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean visit(Result result) throws IOException {
|
||||
RegionLocations rls = MetaTableAccessor.getRegionLocations(result);
|
||||
if (rls == null) {
|
||||
return true;
|
||||
}
|
||||
for (HRegionLocation hrl: rls.getRegionLocations()) {
|
||||
if (hrl == null) {
|
||||
continue;
|
||||
}
|
||||
if (hrl.getRegion() == null) {
|
||||
continue;
|
||||
}
|
||||
if (hrl.getServerName() == null) {
|
||||
continue;
|
||||
}
|
||||
if (!hrl.getServerName().equals(this.unknownServerName)) {
|
||||
continue;
|
||||
}
|
||||
RegionState.State state = RegionStateStore.getRegionState(result, hrl.getRegion());
|
||||
RegionState rs = new RegionState(hrl.getRegion(), state, hrl.getServerName());
|
||||
if (rs.isClosing()) {
|
||||
// Move region to CLOSED in hbase:meta.
|
||||
LOG.info("Moving {} from CLOSING to CLOSED in hbase:meta",
|
||||
hrl.getRegion().getRegionNameAsString());
|
||||
try {
|
||||
MetaTableAccessor.updateRegionState(this.connection, hrl.getRegion(),
|
||||
RegionState.State.CLOSED);
|
||||
} catch (IOException ioe) {
|
||||
LOG.warn("Failed moving {} from CLOSING to CLOSED", ioe);
|
||||
}
|
||||
} else if (rs.isOpening() || rs.isOpened()) {
|
||||
this.reassigns.add(hrl.getRegion());
|
||||
} else {
|
||||
LOG.info("Passing {}", rs);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private List<RegionInfo> getReassigns() {
|
||||
return this.reassigns;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3605,8 +3605,7 @@ public class HBaseTestingUtility extends HBaseZKTestingUtility {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
if (RegionStateStore.getRegionState(r,
|
||||
info.getReplicaId(), info) != RegionState.State.OPEN) {
|
||||
if (RegionStateStore.getRegionState(r, info) != RegionState.State.OPEN) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -112,6 +112,7 @@ public class TestHBCKSCP extends TestSCPBase {
|
|||
master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
|
||||
master.getServerManager().getDeadServers().finish(rsServerName);
|
||||
master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
|
||||
master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
|
||||
// Kill the server. Nothing should happen since an 'Unknown Server' as far
|
||||
// as the Master is concerned; i.e. no SCP.
|
||||
LOG.info("Killing {}", rsServerName);
|
||||
|
|
Loading…
Reference in New Issue