diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java index d9dc779d3e4..0e41146d3a0 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java @@ -1448,7 +1448,7 @@ public class MetaTableAccessor { } } - private static void addRegionStateToPut(Put put, RegionState.State state) throws IOException { + private static Put addRegionStateToPut(Put put, RegionState.State state) throws IOException { put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY) .setRow(put.getRow()) .setFamily(HConstants.CATALOG_FAMILY) @@ -1457,6 +1457,17 @@ public class MetaTableAccessor { .setType(Cell.Type.Put) .setValue(Bytes.toBytes(state.name())) .build()); + return put; + } + + /** + * Update state column in hbase:meta. + */ + public static void updateRegionState(Connection connection, RegionInfo ri, + RegionState.State state) throws IOException { + Put put = new Put(RegionReplicaUtil.getRegionInfoForDefaultReplica(ri).getRegionName()); + MetaTableAccessor.putsToMetaTable(connection, + Collections.singletonList(addRegionStateToPut(put, state))); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java index 14bf91167f1..7145d38b6d2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java @@ -51,7 +51,6 @@ import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; /** @@ -132,7 +131,7 @@ public class RegionStateStore { if (regionInfo == null) continue; final int replicaId = regionInfo.getReplicaId(); - final State state = getRegionState(result, replicaId, regionInfo); + final State state = getRegionState(result, regionInfo); final ServerName lastHost = hrl.getServerName(); final ServerName regionLocation = getRegionServer(result, replicaId); @@ -343,12 +342,11 @@ public class RegionStateStore { /** * Pull the region state from a catalog table {@link Result}. - * @param r Result to pull the region state from * @return the region state, or null if unknown. */ - @VisibleForTesting - public static State getRegionState(final Result r, int replicaId, RegionInfo regionInfo) { - Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY, getStateColumn(replicaId)); + public static State getRegionState(final Result r, RegionInfo regionInfo) { + Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY, + getStateColumn(regionInfo.getReplicaId())); if (cell == null || cell.getValueLength() == 0) { return null; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java index 874dcde1986..eec820cc150 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java @@ -20,21 +20,31 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; +import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.MetaTableAccessor; +import org.apache.hadoop.hbase.RegionLocations; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.RegionInfo; -import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.assignment.RegionStateStore; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * A SCP that differs from default only in how it gets the list of - * Regions hosted on the crashed-server; it also reads hbase:meta directly rather - * than rely solely on Master memory for list of Regions that were on crashed server. - * This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's - * scheduleRecoveries). It is for the case where meta has references to 'Unknown Servers', + * Acts like the super class in all cases except when no Regions found in the + * current Master in-memory context. In this latter case, when the call to + * super#getRegionsOnCrashedServer returns nothing, this SCP will scan + * hbase:meta for references to the passed ServerName. If any found, we'll + * clean them up. + * + *

This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's + * scheduleRecoveries); the super class is used during normal recovery operations. + * It is for the case where meta has references to 'Unknown Servers', * servers that are in hbase:meta but not in live-server or dead-server lists; i.e. Master * and hbase:meta content have deviated. It should never happen in normal running * cluster but if we do drop accounting of servers, we need a means of fix-up. @@ -65,31 +75,97 @@ public class HBCKServerCrashProcedure extends ServerCrashProcedure { public HBCKServerCrashProcedure() {} /** - * Adds Regions found by super method any found scanning hbase:meta. + * If no Regions found in Master context, then we will search hbase:meta for references + * to the passed server. Operator may have passed ServerName because they have found + * references to 'Unknown Servers'. They are using HBCKSCP to clear them out. */ @Override @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION", justification="FindBugs seems confused on ps in below.") List getRegionsOnCrashedServer(MasterProcedureEnv env) { - // Super can return immutable emptyList. + // Super will return an immutable list (empty if nothing on this server). List ris = super.getRegionsOnCrashedServer(env); - List> ps = null; - try { - ps = MetaTableAccessor.getTableRegionsAndLocations(env.getMasterServices().getConnection(), - null, false); - } catch (IOException ioe) { - LOG.warn("Failed get of all regions; continuing", ioe); - } - if (ps == null || ps.isEmpty()) { - LOG.warn("No regions found in hbase:meta"); + if (!ris.isEmpty()) { return ris; } - List aggregate = ris == null || ris.isEmpty()? - new ArrayList<>(): new ArrayList<>(ris); - int before = aggregate.size(); - ps.stream().filter(p -> p.getSecond() != null && p.getSecond().equals(getServerName())). - forEach(p -> aggregate.add(p.getFirst())); - LOG.info("Found {} mentions of {} in hbase:meta", aggregate.size() - before, getServerName()); - return aggregate; + // Nothing in in-master context. Check for Unknown Server! in hbase:meta. + // If super list is empty, then allow that an operator scheduled an SCP because they are trying + // to purge 'Unknown Servers' -- servers that are neither online nor in dead servers + // list but that ARE in hbase:meta and so showing as unknown in places like 'HBCK Report'. + // This mis-accounting does not happen in normal circumstance but may arise in-extremis + // when cluster has been damaged in operation. + UnknownServerVisitor visitor = + new UnknownServerVisitor(env.getMasterServices().getConnection(), getServerName()); + try { + MetaTableAccessor.scanMetaForTableRegions(env.getMasterServices().getConnection(), + visitor, null); + } catch (IOException ioe) { + LOG.warn("Failed scan of hbase:meta for 'Unknown Servers'", ioe); + return ris; + } + LOG.info("Found {} mentions of {} in hbase:meta of OPEN/OPENING Regions: {}", + visitor.getReassigns().size(), getServerName(), + visitor.getReassigns().stream().map(RegionInfo::getEncodedName). + collect(Collectors.joining(","))); + return visitor.getReassigns(); + } + + /** + * Visitor for hbase:meta that 'fixes' Unknown Server issues. Collects + * a List of Regions to reassign as 'result'. + */ + private static class UnknownServerVisitor implements MetaTableAccessor.Visitor { + private final List reassigns = new ArrayList<>(); + private final ServerName unknownServerName; + private final Connection connection; + + private UnknownServerVisitor(Connection connection, ServerName unknownServerName) { + this.connection = connection; + this.unknownServerName = unknownServerName; + } + + @Override + public boolean visit(Result result) throws IOException { + RegionLocations rls = MetaTableAccessor.getRegionLocations(result); + if (rls == null) { + return true; + } + for (HRegionLocation hrl: rls.getRegionLocations()) { + if (hrl == null) { + continue; + } + if (hrl.getRegion() == null) { + continue; + } + if (hrl.getServerName() == null) { + continue; + } + if (!hrl.getServerName().equals(this.unknownServerName)) { + continue; + } + RegionState.State state = RegionStateStore.getRegionState(result, hrl.getRegion()); + RegionState rs = new RegionState(hrl.getRegion(), state, hrl.getServerName()); + if (rs.isClosing()) { + // Move region to CLOSED in hbase:meta. + LOG.info("Moving {} from CLOSING to CLOSED in hbase:meta", + hrl.getRegion().getRegionNameAsString()); + try { + MetaTableAccessor.updateRegionState(this.connection, hrl.getRegion(), + RegionState.State.CLOSED); + } catch (IOException ioe) { + LOG.warn("Failed moving {} from CLOSING to CLOSED", ioe); + } + } else if (rs.isOpening() || rs.isOpened()) { + this.reassigns.add(hrl.getRegion()); + } else { + LOG.info("Passing {}", rs); + } + } + return true; + } + + private List getReassigns() { + return this.reassigns; + } } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java index 994ab1dd4ea..e69612a6b5f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java @@ -3653,8 +3653,7 @@ public class HBaseTestingUtility extends HBaseZKTestingUtility { return false; } } - if (RegionStateStore.getRegionState(r, - info.getReplicaId(), info) != RegionState.State.OPEN) { + if (RegionStateStore.getRegionState(r, info) != RegionState.State.OPEN) { return false; } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java index 9129cc583b3..a74106c1f1e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java @@ -112,6 +112,7 @@ public class TestHBCKSCP extends TestSCPBase { master.getServerManager().moveFromOnlineToDeadServers(rsServerName); master.getServerManager().getDeadServers().finish(rsServerName); master.getServerManager().getDeadServers().removeDeadServer(rsServerName); + master.getAssignmentManager().getRegionStates().removeServer(rsServerName); // Kill the server. Nothing should happen since an 'Unknown Server' as far // as the Master is concerned; i.e. no SCP. LOG.info("Killing {}", rsServerName);