HBASE-25130 - Fix master in-memory server holding map after: (#3402)

HBASE-25130 [branch-1] Masters in-memory serverHoldings map is not cleared during hbck repair

Signed-off-by: Andrew Purtell <apurtell@apache.org>
This commit is contained in:
Victor 2021-06-28 10:00:46 -07:00 committed by GitHub
parent b2f8ec993e
commit 395eb0c8e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 99 additions and 9 deletions

View File

@ -1661,6 +1661,19 @@ public class AssignmentManager extends ZooKeeperListener {
regionOffline(regionInfo, null);
}
/**
* Marks the region as offline. In addition whether removing it from
* replicas and master in-memory server holding map.
* <p>
* @param regionInfo - region info.
* @param force - setting to true to force this region to be removed from replicas and master
* in-memory server holding map, to make this region not be re-opened on any other region
* servers. The only use case is hbck for now.
*/
public void regionOffline(final HRegionInfo regionInfo, boolean force) {
regionOffline(regionInfo, null, force);
}
public void offlineDisabledRegion(HRegionInfo regionInfo) {
if (useZKForAssignment) {
// Disabling so should not be reassigned, just delete the CLOSED node
@ -4551,13 +4564,20 @@ public class AssignmentManager extends ZooKeeperListener {
public Map<String, AtomicInteger> getFailedOpenTracker() {return failedOpenTracker;}
private void regionOffline(final HRegionInfo regionInfo, final State state) {
regionOffline(regionInfo, state, false);
}
/**
* A region is offline. The new state should be the specified one,
* if not null. If the specified state is null, the new state is Offline.
* The specified state can be Split/Merged/Offline/null only.
*
* If region offline is initiated by rpc call from admin, we force offline it.
*/
private void regionOffline(final HRegionInfo regionInfo, final State state) {
regionStates.regionOffline(regionInfo, state);
private void regionOffline(final HRegionInfo regionInfo, final State state,
final boolean force) {
regionStates.regionOffline(regionInfo, state, force);
removeClosedRegion(regionInfo);
// remove the region plan as well just in case.
clearRegionPlan(regionInfo);
@ -4566,7 +4586,7 @@ public class AssignmentManager extends ZooKeeperListener {
// Tell our listeners that a region was closed
sendRegionClosedNotification(regionInfo);
// also note that all the replicas of the primary should be closed
if (state != null && state.equals(State.SPLIT)) {
if (force || (state != null && state.equals(State.SPLIT))) {
Collection<HRegionInfo> c = new ArrayList<HRegionInfo>(1);
c.add(regionInfo);
Map<ServerName, List<HRegionInfo>> map = regionStates.getRegionAssignments(c);
@ -4575,7 +4595,7 @@ public class AssignmentManager extends ZooKeeperListener {
replicasToClose.addAll(list);
}
}
else if (state != null && state.equals(State.MERGED)) {
else if (force || (state != null && state.equals(State.MERGED))) {
Collection<HRegionInfo> c = new ArrayList<HRegionInfo>(1);
c.add(regionInfo);
Map<ServerName, List<HRegionInfo>> map = regionStates.getRegionAssignments(c);

View File

@ -1401,7 +1401,7 @@ public class MasterRpcServices extends RSRpcServices
master.cpHost.preRegionOffline(hri);
}
LOG.info(master.getClientIdAuditPrefix() + " offline " + hri.getRegionNameAsString());
master.assignmentManager.regionOffline(hri);
master.assignmentManager.regionOffline(hri, true);
if (master.cpHost != null) {
master.cpHost.postRegionOffline(hri);
}

View File

@ -669,7 +669,7 @@ public class RegionStates {
* A region is offline, won't be in transition any more.
*/
public void regionOffline(final HRegionInfo hri) {
regionOffline(hri, null);
regionOffline(hri, null, false);
}
/**
@ -678,7 +678,7 @@ public class RegionStates {
* Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
*/
public void regionOffline(
final HRegionInfo hri, final State expectedState) {
final HRegionInfo hri, final State expectedState, final boolean force) {
Preconditions.checkArgument(expectedState == null
|| RegionState.isUnassignable(expectedState),
"Offlined region should not be " + expectedState);
@ -713,9 +713,9 @@ public class RegionStates {
regionsInTransition.remove(encodedName);
ServerName oldServerName = regionAssignments.remove(hri);
if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
if (newState == State.MERGED || newState == State.SPLIT
if (force || (newState == State.MERGED || newState == State.SPLIT
|| hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
TableState.State.DISABLED, TableState.State.DISABLING)) {
TableState.State.DISABLED, TableState.State.DISABLING))) {
// Offline the region only if it's merged/split, or the table is disabled/disabling.
// Otherwise, offline it from this server only when it is online on a different server.
LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);

View File

@ -882,6 +882,76 @@ public class TestHBaseFsck {
assertNoErrors(hbck2);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
long totalRegions = cluster.countServedRegions();
// stop a region servers and run fsck again
cluster.stopRegionServer(server);
cluster.waitForRegionServerToStop(server, 60);
// wait for all regions to come online.
while (cluster.countServedRegions() < totalRegions) {
Thread.sleep(100);
}
// check again after stopping a region server.
HBaseFsck hbck3 = doFsck(conf,false);
assertNoErrors(hbck3);
} finally {
cleanupTable(table);
}
}
/**
* This create and fixes a bad table with regions that have overlap regions.
*/
@Test(timeout=180000)
public void testOverlapRegions() throws Exception {
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
TableName table =
TableName.valueOf("tableOverlapRegions");
HRegionInfo hri;
ServerName server;
try {
setupTable(table);
assertNoErrors(doFsck(conf, false));
assertEquals(ROWKEYS.length, countRows());
// Now let's mess it up, by adding a region which overlaps with others
hri = createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B2"));
TEST_UTIL.assignRegion(hri);
server = regionStates.getRegionServerOfRegion(hri);
TEST_UTIL.assertRegionOnServer(hri, server, REGION_ONLINE_TIMEOUT);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
assertEquals(3, hbck.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
// fix the overlap regions.
doFsck(conf, true);
// check that the overlap regions are gone and no data loss
HBaseFsck hbck2 = doFsck(conf,false);
assertNoErrors(hbck2);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
long totalRegions = cluster.countServedRegions();
// stop a region servers and run fsck again
cluster.stopRegionServer(server);
cluster.waitForRegionServerToStop(server, 60);
// wait for all regions to come online.
while (cluster.countServedRegions() < totalRegions) {
Thread.sleep(100);
}
HBaseFsck hbck3 = doFsck(conf,false);
assertNoErrors(hbck3);
} finally {
cleanupTable(table);
}