HBASE-25130 - Fix master in-memory server holding map after: (#3402)
HBASE-25130 [branch-1] Masters in-memory serverHoldings map is not cleared during hbck repair Signed-off-by: Andrew Purtell <apurtell@apache.org>
This commit is contained in:
parent
b2f8ec993e
commit
395eb0c8e0
|
@ -1661,6 +1661,19 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
regionOffline(regionInfo, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Marks the region as offline. In addition whether removing it from
|
||||
* replicas and master in-memory server holding map.
|
||||
* <p>
|
||||
* @param regionInfo - region info.
|
||||
* @param force - setting to true to force this region to be removed from replicas and master
|
||||
* in-memory server holding map, to make this region not be re-opened on any other region
|
||||
* servers. The only use case is hbck for now.
|
||||
*/
|
||||
public void regionOffline(final HRegionInfo regionInfo, boolean force) {
|
||||
regionOffline(regionInfo, null, force);
|
||||
}
|
||||
|
||||
public void offlineDisabledRegion(HRegionInfo regionInfo) {
|
||||
if (useZKForAssignment) {
|
||||
// Disabling so should not be reassigned, just delete the CLOSED node
|
||||
|
@ -4551,13 +4564,20 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
|
||||
public Map<String, AtomicInteger> getFailedOpenTracker() {return failedOpenTracker;}
|
||||
|
||||
private void regionOffline(final HRegionInfo regionInfo, final State state) {
|
||||
regionOffline(regionInfo, state, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* A region is offline. The new state should be the specified one,
|
||||
* if not null. If the specified state is null, the new state is Offline.
|
||||
* The specified state can be Split/Merged/Offline/null only.
|
||||
*
|
||||
* If region offline is initiated by rpc call from admin, we force offline it.
|
||||
*/
|
||||
private void regionOffline(final HRegionInfo regionInfo, final State state) {
|
||||
regionStates.regionOffline(regionInfo, state);
|
||||
private void regionOffline(final HRegionInfo regionInfo, final State state,
|
||||
final boolean force) {
|
||||
regionStates.regionOffline(regionInfo, state, force);
|
||||
removeClosedRegion(regionInfo);
|
||||
// remove the region plan as well just in case.
|
||||
clearRegionPlan(regionInfo);
|
||||
|
@ -4566,7 +4586,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
// Tell our listeners that a region was closed
|
||||
sendRegionClosedNotification(regionInfo);
|
||||
// also note that all the replicas of the primary should be closed
|
||||
if (state != null && state.equals(State.SPLIT)) {
|
||||
if (force || (state != null && state.equals(State.SPLIT))) {
|
||||
Collection<HRegionInfo> c = new ArrayList<HRegionInfo>(1);
|
||||
c.add(regionInfo);
|
||||
Map<ServerName, List<HRegionInfo>> map = regionStates.getRegionAssignments(c);
|
||||
|
@ -4575,7 +4595,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
replicasToClose.addAll(list);
|
||||
}
|
||||
}
|
||||
else if (state != null && state.equals(State.MERGED)) {
|
||||
else if (force || (state != null && state.equals(State.MERGED))) {
|
||||
Collection<HRegionInfo> c = new ArrayList<HRegionInfo>(1);
|
||||
c.add(regionInfo);
|
||||
Map<ServerName, List<HRegionInfo>> map = regionStates.getRegionAssignments(c);
|
||||
|
|
|
@ -1401,7 +1401,7 @@ public class MasterRpcServices extends RSRpcServices
|
|||
master.cpHost.preRegionOffline(hri);
|
||||
}
|
||||
LOG.info(master.getClientIdAuditPrefix() + " offline " + hri.getRegionNameAsString());
|
||||
master.assignmentManager.regionOffline(hri);
|
||||
master.assignmentManager.regionOffline(hri, true);
|
||||
if (master.cpHost != null) {
|
||||
master.cpHost.postRegionOffline(hri);
|
||||
}
|
||||
|
|
|
@ -669,7 +669,7 @@ public class RegionStates {
|
|||
* A region is offline, won't be in transition any more.
|
||||
*/
|
||||
public void regionOffline(final HRegionInfo hri) {
|
||||
regionOffline(hri, null);
|
||||
regionOffline(hri, null, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -678,7 +678,7 @@ public class RegionStates {
|
|||
* Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
|
||||
*/
|
||||
public void regionOffline(
|
||||
final HRegionInfo hri, final State expectedState) {
|
||||
final HRegionInfo hri, final State expectedState, final boolean force) {
|
||||
Preconditions.checkArgument(expectedState == null
|
||||
|| RegionState.isUnassignable(expectedState),
|
||||
"Offlined region should not be " + expectedState);
|
||||
|
@ -713,9 +713,9 @@ public class RegionStates {
|
|||
regionsInTransition.remove(encodedName);
|
||||
ServerName oldServerName = regionAssignments.remove(hri);
|
||||
if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
|
||||
if (newState == State.MERGED || newState == State.SPLIT
|
||||
if (force || (newState == State.MERGED || newState == State.SPLIT
|
||||
|| hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
|
||||
TableState.State.DISABLED, TableState.State.DISABLING)) {
|
||||
TableState.State.DISABLED, TableState.State.DISABLING))) {
|
||||
// Offline the region only if it's merged/split, or the table is disabled/disabling.
|
||||
// Otherwise, offline it from this server only when it is online on a different server.
|
||||
LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
|
||||
|
|
|
@ -882,6 +882,76 @@ public class TestHBaseFsck {
|
|||
assertNoErrors(hbck2);
|
||||
assertEquals(0, hbck2.getOverlapGroups(table).size());
|
||||
assertEquals(ROWKEYS.length, countRows());
|
||||
|
||||
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
|
||||
long totalRegions = cluster.countServedRegions();
|
||||
|
||||
// stop a region servers and run fsck again
|
||||
cluster.stopRegionServer(server);
|
||||
cluster.waitForRegionServerToStop(server, 60);
|
||||
|
||||
// wait for all regions to come online.
|
||||
while (cluster.countServedRegions() < totalRegions) {
|
||||
Thread.sleep(100);
|
||||
}
|
||||
|
||||
// check again after stopping a region server.
|
||||
HBaseFsck hbck3 = doFsck(conf,false);
|
||||
assertNoErrors(hbck3);
|
||||
} finally {
|
||||
cleanupTable(table);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This create and fixes a bad table with regions that have overlap regions.
|
||||
*/
|
||||
@Test(timeout=180000)
|
||||
public void testOverlapRegions() throws Exception {
|
||||
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
|
||||
TableName table =
|
||||
TableName.valueOf("tableOverlapRegions");
|
||||
HRegionInfo hri;
|
||||
ServerName server;
|
||||
try {
|
||||
setupTable(table);
|
||||
assertNoErrors(doFsck(conf, false));
|
||||
assertEquals(ROWKEYS.length, countRows());
|
||||
|
||||
// Now let's mess it up, by adding a region which overlaps with others
|
||||
hri = createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B2"));
|
||||
TEST_UTIL.assignRegion(hri);
|
||||
server = regionStates.getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, server, REGION_ONLINE_TIMEOUT);
|
||||
|
||||
HBaseFsck hbck = doFsck(conf, false);
|
||||
assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
|
||||
ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
|
||||
assertEquals(3, hbck.getOverlapGroups(table).size());
|
||||
assertEquals(ROWKEYS.length, countRows());
|
||||
|
||||
// fix the overlap regions.
|
||||
doFsck(conf, true);
|
||||
|
||||
// check that the overlap regions are gone and no data loss
|
||||
HBaseFsck hbck2 = doFsck(conf,false);
|
||||
assertNoErrors(hbck2);
|
||||
assertEquals(0, hbck2.getOverlapGroups(table).size());
|
||||
assertEquals(ROWKEYS.length, countRows());
|
||||
|
||||
long totalRegions = cluster.countServedRegions();
|
||||
|
||||
// stop a region servers and run fsck again
|
||||
cluster.stopRegionServer(server);
|
||||
cluster.waitForRegionServerToStop(server, 60);
|
||||
|
||||
// wait for all regions to come online.
|
||||
while (cluster.countServedRegions() < totalRegions) {
|
||||
Thread.sleep(100);
|
||||
}
|
||||
|
||||
HBaseFsck hbck3 = doFsck(conf,false);
|
||||
assertNoErrors(hbck3);
|
||||
} finally {
|
||||
cleanupTable(table);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue