HBASE-23307 Add running of ReplicationBarrierCleaner to hbck2 fixMeta invocation (#859)

Signed-off-by: Lijin Bin <binlijin@apache.org>
This commit is contained in:
Michael Stack 2019-11-22 08:26:45 -08:00 committed by stack
parent 3b0c276aa3
commit 8e52339cb8
7 changed files with 41 additions and 17 deletions

View File

@ -3869,4 +3869,11 @@ public class HMaster extends HRegionServer implements MasterServices {
return cachedClusterId.getFromCacheOrFetch(); return cachedClusterId.getFromCacheOrFetch();
} }
@Override
public void runReplicationBarrierCleaner() {
ReplicationBarrierCleaner rbc = this.replicationBarrierCleaner;
if (rbc != null) {
rbc.chore();
}
}
} }

View File

@ -1,4 +1,4 @@
/** /*
* Licensed to the Apache Software Foundation (ASF) under one * Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file * or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information * distributed with this work for additional information
@ -122,7 +122,6 @@ public class HbckChore extends ScheduledChore {
LOG.warn("hbckChore is either disabled or is already running. Can't run the chore"); LOG.warn("hbckChore is either disabled or is already running. Can't run the chore");
return; return;
} }
running = true;
regionInfoMap.clear(); regionInfoMap.clear();
disabledTableRegions.clear(); disabledTableRegions.clear();
splitParentRegions.clear(); splitParentRegions.clear();
@ -130,6 +129,8 @@ public class HbckChore extends ScheduledChore {
orphanRegionsOnFS.clear(); orphanRegionsOnFS.clear();
inconsistentRegions.clear(); inconsistentRegions.clear();
checkingStartTimestamp = EnvironmentEdgeManager.currentTime(); checkingStartTimestamp = EnvironmentEdgeManager.currentTime();
running = true;
try {
loadRegionsFromInMemoryState(); loadRegionsFromInMemoryState();
loadRegionsFromRSReport(); loadRegionsFromRSReport();
try { try {
@ -138,6 +139,9 @@ public class HbckChore extends ScheduledChore {
LOG.warn("Failed to load the regions from filesystem", e); LOG.warn("Failed to load the regions from filesystem", e);
} }
saveCheckResultToSnapshot(); saveCheckResultToSnapshot();
} catch (Throwable t) {
LOG.warn("Unexpected", t);
}
running = false; running = false;
} }
@ -262,6 +266,10 @@ public class HbckChore extends ScheduledChore {
List<Path> regionDirs = FSUtils.getRegionDirs(fs, tableDir); List<Path> regionDirs = FSUtils.getRegionDirs(fs, tableDir);
for (Path regionDir : regionDirs) { for (Path regionDir : regionDirs) {
String encodedRegionName = regionDir.getName(); String encodedRegionName = regionDir.getName();
if (encodedRegionName == null) {
LOG.warn("Failed get of encoded name from {}", regionDir);
continue;
}
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName); HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
if (hri == null) { if (hri == null) {
orphanRegionsOnFS.put(encodedRegionName, regionDir); orphanRegionsOnFS.put(encodedRegionName, regionDir);

View File

@ -537,4 +537,8 @@ public interface MasterServices extends Server {
*/ */
List<RegionPlan> executeRegionPlansWithThrottling(List<RegionPlan> plans); List<RegionPlan> executeRegionPlansWithThrottling(List<RegionPlan> plans);
/**
* Run the ReplicationBarrierChore.
*/
void runReplicationBarrierCleaner();
} }

View File

@ -77,6 +77,9 @@ class MetaFixer {
} }
fixHoles(report); fixHoles(report);
fixOverlaps(report); fixOverlaps(report);
// Run the ReplicationBarrierCleaner here; it may clear out rep_barrier rows which
// can help cleaning up damaged hbase:meta.
this.masterServices.runReplicationBarrierCleaner();
} }
/** /**

View File

@ -48,7 +48,6 @@ import org.slf4j.LoggerFactory;
*/ */
@InterfaceAudience.Private @InterfaceAudience.Private
public class ReplicationBarrierCleaner extends ScheduledChore { public class ReplicationBarrierCleaner extends ScheduledChore {
private static final Logger LOG = LoggerFactory.getLogger(ReplicationBarrierCleaner.class); private static final Logger LOG = LoggerFactory.getLogger(ReplicationBarrierCleaner.class);
private static final String REPLICATION_BARRIER_CLEANER_INTERVAL = private static final String REPLICATION_BARRIER_CLEANER_INTERVAL =
@ -71,7 +70,9 @@ public class ReplicationBarrierCleaner extends ScheduledChore {
} }
@Override @Override
protected void chore() { // Public so can be run out of MasterRpcServices. Synchronized so only one
// running instance at a time.
public synchronized void chore() {
long totalRows = 0; long totalRows = 0;
long cleanedRows = 0; long cleanedRows = 0;
long deletedRows = 0; long deletedRows = 0;
@ -168,11 +169,9 @@ public class ReplicationBarrierCleaner extends ScheduledChore {
LOG.warn("Failed to clean up replication barrier", e); LOG.warn("Failed to clean up replication barrier", e);
} }
if (totalRows > 0) { if (totalRows > 0) {
LOG.info( LOG.info("TotalRows={}, cleanedRows={}, deletedRows={}, deletedBarriers={}, " +
"Cleanup replication barriers: totalRows {}, " + "deletedLastPushedSeqIds={}", totalRows, cleanedRows, deletedRows,
"cleanedRows {}, deletedRows {}, deletedBarriers {}, deletedLastPushedSeqIds {}", deletedBarriers, deletedLastPushedSeqIds);
totalRows, cleanedRows, deletedRows, deletedBarriers, deletedLastPushedSeqIds);
} }
} }
} }

View File

@ -490,4 +490,7 @@ public class MockNoopMasterServices implements MasterServices {
public AsyncClusterConnection getAsyncClusterConnection() { public AsyncClusterConnection getAsyncClusterConnection() {
return null; return null;
} }
@Override
public void runReplicationBarrierCleaner() {}
} }

View File

@ -108,7 +108,7 @@ public class TestClusterRestartFailover extends AbstractTestRestartCluster {
.filter(p -> (p instanceof ServerCrashProcedure) && .filter(p -> (p instanceof ServerCrashProcedure) &&
((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST)).findAny(); ((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST)).findAny();
assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent()); assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent());
assertFalse("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail", assertTrue("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail",
UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST) == UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST) ==
Procedure.NO_PROC_ID); Procedure.NO_PROC_ID);