From 8e52339cb802723d85dbcf5802a5c269915c2130 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Fri, 22 Nov 2019 08:26:45 -0800 Subject: [PATCH] HBASE-23307 Add running of ReplicationBarrierCleaner to hbck2 fixMeta invocation (#859) Signed-off-by: Lijin Bin --- .../apache/hadoop/hbase/master/HMaster.java | 7 ++++++ .../apache/hadoop/hbase/master/HbckChore.java | 24 ++++++++++++------- .../hadoop/hbase/master/MasterServices.java | 4 ++++ .../apache/hadoop/hbase/master/MetaFixer.java | 3 +++ .../cleaner/ReplicationBarrierCleaner.java | 13 +++++----- .../hbase/master/MockNoopMasterServices.java | 5 +++- .../master/TestClusterRestartFailover.java | 2 +- 7 files changed, 41 insertions(+), 17 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 43da8c10bee..cc226bd58c3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -3869,4 +3869,11 @@ public class HMaster extends HRegionServer implements MasterServices { return cachedClusterId.getFromCacheOrFetch(); } + @Override + public void runReplicationBarrierCleaner() { + ReplicationBarrierCleaner rbc = this.replicationBarrierCleaner; + if (rbc != null) { + rbc.chore(); + } + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java index cf4368581e7..b25bb152988 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -122,7 +122,6 @@ public class HbckChore extends ScheduledChore { LOG.warn("hbckChore is either disabled or is already running. Can't run the chore"); return; } - running = true; regionInfoMap.clear(); disabledTableRegions.clear(); splitParentRegions.clear(); @@ -130,14 +129,19 @@ public class HbckChore extends ScheduledChore { orphanRegionsOnFS.clear(); inconsistentRegions.clear(); checkingStartTimestamp = EnvironmentEdgeManager.currentTime(); - loadRegionsFromInMemoryState(); - loadRegionsFromRSReport(); + running = true; try { - loadRegionsFromFS(); - } catch (IOException e) { - LOG.warn("Failed to load the regions from filesystem", e); + loadRegionsFromInMemoryState(); + loadRegionsFromRSReport(); + try { + loadRegionsFromFS(); + } catch (IOException e) { + LOG.warn("Failed to load the regions from filesystem", e); + } + saveCheckResultToSnapshot(); + } catch (Throwable t) { + LOG.warn("Unexpected", t); } - saveCheckResultToSnapshot(); running = false; } @@ -262,6 +266,10 @@ public class HbckChore extends ScheduledChore { List regionDirs = FSUtils.getRegionDirs(fs, tableDir); for (Path regionDir : regionDirs) { String encodedRegionName = regionDir.getName(); + if (encodedRegionName == null) { + LOG.warn("Failed get of encoded name from {}", regionDir); + continue; + } HbckRegionInfo hri = regionInfoMap.get(encodedRegionName); if (hri == null) { orphanRegionsOnFS.put(encodedRegionName, regionDir); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java index 41cec5cfb23..0163998cc4f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java @@ -537,4 +537,8 @@ public interface MasterServices extends Server { */ List executeRegionPlansWithThrottling(List plans); + /** + * Run the ReplicationBarrierChore. + */ + void runReplicationBarrierCleaner(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetaFixer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetaFixer.java index 281df1ecc01..15b4e883356 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetaFixer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetaFixer.java @@ -77,6 +77,9 @@ class MetaFixer { } fixHoles(report); fixOverlaps(report); + // Run the ReplicationBarrierCleaner here; it may clear out rep_barrier rows which + // can help cleaning up damaged hbase:meta. + this.masterServices.runReplicationBarrierCleaner(); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/ReplicationBarrierCleaner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/ReplicationBarrierCleaner.java index ff1da0b1b25..653f7350411 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/ReplicationBarrierCleaner.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/ReplicationBarrierCleaner.java @@ -48,7 +48,6 @@ import org.slf4j.LoggerFactory; */ @InterfaceAudience.Private public class ReplicationBarrierCleaner extends ScheduledChore { - private static final Logger LOG = LoggerFactory.getLogger(ReplicationBarrierCleaner.class); private static final String REPLICATION_BARRIER_CLEANER_INTERVAL = @@ -71,7 +70,9 @@ public class ReplicationBarrierCleaner extends ScheduledChore { } @Override - protected void chore() { + // Public so can be run out of MasterRpcServices. Synchronized so only one + // running instance at a time. + public synchronized void chore() { long totalRows = 0; long cleanedRows = 0; long deletedRows = 0; @@ -168,11 +169,9 @@ public class ReplicationBarrierCleaner extends ScheduledChore { LOG.warn("Failed to clean up replication barrier", e); } if (totalRows > 0) { - LOG.info( - "Cleanup replication barriers: totalRows {}, " + - "cleanedRows {}, deletedRows {}, deletedBarriers {}, deletedLastPushedSeqIds {}", - totalRows, cleanedRows, deletedRows, deletedBarriers, deletedLastPushedSeqIds); + LOG.info("TotalRows={}, cleanedRows={}, deletedRows={}, deletedBarriers={}, " + + "deletedLastPushedSeqIds={}", totalRows, cleanedRows, deletedRows, + deletedBarriers, deletedLastPushedSeqIds); } } - } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java index cbfdd3f7449..b9fff6df1c0 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java @@ -490,4 +490,7 @@ public class MockNoopMasterServices implements MasterServices { public AsyncClusterConnection getAsyncClusterConnection() { return null; } -} \ No newline at end of file + + @Override + public void runReplicationBarrierCleaner() {} +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java index 2e18c160281..a6844fcac09 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java @@ -108,7 +108,7 @@ public class TestClusterRestartFailover extends AbstractTestRestartCluster { .filter(p -> (p instanceof ServerCrashProcedure) && ((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST)).findAny(); assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent()); - assertFalse("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail", + assertTrue("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail", UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST) == Procedure.NO_PROC_ID);