HBASE-22737 Add a new admin method and shell cmd to trigger the hbck chore to run (#425)

Signed-off-by: stack <stack@apache.org>
This commit is contained in:
Guanghao Zhang 2019-08-01 08:54:47 +08:00
parent 52f8ec8924
commit e68b16a6c1
13 changed files with 178 additions and 62 deletions

View File

@ -24,20 +24,24 @@ import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter;
import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.GetTableStateResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.HbckService.BlockingInterface;
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter;
import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.AssignsResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.BypassProcedureRequest;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.BypassProcedureResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.GetTableStateResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.HbckService.BlockingInterface;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreRequest;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleServerCrashProcedureResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.UnassignsResponse;
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
/**
* Use {@link ClusterConnection#getHbck()} to obtain an instance of {@link Hbck} instead of
@ -106,8 +110,7 @@ public class HBaseHbck implements Hbck {
public List<Long> assigns(List<String> encodedRegionNames, boolean override)
throws IOException {
try {
MasterProtos.AssignsResponse response =
this.hbck.assigns(rpcControllerFactory.newController(),
AssignsResponse response = this.hbck.assigns(rpcControllerFactory.newController(),
RequestConverter.toAssignRegionsRequest(encodedRegionNames, override));
return response.getPidList();
} catch (ServiceException se) {
@ -120,8 +123,7 @@ public class HBaseHbck implements Hbck {
public List<Long> unassigns(List<String> encodedRegionNames, boolean override)
throws IOException {
try {
MasterProtos.UnassignsResponse response =
this.hbck.unassigns(rpcControllerFactory.newController(),
UnassignsResponse response = this.hbck.unassigns(rpcControllerFactory.newController(),
RequestConverter.toUnassignRegionsRequest(encodedRegionNames, override));
return response.getPidList();
} catch (ServiceException se) {
@ -138,13 +140,13 @@ public class HBaseHbck implements Hbck {
public List<Boolean> bypassProcedure(List<Long> pids, long waitTime, boolean override,
boolean recursive)
throws IOException {
MasterProtos.BypassProcedureResponse response = ProtobufUtil.call(
new Callable<MasterProtos.BypassProcedureResponse>() {
BypassProcedureResponse response = ProtobufUtil.call(
new Callable<BypassProcedureResponse>() {
@Override
public MasterProtos.BypassProcedureResponse call() throws Exception {
public BypassProcedureResponse call() throws Exception {
try {
return hbck.bypassProcedure(rpcControllerFactory.newController(),
MasterProtos.BypassProcedureRequest.newBuilder().addAllProcId(pids).
BypassProcedureRequest.newBuilder().addAllProcId(pids).
setWaitTime(waitTime).setOverride(override).setRecursive(recursive).build());
} catch (Throwable t) {
LOG.error(pids.stream().map(i -> i.toString()).
@ -160,7 +162,7 @@ public class HBaseHbck implements Hbck {
public List<Long> scheduleServerCrashProcedure(List<HBaseProtos.ServerName> serverNames)
throws IOException {
try {
MasterProtos.ScheduleServerCrashProcedureResponse response =
ScheduleServerCrashProcedureResponse response =
this.hbck.scheduleServerCrashProcedure(rpcControllerFactory.newController(),
RequestConverter.toScheduleServerCrashProcedureRequest(serverNames));
return response.getPidList();
@ -172,4 +174,16 @@ public class HBaseHbck implements Hbck {
throw new IOException(se);
}
}
@Override
public boolean runHbckChore() throws IOException {
try {
RunHbckChoreResponse response = this.hbck.runHbckChore(rpcControllerFactory.newController(),
RunHbckChoreRequest.newBuilder().build());
return response.getRan();
} catch (ServiceException se) {
LOG.debug("Failed to run HBCK chore", se);
throw new IOException(se);
}
}
}

View File

@ -106,4 +106,12 @@ public interface Hbck extends Abortable, Closeable {
List<Long> scheduleServerCrashProcedure(List<HBaseProtos.ServerName> serverNames)
throws IOException;
/**
* Request HBCK chore to run at master side.
*
* @return <code>true</code> if HBCK chore ran, <code>false</code> if HBCK chore already running
* @throws IOException if a remote or network exception occurs
*/
boolean runHbckChore() throws IOException;
}

View File

@ -350,6 +350,13 @@ message IsNormalizerEnabledResponse {
required bool enabled = 1;
}
message RunHbckChoreRequest {
}
message RunHbckChoreResponse {
required bool ran = 1;
}
message RunCatalogScanRequest {
}
@ -1080,4 +1087,10 @@ service HbckService {
/** Schedule a ServerCrashProcedure to help recover a crash server */
rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest)
returns(ScheduleServerCrashProcedureResponse);
/**
* Request HBCK chore to run at master side.
*/
rpc RunHbckChore(RunHbckChoreRequest)
returns(RunHbckChoreResponse);
}

View File

@ -371,7 +371,7 @@ public class HMaster extends HRegionServer implements MasterServices {
private ClusterStatusChore clusterStatusChore;
private ClusterStatusPublisher clusterStatusPublisherChore = null;
private HbckChecker hbckChecker;
private HbckChore hbckChore;
CatalogJanitor catalogJanitorChore;
private LogCleaner logCleaner;
private HFileCleaner hfileCleaner;
@ -1031,8 +1031,8 @@ public class HMaster extends HRegionServer implements MasterServices {
getChoreService().scheduleChore(normalizerChore);
this.catalogJanitorChore = new CatalogJanitor(this);
getChoreService().scheduleChore(catalogJanitorChore);
this.hbckChecker = new HbckChecker(this);
getChoreService().scheduleChore(hbckChecker);
this.hbckChore = new HbckChore(this);
getChoreService().scheduleChore(hbckChore);
// NAMESPACE READ!!!!
// Here we expect hbase:namespace to be online. See inside initClusterSchemaService.
@ -1498,7 +1498,7 @@ public class HMaster extends HRegionServer implements MasterServices {
choreService.cancelChore(this.logCleaner);
choreService.cancelChore(this.hfileCleaner);
choreService.cancelChore(this.replicationBarrierCleaner);
choreService.cancelChore(this.hbckChecker);
choreService.cancelChore(this.hbckChore);
}
}
@ -3866,7 +3866,7 @@ public class HMaster extends HRegionServer implements MasterServices {
return super.getWalGroupsReplicationStatus();
}
public HbckChecker getHbckChecker() {
return this.hbckChecker;
public HbckChore getHbckChore() {
return this.hbckChore;
}
}

View File

@ -47,11 +47,11 @@ import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class HbckChecker extends ScheduledChore {
private static final Logger LOG = LoggerFactory.getLogger(HbckChecker.class.getName());
public class HbckChore extends ScheduledChore {
private static final Logger LOG = LoggerFactory.getLogger(HbckChore.class.getName());
private static final String HBCK_CHECKER_INTERVAL = "hbase.master.hbck.checker.interval";
private static final int DEFAULT_HBCK_CHECKER_INTERVAL = 60 * 60 * 1000;
private static final String HBCK_CHORE_INTERVAL = "hbase.master.hbck.chore.interval";
private static final int DEFAULT_HBCK_CHORE_INTERVAL = 60 * 60 * 1000;
private final MasterServices master;
@ -100,14 +100,14 @@ public class HbckChecker extends ScheduledChore {
private volatile long checkingStartTimestamp = 0;
private volatile long checkingEndTimestamp = 0;
public HbckChecker(MasterServices master) {
super("HbckChecker-", master,
master.getConfiguration().getInt(HBCK_CHECKER_INTERVAL, DEFAULT_HBCK_CHECKER_INTERVAL));
public HbckChore(MasterServices master) {
super("HbckChore-", master,
master.getConfiguration().getInt(HBCK_CHORE_INTERVAL, DEFAULT_HBCK_CHORE_INTERVAL));
this.master = master;
}
@Override
protected void chore() {
protected synchronized void chore() {
running = true;
regionInfoMap.clear();
orphanRegionsOnRS.clear();
@ -277,6 +277,6 @@ public class HbckChecker extends ScheduledChore {
* Used for web ui to show when the HBCK checking report generated.
*/
public long getCheckingEndTimestamp() {
return this.checkingStartTimestamp;
return this.checkingEndTimestamp;
}
}

View File

@ -237,6 +237,8 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunCatalog
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunCatalogScanResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunCleanerChoreRequest;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunCleanerChoreResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreRequest;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.SecurityCapabilitiesRequest;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.SecurityCapabilitiesResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.SetBalancerRunningRequest;
@ -2293,6 +2295,20 @@ public class MasterRpcServices extends RSRpcServices
// HBCK Services
@Override
public RunHbckChoreResponse runHbckChore(RpcController c, RunHbckChoreRequest req)
throws ServiceException {
rpcPreCheck("runHbckChore");
LOG.info("{} request HBCK chore to run", master.getClientIdAuditPrefix());
HbckChore hbckChore = master.getHbckChore();
boolean ran = false;
if (!hbckChore.isRunning()) {
hbckChore.chore();
ran = true;
}
return RunHbckChoreResponse.newBuilder().setRan(ran).build();
}
/**
* Update state of the table in meta only. This is required by hbck in some situations to cleanup
* stuck assign/ unassign regions procedures for the table.

View File

@ -27,7 +27,7 @@
import="java.time.ZonedDateTime"
import="java.time.format.DateTimeFormatter"
%>
<%@ page import="org.apache.hadoop.hbase.master.HbckChecker" %>
<%@ page import="org.apache.hadoop.hbase.master.HbckChore" %>
<%@ page import="org.apache.hadoop.hbase.master.HMaster" %>
<%@ page import="org.apache.hadoop.hbase.ServerName" %>
<%@ page import="org.apache.hadoop.hbase.util.Bytes" %>
@ -38,18 +38,18 @@
<%
HMaster master = (HMaster) getServletContext().getAttribute(HMaster.MASTER);
pageContext.setAttribute("pageTitle", "HBase Master HBCK Report: " + master.getServerName());
HbckChecker hbckChecker = master.getHbckChecker();
HbckChore hbckChore = master.getHbckChore();
Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegions = null;
Map<String, ServerName> orphanRegionsOnRS = null;
List<String> orphanRegionsOnFS = null;
long startTimestamp = 0;
long endTimestamp = 0;
if (hbckChecker != null) {
inconsistentRegions = hbckChecker.getInconsistentRegions();
orphanRegionsOnRS = hbckChecker.getOrphanRegionsOnRS();
orphanRegionsOnFS = hbckChecker.getOrphanRegionsOnFS();
startTimestamp = hbckChecker.getCheckingStartTimestamp();
endTimestamp = hbckChecker.getCheckingEndTimestamp();
if (hbckChore != null) {
inconsistentRegions = hbckChore.getInconsistentRegions();
orphanRegionsOnRS = hbckChore.getOrphanRegionsOnRS();
orphanRegionsOnFS = hbckChore.getOrphanRegionsOnFS();
startTimestamp = hbckChore.getCheckingStartTimestamp();
endTimestamp = hbckChore.getCheckingEndTimestamp();
}
ZonedDateTime zdt = ZonedDateTime.ofInstant(Instant.ofEpochMilli(startTimestamp),
ZoneId.systemDefault());

View File

@ -28,6 +28,7 @@ import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface;
@ -223,6 +224,20 @@ public class TestHbck {
waitOnPids(pids);
}
@Test
public void testRunHbckChore() throws Exception {
HMaster master = TEST_UTIL.getMiniHBaseCluster().getMaster();
long endTimestamp = master.getHbckChore().getCheckingEndTimestamp();
Hbck hbck = getHbck();
boolean ran = false;
while (!ran) {
ran = hbck.runHbckChore();
if (ran) {
assertTrue(master.getHbckChore().getCheckingEndTimestamp() > endTimestamp);
}
}
}
private void waitOnPids(List<Long> pids) {
TEST_UTIL.waitFor(60000, () -> pids.stream().allMatch(procExec::isFinished));
}

View File

@ -31,7 +31,7 @@ import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
import org.apache.hadoop.hbase.master.HbckChecker;
import org.apache.hadoop.hbase.master.HbckChore;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Pair;
@ -43,19 +43,19 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Category({ MasterTests.class, MediumTests.class })
public class TestHbckChecker extends TestAssignmentManagerBase {
private static final Logger LOG = LoggerFactory.getLogger(TestHbckChecker.class);
public class TestHbckChore extends TestAssignmentManagerBase {
private static final Logger LOG = LoggerFactory.getLogger(TestHbckChore.class);
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestHbckChecker.class);
HBaseClassTestRule.forClass(TestHbckChore.class);
private HbckChecker hbckChecker;
private HbckChore hbckChore;
@Before
public void setUp() throws Exception {
super.setUp();
hbckChecker = new HbckChecker(master);
hbckChore = new HbckChore(master);
}
@Test
@ -65,9 +65,9 @@ public class TestHbckChecker extends TestAssignmentManagerBase {
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
assertEquals(NSERVERS, serverNames.size());
hbckChecker.choreForTesting();
hbckChore.choreForTesting();
Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegions =
hbckChecker.getInconsistentRegions();
hbckChore.getInconsistentRegions();
// Test for case1: Master thought this region opened, but no regionserver reported it.
assertTrue(inconsistentRegions.containsKey(metaRegionName));
@ -79,8 +79,8 @@ public class TestHbckChecker extends TestAssignmentManagerBase {
// Reported right region location. Then not in problematic regions.
am.reportOnlineRegions(locationInMeta, Collections.singleton(metaRegionNameAsBytes));
hbckChecker.choreForTesting();
inconsistentRegions = hbckChecker.getInconsistentRegions();
hbckChore.choreForTesting();
inconsistentRegions = hbckChore.getInconsistentRegions();
assertFalse(inconsistentRegions.containsKey(metaRegionName));
}
@ -97,9 +97,9 @@ public class TestHbckChecker extends TestAssignmentManagerBase {
assertEquals(NSERVERS, serverNames.size());
// Test for case1: Master thought this region opened, but no regionserver reported it.
hbckChecker.choreForTesting();
hbckChore.choreForTesting();
Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegions =
hbckChecker.getInconsistentRegions();
hbckChore.getInconsistentRegions();
assertTrue(inconsistentRegions.containsKey(regionName));
Pair<ServerName, List<ServerName>> pair = inconsistentRegions.get(regionName);
ServerName locationInMeta = pair.getFirst();
@ -113,8 +113,8 @@ public class TestHbckChecker extends TestAssignmentManagerBase {
final ServerName anotherServer =
serverNames.stream().filter(s -> !s.equals(tempLocationInMeta)).findFirst().get();
am.reportOnlineRegions(anotherServer, Collections.singleton(hri.getRegionName()));
hbckChecker.choreForTesting();
inconsistentRegions = hbckChecker.getInconsistentRegions();
hbckChore.choreForTesting();
inconsistentRegions = hbckChore.getInconsistentRegions();
assertTrue(inconsistentRegions.containsKey(regionName));
pair = inconsistentRegions.get(regionName);
locationInMeta = pair.getFirst();
@ -125,8 +125,8 @@ public class TestHbckChecker extends TestAssignmentManagerBase {
// Test for case3: More than one regionservers reported opened this region.
am.reportOnlineRegions(locationInMeta, Collections.singleton(hri.getRegionName()));
hbckChecker.choreForTesting();
inconsistentRegions = hbckChecker.getInconsistentRegions();
hbckChore.choreForTesting();
inconsistentRegions = hbckChore.getInconsistentRegions();
assertTrue(inconsistentRegions.containsKey(regionName));
pair = inconsistentRegions.get(regionName);
locationInMeta = pair.getFirst();
@ -137,8 +137,8 @@ public class TestHbckChecker extends TestAssignmentManagerBase {
// Reported right region location. Then not in problematic regions.
am.reportOnlineRegions(anotherServer, Collections.EMPTY_SET);
hbckChecker.choreForTesting();
inconsistentRegions = hbckChecker.getInconsistentRegions();
hbckChore.choreForTesting();
inconsistentRegions = hbckChore.getInconsistentRegions();
assertFalse(inconsistentRegions.containsKey(regionName));
}
}

View File

@ -37,6 +37,7 @@ module Hbase
@connection = connection
# Java Admin instance
@admin = @connection.getAdmin
@hbck = @connection.getHbck
@conf = @connection.getConfiguration
end
@ -236,6 +237,12 @@ module Hbase
@admin.isMasterInMaintenanceMode
end
#----------------------------------------------------------------------------------------------
# Request HBCK chore to run
def hbck_chore_run
@hbck.runHbckChore
end
#----------------------------------------------------------------------------------------------
# Request a scan of the catalog table (for garbage collection)
# Returns an int signifying the number of entries cleaned

View File

@ -343,6 +343,7 @@ Shell.load_command_group(
unassign
zk_dump
wal_roll
hbck_chore_run
catalogjanitor_run
catalogjanitor_switch
catalogjanitor_enabled

View File

@ -0,0 +1,38 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
module Shell
module Commands
class HbckChoreRun < Command
def help
<<-EOF
Request HBCK chore to run at master side. It will try to find the orphan
regions on RegionServer or FileSystem and find the inconsistent regions.
You can check the HBCK report at Master web UI.
hbase> hbck_chore_run
EOF
end
def command
admin.hbck_chore_run
end
end
end
end

View File

@ -57,6 +57,10 @@ module Hbase
command(:disable, @test_name)
assert(!command(:is_enabled, @test_name))
end
define_test 'hbck_chore_run' do
command(:hbck_chore_run)
end
end
# Simple administration methods tests