HBASE-18167 OfflineMetaRepair tool may cause HMaster to abort always
Signed-off-by: tedyu <yuzhihong@gmail.com>
This commit is contained in:
parent
532e0dda16
commit
01027f805b
|
@ -25,6 +25,7 @@ import java.io.InterruptedIOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NavigableMap;
|
||||
|
@ -575,6 +576,34 @@ public class MetaTableAccessor {
|
|||
return hris;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve server names from meta table.
|
||||
* @param connection connection we're using
|
||||
* @return List of region servers.
|
||||
* @throws IOException
|
||||
*/
|
||||
public static Set<ServerName> getServerNames(Connection connection) throws IOException {
|
||||
final Set<ServerName> serverNames = new HashSet<ServerName>();
|
||||
// Fill the above serverNames set with server entries from hbase:meta
|
||||
CollectingVisitor<Result> v = new CollectingVisitor<Result>() {
|
||||
@Override
|
||||
void add(Result r) {
|
||||
if (r == null || r.isEmpty()) return;
|
||||
RegionLocations locations = getRegionLocations(r);
|
||||
if (locations == null) return;
|
||||
for (HRegionLocation loc : locations.getRegionLocations()) {
|
||||
if (loc != null) {
|
||||
if (loc.getServerName() != null) {
|
||||
serverNames.add(loc.getServerName());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
fullScan(connection, v);
|
||||
return serverNames;
|
||||
}
|
||||
|
||||
public static void fullScanMetaAndPrint(Connection connection)
|
||||
throws IOException {
|
||||
Visitor v = new Visitor() {
|
||||
|
|
|
@ -634,17 +634,10 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
}
|
||||
}
|
||||
|
||||
Set<TableName> disabledOrDisablingOrEnabling = null;
|
||||
Map<HRegionInfo, ServerName> allRegions = null;
|
||||
|
||||
if (!failover) {
|
||||
disabledOrDisablingOrEnabling = tableStateManager.getTablesInStates(
|
||||
ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING,
|
||||
ZooKeeperProtos.Table.State.ENABLING);
|
||||
|
||||
// Clean re/start, mark all user regions closed before reassignment
|
||||
allRegions = regionStates.closeAllUserRegions(
|
||||
disabledOrDisablingOrEnabling);
|
||||
// Retrieve user regions except tables region that are in disabled/disabling/enabling states.
|
||||
allRegions = getUserRegionsToAssign();
|
||||
}
|
||||
|
||||
// Now region states are restored
|
||||
|
@ -656,6 +649,15 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
// Process list of dead servers and regions in RIT.
|
||||
// See HBASE-4580 for more information.
|
||||
processDeadServersAndRecoverLostRegions(deadServers);
|
||||
|
||||
// Handle the scenario when meta is rebuild by OfflineMetaRepair tool.
|
||||
// In this scenario, meta will have only info:regioninfo entries (won't contain info:server)
|
||||
// which lead SSH to skip holding region assignment.
|
||||
if (MetaTableAccessor.getServerNames(server.getConnection()).isEmpty()) {
|
||||
// Need to assign the user region as a fresh startup, otherwise user region assignment will
|
||||
// never happen
|
||||
assignRegionsOnSSHCompletion();
|
||||
}
|
||||
}
|
||||
|
||||
if (!failover && useZKForAssignment) {
|
||||
|
@ -685,6 +687,59 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
return failover;
|
||||
}
|
||||
|
||||
/*
|
||||
* At cluster clean re/start, mark all user regions closed except those of tables that are
|
||||
* excluded, such as disabled/disabling/enabling tables. All user regions and their previous
|
||||
* locations are returned.
|
||||
*/
|
||||
private Map<HRegionInfo, ServerName> getUserRegionsToAssign()
|
||||
throws InterruptedIOException, CoordinatedStateException {
|
||||
Set<TableName> disabledOrDisablingOrEnabling =
|
||||
tableStateManager.getTablesInStates(ZooKeeperProtos.Table.State.DISABLED,
|
||||
ZooKeeperProtos.Table.State.DISABLING, ZooKeeperProtos.Table.State.ENABLING);
|
||||
|
||||
// Clean re/start, mark all user regions closed before reassignment
|
||||
return regionStates.closeAllUserRegions(disabledOrDisablingOrEnabling);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for SSH completion and assign user region which are not in disabled/disabling/enabling
|
||||
* table states.
|
||||
*/
|
||||
private void assignRegionsOnSSHCompletion() {
|
||||
LOG.info("Meta is rebuild by OfflineMetaRepair tool, assigning all user regions.");
|
||||
Thread regionAssignerThread = new Thread("RegionAssignerOnMetaRebuild") {
|
||||
public void run() {
|
||||
long sshTimeout =
|
||||
server.getConfiguration().getLong("hbase.master.initializationmonitor.timeout", 900000);
|
||||
long startTime = EnvironmentEdgeManager.currentTime();
|
||||
// Wait until all dead sercessing is done.
|
||||
while (serverManager.areDeadServersInProgress()) {
|
||||
if (EnvironmentEdgeManager.currentTime() - startTime > sshTimeout) {
|
||||
LOG.warn(
|
||||
"Couldn't assign the regions as SSH was not finished within the specified time in hbase.master.initializationmonitor.timeout parameter.");
|
||||
return;
|
||||
}
|
||||
try {
|
||||
Thread.sleep(100);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.warn("RegionAssignerOnMetaRebuild got interrupted.", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
LOG.info("SSH has been completed for all dead servers, assigning the user regions.");
|
||||
try {
|
||||
// Assign the regions
|
||||
assignAllUserRegions(getUserRegionsToAssign());
|
||||
} catch (CoordinatedStateException | IOException | InterruptedException e) {
|
||||
LOG.error("Exception occured while assigning user regions.", e);
|
||||
}
|
||||
};
|
||||
};
|
||||
regionAssignerThread.setDaemon(true);
|
||||
regionAssignerThread.start();
|
||||
}
|
||||
|
||||
/**
|
||||
* If region is up in zk in transition, then do fixup and block and wait until
|
||||
* the region is assigned and out of transition. Used on startup for
|
||||
|
|
|
@ -25,6 +25,7 @@ import static org.junit.Assert.assertTrue;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
@ -33,13 +34,17 @@ import org.apache.hadoop.fs.FileSystem;
|
|||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.NamespaceDescriptor;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
import org.apache.hadoop.hbase.client.Connection;
|
||||
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.apache.hadoop.hbase.util.HBaseFsck;
|
||||
import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
|
@ -77,6 +82,63 @@ public class TestOfflineMetaRebuildBase extends OfflineMetaRebuildTestCore {
|
|||
// bring up the minicluster
|
||||
TEST_UTIL.startMiniZKCluster();
|
||||
TEST_UTIL.restartHBaseCluster(3);
|
||||
validateMetaAndUserTableRows(1, 5);
|
||||
}
|
||||
|
||||
@Test(timeout = 300000)
|
||||
public void testHMasterStartupOnMetaRebuild() throws Exception {
|
||||
// shutdown the minicluster
|
||||
TEST_UTIL.shutdownMiniHBaseCluster();
|
||||
|
||||
// Assign meta in master and restart Hbase
|
||||
TEST_UTIL.getConfiguration().set("hbase.balancer.tablesOnMaster", "hbase:meta");
|
||||
// Set namespace initialization timeout
|
||||
TEST_UTIL.getConfiguration().set("hbase.master.namespace.init.timeout", "150000");
|
||||
TEST_UTIL.restartHBaseCluster(3);
|
||||
TEST_UTIL.getMiniHBaseCluster().waitForActiveAndReadyMaster();
|
||||
|
||||
// Create namespace
|
||||
TEST_UTIL.getHBaseAdmin().createNamespace(NamespaceDescriptor.create("ns1").build());
|
||||
TEST_UTIL.getHBaseAdmin().createNamespace(NamespaceDescriptor.create("ns2").build());
|
||||
// Create tables
|
||||
TEST_UTIL.createTable(TableName.valueOf("ns1:testHMasterStartupOnMetaRebuild"),
|
||||
Bytes.toBytes("cf1"));
|
||||
TEST_UTIL.createTable(TableName.valueOf("ns2:testHMasterStartupOnMetaRebuild"),
|
||||
Bytes.toBytes("cf1"));
|
||||
// Flush meta
|
||||
TEST_UTIL.flush(TableName.META_TABLE_NAME);
|
||||
|
||||
// HMaster graceful shutdown
|
||||
TEST_UTIL.getHBaseCluster().getMaster().shutdown();
|
||||
|
||||
// Kill region servers
|
||||
List<RegionServerThread> regionServerThreads =
|
||||
TEST_UTIL.getHBaseCluster().getRegionServerThreads();
|
||||
for (RegionServerThread regionServerThread : regionServerThreads) {
|
||||
TEST_UTIL.getHBaseCluster()
|
||||
.killRegionServer(regionServerThread.getRegionServer().getServerName());
|
||||
}
|
||||
|
||||
// rebuild meta table from scratch
|
||||
HBaseFsck fsck = new HBaseFsck(conf);
|
||||
assertTrue(fsck.rebuildMeta(false));
|
||||
|
||||
// bring up the minicluster
|
||||
TEST_UTIL.restartHBaseCluster(3);
|
||||
validateMetaAndUserTableRows(3, 7);
|
||||
|
||||
// Remove table and namesapce
|
||||
TEST_UTIL.deleteTable("ns1:testHMasterStartupOnMetaRebuild");
|
||||
TEST_UTIL.deleteTable("ns2:testHMasterStartupOnMetaRebuild");
|
||||
TEST_UTIL.getHBaseAdmin().deleteNamespace("ns1");
|
||||
TEST_UTIL.getHBaseAdmin().deleteNamespace("ns2");
|
||||
}
|
||||
|
||||
/*
|
||||
* Validate meta table region count and user table rows.
|
||||
*/
|
||||
private void validateMetaAndUserTableRows(int totalTableCount, int totalRegionCount)
|
||||
throws Exception {
|
||||
try (Connection connection = ConnectionFactory.createConnection(TEST_UTIL.getConfiguration())) {
|
||||
Admin admin = connection.getAdmin();
|
||||
admin.enableTable(table);
|
||||
|
@ -85,10 +147,10 @@ public class TestOfflineMetaRebuildBase extends OfflineMetaRebuildTestCore {
|
|||
LOG.info("No more RIT in ZK, now doing final test verification");
|
||||
|
||||
// everything is good again.
|
||||
assertEquals(5, scanMeta());
|
||||
assertEquals(totalRegionCount, scanMeta());
|
||||
HTableDescriptor[] htbls = admin.listTables();
|
||||
LOG.info("Tables present after restart: " + Arrays.toString(htbls));
|
||||
assertEquals(1, htbls.length);
|
||||
assertEquals(totalTableCount, htbls.length);
|
||||
}
|
||||
|
||||
assertErrors(doFsck(conf, false), new ERROR_CODE[] {});
|
||||
|
|
Loading…
Reference in New Issue