HBASE-9151 HBCK cannot fix when meta server znode deleted, this can happen if all region servers stopped and there are no logs to split.

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1553056 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
rajeshbabu 2013-12-23 01:44:37 +00:00
parent b6c898a138
commit 8d7ed731a9
4 changed files with 60 additions and 35 deletions

View File

@ -1864,12 +1864,9 @@ public class AssignmentManager extends ZooKeeperListener {
} else {
if (region.isMetaRegion()) {
try {
if (i != maximumAttempts) {
Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
if (i == maximumAttempts) i = 1;
continue;
}
// TODO : Ensure HBCK fixes this
LOG.error("Unable to determine a plan to assign hbase:meta even after repeated attempts. Run HBCK to fix this");
} catch (InterruptedException e) {
LOG.error("Got exception while waiting for hbase:meta assignment");
Thread.currentThread().interrupt();

View File

@ -411,11 +411,7 @@ public class HBaseFsck extends Configured {
// get regions according to what is online on each RegionServer
loadDeployedRegions();
// check whether hbase:meta is deployed and online
if (!recordMetaRegion()) {
// Will remove later if we can fix it
errors.reportError("Fatal error: unable to get hbase:meta region location. Exiting...");
return -2;
}
recordMetaRegion();
// Check if hbase:meta is found only once and in the right place
if (!checkMetaRegion()) {
String errorMsg = "hbase:meta table is not consistent. ";
@ -2577,8 +2573,12 @@ public class HBaseFsck extends Configured {
// There will be always one entry in regionInfoMap corresponding to hbase:meta
// Check the deployed servers. It should be exactly one server.
HbckInfo metaHbckInfo = metaRegions.get(0);
List<ServerName> servers = metaHbckInfo.deployedOn;
List<ServerName> servers = new ArrayList<ServerName>();
HbckInfo metaHbckInfo = null;
if (!metaRegions.isEmpty()) {
metaHbckInfo = metaRegions.get(0);
servers = metaHbckInfo.deployedOn;
}
if (servers.size() != 1) {
if (servers.size() == 0) {
errors.reportError(ERROR_CODE.NO_META_REGION, "hbase:meta is not found on any region.");
@ -2586,8 +2586,8 @@ public class HBaseFsck extends Configured {
errors.print("Trying to fix a problem with hbase:meta..");
setShouldRerun();
// try to fix it (treat it as unassigned region)
HBaseFsckRepair.fixUnassigned(admin, metaHbckInfo.metaEntry);
HBaseFsckRepair.waitUntilAssigned(admin, metaHbckInfo.metaEntry);
HBaseFsckRepair.fixUnassigned(admin, HRegionInfo.FIRST_META_REGIONINFO);
HBaseFsckRepair.waitUntilAssigned(admin, HRegionInfo.FIRST_META_REGIONINFO);
}
} else if (servers.size() > 1) {
errors

View File

@ -19,10 +19,10 @@
package org.apache.hadoop.hbase.regionserver;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -50,6 +50,7 @@ public class TestRSKilledWhenInitializing {
private static final Log LOG = LogFactory.getLog(TestRSKilledWhenInitializing.class);
private static boolean masterActive = false;
private static AtomicBoolean firstRS = new AtomicBoolean(true);
/**
* Test verifies whether a region server is removing from online servers list in master if it went
@ -60,7 +61,8 @@ public class TestRSKilledWhenInitializing {
public void testRSTermnationAfterRegisteringToMasterBeforeCreatingEphemeralNod() throws Exception {
final int NUM_MASTERS = 1;
final int NUM_RS = 1;
final int NUM_RS = 2;
firstRS.set(true);
// Create config to use for this cluster
Configuration conf = HBaseConfiguration.create();
@ -86,21 +88,20 @@ public class TestRSKilledWhenInitializing {
}
masterActive = true;
cluster.getRegionServers().get(0).start();
cluster.getRegionServers().get(1).start();
Thread.sleep(10000);
List<ServerName> onlineServersList =
master.getMaster().getServerManager().getOnlineServersList();
while (!onlineServersList.isEmpty()) {
while (onlineServersList.size() != 1) {
Thread.sleep(100);
onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
}
assertTrue(onlineServersList.isEmpty());
master.getMaster().stop("stopping master");
master.join();
assertEquals(onlineServersList.size(), 1);
cluster.shutdown();
} finally {
masterActive = false;
TEST_UTIL.shutdownMiniZKCluster();
TEST_UTIL.shutdownMiniDFSCluster();
TEST_UTIL.cleanupTestDir();
firstRS.set(true);
TEST_UTIL.shutdownMiniCluster();
}
}
@ -112,6 +113,7 @@ public class TestRSKilledWhenInitializing {
@Override
protected void handleReportForDutyResponse(RegionServerStartupResponse c) throws IOException {
if (firstRS.getAndSet(false)) {
for (NameStringPair e : c.getMapEntriesList()) {
String key = e.getName();
// The hostname the master sees us as.
@ -125,6 +127,9 @@ public class TestRSKilledWhenInitializing {
Threads.sleep(100);
}
super.kill();
} else {
super.handleReportForDutyResponse(c);
}
}
}
}

View File

@ -93,6 +93,7 @@ import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
import org.junit.BeforeClass;
@ -210,6 +211,28 @@ public class TestHBaseFsck {
meta.close();
}
@Test(timeout=180000)
public void testFixAssignmentsWhenMETAinTransition() throws Exception {
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
HBaseAdmin admin = null;
try {
admin = new HBaseAdmin(TEST_UTIL.getConfiguration());
admin.closeRegion(cluster.getServerHoldingMeta(),
HRegionInfo.FIRST_META_REGIONINFO);
} finally {
if (admin != null) {
admin.close();
}
}
regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
MetaRegionTracker.deleteMetaLocation(cluster.getMaster().getZooKeeper());
assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
HBaseFsck hbck = doFsck(conf, true);
assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
ERROR_CODE.NULL_META_REGION });
assertNoErrors(doFsck(conf, false));
}
/**
* Create a new region in META.
*/