HBASE-9151 HBCK cannot fix when meta server znode deleted, this can happen if all region servers stopped and there are no logs to split.
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1553056 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b6c898a138
commit
8d7ed731a9
@ -1864,12 +1864,9 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||||||
} else {
|
} else {
|
||||||
if (region.isMetaRegion()) {
|
if (region.isMetaRegion()) {
|
||||||
try {
|
try {
|
||||||
if (i != maximumAttempts) {
|
Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
|
||||||
Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
|
if (i == maximumAttempts) i = 1;
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
// TODO : Ensure HBCK fixes this
|
|
||||||
LOG.error("Unable to determine a plan to assign hbase:meta even after repeated attempts. Run HBCK to fix this");
|
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
LOG.error("Got exception while waiting for hbase:meta assignment");
|
LOG.error("Got exception while waiting for hbase:meta assignment");
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
|
@ -411,11 +411,7 @@ public class HBaseFsck extends Configured {
|
|||||||
// get regions according to what is online on each RegionServer
|
// get regions according to what is online on each RegionServer
|
||||||
loadDeployedRegions();
|
loadDeployedRegions();
|
||||||
// check whether hbase:meta is deployed and online
|
// check whether hbase:meta is deployed and online
|
||||||
if (!recordMetaRegion()) {
|
recordMetaRegion();
|
||||||
// Will remove later if we can fix it
|
|
||||||
errors.reportError("Fatal error: unable to get hbase:meta region location. Exiting...");
|
|
||||||
return -2;
|
|
||||||
}
|
|
||||||
// Check if hbase:meta is found only once and in the right place
|
// Check if hbase:meta is found only once and in the right place
|
||||||
if (!checkMetaRegion()) {
|
if (!checkMetaRegion()) {
|
||||||
String errorMsg = "hbase:meta table is not consistent. ";
|
String errorMsg = "hbase:meta table is not consistent. ";
|
||||||
@ -2577,8 +2573,12 @@ public class HBaseFsck extends Configured {
|
|||||||
|
|
||||||
// There will be always one entry in regionInfoMap corresponding to hbase:meta
|
// There will be always one entry in regionInfoMap corresponding to hbase:meta
|
||||||
// Check the deployed servers. It should be exactly one server.
|
// Check the deployed servers. It should be exactly one server.
|
||||||
HbckInfo metaHbckInfo = metaRegions.get(0);
|
List<ServerName> servers = new ArrayList<ServerName>();
|
||||||
List<ServerName> servers = metaHbckInfo.deployedOn;
|
HbckInfo metaHbckInfo = null;
|
||||||
|
if (!metaRegions.isEmpty()) {
|
||||||
|
metaHbckInfo = metaRegions.get(0);
|
||||||
|
servers = metaHbckInfo.deployedOn;
|
||||||
|
}
|
||||||
if (servers.size() != 1) {
|
if (servers.size() != 1) {
|
||||||
if (servers.size() == 0) {
|
if (servers.size() == 0) {
|
||||||
errors.reportError(ERROR_CODE.NO_META_REGION, "hbase:meta is not found on any region.");
|
errors.reportError(ERROR_CODE.NO_META_REGION, "hbase:meta is not found on any region.");
|
||||||
@ -2586,8 +2586,8 @@ public class HBaseFsck extends Configured {
|
|||||||
errors.print("Trying to fix a problem with hbase:meta..");
|
errors.print("Trying to fix a problem with hbase:meta..");
|
||||||
setShouldRerun();
|
setShouldRerun();
|
||||||
// try to fix it (treat it as unassigned region)
|
// try to fix it (treat it as unassigned region)
|
||||||
HBaseFsckRepair.fixUnassigned(admin, metaHbckInfo.metaEntry);
|
HBaseFsckRepair.fixUnassigned(admin, HRegionInfo.FIRST_META_REGIONINFO);
|
||||||
HBaseFsckRepair.waitUntilAssigned(admin, metaHbckInfo.metaEntry);
|
HBaseFsckRepair.waitUntilAssigned(admin, HRegionInfo.FIRST_META_REGIONINFO);
|
||||||
}
|
}
|
||||||
} else if (servers.size() > 1) {
|
} else if (servers.size() > 1) {
|
||||||
errors
|
errors
|
||||||
|
@ -19,10 +19,10 @@
|
|||||||
package org.apache.hadoop.hbase.regionserver;
|
package org.apache.hadoop.hbase.regionserver;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertTrue;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
@ -50,6 +50,7 @@ public class TestRSKilledWhenInitializing {
|
|||||||
private static final Log LOG = LogFactory.getLog(TestRSKilledWhenInitializing.class);
|
private static final Log LOG = LogFactory.getLog(TestRSKilledWhenInitializing.class);
|
||||||
|
|
||||||
private static boolean masterActive = false;
|
private static boolean masterActive = false;
|
||||||
|
private static AtomicBoolean firstRS = new AtomicBoolean(true);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test verifies whether a region server is removing from online servers list in master if it went
|
* Test verifies whether a region server is removing from online servers list in master if it went
|
||||||
@ -60,7 +61,8 @@ public class TestRSKilledWhenInitializing {
|
|||||||
public void testRSTermnationAfterRegisteringToMasterBeforeCreatingEphemeralNod() throws Exception {
|
public void testRSTermnationAfterRegisteringToMasterBeforeCreatingEphemeralNod() throws Exception {
|
||||||
|
|
||||||
final int NUM_MASTERS = 1;
|
final int NUM_MASTERS = 1;
|
||||||
final int NUM_RS = 1;
|
final int NUM_RS = 2;
|
||||||
|
firstRS.set(true);
|
||||||
// Create config to use for this cluster
|
// Create config to use for this cluster
|
||||||
Configuration conf = HBaseConfiguration.create();
|
Configuration conf = HBaseConfiguration.create();
|
||||||
|
|
||||||
@ -86,21 +88,20 @@ public class TestRSKilledWhenInitializing {
|
|||||||
}
|
}
|
||||||
masterActive = true;
|
masterActive = true;
|
||||||
cluster.getRegionServers().get(0).start();
|
cluster.getRegionServers().get(0).start();
|
||||||
|
cluster.getRegionServers().get(1).start();
|
||||||
Thread.sleep(10000);
|
Thread.sleep(10000);
|
||||||
List<ServerName> onlineServersList =
|
List<ServerName> onlineServersList =
|
||||||
master.getMaster().getServerManager().getOnlineServersList();
|
master.getMaster().getServerManager().getOnlineServersList();
|
||||||
while (!onlineServersList.isEmpty()) {
|
while (onlineServersList.size() != 1) {
|
||||||
Thread.sleep(100);
|
Thread.sleep(100);
|
||||||
onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
|
onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
|
||||||
}
|
}
|
||||||
assertTrue(onlineServersList.isEmpty());
|
assertEquals(onlineServersList.size(), 1);
|
||||||
master.getMaster().stop("stopping master");
|
cluster.shutdown();
|
||||||
master.join();
|
|
||||||
} finally {
|
} finally {
|
||||||
masterActive = false;
|
masterActive = false;
|
||||||
TEST_UTIL.shutdownMiniZKCluster();
|
firstRS.set(true);
|
||||||
TEST_UTIL.shutdownMiniDFSCluster();
|
TEST_UTIL.shutdownMiniCluster();
|
||||||
TEST_UTIL.cleanupTestDir();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,19 +113,23 @@ public class TestRSKilledWhenInitializing {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void handleReportForDutyResponse(RegionServerStartupResponse c) throws IOException {
|
protected void handleReportForDutyResponse(RegionServerStartupResponse c) throws IOException {
|
||||||
for (NameStringPair e : c.getMapEntriesList()) {
|
if (firstRS.getAndSet(false)) {
|
||||||
String key = e.getName();
|
for (NameStringPair e : c.getMapEntriesList()) {
|
||||||
// The hostname the master sees us as.
|
String key = e.getName();
|
||||||
if (key.equals(HConstants.KEY_FOR_HOSTNAME_SEEN_BY_MASTER)) {
|
// The hostname the master sees us as.
|
||||||
String hostnameFromMasterPOV = e.getValue();
|
if (key.equals(HConstants.KEY_FOR_HOSTNAME_SEEN_BY_MASTER)) {
|
||||||
assertEquals(super.getRpcServer().getListenerAddress().getHostName(),
|
String hostnameFromMasterPOV = e.getValue();
|
||||||
hostnameFromMasterPOV);
|
assertEquals(super.getRpcServer().getListenerAddress().getHostName(),
|
||||||
|
hostnameFromMasterPOV);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
while (!masterActive) {
|
||||||
|
Threads.sleep(100);
|
||||||
|
}
|
||||||
|
super.kill();
|
||||||
|
} else {
|
||||||
|
super.handleReportForDutyResponse(c);
|
||||||
}
|
}
|
||||||
while (!masterActive) {
|
|
||||||
Threads.sleep(100);
|
|
||||||
}
|
|
||||||
super.kill();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -93,6 +93,7 @@ import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
|
|||||||
import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
|
import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
|
||||||
import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
|
import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
|
||||||
import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
|
import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
|
||||||
|
import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
|
||||||
import org.apache.zookeeper.KeeperException;
|
import org.apache.zookeeper.KeeperException;
|
||||||
import org.junit.AfterClass;
|
import org.junit.AfterClass;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
@ -210,6 +211,28 @@ public class TestHBaseFsck {
|
|||||||
meta.close();
|
meta.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(timeout=180000)
|
||||||
|
public void testFixAssignmentsWhenMETAinTransition() throws Exception {
|
||||||
|
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
|
||||||
|
HBaseAdmin admin = null;
|
||||||
|
try {
|
||||||
|
admin = new HBaseAdmin(TEST_UTIL.getConfiguration());
|
||||||
|
admin.closeRegion(cluster.getServerHoldingMeta(),
|
||||||
|
HRegionInfo.FIRST_META_REGIONINFO);
|
||||||
|
} finally {
|
||||||
|
if (admin != null) {
|
||||||
|
admin.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
|
||||||
|
MetaRegionTracker.deleteMetaLocation(cluster.getMaster().getZooKeeper());
|
||||||
|
assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
|
||||||
|
HBaseFsck hbck = doFsck(conf, true);
|
||||||
|
assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
|
||||||
|
ERROR_CODE.NULL_META_REGION });
|
||||||
|
assertNoErrors(doFsck(conf, false));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new region in META.
|
* Create a new region in META.
|
||||||
*/
|
*/
|
||||||
|
Loading…
x
Reference in New Issue
Block a user