From 8d7ed731a9549c32e5f52a4efce2653f8a8fa2bc Mon Sep 17 00:00:00 2001 From: rajeshbabu Date: Mon, 23 Dec 2013 01:44:37 +0000 Subject: [PATCH] HBASE-9151 HBCK cannot fix when meta server znode deleted, this can happen if all region servers stopped and there are no logs to split. git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1553056 13f79535-47bb-0310-9956-ffa450edef68 --- .../hbase/master/AssignmentManager.java | 9 ++-- .../apache/hadoop/hbase/util/HBaseFsck.java | 18 ++++---- .../TestRSKilledWhenInitializing.java | 45 ++++++++++--------- .../hadoop/hbase/util/TestHBaseFsck.java | 23 ++++++++++ 4 files changed, 60 insertions(+), 35 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index b1a594d4604..39cbed1b657 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -1864,12 +1864,9 @@ public class AssignmentManager extends ZooKeeperListener { } else { if (region.isMetaRegion()) { try { - if (i != maximumAttempts) { - Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment); - continue; - } - // TODO : Ensure HBCK fixes this - LOG.error("Unable to determine a plan to assign hbase:meta even after repeated attempts. Run HBCK to fix this"); + Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment); + if (i == maximumAttempts) i = 1; + continue; } catch (InterruptedException e) { LOG.error("Got exception while waiting for hbase:meta assignment"); Thread.currentThread().interrupt(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index 0ecc3ff1698..85e45128226 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -411,11 +411,7 @@ public class HBaseFsck extends Configured { // get regions according to what is online on each RegionServer loadDeployedRegions(); // check whether hbase:meta is deployed and online - if (!recordMetaRegion()) { - // Will remove later if we can fix it - errors.reportError("Fatal error: unable to get hbase:meta region location. Exiting..."); - return -2; - } + recordMetaRegion(); // Check if hbase:meta is found only once and in the right place if (!checkMetaRegion()) { String errorMsg = "hbase:meta table is not consistent. "; @@ -2577,8 +2573,12 @@ public class HBaseFsck extends Configured { // There will be always one entry in regionInfoMap corresponding to hbase:meta // Check the deployed servers. It should be exactly one server. - HbckInfo metaHbckInfo = metaRegions.get(0); - List servers = metaHbckInfo.deployedOn; + List servers = new ArrayList(); + HbckInfo metaHbckInfo = null; + if (!metaRegions.isEmpty()) { + metaHbckInfo = metaRegions.get(0); + servers = metaHbckInfo.deployedOn; + } if (servers.size() != 1) { if (servers.size() == 0) { errors.reportError(ERROR_CODE.NO_META_REGION, "hbase:meta is not found on any region."); @@ -2586,8 +2586,8 @@ public class HBaseFsck extends Configured { errors.print("Trying to fix a problem with hbase:meta.."); setShouldRerun(); // try to fix it (treat it as unassigned region) - HBaseFsckRepair.fixUnassigned(admin, metaHbckInfo.metaEntry); - HBaseFsckRepair.waitUntilAssigned(admin, metaHbckInfo.metaEntry); + HBaseFsckRepair.fixUnassigned(admin, HRegionInfo.FIRST_META_REGIONINFO); + HBaseFsckRepair.waitUntilAssigned(admin, HRegionInfo.FIRST_META_REGIONINFO); } } else if (servers.size() > 1) { errors diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java index e0ada2e9ede..6ac8230e1b5 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java @@ -19,10 +19,10 @@ package org.apache.hadoop.hbase.regionserver; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -50,6 +50,7 @@ public class TestRSKilledWhenInitializing { private static final Log LOG = LogFactory.getLog(TestRSKilledWhenInitializing.class); private static boolean masterActive = false; + private static AtomicBoolean firstRS = new AtomicBoolean(true); /** * Test verifies whether a region server is removing from online servers list in master if it went @@ -60,7 +61,8 @@ public class TestRSKilledWhenInitializing { public void testRSTermnationAfterRegisteringToMasterBeforeCreatingEphemeralNod() throws Exception { final int NUM_MASTERS = 1; - final int NUM_RS = 1; + final int NUM_RS = 2; + firstRS.set(true); // Create config to use for this cluster Configuration conf = HBaseConfiguration.create(); @@ -86,21 +88,20 @@ public class TestRSKilledWhenInitializing { } masterActive = true; cluster.getRegionServers().get(0).start(); + cluster.getRegionServers().get(1).start(); Thread.sleep(10000); List onlineServersList = master.getMaster().getServerManager().getOnlineServersList(); - while (!onlineServersList.isEmpty()) { + while (onlineServersList.size() != 1) { Thread.sleep(100); onlineServersList = master.getMaster().getServerManager().getOnlineServersList(); } - assertTrue(onlineServersList.isEmpty()); - master.getMaster().stop("stopping master"); - master.join(); + assertEquals(onlineServersList.size(), 1); + cluster.shutdown(); } finally { masterActive = false; - TEST_UTIL.shutdownMiniZKCluster(); - TEST_UTIL.shutdownMiniDFSCluster(); - TEST_UTIL.cleanupTestDir(); + firstRS.set(true); + TEST_UTIL.shutdownMiniCluster(); } } @@ -112,19 +113,23 @@ public class TestRSKilledWhenInitializing { @Override protected void handleReportForDutyResponse(RegionServerStartupResponse c) throws IOException { - for (NameStringPair e : c.getMapEntriesList()) { - String key = e.getName(); - // The hostname the master sees us as. - if (key.equals(HConstants.KEY_FOR_HOSTNAME_SEEN_BY_MASTER)) { - String hostnameFromMasterPOV = e.getValue(); - assertEquals(super.getRpcServer().getListenerAddress().getHostName(), - hostnameFromMasterPOV); + if (firstRS.getAndSet(false)) { + for (NameStringPair e : c.getMapEntriesList()) { + String key = e.getName(); + // The hostname the master sees us as. + if (key.equals(HConstants.KEY_FOR_HOSTNAME_SEEN_BY_MASTER)) { + String hostnameFromMasterPOV = e.getValue(); + assertEquals(super.getRpcServer().getListenerAddress().getHostName(), + hostnameFromMasterPOV); + } } + while (!masterActive) { + Threads.sleep(100); + } + super.kill(); + } else { + super.handleReportForDutyResponse(c); } - while (!masterActive) { - Threads.sleep(100); - } - super.kill(); } } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java index 153777efc5b..8cddae8e009 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java @@ -93,6 +93,7 @@ import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter; import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo; import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker; import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil; +import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker; import org.apache.zookeeper.KeeperException; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -210,6 +211,28 @@ public class TestHBaseFsck { meta.close(); } + @Test(timeout=180000) + public void testFixAssignmentsWhenMETAinTransition() throws Exception { + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + HBaseAdmin admin = null; + try { + admin = new HBaseAdmin(TEST_UTIL.getConfiguration()); + admin.closeRegion(cluster.getServerHoldingMeta(), + HRegionInfo.FIRST_META_REGIONINFO); + } finally { + if (admin != null) { + admin.close(); + } + } + regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO); + MetaRegionTracker.deleteMetaLocation(cluster.getMaster().getZooKeeper()); + assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO)); + HBaseFsck hbck = doFsck(conf, true); + assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION, + ERROR_CODE.NULL_META_REGION }); + assertNoErrors(doFsck(conf, false)); + } + /** * Create a new region in META. */