From e8ba7b2320e2704ce413a2f906f443071cdf799d Mon Sep 17 00:00:00 2001 From: Apekshit Sharma Date: Fri, 8 Dec 2017 17:16:06 -0800 Subject: [PATCH] HBASE-19457 Debugging flaky TestTruncateTableProcedure - Adds debug logging for future ease - Removes 60s timeout since testRecoveryAndDoubleExecutionPreserveSplits is only halfway after a minute. - Adds some comments - Logging change: Some places report "regionState=" while others just "state=". State machine procs also have "state=" in their logs. Let me change all region related logging to "regionState=" so that 1) it's consistent everywhere, 2) more filtered results when searching through logs. --- .../apache/hadoop/hbase/util/CommonFSUtils.java | 2 +- .../hbase/procedure2/StateMachineProcedure.java | 3 +++ .../hbase/master/assignment/AssignmentManager.java | 13 +++++++------ .../master/procedure/TruncateTableProcedure.java | 6 ++++++ .../procedure/MasterProcedureTestingUtility.java | 7 ++++--- .../procedure/TestTruncateTableProcedure.java | 14 ++++++++------ 6 files changed, 29 insertions(+), 16 deletions(-) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/CommonFSUtils.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/CommonFSUtils.java index bdf148eff15..38ae4766278 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/CommonFSUtils.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/CommonFSUtils.java @@ -733,7 +733,7 @@ public abstract class CommonFSUtils { */ public static void logFileSystemState(final FileSystem fs, final Path root, Log LOG) throws IOException { - LOG.debug("Current file system:"); + LOG.debug("File system contents for path " + root); logFSTree(LOG, fs, root, "|-"); } diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/StateMachineProcedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/StateMachineProcedure.java index bb1d4fc5427..893ee0cd57e 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/StateMachineProcedure.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/StateMachineProcedure.java @@ -176,6 +176,9 @@ public abstract class StateMachineProcedure this.cycles++; } + if (LOG.isDebugEnabled()) { + LOG.debug(toString()); + } stateFlow = executeFromState(env, state); if (!hasMoreState()) setNextState(EOF_STATE); if (subProcList != null && !subProcList.isEmpty()) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 26c6bb76b57..cebe0b04658 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -805,7 +805,7 @@ public class AssignmentManager implements ServerListener { } if (LOG.isTraceEnabled()) { - LOG.trace(String.format("Update region transition serverName=%s region=%s state=%s", + LOG.trace(String.format("Update region transition serverName=%s region=%s regionState=%s", serverName, regionNode, state)); } @@ -838,7 +838,7 @@ public class AssignmentManager implements ServerListener { checkFailoverCleanupCompleted(parent); if (state != TransitionCode.READY_TO_SPLIT) { - throw new UnexpectedStateException("unsupported split state=" + state + + throw new UnexpectedStateException("unsupported split regionState=" + state + " for parent region " + parent + " maybe an old RS (< 2.0) had the operation in progress"); } @@ -870,7 +870,7 @@ public class AssignmentManager implements ServerListener { checkFailoverCleanupCompleted(merged); if (state != TransitionCode.READY_TO_MERGE) { - throw new UnexpectedStateException("Unsupported merge state=" + state + + throw new UnexpectedStateException("Unsupported merge regionState=" + state + " for regionA=" + hriA + " regionB=" + hriB + " merged=" + merged + " maybe an old RS (< 2.0) had the operation in progress"); } @@ -884,7 +884,8 @@ public class AssignmentManager implements ServerListener { // If the RS is < 2.0 throw an exception to abort the operation, we are handling the merge if (regionStates.getOrCreateServer(serverName).getVersionNumber() < 0x0200000) { throw new UnsupportedOperationException(String.format( - "Merge not handled yet: state=%s merged=%s hriA=%s hriB=%s", state, merged, hriA, hriB)); + "Merge not handled yet: regionState=%s merged=%s hriA=%s hriB=%s", state, merged, hriA, + hriB)); } } @@ -1162,7 +1163,7 @@ public class AssignmentManager implements ServerListener { LOG.info("waiting for RS to join"); Threads.sleep(250); } - LOG.info("RS joined " + master.getServerManager().countOfRegionServers()); + LOG.info("RS joined. Num RS = " + master.getServerManager().countOfRegionServers()); // This method will assign all user regions if a clean server startup or // it will reconstruct master state and cleanup any leftovers from previous master process. @@ -1188,7 +1189,7 @@ public class AssignmentManager implements ServerListener { // hbase1 to hbase2? Am I restoring a SNAPSHOT or otherwise adding a region to hbase:meta? // In any of these cases, state is empty. For now, presume OFFLINE but there are probably // cases where we need to probe more to be sure this correct; TODO informed by experience. - LOG.info(regionInfo.getEncodedName() + " state=null; presuming " + State.OFFLINE); + LOG.info(regionInfo.getEncodedName() + " regionState=null; presuming " + State.OFFLINE); localState = State.OFFLINE; } synchronized (regionNode) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateTableProcedure.java index c82f8d1f1a0..79b22712272 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateTableProcedure.java @@ -104,6 +104,12 @@ public class TruncateTableProcedure break; case TRUNCATE_TABLE_CLEAR_FS_LAYOUT: DeleteTableProcedure.deleteFromFs(env, getTableName(), regions, true); + // NOTE: It's very important that we create new HRegions before next state, so that + // they get persisted in procedure state before we start using them for anything. + // Otherwise, if we create them in next step and master crashes after creating fs + // layout but before saving state, region re-created after recovery will have different + // regionId(s) and encoded names. That will lead to unwanted regions in FS layout + // (which were created before the crash). if (!preserveSplits) { // if we are not preserving splits, generate a new single region regions = Arrays.asList(ModifyRegionUtils.createRegionInfos(tableDescriptor, null)); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java index 14859669008..c68c01ffb48 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java @@ -170,11 +170,11 @@ public class MasterProcedureTestingUtility { final Path tableDir = FSUtils.getTableDir(master.getMasterFileSystem().getRootDir(), tableName); assertTrue(fs.exists(tableDir)); FSUtils.logFileSystemState(fs, tableDir, LOG); - List allRegionDirs = FSUtils.getRegionDirs(fs, tableDir); + List unwantedRegionDirs = FSUtils.getRegionDirs(fs, tableDir); for (int i = 0; i < regions.length; ++i) { Path regionDir = new Path(tableDir, regions[i].getEncodedName()); assertTrue(regions[i] + " region dir does not exist", fs.exists(regionDir)); - assertTrue(allRegionDirs.remove(regionDir)); + assertTrue(unwantedRegionDirs.remove(regionDir)); List allFamilyDirs = FSUtils.getFamilyDirs(fs, regionDir); for (int j = 0; j < family.length; ++j) { final Path familyDir = new Path(regionDir, family[j]); @@ -191,7 +191,8 @@ public class MasterProcedureTestingUtility { } assertTrue("found extraneous families: " + allFamilyDirs, allFamilyDirs.isEmpty()); } - assertTrue("found extraneous regions: " + allRegionDirs, allRegionDirs.isEmpty()); + assertTrue("found extraneous regions: " + unwantedRegionDirs, unwantedRegionDirs.isEmpty()); + LOG.debug("Table directory layout is as expected."); // check meta assertTrue(MetaTableAccessor.tableExists(master.getConnection(), tableName)); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateTableProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateTableProcedure.java index c5a17a034e8..2368af55619 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateTableProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateTableProcedure.java @@ -49,7 +49,7 @@ public class TestTruncateTableProcedure extends TestTableDDLProcedureBase { @Rule public TestName name = new TestName(); - @Test(timeout=60000) + @Test public void testTruncateNotExistentTable() throws Exception { final TableName tableName = TableName.valueOf(name.getMethodName()); @@ -64,7 +64,7 @@ public class TestTruncateTableProcedure extends TestTableDDLProcedureBase { assertTrue(ProcedureTestingUtility.getExceptionCause(result) instanceof TableNotFoundException); } - @Test(timeout=60000) + @Test public void testTruncateNotDisabledTable() throws Exception { final TableName tableName = TableName.valueOf(name.getMethodName()); @@ -82,13 +82,13 @@ public class TestTruncateTableProcedure extends TestTableDDLProcedureBase { ProcedureTestingUtility.getExceptionCause(result) instanceof TableNotDisabledException); } - @Test(timeout=60000) + @Test public void testSimpleTruncatePreserveSplits() throws Exception { final TableName tableName = TableName.valueOf(name.getMethodName()); testSimpleTruncate(tableName, true); } - @Test(timeout=60000) + @Test public void testSimpleTruncateNoPreserveSplits() throws Exception { final TableName tableName = TableName.valueOf(name.getMethodName()); testSimpleTruncate(tableName, false); @@ -116,6 +116,8 @@ public class TestTruncateTableProcedure extends TestTableDDLProcedureBase { new TruncateTableProcedure(procExec.getEnvironment(), tableName, preserveSplits)); ProcedureTestingUtility.assertProcNotFailed(procExec, procId); + // If truncate procedure completed successfully, it means all regions were assigned correctly + // and table is enabled now. UTIL.waitUntilAllRegionsAssigned(tableName); // validate the table regions and layout @@ -137,13 +139,13 @@ public class TestTruncateTableProcedure extends TestTableDDLProcedureBase { assertEquals(50, UTIL.countRows(tableName)); } - @Test(timeout=60000) + @Test public void testRecoveryAndDoubleExecutionPreserveSplits() throws Exception { final TableName tableName = TableName.valueOf(name.getMethodName()); testRecoveryAndDoubleExecution(tableName, true); } - @Test(timeout=60000) + @Test public void testRecoveryAndDoubleExecutionNoPreserveSplits() throws Exception { final TableName tableName = TableName.valueOf(name.getMethodName()); testRecoveryAndDoubleExecution(tableName, false);