From fd86de98e192d55f221f1891b712e927761736d9 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Fri, 3 Nov 2017 15:13:13 -0700 Subject: [PATCH] HBASE-19165 TODO Handle stuck in transition: rit=OPENING, location=ve0538.... --- .../hbase/zookeeper/RecoverableZooKeeper.java | 3 ++- .../hadoop/hbase/master/ServerManager.java | 2 +- .../master/assignment/AssignmentManager.java | 19 ++++++++++++++----- .../master/assignment/RegionStateStore.java | 17 ++++++++++++----- .../hbase/master/assignment/RegionStates.java | 10 +++++++++- .../procedure/EnableTableProcedure.java | 6 +++--- .../hbase/zookeeper/ZooKeeperMainServer.java | 4 +++- .../TestRestoreSnapshotProcedure.java | 2 +- 8 files changed, 45 insertions(+), 18 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java index 1c3138bfa6f..327bafbdc8f 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java @@ -295,12 +295,13 @@ public class RecoverableZooKeeper { private void retryOrThrow(RetryCounter retryCounter, KeeperException e, String opName) throws KeeperException { - LOG.debug("Possibly transient ZooKeeper, quorum=" + quorumServers + ", exception=" + e); if (!retryCounter.shouldRetry()) { LOG.error("ZooKeeper " + opName + " failed after " + retryCounter.getMaxAttempts() + " attempts"); throw e; } + LOG.debug("Retry, connectivity issue (JVM Pause?); quorum=" + quorumServers + "," + + "exception=" + e); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index c0143030a4a..f19995fb8dd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -656,7 +656,7 @@ public class ServerManager { } if (!master.getAssignmentManager().isFailoverCleanupDone()) { - LOG.info("AssignmentManager hasn't finished failover cleanup; waiting"); + LOG.debug("AssignmentManager failover cleanup not done."); } for (Map.Entry entry : requeuedDeadServers.entrySet()) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 8bdf4d5dfd1..f4a3ca87f78 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -935,7 +935,7 @@ public class AssignmentManager implements ServerListener { wakeServerReportEvent(serverNode); } - public void checkOnlineRegionsReportForMeta(final ServerStateNode serverNode, + void checkOnlineRegionsReportForMeta(final ServerStateNode serverNode, final Set regionNames) { try { for (byte[] regionName: regionNames) { @@ -951,7 +951,7 @@ public class AssignmentManager implements ServerListener { final RegionStateNode regionNode = regionStates.getOrCreateRegionNode(hri); LOG.info("META REPORTED: " + regionNode); if (!reportTransition(regionNode, serverNode, TransitionCode.OPENED, 0)) { - LOG.warn("META REPORTED but no procedure found"); + LOG.warn("META REPORTED but no procedure found (complete?)"); regionNode.setRegionLocation(serverNode.getServerName()); } else if (LOG.isTraceEnabled()) { LOG.trace("META REPORTED: " + regionNode); @@ -1183,17 +1183,26 @@ public class AssignmentManager implements ServerListener { public void visitRegionState(final RegionInfo regionInfo, final State state, final ServerName regionLocation, final ServerName lastHost, final long openSeqNum) { final RegionStateNode regionNode = regionStates.getOrCreateRegionNode(regionInfo); + State localState = state; + if (localState == null) { + // No region state column data in hbase:meta table! Are I doing a rolling upgrade from + // hbase1 to hbase2? Am I restoring a SNAPSHOT or otherwise adding a region to hbase:meta? + // In any of these cases, state is empty. For now, presume OFFLINE but there are probably + // cases where we need to probe more to be sure this correct; TODO informed by experience. + LOG.info(regionInfo.getEncodedName() + " state=null; presuming " + State.OFFLINE); + localState = State.OFFLINE; + } synchronized (regionNode) { if (!regionNode.isInTransition()) { - regionNode.setState(state); + regionNode.setState(localState); regionNode.setLastHost(lastHost); regionNode.setRegionLocation(regionLocation); regionNode.setOpenSeqNum(openSeqNum); - if (state == State.OPEN) { + if (localState == State.OPEN) { assert regionLocation != null : "found null region location for " + regionNode; regionStates.addRegionToServer(regionLocation, regionNode); - } else if (state == State.OFFLINE || regionInfo.isOffline()) { + } else if (localState == State.OFFLINE || regionInfo.isOffline()) { regionStates.addToOfflineRegions(regionNode); } else { // These regions should have a procedure in replay diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java index f9a1b438c8a..723a7761d0a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java @@ -90,10 +90,15 @@ public class RegionStateStore { @Override public boolean visit(final Result r) throws IOException { if (r != null && !r.isEmpty()) { - long st = System.currentTimeMillis(); + long st = 0; + if (LOG.isTraceEnabled()) { + st = System.currentTimeMillis(); + } visitMetaEntry(visitor, r); - long et = System.currentTimeMillis(); - LOG.info("[T] LOAD META PERF " + StringUtils.humanTimeDiff(et - st)); + if (LOG.isTraceEnabled()) { + long et = System.currentTimeMillis(); + LOG.trace("[T] LOAD META PERF " + StringUtils.humanTimeDiff(et - st)); + } } else if (isDebugEnabled) { LOG.debug("NULL result from meta - ignoring but this is strange."); } @@ -310,11 +315,13 @@ public class RegionStateStore { /** * Pull the region state from a catalog table {@link Result}. * @param r Result to pull the region state from - * @return the region state, or OPEN if there's no value written. + * @return the region state, or null if unknown. */ protected State getRegionState(final Result r, int replicaId) { Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY, getStateColumn(replicaId)); - if (cell == null || cell.getValueLength() == 0) return State.OPENING; + if (cell == null || cell.getValueLength() == 0) { + return null; + } return State.valueOf(Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength())); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java index 3b58fe26ce8..f404abbc82b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java @@ -542,11 +542,19 @@ public class RegionStates { return !getTableRegionStates(tableName).isEmpty(); } + /** + * @return Return online regions of table; does not include OFFLINE or SPLITTING regions. + */ public List getRegionsOfTable(final TableName table) { return getRegionsOfTable(table, false); } - List getRegionsOfTable(final TableName table, final boolean offline) { + /** + * @return Return the regions of the table; does not include OFFLINE unless you set + * offline to true. Does not include regions that are in the + * {@link State#SPLIT} state. + */ + public List getRegionsOfTable(final TableName table, final boolean offline) { final ArrayList nodes = getTableRegionStateNodes(table); final ArrayList hris = new ArrayList(nodes.size()); for (RegionStateNode node: nodes) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java index 6c94effd0bc..aaac96d6bc7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -120,9 +120,9 @@ public class EnableTableProcedure // Get the replica count int regionReplicaCount = hTableDescriptor.getRegionReplication(); - // Get the regions for the table from the memory + // Get the regions for the table from memory; get both online and offline regions ('true'). List regionsOfTable = - env.getAssignmentManager().getRegionStates().getRegionsOfTable(tableName); + env.getAssignmentManager().getRegionStates().getRegionsOfTable(tableName, true); if (regionReplicaCount > 1) { int currentMaxReplica = 0; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperMainServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperMainServer.java index 20a2d6dcc7e..341fbbdfa82 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperMainServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperMainServer.java @@ -56,7 +56,9 @@ public class ZooKeeperMainServer { while (!this.zk.getState().isConnected()) { Thread.sleep(1); if (stopWatch.elapsed(TimeUnit.SECONDS) > 10) { - throw new InterruptedException("Failed connect " + this.zk); + throw new InterruptedException("Failed connect after waiting " + + stopWatch.elapsed(TimeUnit.SECONDS) + "seconds; state=" + this.zk.getState() + + "; " + this.zk); } } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestRestoreSnapshotProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestRestoreSnapshotProcedure.java index 58042d80698..13146f72b20 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestRestoreSnapshotProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestRestoreSnapshotProcedure.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information