diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/PleaseRestartMasterException.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/PleaseRestartMasterException.java new file mode 100644 index 00000000000..62f84e9495b --- /dev/null +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/PleaseRestartMasterException.java @@ -0,0 +1,34 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase; + +import org.apache.yetus.audience.InterfaceAudience; + +/** + * Thrown if the master requires restart. + */ +@InterfaceAudience.Public +public class PleaseRestartMasterException extends HBaseIOException { + + public PleaseRestartMasterException(final String s) { + super(s); + } + +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index f92dd4c63c8..9086ad3ea8c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -70,6 +70,7 @@ import org.apache.hadoop.hbase.InvalidFamilyOperationException; import org.apache.hadoop.hbase.MasterNotRunningException; import org.apache.hadoop.hbase.NamespaceDescriptor; import org.apache.hadoop.hbase.PleaseHoldException; +import org.apache.hadoop.hbase.PleaseRestartMasterException; import org.apache.hadoop.hbase.RegionMetrics; import org.apache.hadoop.hbase.ReplicationPeerNotFoundException; import org.apache.hadoop.hbase.ServerMetrics; @@ -175,6 +176,7 @@ import org.apache.hadoop.hbase.quotas.SpaceQuotaSnapshotNotifier; import org.apache.hadoop.hbase.quotas.SpaceQuotaSnapshotNotifierFactory; import org.apache.hadoop.hbase.quotas.SpaceViolationPolicy; import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.NoSuchColumnFamilyException; import org.apache.hadoop.hbase.regionserver.RSRpcServices; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationLoadSource; @@ -191,6 +193,7 @@ import org.apache.hadoop.hbase.security.UserProvider; import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.hadoop.hbase.util.Addressing; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.FSTableDescriptors; import org.apache.hadoop.hbase.util.HBaseFsck; import org.apache.hadoop.hbase.util.HFileArchiveUtil; import org.apache.hadoop.hbase.util.IdLock; @@ -953,9 +956,26 @@ public class HMaster extends HRegionServer implements MasterServices { if (!waitForMetaOnline()) { return; } + TableDescriptor metaDescriptor = tableDescriptors.get( + TableName.META_TABLE_NAME); + final ColumnFamilyDescriptor tableFamilyDesc = metaDescriptor + .getColumnFamily(HConstants.TABLE_FAMILY); + final ColumnFamilyDescriptor replBarrierFamilyDesc = + metaDescriptor.getColumnFamily(HConstants.REPLICATION_BARRIER_FAMILY); + this.assignmentManager.joinCluster(); // The below depends on hbase:meta being online. - this.tableStateManager.start(); + try { + this.tableStateManager.start(); + } catch (NoSuchColumnFamilyException e) { + if (tableFamilyDesc == null && replBarrierFamilyDesc == null) { + LOG.info("TableStates manager could not be started. This is expected" + + " during HBase 1 to 2 upgrade.", e); + } else { + throw e; + } + } + this.assignmentManager.processOfflineRegions(); // this must be called after the above processOfflineRegions to prevent race this.assignmentManager.wakeMetaLoadedEvent(); @@ -1025,7 +1045,17 @@ public class HMaster extends HRegionServer implements MasterServices { return; } status.setStatus("Starting cluster schema service"); - initClusterSchemaService(); + try { + initClusterSchemaService(); + } catch (IllegalStateException e) { + if (e.getCause() != null && e.getCause() instanceof NoSuchColumnFamilyException + && tableFamilyDesc == null && replBarrierFamilyDesc == null) { + LOG.info("ClusterSchema service could not be initialized. This is " + + "expected during HBase 1 to 2 upgrade", e); + } else { + throw e; + } + } if (this.cpHost != null) { try { @@ -1047,6 +1077,29 @@ public class HMaster extends HRegionServer implements MasterServices { // Set master as 'initialized'. setInitialized(true); + if (tableFamilyDesc == null && replBarrierFamilyDesc == null) { + // create missing CFs in meta table after master is set to 'initialized'. + createMissingCFsInMetaDuringUpgrade(metaDescriptor); + + // Throwing this Exception to abort active master is painful but this + // seems the only way to add missing CFs in meta while upgrading from + // HBase 1 to 2 (where HBase 2 has HBASE-23055 & HBASE-23782 checked-in). + // So, why do we abort active master after adding missing CFs in meta? + // When we reach here, we would have already bypassed NoSuchColumnFamilyException + // in initClusterSchemaService(), meaning ClusterSchemaService is not + // correctly initialized but we bypassed it. Similarly, we bypassed + // tableStateManager.start() as well. Hence, we should better abort + // current active master because our main task - adding missing CFs + // in meta table is done (possible only after master state is set as + // initialized) at the expense of bypassing few important tasks as part + // of active master init routine. So now we abort active master so that + // next active master init will not face any issues and all mandatory + // services will be started during master init phase. + throw new PleaseRestartMasterException("Aborting active master after missing" + + " CFs are successfully added in meta. Subsequent active master " + + "initialization should be uninterrupted"); + } + if (maintenanceMode) { LOG.info("Detected repair mode, skipping final initialization steps."); return; @@ -1106,6 +1159,38 @@ public class HMaster extends HRegionServer implements MasterServices { } } + private void createMissingCFsInMetaDuringUpgrade( + TableDescriptor metaDescriptor) throws IOException { + TableDescriptor newMetaDesc = + TableDescriptorBuilder.newBuilder(metaDescriptor) + .setColumnFamily(FSTableDescriptors.getTableFamilyDescForMeta(conf)) + .setColumnFamily(FSTableDescriptors.getReplBarrierFamilyDescForMeta()) + .build(); + long pid = this.modifyTable(TableName.META_TABLE_NAME, () -> newMetaDesc, + 0, 0, false); + int tries = 30; + while (!(getMasterProcedureExecutor().isFinished(pid)) + && getMasterProcedureExecutor().isRunning() && tries > 0) { + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + throw new IOException("Wait interrupted", e); + } + tries--; + } + if (tries <= 0) { + throw new HBaseIOException( + "Failed to add table and rep_barrier CFs to meta in a given time."); + } else { + Procedure result = getMasterProcedureExecutor().getResult(pid); + if (result != null && result.isFailed()) { + throw new IOException( + "Failed to add table and rep_barrier CFs to meta. " + + MasterProcedureUtil.unwrapRemoteIOException(result)); + } + } + } + /** * Check hbase:meta is up and ready for reading. For use during Master startup only. * @return True if meta is UP and online and startup can progress. Otherwise, meta is not online diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSTableDescriptors.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSTableDescriptors.java index 76a9328b9e5..f40736d6340 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSTableDescriptors.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSTableDescriptors.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.TableDescriptors; import org.apache.hadoop.hbase.TableInfoMissingException; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; import org.apache.hadoop.hbase.client.CoprocessorDescriptorBuilder; import org.apache.hadoop.hbase.client.TableDescriptor; @@ -139,6 +140,31 @@ public class FSTableDescriptors implements TableDescriptors { } } + public static ColumnFamilyDescriptor getTableFamilyDescForMeta( + final Configuration conf) { + return ColumnFamilyDescriptorBuilder + .newBuilder(HConstants.TABLE_FAMILY) + .setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS, + HConstants.DEFAULT_HBASE_META_VERSIONS)) + .setInMemory(true) + .setBlocksize(8 * 1024) + .setScope(HConstants.REPLICATION_SCOPE_LOCAL) + // Disable blooms for meta. Needs work. Seems to mess w/ getClosestOrBefore. + .setBloomFilterType(BloomType.NONE) + .build(); + } + + public static ColumnFamilyDescriptor getReplBarrierFamilyDescForMeta() { + return ColumnFamilyDescriptorBuilder + .newBuilder(HConstants.REPLICATION_BARRIER_FAMILY) + .setMaxVersions(HConstants.ALL_VERSIONS) + .setInMemory(true) + .setScope(HConstants.REPLICATION_SCOPE_LOCAL) + // Disable blooms for meta. Needs work. Seems to mess w/ getClosestOrBefore. + .setBloomFilterType(BloomType.NONE) + .build(); + } + public static TableDescriptorBuilder createMetaTableDescriptorBuilder(final Configuration conf) throws IOException { // TODO We used to set CacheDataInL1 for META table. When we have BucketCache in file mode, now @@ -155,23 +181,8 @@ public class FSTableDescriptors implements TableDescriptors { // Disable blooms for meta. Needs work. Seems to mess w/ getClosestOrBefore. .setBloomFilterType(BloomType.NONE) .build()) - .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(HConstants.TABLE_FAMILY) - .setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS, - HConstants.DEFAULT_HBASE_META_VERSIONS)) - .setInMemory(true) - .setBlocksize(8 * 1024) - .setScope(HConstants.REPLICATION_SCOPE_LOCAL) - // Disable blooms for meta. Needs work. Seems to mess w/ getClosestOrBefore. - .setBloomFilterType(BloomType.NONE) - .build()) - .setColumnFamily(ColumnFamilyDescriptorBuilder - .newBuilder(HConstants.REPLICATION_BARRIER_FAMILY) - .setMaxVersions(HConstants.ALL_VERSIONS) - .setInMemory(true) - .setScope(HConstants.REPLICATION_SCOPE_LOCAL) - // Disable blooms for meta. Needs work. Seems to mess w/ getClosestOrBefore. - .setBloomFilterType(BloomType.NONE) - .build()) + .setColumnFamily(getTableFamilyDescForMeta(conf)) + .setColumnFamily(getReplBarrierFamilyDescForMeta()) .setCoprocessor(CoprocessorDescriptorBuilder.newBuilder( MultiRowMutationEndpoint.class.getName()) .setPriority(Coprocessor.PRIORITY_SYSTEM)