HBASE-25902 Add missing CFs in meta during HBase 1 to 2 Upgrade (#3417)

Signed-off-by: Michael Stack <stack@apache.org>
This commit is contained in:
Viraj Jasani 2021-07-01 15:14:23 +05:30 committed by GitHub
parent 399b58e7c6
commit 5b5c92f427
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 149 additions and 19 deletions

View File

@ -0,0 +1,34 @@
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;
import org.apache.yetus.audience.InterfaceAudience;
/**
* Thrown if the master requires restart.
*/
@InterfaceAudience.Public
public class PleaseRestartMasterException extends HBaseIOException {
public PleaseRestartMasterException(final String s) {
super(s);
}
}

View File

@ -70,6 +70,7 @@ import org.apache.hadoop.hbase.InvalidFamilyOperationException;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.NamespaceDescriptor;
import org.apache.hadoop.hbase.PleaseHoldException;
import org.apache.hadoop.hbase.PleaseRestartMasterException;
import org.apache.hadoop.hbase.RegionMetrics;
import org.apache.hadoop.hbase.ReplicationPeerNotFoundException;
import org.apache.hadoop.hbase.ServerMetrics;
@ -175,6 +176,7 @@ import org.apache.hadoop.hbase.quotas.SpaceQuotaSnapshotNotifier;
import org.apache.hadoop.hbase.quotas.SpaceQuotaSnapshotNotifierFactory;
import org.apache.hadoop.hbase.quotas.SpaceViolationPolicy;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.regionserver.NoSuchColumnFamilyException;
import org.apache.hadoop.hbase.regionserver.RSRpcServices;
import org.apache.hadoop.hbase.replication.ReplicationException;
import org.apache.hadoop.hbase.replication.ReplicationLoadSource;
@ -191,6 +193,7 @@ import org.apache.hadoop.hbase.security.UserProvider;
import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.util.Addressing;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSTableDescriptors;
import org.apache.hadoop.hbase.util.HBaseFsck;
import org.apache.hadoop.hbase.util.HFileArchiveUtil;
import org.apache.hadoop.hbase.util.IdLock;
@ -953,9 +956,26 @@ public class HMaster extends HRegionServer implements MasterServices {
if (!waitForMetaOnline()) {
return;
}
TableDescriptor metaDescriptor = tableDescriptors.get(
TableName.META_TABLE_NAME);
final ColumnFamilyDescriptor tableFamilyDesc = metaDescriptor
.getColumnFamily(HConstants.TABLE_FAMILY);
final ColumnFamilyDescriptor replBarrierFamilyDesc =
metaDescriptor.getColumnFamily(HConstants.REPLICATION_BARRIER_FAMILY);
this.assignmentManager.joinCluster();
// The below depends on hbase:meta being online.
this.tableStateManager.start();
try {
this.tableStateManager.start();
} catch (NoSuchColumnFamilyException e) {
if (tableFamilyDesc == null && replBarrierFamilyDesc == null) {
LOG.info("TableStates manager could not be started. This is expected"
+ " during HBase 1 to 2 upgrade.", e);
} else {
throw e;
}
}
this.assignmentManager.processOfflineRegions();
// this must be called after the above processOfflineRegions to prevent race
this.assignmentManager.wakeMetaLoadedEvent();
@ -1025,7 +1045,17 @@ public class HMaster extends HRegionServer implements MasterServices {
return;
}
status.setStatus("Starting cluster schema service");
initClusterSchemaService();
try {
initClusterSchemaService();
} catch (IllegalStateException e) {
if (e.getCause() != null && e.getCause() instanceof NoSuchColumnFamilyException
&& tableFamilyDesc == null && replBarrierFamilyDesc == null) {
LOG.info("ClusterSchema service could not be initialized. This is "
+ "expected during HBase 1 to 2 upgrade", e);
} else {
throw e;
}
}
if (this.cpHost != null) {
try {
@ -1047,6 +1077,29 @@ public class HMaster extends HRegionServer implements MasterServices {
// Set master as 'initialized'.
setInitialized(true);
if (tableFamilyDesc == null && replBarrierFamilyDesc == null) {
// create missing CFs in meta table after master is set to 'initialized'.
createMissingCFsInMetaDuringUpgrade(metaDescriptor);
// Throwing this Exception to abort active master is painful but this
// seems the only way to add missing CFs in meta while upgrading from
// HBase 1 to 2 (where HBase 2 has HBASE-23055 & HBASE-23782 checked-in).
// So, why do we abort active master after adding missing CFs in meta?
// When we reach here, we would have already bypassed NoSuchColumnFamilyException
// in initClusterSchemaService(), meaning ClusterSchemaService is not
// correctly initialized but we bypassed it. Similarly, we bypassed
// tableStateManager.start() as well. Hence, we should better abort
// current active master because our main task - adding missing CFs
// in meta table is done (possible only after master state is set as
// initialized) at the expense of bypassing few important tasks as part
// of active master init routine. So now we abort active master so that
// next active master init will not face any issues and all mandatory
// services will be started during master init phase.
throw new PleaseRestartMasterException("Aborting active master after missing"
+ " CFs are successfully added in meta. Subsequent active master "
+ "initialization should be uninterrupted");
}
if (maintenanceMode) {
LOG.info("Detected repair mode, skipping final initialization steps.");
return;
@ -1106,6 +1159,38 @@ public class HMaster extends HRegionServer implements MasterServices {
}
}
private void createMissingCFsInMetaDuringUpgrade(
TableDescriptor metaDescriptor) throws IOException {
TableDescriptor newMetaDesc =
TableDescriptorBuilder.newBuilder(metaDescriptor)
.setColumnFamily(FSTableDescriptors.getTableFamilyDescForMeta(conf))
.setColumnFamily(FSTableDescriptors.getReplBarrierFamilyDescForMeta())
.build();
long pid = this.modifyTable(TableName.META_TABLE_NAME, () -> newMetaDesc,
0, 0, false);
int tries = 30;
while (!(getMasterProcedureExecutor().isFinished(pid))
&& getMasterProcedureExecutor().isRunning() && tries > 0) {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
throw new IOException("Wait interrupted", e);
}
tries--;
}
if (tries <= 0) {
throw new HBaseIOException(
"Failed to add table and rep_barrier CFs to meta in a given time.");
} else {
Procedure<?> result = getMasterProcedureExecutor().getResult(pid);
if (result != null && result.isFailed()) {
throw new IOException(
"Failed to add table and rep_barrier CFs to meta. "
+ MasterProcedureUtil.unwrapRemoteIOException(result));
}
}
}
/**
* Check hbase:meta is up and ready for reading. For use during Master startup only.
* @return True if meta is UP and online and startup can progress. Otherwise, meta is not online

View File

@ -39,6 +39,7 @@ import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.TableDescriptors;
import org.apache.hadoop.hbase.TableInfoMissingException;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
import org.apache.hadoop.hbase.client.CoprocessorDescriptorBuilder;
import org.apache.hadoop.hbase.client.TableDescriptor;
@ -139,6 +140,31 @@ public class FSTableDescriptors implements TableDescriptors {
}
}
public static ColumnFamilyDescriptor getTableFamilyDescForMeta(
final Configuration conf) {
return ColumnFamilyDescriptorBuilder
.newBuilder(HConstants.TABLE_FAMILY)
.setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS,
HConstants.DEFAULT_HBASE_META_VERSIONS))
.setInMemory(true)
.setBlocksize(8 * 1024)
.setScope(HConstants.REPLICATION_SCOPE_LOCAL)
// Disable blooms for meta. Needs work. Seems to mess w/ getClosestOrBefore.
.setBloomFilterType(BloomType.NONE)
.build();
}
public static ColumnFamilyDescriptor getReplBarrierFamilyDescForMeta() {
return ColumnFamilyDescriptorBuilder
.newBuilder(HConstants.REPLICATION_BARRIER_FAMILY)
.setMaxVersions(HConstants.ALL_VERSIONS)
.setInMemory(true)
.setScope(HConstants.REPLICATION_SCOPE_LOCAL)
// Disable blooms for meta. Needs work. Seems to mess w/ getClosestOrBefore.
.setBloomFilterType(BloomType.NONE)
.build();
}
public static TableDescriptorBuilder createMetaTableDescriptorBuilder(final Configuration conf)
throws IOException {
// TODO We used to set CacheDataInL1 for META table. When we have BucketCache in file mode, now
@ -155,23 +181,8 @@ public class FSTableDescriptors implements TableDescriptors {
// Disable blooms for meta. Needs work. Seems to mess w/ getClosestOrBefore.
.setBloomFilterType(BloomType.NONE)
.build())
.setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(HConstants.TABLE_FAMILY)
.setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS,
HConstants.DEFAULT_HBASE_META_VERSIONS))
.setInMemory(true)
.setBlocksize(8 * 1024)
.setScope(HConstants.REPLICATION_SCOPE_LOCAL)
// Disable blooms for meta. Needs work. Seems to mess w/ getClosestOrBefore.
.setBloomFilterType(BloomType.NONE)
.build())
.setColumnFamily(ColumnFamilyDescriptorBuilder
.newBuilder(HConstants.REPLICATION_BARRIER_FAMILY)
.setMaxVersions(HConstants.ALL_VERSIONS)
.setInMemory(true)
.setScope(HConstants.REPLICATION_SCOPE_LOCAL)
// Disable blooms for meta. Needs work. Seems to mess w/ getClosestOrBefore.
.setBloomFilterType(BloomType.NONE)
.build())
.setColumnFamily(getTableFamilyDescForMeta(conf))
.setColumnFamily(getReplBarrierFamilyDescForMeta())
.setCoprocessor(CoprocessorDescriptorBuilder.newBuilder(
MultiRowMutationEndpoint.class.getName())
.setPriority(Coprocessor.PRIORITY_SYSTEM)