HBASE-20592 Create a tool to verify tables do not have prefix tree encoding

Signed-off-by: Mike Drob <mdrob@apache.org>
This commit is contained in:
Peter Somogyi 2018-05-25 15:03:17 +02:00
parent 26e54e42cd
commit 53d29d53c4
5 changed files with 156 additions and 2 deletions

View File

@ -106,6 +106,7 @@ if [ $# = 0 ]; then
echo " regionsplitter Run RegionSplitter tool" echo " regionsplitter Run RegionSplitter tool"
echo " rowcounter Run RowCounter tool" echo " rowcounter Run RowCounter tool"
echo " cellcounter Run CellCounter tool" echo " cellcounter Run CellCounter tool"
echo " pre-upgrade Run Pre-Upgrade validator tool"
echo " CLASSNAME Run the class named CLASSNAME" echo " CLASSNAME Run the class named CLASSNAME"
exit 1 exit 1
fi fi
@ -465,6 +466,8 @@ elif [ "$COMMAND" = "rowcounter" ] ; then
CLASS='org.apache.hadoop.hbase.mapreduce.RowCounter' CLASS='org.apache.hadoop.hbase.mapreduce.RowCounter'
elif [ "$COMMAND" = "cellcounter" ] ; then elif [ "$COMMAND" = "cellcounter" ] ; then
CLASS='org.apache.hadoop.hbase.mapreduce.CellCounter' CLASS='org.apache.hadoop.hbase.mapreduce.CellCounter'
elif [ "$COMMAND" = "pre-upgrade" ] ; then
CLASS='org.apache.hadoop.hbase.tool.PreUpgradeValidator'
else else
CLASS=$COMMAND CLASS=$COMMAND
fi fi

View File

@ -0,0 +1,129 @@
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.tool;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
import org.apache.hadoop.hbase.util.AbstractHBaseTool;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLine;
/**
* Tool for validating that cluster can be upgraded from HBase 1.x to 2.0
* <p>
* Available validations:
* <ul>
* <li>all: Run all pre-upgrade validations</li>
* <li>validateDBE: Check Data Block Encoding for column families</li>
* </ul>
* </p>
*/
@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
public class PreUpgradeValidator extends AbstractHBaseTool {
public static final String NAME = "pre-upgrade";
private static final Logger LOG = LoggerFactory.getLogger(PreUpgradeValidator.class);
private static final byte[] DATA_BLOCK_ENCODING = Bytes.toBytes("DATA_BLOCK_ENCODING");
private boolean validateAll;
private boolean validateDBE;
/**
* Check DataBlockEncodings for column families.
*
* @return DataBlockEncoding compatible with HBase 2
* @throws IOException if a remote or network exception occurs
*/
private boolean validateDBE() throws IOException {
int incompatibilities = 0;
LOG.info("Validating Data Block Encodings");
try (Connection connection = ConnectionFactory.createConnection(getConf());
Admin admin = connection.getAdmin()) {
List<TableDescriptor> tableDescriptors = admin.listTableDescriptors();
String encoding = "";
for (TableDescriptor td : tableDescriptors) {
ColumnFamilyDescriptor[] columnFamilies = td.getColumnFamilies();
for (ColumnFamilyDescriptor cfd : columnFamilies) {
try {
encoding = Bytes.toString(cfd.getValue(DATA_BLOCK_ENCODING));
// IllegalArgumentException will be thrown if encoding is incompatible with 2.0
DataBlockEncoding.valueOf(encoding);
} catch (IllegalArgumentException e) {
incompatibilities++;
LOG.warn("Incompatible DataBlockEncoding for table: {}, cf: {}, encoding: {}",
td.getTableName().getNameAsString(), cfd.getNameAsString(), encoding);
}
}
}
}
if (incompatibilities > 0) {
LOG.warn("There are {} column families with incompatible Data Block Encodings. Do not "
+ "upgrade until these encodings are converted to a supported one.", incompatibilities);
LOG.warn("Check http://hbase.apache.org/book.html#upgrade2.0.prefix-tree.removed "
+ "for instructions.");
return false;
} else {
LOG.info("The used Data Block Encodings are compatible with HBase 2.0.");
return true;
}
}
@Override
protected void addOptions() {
addOptNoArg("all", "Run all pre-upgrade validations");
addOptNoArg("validateDBE", "Validate DataBlockEncoding are compatible on the cluster");
}
@Override
protected void processOptions(CommandLine cmd) {
validateAll = cmd.hasOption("all");
validateDBE = cmd.hasOption("validateDBE");
}
@Override
protected int doWork() throws Exception {
boolean validationFailed = false;
if (validateDBE || validateAll) {
if (validateDBE()) {
validationFailed = true;
}
}
return validationFailed ? 1 : 0;
}
public static void main(String[] args) {
new PreUpgradeValidator().doStaticMain(args);
}
}

View File

@ -321,7 +321,7 @@ Version 3 added two additional pieces of information to the reserved keys in the
When reading a Version 3 HFile the presence of `MAX_TAGS_LEN` is used to determine how to deserialize the cells within a data block. When reading a Version 3 HFile the presence of `MAX_TAGS_LEN` is used to determine how to deserialize the cells within a data block.
Therefore, consumers must read the file's info block prior to reading any data blocks. Therefore, consumers must read the file's info block prior to reading any data blocks.
When writing a Version 3 HFile, HBase will always include `MAX_TAGS_LEN ` when flushing the memstore to underlying filesystem and when using prefix tree encoding for data blocks, as described in <<compression,compression>>. When writing a Version 3 HFile, HBase will always include `MAX_TAGS_LEN` when flushing the memstore to underlying filesystem.
When compacting extant files, the default writer will omit `MAX_TAGS_LEN` if all of the files selected do not themselves contain any cells with tags. When compacting extant files, the default writer will omit `MAX_TAGS_LEN` if all of the files selected do not themselves contain any cells with tags.

View File

@ -125,7 +125,7 @@ The compression or codec type to use depends on the characteristics of your data
In general, you need to weigh your options between smaller size and faster compression/decompression. Following are some general guidelines, expanded from a discussion at link:http://search-hadoop.com/m/lL12B1PFVhp1[Documenting Guidance on compression and codecs]. In general, you need to weigh your options between smaller size and faster compression/decompression. Following are some general guidelines, expanded from a discussion at link:http://search-hadoop.com/m/lL12B1PFVhp1[Documenting Guidance on compression and codecs].
* If you have long keys (compared to the values) or many columns, use a prefix encoder. * If you have long keys (compared to the values) or many columns, use a prefix encoder.
FAST_DIFF is recommended, as more testing is needed for Prefix Tree encoding. FAST_DIFF is recommended.
* If the values are large (and not precompressed, such as images), use a data block compressor. * If the values are large (and not precompressed, such as images), use a data block compressor.
* Use GZIP for [firstterm]_cold data_, which is accessed infrequently. * Use GZIP for [firstterm]_cold data_, which is accessed infrequently.
GZIP compression uses more CPU resources than Snappy or LZO, but provides a higher compression ratio. GZIP compression uses more CPU resources than Snappy or LZO, but provides a higher compression ratio.

View File

@ -793,6 +793,28 @@ For general usage instructions, pass the `-h` option.
The LoadTestTool has received many updates in recent HBase releases, including support for namespaces, support for tags, cell-level ACLS and visibility labels, testing security-related features, ability to specify the number of regions per server, tests for multi-get RPC calls, and tests relating to replication. The LoadTestTool has received many updates in recent HBase releases, including support for namespaces, support for tags, cell-level ACLS and visibility labels, testing security-related features, ability to specify the number of regions per server, tests for multi-get RPC calls, and tests relating to replication.
[[ops.pre-upgrade]]
=== Pre-Upgrade validator
Pre-Upgrade validator tool can be used to check the cluster for known incompatibilities before upgrading from HBase 1 to HBase 2.
To run all the checks use the `-all` flag.
[source, bash]
----
$ bin/hbase pre-upgrade -all
----
==== DataBlockEncoding validation
HBase 2.0 removed `PREFIX_TREE` Data Block Encoding from column families.
To verify that none of the column families are using incompatible Data Block Encodings in the cluster run the following command.
[source, bash]
----
$ bin/hbase pre-upgrade -validateDBE
----
This check validates all column families and print out any incompatibilities.
To change `PREFIX_TREE` encoding to supported one check <<upgrade2.0.prefix-tree.removed,_prefix-tree_ encoding removed>>.
[[ops.regionmgt]] [[ops.regionmgt]]
== Region Management == Region Management