From fb9aa8a95a999a30002b290fad8aba3675be944f Mon Sep 17 00:00:00 2001 From: brandboat Date: Tue, 7 Aug 2018 12:52:05 +0800 Subject: [PATCH] HBASE-18201 add UT and docs for DataBlockEncodingTool Signed-off-by: Reid Chan --- .../hbase/io/encoding/EncodedDataBlock.java | 41 ++++- .../regionserver/DataBlockEncodingTool.java | 64 +++++--- .../TestDataBlockEncodingTool.java | 147 ++++++++++++++++++ src/main/asciidoc/_chapters/ops_mgt.adoc | 24 +++ 4 files changed, 251 insertions(+), 25 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataBlockEncodingTool.java diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/EncodedDataBlock.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/EncodedDataBlock.java index af6865607af..3c16cffaef2 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/EncodedDataBlock.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/EncodedDataBlock.java @@ -23,7 +23,9 @@ import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.Iterator; +import java.util.List; import org.apache.commons.lang3.NotImplementedException; import org.apache.hadoop.hbase.Cell; @@ -57,6 +59,16 @@ public class EncodedDataBlock { private final HFileBlockEncodingContext encodingCtx; private HFileContext meta; + private final DataBlockEncoding encoding; + + // The is for one situation that there are some cells includes tags and others are not. + // isTagsLenZero stores if cell tags length is zero before doing encoding since we need + // to check cell tags length is zero or not after decoding. + // Encoders ROW_INDEX_V1 would abandon tags segment if tags is 0 after decode cells to + // byte array, other encoders won't do that. So we have to find a way to add tagsLen zero + // in the decoded byte array. + private List isTagsLenZero = new ArrayList<>(); + /** * Create a buffer which will be encoded using dataBlockEncoder. * @param dataBlockEncoder Algorithm used for compression. @@ -69,6 +81,7 @@ public class EncodedDataBlock { Preconditions.checkNotNull(encoding, "Cannot create encoded data block with null encoder"); this.dataBlockEncoder = dataBlockEncoder; + this.encoding = encoding; encodingCtx = dataBlockEncoder.newDataBlockEncodingContext(encoding, HConstants.HFILEBLOCK_DUMMY_HEADER, meta); this.rawKVs = rawKVs; @@ -90,6 +103,7 @@ public class EncodedDataBlock { return new Iterator() { private ByteBuffer decompressedData = null; + private Iterator it = isTagsLenZero.iterator(); @Override public boolean hasNext() { @@ -116,10 +130,18 @@ public class EncodedDataBlock { int vlen = decompressedData.getInt(); int tagsLen = 0; ByteBufferUtils.skip(decompressedData, klen + vlen); - // Read the tag length in case when steam contain tags + // Read the tag length in case when stream contain tags if (meta.isIncludesTags()) { - tagsLen = ((decompressedData.get() & 0xff) << 8) ^ (decompressedData.get() & 0xff); - ByteBufferUtils.skip(decompressedData, tagsLen); + boolean noTags = true; + if (it.hasNext()) { + noTags = it.next(); + } + // ROW_INDEX_V1 will not put tagsLen back in cell if it is zero, there is no need + // to read short here. + if (!(encoding.equals(DataBlockEncoding.ROW_INDEX_V1) && noTags)) { + tagsLen = ((decompressedData.get() & 0xff) << 8) ^ (decompressedData.get() & 0xff); + ByteBufferUtils.skip(decompressedData, tagsLen); + } } KeyValue kv = new KeyValue(decompressedData.array(), offset, (int) KeyValue.getKeyValueDataStructureSize(klen, vlen, tagsLen)); @@ -247,6 +269,7 @@ public class EncodedDataBlock { if (this.meta.isIncludesTags()) { tagsLength = ((in.get() & 0xff) << 8) ^ (in.get() & 0xff); ByteBufferUtils.skip(in, tagsLength); + this.isTagsLenZero.add(tagsLength == 0); } if (this.meta.isIncludesMvcc()) { memstoreTS = ByteBufferUtils.readVLong(in); @@ -260,6 +283,16 @@ public class EncodedDataBlock { baos.flush(); baosBytes = baos.toByteArray(); this.dataBlockEncoder.endBlockEncoding(encodingCtx, out, baosBytes); + // In endBlockEncoding(encodingCtx, out, baosBytes), Encoder ROW_INDEX_V1 write integer in + // out while the others write integer in baosBytes(byte array). We need to add + // baos.toByteArray() after endBlockEncoding again to make sure the integer writes in + // outputstream with Encoder ROW_INDEX_V1 dump to byte array (baosBytes). + // The if branch is necessary because Encoders excepts ROW_INDEX_V1 write integer in + // baosBytes directly, without if branch and do toByteArray() again, baosBytes won't + // contains the integer wrotten in endBlockEncoding. + if (this.encoding.equals(DataBlockEncoding.ROW_INDEX_V1)) { + baosBytes = baos.toByteArray(); + } } catch (IOException e) { throw new RuntimeException(String.format( "Bug in encoding part of algorithm %s. " + @@ -271,6 +304,6 @@ public class EncodedDataBlock { @Override public String toString() { - return dataBlockEncoder.toString(); + return encoding.name(); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/DataBlockEncodingTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/DataBlockEncodingTool.java index 91435f3780b..59f6678a08e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/DataBlockEncodingTool.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/DataBlockEncodingTool.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.regionserver; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; import java.text.DecimalFormat; @@ -44,6 +45,7 @@ import org.apache.hadoop.hbase.io.hfile.HFileContext; import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; import org.apache.hadoop.hbase.io.hfile.HFileReaderImpl; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.compress.CompressionOutputStream; import org.apache.hadoop.io.compress.Compressor; import org.apache.hadoop.io.compress.Decompressor; @@ -91,8 +93,8 @@ public class DataBlockEncodingTool { /** If this is specified, no correctness testing will be done */ private static final String OPT_OMIT_CORRECTNESS_TEST = "c"; - /** What encoding algorithm to test */ - private static final String OPT_ENCODING_ALGORITHM = "a"; + /** What compression algorithm to test */ + private static final String OPT_COMPRESSION_ALGORITHM = "a"; /** Number of times to run each benchmark */ private static final String OPT_BENCHMARK_N_TIMES = "t"; @@ -132,7 +134,10 @@ public class DataBlockEncodingTool { private final Compressor compressor; private final Decompressor decompressor; - private static enum Manipulation { + // Check if HFile use Tag. + private static boolean USE_TAG = false; + + private enum Manipulation { ENCODING, DECODING, COMPRESSION, @@ -192,8 +197,25 @@ public class DataBlockEncodingTool { } } - uncompressedOutputStream.write(currentKV.getBuffer(), - currentKV.getOffset(), currentKV.getLength()); + // Add tagsLen zero to cells don't include tags. Since the process of + // scanner converts byte array to KV would abandon tagsLen part if tagsLen + // is zero. But we still needs the tagsLen part to check if current cell + // include tags. If USE_TAG is true, HFile contains cells with tags, + // if the cell tagsLen equals 0, it means other cells may have tags. + if (USE_TAG && currentKV.getTagsLength() == 0) { + uncompressedOutputStream.write(currentKV.getBuffer(), + currentKV.getOffset(), currentKV.getLength()); + // write tagsLen = 0. + uncompressedOutputStream.write(Bytes.toBytes((short) 0)); + } else { + uncompressedOutputStream.write(currentKV.getBuffer(), + currentKV.getOffset(), currentKV.getLength()); + } + + if(includesMemstoreTS) { + WritableUtils.writeVLong( + new DataOutputStream(uncompressedOutputStream), currentKV.getSequenceId()); + } previousKey = currentKey; @@ -209,16 +231,16 @@ public class DataBlockEncodingTool { } rawKVs = uncompressedOutputStream.toByteArray(); - boolean useTag = (currentKV.getTagsLength() > 0); for (DataBlockEncoding encoding : encodings) { if (encoding == DataBlockEncoding.NONE) { continue; } DataBlockEncoder d = encoding.getEncoder(); HFileContext meta = new HFileContextBuilder() - .withCompression(Compression.Algorithm.NONE) - .withIncludesMvcc(includesMemstoreTS) - .withIncludesTags(useTag).build(); + .withDataBlockEncoding(encoding) + .withCompression(Compression.Algorithm.NONE) + .withIncludesMvcc(includesMemstoreTS) + .withIncludesTags(USE_TAG).build(); codecs.add(new EncodedDataBlock(d, encoding, rawKVs, meta )); } } @@ -396,8 +418,10 @@ public class DataBlockEncodingTool { try { for (int itTime = 0; itTime < benchmarkNTimes; ++itTime) { final long startTime = System.nanoTime(); - compressingStream.resetState(); + // The compressedStream should reset before compressingStream resetState since in GZ + // resetStatue will write header in the outputstream. compressedStream.reset(); + compressingStream.resetState(); compressingStream.write(buffer, offset, length); compressingStream.flush(); compressedStream.toByteArray(); @@ -438,12 +462,6 @@ public class DataBlockEncodingTool { } decompressedStream.close(); - // iterate over KeyValues - KeyValue kv; - for (int pos = 0; pos < length; pos += kv.getLength()) { - kv = new KeyValue(newBuf, pos); - } - } catch (IOException e) { throw new RuntimeException(String.format( "Decoding path in '%s' algorithm cause exception ", name), e); @@ -596,8 +614,9 @@ public class DataBlockEncodingTool { hsf.initReader(); StoreFileReader reader = hsf.getReader(); reader.loadFileInfo(); - KeyValueScanner scanner = reader.getStoreFileScanner(true, true, false, 0, 0, false); - + KeyValueScanner scanner = reader.getStoreFileScanner(true, true, + false, hsf.getMaxMemStoreTS(), 0, false); + USE_TAG = reader.getHFileReader().getFileContext().isIncludesTags(); // run the utilities DataBlockEncodingTool comp = new DataBlockEncodingTool(compressionName); int majorVersion = reader.getHFileVersion(); @@ -654,7 +673,7 @@ public class DataBlockEncodingTool { "Measure read throughput"); options.addOption(OPT_OMIT_CORRECTNESS_TEST, false, "Omit corectness tests."); - options.addOption(OPT_ENCODING_ALGORITHM, true, + options.addOption(OPT_COMPRESSION_ALGORITHM, true, "What kind of compression algorithm use for comparison."); options.addOption(OPT_BENCHMARK_N_TIMES, true, "Number of times to run each benchmark. Default value: " + @@ -680,6 +699,9 @@ public class DataBlockEncodingTool { int kvLimit = Integer.MAX_VALUE; if (cmd.hasOption(OPT_KV_LIMIT)) { kvLimit = Integer.parseInt(cmd.getOptionValue(OPT_KV_LIMIT)); + if (kvLimit <= 0) { + LOG.error("KV_LIMIT should not less than 1."); + } } // basic argument sanity checks @@ -692,9 +714,9 @@ public class DataBlockEncodingTool { String pathName = cmd.getOptionValue(OPT_HFILE_NAME); String compressionName = DEFAULT_COMPRESSION.getName(); - if (cmd.hasOption(OPT_ENCODING_ALGORITHM)) { + if (cmd.hasOption(OPT_COMPRESSION_ALGORITHM)) { compressionName = - cmd.getOptionValue(OPT_ENCODING_ALGORITHM).toLowerCase(Locale.ROOT); + cmd.getOptionValue(OPT_COMPRESSION_ALGORITHM).toLowerCase(Locale.ROOT); } boolean doBenchmark = cmd.hasOption(OPT_MEASURE_THROUGHPUT); boolean doVerify = !cmd.hasOption(OPT_OMIT_CORRECTNESS_TEST); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataBlockEncodingTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataBlockEncodingTool.java new file mode 100644 index 00000000000..7a6b3d2171c --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataBlockEncodingTool.java @@ -0,0 +1,147 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.ArrayBackedTag; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.Tag; +import org.apache.hadoop.hbase.io.compress.Compression; +import org.apache.hadoop.hbase.io.hfile.HFileContext; +import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; +import org.apache.hadoop.hbase.testclassification.MiscTests; +import org.apache.hadoop.hbase.testclassification.SmallTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +/** + * Test DataBlockEncodingTool. + */ +@Category({MiscTests.class, SmallTests.class}) +public class TestDataBlockEncodingTool { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestDataBlockEncodingTool.class); + + private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + private static final String ROOT_DIR = + TEST_UTIL.getDataTestDir("TestDataBlockEncodingTool").toString(); + private static final Configuration conf = TEST_UTIL.getConfiguration(); + private static FileSystem fs; + private static StoreFileWriter sfw; + + @Before + public void setUp() throws IOException { + fs = TEST_UTIL.getTestFileSystem(); + } + + private void testHFile(String fileName, boolean useTags, boolean allTags) throws IOException { + Path path = new Path(ROOT_DIR, fileName); + try { + createHFileWithTags(path, useTags, allTags); + testDataBlockingTool(path); + } finally { + if (fs.exists(path)) { + fs.delete(path, false); + } + } + } + + private void createHFileWithTags(Path path, boolean useTags, boolean allTags) throws IOException { + HFileContext meta = new HFileContextBuilder() + .withBlockSize(64 * 1024) + .withIncludesTags(useTags).build(); + sfw = + new StoreFileWriter.Builder(conf, fs) + .withFilePath(path) + .withFileContext(meta).build(); + long now = System.currentTimeMillis(); + byte[] FAMILY = Bytes.toBytes("cf"); + byte[] QUALIFIER = Bytes.toBytes("q"); + try { + for (char d = 'a'; d <= 'z'; d++) { + for (char e = 'a'; e <= 'z'; e++) { + byte[] b = new byte[] { (byte) d, (byte) e }; + KeyValue kv; + if (useTags) { + if (allTags) { + // Write cells with tags to HFile. + Tag[] tags = new Tag[]{ + new ArrayBackedTag((byte) 0, Bytes.toString(b)), + new ArrayBackedTag((byte) 0, Bytes.toString(b))}; + kv = new KeyValue(b, FAMILY, QUALIFIER, now, b, tags); + } else { + // Write half cells with tags and half without tags to HFile. + if ((e - 'a') % 2 == 0) { + kv = new KeyValue(b, FAMILY, QUALIFIER, now, b); + } else { + Tag[] tags = new Tag[]{ + new ArrayBackedTag((byte) 0, Bytes.toString(b)), + new ArrayBackedTag((byte) 0, Bytes.toString(b))}; + kv = new KeyValue(b, FAMILY, QUALIFIER, now, b, tags); + } + } + } else { + // Write cells without tags to HFile. + kv = new KeyValue(b, FAMILY, QUALIFIER, now, b); + } + sfw.append(kv); + } + } + sfw.appendMetadata(0, false); + } finally { + sfw.close(); + } + } + + private static void testDataBlockingTool(Path path) throws IOException { + Configuration conf = HBaseConfiguration.create(); + int maxKV = Integer.MAX_VALUE; + boolean doVerify = true; + boolean doBenchmark = true; + String testHFilePath = path.toString(); + DataBlockEncodingTool.testCodecs(conf, maxKV, testHFilePath, + Compression.Algorithm.GZ.getName(), doBenchmark, doVerify); + } + + @Test + public void testHFileAllCellsWithTags() throws IOException { + testHFile("1234567890", true, true); + } + + @Test + public void testHFileAllCellsWithoutTags() throws IOException { + testHFile("1234567089", false, false); + } + + @Test + public void testHFileHalfCellsWithTags() throws IOException { + testHFile("1234560789", true, false); + } +} \ No newline at end of file diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc index 7f9a73a7e38..07408ff7566 100644 --- a/src/main/asciidoc/_chapters/ops_mgt.adoc +++ b/src/main/asciidoc/_chapters/ops_mgt.adoc @@ -1005,6 +1005,30 @@ drop_namespace 'pre_upgrade_cleanup' For further information, please refer to link:https://issues.apache.org/jira/browse/HBASE-20649?focusedCommentId=16535476#comment-16535476[HBASE-20649]. +=== Data Block Encoding Tool + +Tests various compression algorithms with different data block encoder for key compression on an existing HFile. +Useful for testing, debugging and benchmarking. + +You must specify `-f` which is the full path of the HFile. + +The result shows both the performance (MB/s) of compression/decompression and encoding/decoding, and the data savings on the HFile. + +---- + +$ bin/hbase org.apache.hadoop.hbase.regionserver.DataBlockEncodingTool +Usages: hbase org.apache.hadoop.hbase.regionserver.DataBlockEncodingTool +Options: + -f HFile to analyse (REQUIRED) + -n Maximum number of key/value pairs to process in a single benchmark run. + -b Whether to run a benchmark to measure read throughput. + -c If this is specified, no correctness testing will be done. + -a What kind of compression algorithm use for test. Default value: GZ. + -t Number of times to run each benchmark. Default value: 12. + -omit Number of first runs of every benchmark to omit from statistics. Default value: 2. + +---- + [[ops.regionmgt]] == Region Management