HBASE-18201 add UT and docs for DataBlockEncodingTool

Signed-off-by: Reid Chan <reidchan@apache.org>
2018-08-07 12:52:05 +08:00 · 2018-08-07 12:52:05 +08:00 · fb9aa8a95a
parent 28c3336bfe
commit fb9aa8a95a
4 changed files with 251 additions and 25 deletions
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/EncodedDataBlock.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/EncodedDataBlock.java
@ -23,7 +23,9 @@ import java.io.DataOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;

 import org.apache.commons.lang3.NotImplementedException;
 import org.apache.hadoop.hbase.Cell;
@ -57,6 +59,16 @@ public class EncodedDataBlock {
  private final HFileBlockEncodingContext encodingCtx;
  private HFileContext meta;

+  private final DataBlockEncoding encoding;
+
+  // The is for one situation that there are some cells includes tags and others are not.
+  // isTagsLenZero stores if cell tags length is zero before doing encoding since we need
+  // to check cell tags length is zero or not after decoding.
+  // Encoders ROW_INDEX_V1 would abandon tags segment if tags is 0 after decode cells to
+  // byte array, other encoders won't do that. So we have to find a way to add tagsLen zero
+  // in the decoded byte array.
+  private List<Boolean> isTagsLenZero = new ArrayList<>();
+
  /**
   * Create a buffer which will be encoded using dataBlockEncoder.
   * @param dataBlockEncoder Algorithm used for compression.
@ -69,6 +81,7 @@ public class EncodedDataBlock {
    Preconditions.checkNotNull(encoding,
        "Cannot create encoded data block with null encoder");
    this.dataBlockEncoder = dataBlockEncoder;
+    this.encoding = encoding;
    encodingCtx = dataBlockEncoder.newDataBlockEncodingContext(encoding,
        HConstants.HFILEBLOCK_DUMMY_HEADER, meta);
    this.rawKVs = rawKVs;
@ -90,6 +103,7 @@ public class EncodedDataBlock {

    return new Iterator<Cell>() {
      private ByteBuffer decompressedData = null;
+      private Iterator<Boolean> it = isTagsLenZero.iterator();

      @Override
      public boolean hasNext() {
@ -116,11 +130,19 @@ public class EncodedDataBlock {
        int vlen = decompressedData.getInt();
        int tagsLen = 0;
        ByteBufferUtils.skip(decompressedData, klen + vlen);
-        // Read the tag length in case when steam contain tags
+        // Read the tag length in case when stream contain tags
        if (meta.isIncludesTags()) {
+          boolean noTags = true;
+          if (it.hasNext()) {
+            noTags = it.next();
+          }
+          // ROW_INDEX_V1 will not put tagsLen back in cell if it is zero, there is no need
+          // to read short here.
+          if (!(encoding.equals(DataBlockEncoding.ROW_INDEX_V1) && noTags)) {
            tagsLen = ((decompressedData.get() & 0xff) << 8) ^ (decompressedData.get() & 0xff);
            ByteBufferUtils.skip(decompressedData, tagsLen);
          }
+        }
        KeyValue kv = new KeyValue(decompressedData.array(), offset,
            (int) KeyValue.getKeyValueDataStructureSize(klen, vlen, tagsLen));
        if (meta.isIncludesMvcc()) {
@ -247,6 +269,7 @@ public class EncodedDataBlock {
        if (this.meta.isIncludesTags()) {
          tagsLength = ((in.get() & 0xff) << 8) ^ (in.get() & 0xff);
          ByteBufferUtils.skip(in, tagsLength);
+          this.isTagsLenZero.add(tagsLength == 0);
        }
        if (this.meta.isIncludesMvcc()) {
          memstoreTS = ByteBufferUtils.readVLong(in);
@ -260,6 +283,16 @@ public class EncodedDataBlock {
      baos.flush();
      baosBytes = baos.toByteArray();
      this.dataBlockEncoder.endBlockEncoding(encodingCtx, out, baosBytes);
+      // In endBlockEncoding(encodingCtx, out, baosBytes), Encoder ROW_INDEX_V1 write integer in
+      // out while the others write integer in baosBytes(byte array). We need to add
+      // baos.toByteArray() after endBlockEncoding again to make sure the integer writes in
+      // outputstream with Encoder ROW_INDEX_V1 dump to byte array (baosBytes).
+      // The if branch is necessary because Encoders excepts ROW_INDEX_V1 write integer in
+      // baosBytes directly, without if branch and do toByteArray() again, baosBytes won't
+      // contains the integer wrotten in endBlockEncoding.
+      if (this.encoding.equals(DataBlockEncoding.ROW_INDEX_V1)) {
+        baosBytes = baos.toByteArray();
+      }
    } catch (IOException e) {
      throw new RuntimeException(String.format(
          "Bug in encoding part of algorithm %s. " +
@ -271,6 +304,6 @@ public class EncodedDataBlock {

  @Override
  public String toString() {
-    return dataBlockEncoder.toString();
+    return encoding.name();
  }
 }
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/DataBlockEncodingTool.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/DataBlockEncodingTool.java
@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.regionserver;

 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.DecimalFormat;
@ -44,6 +45,7 @@ import org.apache.hadoop.hbase.io.hfile.HFileContext;
 import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
 import org.apache.hadoop.hbase.io.hfile.HFileReaderImpl;
 import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.WritableUtils;
 import org.apache.hadoop.io.compress.CompressionOutputStream;
 import org.apache.hadoop.io.compress.Compressor;
 import org.apache.hadoop.io.compress.Decompressor;
@ -91,8 +93,8 @@ public class DataBlockEncodingTool {
  /** If this is specified, no correctness testing will be done */
  private static final String OPT_OMIT_CORRECTNESS_TEST = "c";

-  /** What encoding algorithm to test */
-  private static final String OPT_ENCODING_ALGORITHM = "a";
+  /** What compression algorithm to test */
+  private static final String OPT_COMPRESSION_ALGORITHM = "a";

  /** Number of times to run each benchmark */
  private static final String OPT_BENCHMARK_N_TIMES = "t";
@ -132,7 +134,10 @@ public class DataBlockEncodingTool {
  private final Compressor compressor;
  private final Decompressor decompressor;

-  private static enum Manipulation {
+  // Check if HFile use Tag.
+  private static boolean USE_TAG = false;
+
+  private enum Manipulation {
    ENCODING,
    DECODING,
    COMPRESSION,
@ -192,8 +197,25 @@ public class DataBlockEncodingTool {
        }
      }

+      // Add tagsLen zero to cells don't include tags. Since the process of
+      // scanner converts byte array to KV would abandon tagsLen part if tagsLen
+      // is zero. But we still needs the tagsLen part to check if current cell
+      // include tags. If USE_TAG is true, HFile contains cells with tags,
+      // if the cell tagsLen equals 0, it means other cells may have tags.
+      if (USE_TAG && currentKV.getTagsLength() == 0) {
        uncompressedOutputStream.write(currentKV.getBuffer(),
            currentKV.getOffset(), currentKV.getLength());
+        // write tagsLen = 0.
+        uncompressedOutputStream.write(Bytes.toBytes((short) 0));
+      } else {
+        uncompressedOutputStream.write(currentKV.getBuffer(),
+            currentKV.getOffset(), currentKV.getLength());
+      }
+
+      if(includesMemstoreTS) {
+        WritableUtils.writeVLong(
+            new DataOutputStream(uncompressedOutputStream), currentKV.getSequenceId());
+      }

      previousKey = currentKey;

@ -209,16 +231,16 @@ public class DataBlockEncodingTool {
    }

    rawKVs = uncompressedOutputStream.toByteArray();
-    boolean useTag = (currentKV.getTagsLength() > 0);
    for (DataBlockEncoding encoding : encodings) {
      if (encoding == DataBlockEncoding.NONE) {
        continue;
      }
      DataBlockEncoder d = encoding.getEncoder();
      HFileContext meta = new HFileContextBuilder()
+          .withDataBlockEncoding(encoding)
          .withCompression(Compression.Algorithm.NONE)
          .withIncludesMvcc(includesMemstoreTS)
-                          .withIncludesTags(useTag).build();
+          .withIncludesTags(USE_TAG).build();
      codecs.add(new EncodedDataBlock(d, encoding, rawKVs, meta ));
    }
  }
@ -396,8 +418,10 @@ public class DataBlockEncodingTool {
    try {
      for (int itTime = 0; itTime < benchmarkNTimes; ++itTime) {
        final long startTime = System.nanoTime();
-        compressingStream.resetState();
+        // The compressedStream should reset before compressingStream resetState since in GZ
+        // resetStatue will write header in the outputstream.
        compressedStream.reset();
+        compressingStream.resetState();
        compressingStream.write(buffer, offset, length);
        compressingStream.flush();
        compressedStream.toByteArray();
@ -438,12 +462,6 @@ public class DataBlockEncodingTool {
        }
        decompressedStream.close();

-        // iterate over KeyValues
-        KeyValue kv;
-        for (int pos = 0; pos < length; pos += kv.getLength()) {
-          kv = new KeyValue(newBuf, pos);
-        }
-
      } catch (IOException e) {
        throw new RuntimeException(String.format(
            "Decoding path in '%s' algorithm cause exception ", name), e);
@ -596,8 +614,9 @@ public class DataBlockEncodingTool {
    hsf.initReader();
    StoreFileReader reader = hsf.getReader();
    reader.loadFileInfo();
-    KeyValueScanner scanner = reader.getStoreFileScanner(true, true, false, 0, 0, false);
-
+    KeyValueScanner scanner = reader.getStoreFileScanner(true, true,
+        false, hsf.getMaxMemStoreTS(), 0, false);
+    USE_TAG = reader.getHFileReader().getFileContext().isIncludesTags();
    // run the utilities
    DataBlockEncodingTool comp = new DataBlockEncodingTool(compressionName);
    int majorVersion = reader.getHFileVersion();
@ -654,7 +673,7 @@ public class DataBlockEncodingTool {
        "Measure read throughput");
    options.addOption(OPT_OMIT_CORRECTNESS_TEST, false,
        "Omit corectness tests.");
-    options.addOption(OPT_ENCODING_ALGORITHM, true,
+    options.addOption(OPT_COMPRESSION_ALGORITHM, true,
        "What kind of compression algorithm use for comparison.");
    options.addOption(OPT_BENCHMARK_N_TIMES,
        true, "Number of times to run each benchmark. Default value: " +
@ -680,6 +699,9 @@ public class DataBlockEncodingTool {
    int kvLimit = Integer.MAX_VALUE;
    if (cmd.hasOption(OPT_KV_LIMIT)) {
      kvLimit = Integer.parseInt(cmd.getOptionValue(OPT_KV_LIMIT));
+      if (kvLimit <= 0) {
+        LOG.error("KV_LIMIT should not less than 1.");
+      }
    }

    // basic argument sanity checks
@ -692,9 +714,9 @@ public class DataBlockEncodingTool {

    String pathName = cmd.getOptionValue(OPT_HFILE_NAME);
    String compressionName = DEFAULT_COMPRESSION.getName();
-    if (cmd.hasOption(OPT_ENCODING_ALGORITHM)) {
+    if (cmd.hasOption(OPT_COMPRESSION_ALGORITHM)) {
      compressionName =
-          cmd.getOptionValue(OPT_ENCODING_ALGORITHM).toLowerCase(Locale.ROOT);
+          cmd.getOptionValue(OPT_COMPRESSION_ALGORITHM).toLowerCase(Locale.ROOT);
    }
    boolean doBenchmark = cmd.hasOption(OPT_MEASURE_THROUGHPUT);
    boolean doVerify = !cmd.hasOption(OPT_OMIT_CORRECTNESS_TEST);
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataBlockEncodingTool.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataBlockEncodingTool.java
@ -0,0 +1,147 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.regionserver;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.ArrayBackedTag;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.Tag;
+import org.apache.hadoop.hbase.io.compress.Compression;
+import org.apache.hadoop.hbase.io.hfile.HFileContext;
+import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
+import org.apache.hadoop.hbase.testclassification.MiscTests;
+import org.apache.hadoop.hbase.testclassification.SmallTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.Before;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ *  Test DataBlockEncodingTool.
+ */
+@Category({MiscTests.class, SmallTests.class})
+public class TestDataBlockEncodingTool {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+      HBaseClassTestRule.forClass(TestDataBlockEncodingTool.class);
+
+  private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private static final String ROOT_DIR =
+      TEST_UTIL.getDataTestDir("TestDataBlockEncodingTool").toString();
+  private static final Configuration conf = TEST_UTIL.getConfiguration();
+  private static FileSystem fs;
+  private static StoreFileWriter sfw;
+
+  @Before
+  public void setUp() throws IOException {
+    fs = TEST_UTIL.getTestFileSystem();
+  }
+
+  private void testHFile(String fileName, boolean useTags, boolean allTags) throws IOException {
+    Path path = new Path(ROOT_DIR, fileName);
+    try {
+      createHFileWithTags(path, useTags, allTags);
+      testDataBlockingTool(path);
+    } finally {
+      if (fs.exists(path)) {
+        fs.delete(path, false);
+      }
+    }
+  }
+
+  private void createHFileWithTags(Path path, boolean useTags, boolean allTags) throws IOException {
+    HFileContext meta = new HFileContextBuilder()
+        .withBlockSize(64 * 1024)
+        .withIncludesTags(useTags).build();
+    sfw =
+        new StoreFileWriter.Builder(conf, fs)
+            .withFilePath(path)
+            .withFileContext(meta).build();
+    long now = System.currentTimeMillis();
+    byte[] FAMILY = Bytes.toBytes("cf");
+    byte[] QUALIFIER = Bytes.toBytes("q");
+    try {
+      for (char d = 'a'; d <= 'z'; d++) {
+        for (char e = 'a'; e <= 'z'; e++) {
+          byte[] b = new byte[] { (byte) d, (byte) e };
+          KeyValue kv;
+          if (useTags) {
+            if (allTags) {
+              // Write cells with tags to HFile.
+              Tag[] tags = new Tag[]{
+                new ArrayBackedTag((byte) 0, Bytes.toString(b)),
+                new ArrayBackedTag((byte) 0, Bytes.toString(b))};
+              kv = new KeyValue(b, FAMILY, QUALIFIER, now, b, tags);
+            } else {
+              // Write half cells with tags and half without tags to HFile.
+              if ((e - 'a') % 2 == 0) {
+                kv = new KeyValue(b, FAMILY, QUALIFIER, now, b);
+              } else {
+                Tag[] tags = new Tag[]{
+                  new ArrayBackedTag((byte) 0, Bytes.toString(b)),
+                  new ArrayBackedTag((byte) 0, Bytes.toString(b))};
+                kv = new KeyValue(b, FAMILY, QUALIFIER, now, b, tags);
+              }
+            }
+          } else {
+            // Write cells without tags to HFile.
+            kv = new KeyValue(b, FAMILY, QUALIFIER, now, b);
+          }
+          sfw.append(kv);
+        }
+      }
+      sfw.appendMetadata(0, false);
+    } finally {
+      sfw.close();
+    }
+  }
+
+  private static void testDataBlockingTool(Path path) throws IOException {
+    Configuration conf = HBaseConfiguration.create();
+    int maxKV = Integer.MAX_VALUE;
+    boolean doVerify = true;
+    boolean doBenchmark = true;
+    String testHFilePath = path.toString();
+    DataBlockEncodingTool.testCodecs(conf, maxKV, testHFilePath,
+        Compression.Algorithm.GZ.getName(), doBenchmark, doVerify);
+  }
+
+  @Test
+  public void testHFileAllCellsWithTags() throws IOException {
+    testHFile("1234567890", true, true);
+  }
+
+  @Test
+  public void testHFileAllCellsWithoutTags() throws IOException {
+    testHFile("1234567089", false, false);
+  }
+
+  @Test
+  public void testHFileHalfCellsWithTags() throws IOException {
+    testHFile("1234560789", true, false);
+  }
+}
--- a/src/main/asciidoc/_chapters/ops_mgt.adoc
+++ b/src/main/asciidoc/_chapters/ops_mgt.adoc
@ -1005,6 +1005,30 @@ drop_namespace 'pre_upgrade_cleanup'
 For further information, please refer to
 link:https://issues.apache.org/jira/browse/HBASE-20649?focusedCommentId=16535476#comment-16535476[HBASE-20649].

+=== Data Block Encoding Tool
+
+Tests various compression algorithms with different data block encoder for key compression on an existing HFile.
+Useful for testing, debugging and benchmarking.
+
+You must specify `-f` which is the full path of the HFile.
+
+The result shows both the performance (MB/s) of compression/decompression and encoding/decoding, and the data savings on the HFile.
+
+----
+
+$ bin/hbase org.apache.hadoop.hbase.regionserver.DataBlockEncodingTool
+Usages: hbase org.apache.hadoop.hbase.regionserver.DataBlockEncodingTool
+Options:
+        -f HFile to analyse (REQUIRED)
+        -n Maximum number of key/value pairs to process in a single benchmark run.
+        -b Whether to run a benchmark to measure read throughput.
+        -c If this is specified, no correctness testing will be done.
+        -a What kind of compression algorithm use for test. Default value: GZ.
+        -t Number of times to run each benchmark. Default value: 12.
+        -omit Number of first runs of every benchmark to omit from statistics. Default value: 2.
+
+----
+
 [[ops.regionmgt]]
 == Region Management