HBASE-14755 Fix some broken links and HTML problems
This commit is contained in:
parent
68b94886a5
commit
bfa3689190
|
@ -44,7 +44,7 @@ import org.apache.hadoop.mapreduce.Partitioner;
|
|||
* <p>This class is not suitable as partitioner creating hfiles
|
||||
* for incremental bulk loads as region spread will likely change between time of
|
||||
* hfile creation and load time. See {@link LoadIncrementalHFiles}
|
||||
* and <a href="http://hbase.apache.org/docs/current/bulk-loads.html">Bulk Load</a>.
|
||||
* and <a href="http://hbase.apache.org/book.html#arch.bulk.load">Bulk Load</a>.
|
||||
*
|
||||
* @param <KEY> The type of the key.
|
||||
* @param <VALUE> The type of the value.
|
||||
|
|
27
pom.xml
27
pom.xml
|
@ -2754,6 +2754,9 @@
|
|||
<javadocDir>${project.reporting.outputDirectory}/devapidocs</javadocDir>
|
||||
<testJavadocDir>${project.reporting.outputDirectory}/testdevapidocs</testJavadocDir>
|
||||
<destDir>${project.reporting.outputDirectory}/xref</destDir>
|
||||
<excludes>
|
||||
<exclude>**/generated/*</exclude>
|
||||
</excludes>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
|
@ -2767,7 +2770,6 @@
|
|||
<id>devapi</id>
|
||||
<reports>
|
||||
<report>aggregate</report>
|
||||
<report>test-aggregate</report>
|
||||
</reports>
|
||||
<configuration>
|
||||
<destDir>devapidocs</destDir>
|
||||
|
@ -2778,7 +2780,8 @@
|
|||
<exclude>**/protobuf/*</exclude>
|
||||
<exclude>**/*.scala</exclude>
|
||||
</sourceFileExcludes>
|
||||
<excludePackageNames>*.generated.master:*.generated:org.apache.hadoop.hbase.tmpl.common:com.google.protobuf:org.apache.hadoop.hbase.spark</excludePackageNames>
|
||||
<excludePackageNames>org.apache.hadoop.hbase.tmpl.common:com.google.protobuf:org.apache.hadoop.hbase.spark:org.apache.hadoop.hbase.generated*</excludePackageNames>
|
||||
<show>private</show> <!-- (shows all classes and members) -->
|
||||
<quiet>true</quiet>
|
||||
<linksource>true</linksource>
|
||||
<sourcetab>2</sourcetab>
|
||||
|
@ -2814,13 +2817,14 @@
|
|||
<configuration>
|
||||
<destDir>testdevapidocs</destDir>
|
||||
<name>Developer API</name>
|
||||
<description>The full HBase API, including private and unstable APIs</description>
|
||||
<description>The full HBase API test code, including private and unstable APIs</description>
|
||||
<sourceFileExcludes>
|
||||
<exclude>**/generated/*</exclude>
|
||||
<exclude>**/protobuf/*</exclude>
|
||||
<exclude>**/*.scala</exclude>
|
||||
</sourceFileExcludes>
|
||||
<excludePackageNames>*.generated.master:*.generated:org.apache.hadoop.hbase.tmpl.common:com.google.protobuf:org.apache.hadoop.hbase.spark</excludePackageNames>
|
||||
<excludePackageNames>org.apache.hadoop.hbase.tmpl.common:com.google.protobuf:org.apache.hadoop.hbase.spark:org.apache.hadoop.hbase.generated*</excludePackageNames>
|
||||
<show>private</show> <!-- (shows all classes and members) -->
|
||||
<quiet>true</quiet>
|
||||
<linksource>true</linksource>
|
||||
<sourcetab>2</sourcetab>
|
||||
|
@ -2869,7 +2873,7 @@
|
|||
<name>User API</name>
|
||||
<description>The HBase Application Programmer's API</description>
|
||||
<excludePackageNames>
|
||||
org.apache.hadoop.hbase.backup*:org.apache.hadoop.hbase.catalog:org.apache.hadoop.hbase.client.coprocessor:org.apache.hadoop.hbase.client.metrics:org.apache.hadoop.hbase.codec*:org.apache.hadoop.hbase.constraint:org.apache.hadoop.hbase.coprocessor.*:org.apache.hadoop.hbase.executor:org.apache.hadoop.hbase.fs:*.generated.*:org.apache.hadoop.hbase.io.hfile.*:org.apache.hadoop.hbase.mapreduce.hadoopbackport:org.apache.hadoop.hbase.mapreduce.replication:org.apache.hadoop.hbase.master.*:org.apache.hadoop.hbase.metrics*:org.apache.hadoop.hbase.migration:org.apache.hadoop.hbase.monitoring:org.apache.hadoop.hbase.p*:org.apache.hadoop.hbase.regionserver.compactions:org.apache.hadoop.hbase.regionserver.handler:org.apache.hadoop.hbase.regionserver.snapshot:org.apache.hadoop.hbase.replication.*:org.apache.hadoop.hbase.rest.filter:org.apache.hadoop.hbase.rest.model:org.apache.hadoop.hbase.rest.p*:org.apache.hadoop.hbase.security.*:org.apache.hadoop.hbase.thrift*:org.apache.hadoop.hbase.tmpl.*:org.apache.hadoop.hbase.tool:org.apache.hadoop.hbase.trace:org.apache.hadoop.hbase.util.byterange*:org.apache.hadoop.hbase.util.test:org.apache.hadoop.hbase.util.vint:org.apache.hadoop.hbase.zookeeper.lock:org.apache.hadoop.metrics2*
|
||||
org.apache.hadoop.hbase.backup*:org.apache.hadoop.hbase.catalog:org.apache.hadoop.hbase.client.coprocessor:org.apache.hadoop.hbase.client.metrics:org.apache.hadoop.hbase.codec*:org.apache.hadoop.hbase.constraint:org.apache.hadoop.hbase.coprocessor.*:org.apache.hadoop.hbase.executor:org.apache.hadoop.hbase.fs:*.generated.*:org.apache.hadoop.hbase.io.hfile.*:org.apache.hadoop.hbase.mapreduce.hadoopbackport:org.apache.hadoop.hbase.mapreduce.replication:org.apache.hadoop.hbase.master.*:org.apache.hadoop.hbase.metrics*:org.apache.hadoop.hbase.migration:org.apache.hadoop.hbase.monitoring:org.apache.hadoop.hbase.p*:org.apache.hadoop.hbase.regionserver.compactions:org.apache.hadoop.hbase.regionserver.handler:org.apache.hadoop.hbase.regionserver.snapshot:org.apache.hadoop.hbase.replication.*:org.apache.hadoop.hbase.rest.filter:org.apache.hadoop.hbase.rest.model:org.apache.hadoop.hbase.rest.p*:org.apache.hadoop.hbase.security.*:org.apache.hadoop.hbase.thrift*:org.apache.hadoop.hbase.tmpl.*:org.apache.hadoop.hbase.tool:org.apache.hadoop.hbase.trace:org.apache.hadoop.hbase.util.byterange*:org.apache.hadoop.hbase.util.test:org.apache.hadoop.hbase.util.vint:org.apache.hadoop.hbase.zookeeper.lock:org.apache.hadoop.metrics2*:org.apache.hadoop.hbase.io.compress*
|
||||
</excludePackageNames>
|
||||
<!-- switch on dependency-driven aggregation -->
|
||||
<includeDependencySources>false</includeDependencySources>
|
||||
|
@ -2877,11 +2881,8 @@
|
|||
<!-- include ONLY dependencies I control -->
|
||||
<dependencySourceInclude>org.apache.hbase:hbase-annotations</dependencySourceInclude>
|
||||
</dependencySourceIncludes>
|
||||
<outputDirectory>${project.reporting.outputDirectory}/devapidocs</outputDirectory>
|
||||
<name>Developer API</name>
|
||||
<description>The full HBase API, including private and unstable APIs</description>
|
||||
<sourceFilesExclude>**/generated/*</sourceFilesExclude>
|
||||
<excludePackageNames>org.apache.hadoop.hbase.generated.master:org.apache.hadoop.hbase.protobuf.generated:org.apache.hadoop.hbase.tmpl.common</excludePackageNames>
|
||||
<show>protected</show> <!-- (shows only public and protected classes and members) -->
|
||||
<quiet>true</quiet>
|
||||
<linksource>true</linksource>
|
||||
<sourcetab>2</sourcetab>
|
||||
|
@ -2909,6 +2910,7 @@
|
|||
<inherited>false</inherited>
|
||||
</configuration>
|
||||
</reportSet>
|
||||
<!-- User Test API -->
|
||||
<reportSet>
|
||||
<id>testuserapi</id>
|
||||
<reports>
|
||||
|
@ -2928,7 +2930,7 @@
|
|||
<name>User API</name>
|
||||
<description>The HBase Application Programmer's API</description>
|
||||
<excludePackageNames>
|
||||
org.apache.hadoop.hbase.backup*:org.apache.hadoop.hbase.catalog:org.apache.hadoop.hbase.client.coprocessor:org.apache.hadoop.hbase.client.metrics:org.apache.hadoop.hbase.codec*:org.apache.hadoop.hbase.constraint:org.apache.hadoop.hbase.coprocessor.*:org.apache.hadoop.hbase.executor:org.apache.hadoop.hbase.fs:*.generated.*:org.apache.hadoop.hbase.io.hfile.*:org.apache.hadoop.hbase.mapreduce.hadoopbackport:org.apache.hadoop.hbase.mapreduce.replication:org.apache.hadoop.hbase.master.*:org.apache.hadoop.hbase.metrics*:org.apache.hadoop.hbase.migration:org.apache.hadoop.hbase.monitoring:org.apache.hadoop.hbase.p*:org.apache.hadoop.hbase.regionserver.compactions:org.apache.hadoop.hbase.regionserver.handler:org.apache.hadoop.hbase.regionserver.snapshot:org.apache.hadoop.hbase.replication.*:org.apache.hadoop.hbase.rest.filter:org.apache.hadoop.hbase.rest.model:org.apache.hadoop.hbase.rest.p*:org.apache.hadoop.hbase.security.*:org.apache.hadoop.hbase.thrift*:org.apache.hadoop.hbase.tmpl.*:org.apache.hadoop.hbase.tool:org.apache.hadoop.hbase.trace:org.apache.hadoop.hbase.util.byterange*:org.apache.hadoop.hbase.util.test:org.apache.hadoop.hbase.util.vint:org.apache.hadoop.hbase.zookeeper.lock:org.apache.hadoop.metrics2*
|
||||
org.apache.hadoop.hbase.backup*:org.apache.hadoop.hbase.catalog:org.apache.hadoop.hbase.client.coprocessor:org.apache.hadoop.hbase.client.metrics:org.apache.hadoop.hbase.codec*:org.apache.hadoop.hbase.constraint:org.apache.hadoop.hbase.coprocessor.*:org.apache.hadoop.hbase.executor:org.apache.hadoop.hbase.fs:*.generated.*:org.apache.hadoop.hbase.io.hfile.*:org.apache.hadoop.hbase.mapreduce.hadoopbackport:org.apache.hadoop.hbase.mapreduce.replication:org.apache.hadoop.hbase.master.*:org.apache.hadoop.hbase.metrics*:org.apache.hadoop.hbase.migration:org.apache.hadoop.hbase.monitoring:org.apache.hadoop.hbase.p*:org.apache.hadoop.hbase.regionserver.compactions:org.apache.hadoop.hbase.regionserver.handler:org.apache.hadoop.hbase.regionserver.snapshot:org.apache.hadoop.hbase.replication.*:org.apache.hadoop.hbase.rest.filter:org.apache.hadoop.hbase.rest.model:org.apache.hadoop.hbase.rest.p*:org.apache.hadoop.hbase.security.*:org.apache.hadoop.hbase.thrift*:org.apache.hadoop.hbase.tmpl.*:org.apache.hadoop.hbase.tool:org.apache.hadoop.hbase.trace:org.apache.hadoop.hbase.util.byterange*:org.apache.hadoop.hbase.util.test:org.apache.hadoop.hbase.util.vint:org.apache.hadoop.hbase.zookeeper.lock:org.apache.hadoop.metrics2*:org.apache.hadoop.hbase.io.compress*
|
||||
</excludePackageNames>
|
||||
<!-- switch on dependency-driven aggregation -->
|
||||
<includeDependencySources>false</includeDependencySources>
|
||||
|
@ -2936,11 +2938,8 @@
|
|||
<!-- include ONLY dependencies I control -->
|
||||
<dependencySourceInclude>org.apache.hbase:hbase-annotations</dependencySourceInclude>
|
||||
</dependencySourceIncludes>
|
||||
<outputDirectory>${project.reporting.outputDirectory}/devapidocs</outputDirectory>
|
||||
<name>Developer API</name>
|
||||
<description>The full HBase API, including private and unstable APIs</description>
|
||||
<sourceFilesExclude>**/generated/*</sourceFilesExclude>
|
||||
<excludePackageNames>org.apache.hadoop.hbase.generated.master:org.apache.hadoop.hbase.protobuf.generated:org.apache.hadoop.hbase.tmpl.common</excludePackageNames>
|
||||
<show>protected</show> <!-- (shows only public and protected classes and members) -->
|
||||
<quiet>true</quiet>
|
||||
<linksource>true</linksource>
|
||||
<sourcetab>2</sourcetab>
|
||||
|
|
|
@ -44,16 +44,16 @@ An HFile in version 1 format is structured as follows:
|
|||
.HFile V1 Format
|
||||
image::hfile.png[HFile Version 1]
|
||||
|
||||
==== Block index format in version 1
|
||||
==== Block index format in version 1
|
||||
|
||||
The block index in version 1 is very straightforward.
|
||||
For each entry, it contains:
|
||||
For each entry, it contains:
|
||||
|
||||
. Offset (long)
|
||||
. Uncompressed size (int)
|
||||
. Key (a serialized byte array written using Bytes.writeByteArray)
|
||||
.. Key length as a variable-length integer (VInt)
|
||||
.. Key bytes
|
||||
. Key (a serialized byte array written using Bytes.writeByteArray)
|
||||
.. Key length as a variable-length integer (VInt)
|
||||
.. Key bytes
|
||||
|
||||
|
||||
The number of entries in the block index is stored in the fixed file trailer, and has to be passed in to the method that reads the block index.
|
||||
|
@ -66,7 +66,7 @@ We fix this limitation in version 2, where we store on-disk block size instead o
|
|||
|
||||
Note: this feature was introduced in HBase 0.92
|
||||
|
||||
==== Motivation
|
||||
==== Motivation
|
||||
|
||||
We found it necessary to revise the HFile format after encountering high memory usage and slow startup times caused by large Bloom filters and block indexes in the region server.
|
||||
Bloom filters can get as large as 100 MB per HFile, which adds up to 2 GB when aggregated over 20 regions.
|
||||
|
@ -80,7 +80,7 @@ Bloom filter blocks and index blocks (we call these "inline blocks") become inte
|
|||
|
||||
HFile is a low-level file format by design, and it should not deal with application-specific details such as Bloom filters, which are handled at StoreFile level.
|
||||
Therefore, we call Bloom filter blocks in an HFile "inline" blocks.
|
||||
We also supply HFile with an interface to write those inline blocks.
|
||||
We also supply HFile with an interface to write those inline blocks.
|
||||
|
||||
Another format modification aimed at reducing the region server startup time is to use a contiguous "load-on-open" section that has to be loaded in memory at the time an HFile is being opened.
|
||||
Currently, as an HFile opens, there are separate seek operations to read the trailer, data/meta indexes, and file info.
|
||||
|
@ -91,57 +91,57 @@ In version 2, we seek once to read the trailer and seek again to read everything
|
|||
==== Overview of Version 2
|
||||
|
||||
The version of HBase introducing the above features reads both version 1 and 2 HFiles, but only writes version 2 HFiles.
|
||||
A version 2 HFile is structured as follows:
|
||||
A version 2 HFile is structured as follows:
|
||||
|
||||
.HFile Version 2 Structure
|
||||
image:hfilev2.png[HFile Version 2]
|
||||
image:hfilev2.png[HFile Version 2]
|
||||
|
||||
==== Unified version 2 block format
|
||||
|
||||
In the version 2 every block in the data section contains the following fields:
|
||||
In the version 2 every block in the data section contains the following fields:
|
||||
|
||||
. 8 bytes: Block type, a sequence of bytes equivalent to version 1's "magic records". Supported block types are:
|
||||
.. DATA – data blocks
|
||||
.. LEAF_INDEX – leaf-level index blocks in a multi-level-block-index
|
||||
.. BLOOM_CHUNK – Bloom filter chunks
|
||||
.. META – meta blocks (not used for Bloom filters in version 2 anymore)
|
||||
.. INTERMEDIATE_INDEX – intermediate-level index blocks in a multi-level blockindex
|
||||
.. ROOT_INDEX – root>level index blocks in a multi>level block index
|
||||
.. FILE_INFO – the ``file info'' block, a small key>value map of metadata
|
||||
.. BLOOM_META – a Bloom filter metadata block in the load>on>open section
|
||||
. 8 bytes: Block type, a sequence of bytes equivalent to version 1's "magic records". Supported block types are:
|
||||
.. DATA – data blocks
|
||||
.. LEAF_INDEX – leaf-level index blocks in a multi-level-block-index
|
||||
.. BLOOM_CHUNK – Bloom filter chunks
|
||||
.. META – meta blocks (not used for Bloom filters in version 2 anymore)
|
||||
.. INTERMEDIATE_INDEX – intermediate-level index blocks in a multi-level blockindex
|
||||
.. ROOT_INDEX – root>level index blocks in a multi>level block index
|
||||
.. FILE_INFO – the ``file info'' block, a small key>value map of metadata
|
||||
.. BLOOM_META – a Bloom filter metadata block in the load>on>open section
|
||||
.. TRAILER – a fixed>size file trailer.
|
||||
As opposed to the above, this is not an HFile v2 block but a fixed>size (for each HFile version) data structure
|
||||
.. INDEX_V1 – this block type is only used for legacy HFile v1 block
|
||||
. Compressed size of the block's data, not including the header (int).
|
||||
As opposed to the above, this is not an HFile v2 block but a fixed>size (for each HFile version) data structure
|
||||
.. INDEX_V1 – this block type is only used for legacy HFile v1 block
|
||||
. Compressed size of the block's data, not including the header (int).
|
||||
+
|
||||
Can be used for skipping the current data block when scanning HFile data.
|
||||
Can be used for skipping the current data block when scanning HFile data.
|
||||
. Uncompressed size of the block's data, not including the header (int)
|
||||
+
|
||||
This is equal to the compressed size if the compression algorithm is NONE
|
||||
This is equal to the compressed size if the compression algorithm is NONE
|
||||
. File offset of the previous block of the same type (long)
|
||||
+
|
||||
Can be used for seeking to the previous data/index block
|
||||
Can be used for seeking to the previous data/index block
|
||||
. Compressed data (or uncompressed data if the compression algorithm is NONE).
|
||||
|
||||
The above format of blocks is used in the following HFile sections:
|
||||
|
||||
Scanned block section::
|
||||
The section is named so because it contains all data blocks that need to be read when an HFile is scanned sequentially.
|
||||
Also contains leaf block index and Bloom chunk blocks.
|
||||
Also contains leaf block index and Bloom chunk blocks.
|
||||
Non-scanned block section::
|
||||
This section still contains unified-format v2 blocks but it does not have to be read when doing a sequential scan.
|
||||
This section contains "meta" blocks and intermediate-level index blocks.
|
||||
This section contains "meta" blocks and intermediate-level index blocks.
|
||||
|
||||
We are supporting "meta" blocks in version 2 the same way they were supported in version 1, even though we do not store Bloom filter data in these blocks anymore.
|
||||
We are supporting "meta" blocks in version 2 the same way they were supported in version 1, even though we do not store Bloom filter data in these blocks anymore.
|
||||
|
||||
==== Block index in version 2
|
||||
|
||||
There are three types of block indexes in HFile version 2, stored in two different formats (root and non-root):
|
||||
There are three types of block indexes in HFile version 2, stored in two different formats (root and non-root):
|
||||
|
||||
. Data index -- version 2 multi-level block index, consisting of:
|
||||
.. Version 2 root index, stored in the data block index section of the file
|
||||
.. Optionally, version 2 intermediate levels, stored in the non%root format in the data index section of the file. Intermediate levels can only be present if leaf level blocks are present
|
||||
.. Optionally, version 2 leaf levels, stored in the non%root format inline with data blocks
|
||||
.. Version 2 root index, stored in the data block index section of the file
|
||||
.. Optionally, version 2 intermediate levels, stored in the non%root format in the data index section of the file. Intermediate levels can only be present if leaf level blocks are present
|
||||
.. Optionally, version 2 leaf levels, stored in the non%root format inline with data blocks
|
||||
. Meta index -- version 2 root index format only, stored in the meta index section of the file
|
||||
. Bloom index -- version 2 root index format only, stored in the ``load-on-open'' section as part of Bloom filter metadata.
|
||||
|
||||
|
@ -150,19 +150,19 @@ There are three types of block indexes in HFile version 2, stored in two differe
|
|||
This format applies to:
|
||||
|
||||
. Root level of the version 2 data index
|
||||
. Entire meta and Bloom indexes in version 2, which are always single-level.
|
||||
. Entire meta and Bloom indexes in version 2, which are always single-level.
|
||||
|
||||
A version 2 root index block is a sequence of entries of the following format, similar to entries of a version 1 block index, but storing on-disk size instead of uncompressed size.
|
||||
A version 2 root index block is a sequence of entries of the following format, similar to entries of a version 1 block index, but storing on-disk size instead of uncompressed size.
|
||||
|
||||
. Offset (long)
|
||||
. Offset (long)
|
||||
+
|
||||
This offset may point to a data block or to a deeper>level index block.
|
||||
This offset may point to a data block or to a deeper>level index block.
|
||||
|
||||
. On-disk size (int)
|
||||
. Key (a serialized byte array stored using Bytes.writeByteArray)
|
||||
. On-disk size (int)
|
||||
. Key (a serialized byte array stored using Bytes.writeByteArray)
|
||||
+
|
||||
. Key (VInt)
|
||||
. Key bytes
|
||||
. Key (VInt)
|
||||
. Key bytes
|
||||
|
||||
|
||||
A single-level version 2 block index consists of just a single root index block.
|
||||
|
@ -172,13 +172,13 @@ For the data index and the meta index the number of entries is stored in the tra
|
|||
For a multi-level block index we also store the following fields in the root index block in the load-on-open section of the HFile, in addition to the data structure described above:
|
||||
|
||||
. Middle leaf index block offset
|
||||
. Middle leaf block on-disk size (meaning the leaf index block containing the reference to the ``middle'' data block of the file)
|
||||
. Middle leaf block on-disk size (meaning the leaf index block containing the reference to the ``middle'' data block of the file)
|
||||
. The index of the mid-key (defined below) in the middle leaf-level block.
|
||||
|
||||
|
||||
|
||||
These additional fields are used to efficiently retrieve the mid-key of the HFile used in HFile splits, which we define as the first key of the block with a zero-based index of (n – 1) / 2, if the total number of blocks in the HFile is n.
|
||||
This definition is consistent with how the mid-key was determined in HFile version 1, and is reasonable in general, because blocks are likely to be the same size on average, but we don't have any estimates on individual key/value pair sizes.
|
||||
This definition is consistent with how the mid-key was determined in HFile version 1, and is reasonable in general, because blocks are likely to be the same size on average, but we don't have any estimates on individual key/value pair sizes.
|
||||
|
||||
|
||||
|
||||
|
@ -189,47 +189,47 @@ When reading the HFile and the mid-key is requested, we retrieve the middle leaf
|
|||
==== Non-root block index format in version 2
|
||||
|
||||
This format applies to intermediate-level and leaf index blocks of a version 2 multi-level data block index.
|
||||
Every non-root index block is structured as follows.
|
||||
Every non-root index block is structured as follows.
|
||||
|
||||
. numEntries: the number of entries (int).
|
||||
. numEntries: the number of entries (int).
|
||||
. entryOffsets: the ``secondary index'' of offsets of entries in the block, to facilitate a quick binary search on the key (numEntries + 1 int values). The last value is the total length of all entries in this index block.
|
||||
For example, in a non-root index block with entry sizes 60, 80, 50 the ``secondary index'' will contain the following int array: {0, 60, 140, 190}.
|
||||
. Entries.
|
||||
Each entry contains:
|
||||
Each entry contains:
|
||||
+
|
||||
. Offset of the block referenced by this entry in the file (long)
|
||||
. On>disk size of the referenced block (int)
|
||||
. Offset of the block referenced by this entry in the file (long)
|
||||
. On>disk size of the referenced block (int)
|
||||
. Key.
|
||||
The length can be calculated from entryOffsets.
|
||||
The length can be calculated from entryOffsets.
|
||||
|
||||
|
||||
==== Bloom filters in version 2
|
||||
|
||||
In contrast with version 1, in a version 2 HFile Bloom filter metadata is stored in the load-on-open section of the HFile for quick startup.
|
||||
In contrast with version 1, in a version 2 HFile Bloom filter metadata is stored in the load-on-open section of the HFile for quick startup.
|
||||
|
||||
. A compound Bloom filter.
|
||||
. A compound Bloom filter.
|
||||
+
|
||||
. Bloom filter version = 3 (int). There used to be a DynamicByteBloomFilter class that had the Bloom filter version number 2
|
||||
. The total byte size of all compound Bloom filter chunks (long)
|
||||
. Number of hash functions (int
|
||||
. Type of hash functions (int)
|
||||
. The total key count inserted into the Bloom filter (long)
|
||||
. The maximum total number of keys in the Bloom filter (long)
|
||||
. The number of chunks (int)
|
||||
. Comparator class used for Bloom filter keys, a UTF>8 encoded string stored using Bytes.writeByteArray
|
||||
. Bloom block index in the version 2 root block index format
|
||||
. Bloom filter version = 3 (int). There used to be a DynamicByteBloomFilter class that had the Bloom filter version number 2
|
||||
. The total byte size of all compound Bloom filter chunks (long)
|
||||
. Number of hash functions (int
|
||||
. Type of hash functions (int)
|
||||
. The total key count inserted into the Bloom filter (long)
|
||||
. The maximum total number of keys in the Bloom filter (long)
|
||||
. The number of chunks (int)
|
||||
. Comparator class used for Bloom filter keys, a UTF>8 encoded string stored using Bytes.writeByteArray
|
||||
. Bloom block index in the version 2 root block index format
|
||||
|
||||
|
||||
==== File Info format in versions 1 and 2
|
||||
|
||||
The file info block is a serialized link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/io/HbaseMapWritable.html[HbaseMapWritable] (essentially a map from byte arrays to byte arrays) with the following keys, among others.
|
||||
The file info block is a serialized link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/io/HbaseMapWritable.html[HbaseMapWritable] (essentially a map from byte arrays to byte arrays) with the following keys, among others.
|
||||
StoreFile-level logic adds more keys to this.
|
||||
|
||||
[cols="1,1", frame="all"]
|
||||
|===
|
||||
|hfile.LASTKEY| The last key of the file (byte array)
|
||||
|hfile.AVG_KEY_LEN| The average key length in the file (int)
|
||||
|hfile.AVG_VALUE_LEN| The average value length in the file (int)
|
||||
|hfile.AVG_VALUE_LEN| The average value length in the file (int)
|
||||
|===
|
||||
|
||||
File info format did not change in version 2.
|
||||
|
@ -242,7 +242,7 @@ This is because we need to know the comparator at the time of parsing the load-o
|
|||
|
||||
The following table shows common and different fields between fixed file trailers in versions 1 and 2.
|
||||
Note that the size of the trailer is different depending on the version, so it is ``fixed'' only within one version.
|
||||
However, the version is always stored as the last four-byte integer in the file.
|
||||
However, the version is always stored as the last four-byte integer in the file.
|
||||
|
||||
.Differences between HFile Versions 1 and 2
|
||||
[cols="1,1", frame="all"]
|
||||
|
@ -290,42 +290,42 @@ This optimization (implemented by the getShortMidpointKey method) is inspired by
|
|||
Note: this feature was introduced in HBase 0.98
|
||||
|
||||
[[hfilev3.motivation]]
|
||||
==== Motivation
|
||||
==== Motivation
|
||||
|
||||
Version 3 of HFile makes changes needed to ease management of encryption at rest and cell-level metadata (which in turn is needed for cell-level ACLs and cell-level visibility labels). For more information see <<hbase.encryption.server,hbase.encryption.server>>, <<hbase.tags,hbase.tags>>, <<hbase.accesscontrol.configuration,hbase.accesscontrol.configuration>>, and <<hbase.visibility.labels,hbase.visibility.labels>>.
|
||||
Version 3 of HFile makes changes needed to ease management of encryption at rest and cell-level metadata (which in turn is needed for cell-level ACLs and cell-level visibility labels). For more information see <<hbase.encryption.server,hbase.encryption.server>>, <<hbase.tags,hbase.tags>>, <<hbase.accesscontrol.configuration,hbase.accesscontrol.configuration>>, and <<hbase.visibility.labels,hbase.visibility.labels>>.
|
||||
|
||||
[[hfilev3.overview]]
|
||||
==== Overview
|
||||
|
||||
The version of HBase introducing the above features reads HFiles in versions 1, 2, and 3 but only writes version 3 HFiles.
|
||||
Version 3 HFiles are structured the same as version 2 HFiles.
|
||||
For more information see <<hfilev2.overview,hfilev2.overview>>.
|
||||
For more information see <<hfilev2.overview,hfilev2.overview>>.
|
||||
|
||||
[[hvilev3.infoblock]]
|
||||
==== File Info Block in Version 3
|
||||
|
||||
Version 3 added two additional pieces of information to the reserved keys in the file info block.
|
||||
Version 3 added two additional pieces of information to the reserved keys in the file info block.
|
||||
|
||||
[cols="1,1", frame="all"]
|
||||
|===
|
||||
| hfile.MAX_TAGS_LEN | The maximum number of bytes needed to store the serialized tags for any single cell in this hfile (int)
|
||||
| hfile.TAGS_COMPRESSED | Does the block encoder for this hfile compress tags? (boolean). Should only be present if hfile.MAX_TAGS_LEN is also present.
|
||||
|===
|
||||
|===
|
||||
|
||||
When reading a Version 3 HFile the presence of `MAX_TAGS_LEN` is used to determine how to deserialize the cells within a data block.
|
||||
Therefore, consumers must read the file's info block prior to reading any data blocks.
|
||||
Therefore, consumers must read the file's info block prior to reading any data blocks.
|
||||
|
||||
When writing a Version 3 HFile, HBase will always include `MAX_TAGS_LEN ` when flushing the memstore to underlying filesystem and when using prefix tree encoding for data blocks, as described in <<compression,compression>>.
|
||||
When writing a Version 3 HFile, HBase will always include `MAX_TAGS_LEN ` when flushing the memstore to underlying filesystem and when using prefix tree encoding for data blocks, as described in <<compression,compression>>.
|
||||
|
||||
When compacting extant files, the default writer will omit `MAX_TAGS_LEN` if all of the files selected do not themselves contain any cells with tags.
|
||||
|
||||
See <<compaction,compaction>> for details on the compaction file selection algorithm.
|
||||
See <<compaction,compaction>> for details on the compaction file selection algorithm.
|
||||
|
||||
[[hfilev3.datablock]]
|
||||
==== Data Blocks in Version 3
|
||||
|
||||
Within an HFile, HBase cells are stored in data blocks as a sequence of KeyValues (see <<hfilev1.overview,hfilev1.overview>>, or link:http://www.larsgeorge.com/2009/10/hbase-architecture-101-storage.html[Lars George's
|
||||
excellent introduction to HBase Storage]). In version 3, these KeyValue optionally will include a set of 0 or more tags:
|
||||
excellent introduction to HBase Storage]). In version 3, these KeyValue optionally will include a set of 0 or more tags:
|
||||
|
||||
[cols="1,1", frame="all"]
|
||||
|===
|
||||
|
@ -335,14 +335,14 @@ Within an HFile, HBase cells are stored in data blocks as a sequence of KeyValue
|
|||
2+| Key bytes (variable)
|
||||
2+| Value bytes (variable)
|
||||
| | Tags Length (2 bytes)
|
||||
| | Tags bytes (variable)
|
||||
|===
|
||||
| | Tags bytes (variable)
|
||||
|===
|
||||
|
||||
If the info block for a given HFile contains an entry for `MAX_TAGS_LEN` each cell will have the length of that cell's tags included, even if that length is zero.
|
||||
The actual tags are stored as a sequence of tag length (2 bytes), tag type (1 byte), tag bytes (variable). The format an individual tag's bytes depends on the tag type.
|
||||
The actual tags are stored as a sequence of tag length (2 bytes), tag type (1 byte), tag bytes (variable). The format an individual tag's bytes depends on the tag type.
|
||||
|
||||
Note that the dependence on the contents of the info block implies that prior to reading any data blocks you must first process a file's info block.
|
||||
It also implies that prior to writing a data block you must know if the file's info block will include `MAX_TAGS_LEN`.
|
||||
It also implies that prior to writing a data block you must know if the file's info block will include `MAX_TAGS_LEN`.
|
||||
|
||||
[[hfilev3.fixedtrailer]]
|
||||
==== Fixed File Trailer in Version 3
|
||||
|
@ -350,6 +350,6 @@ It also implies that prior to writing a data block you must know if the file's i
|
|||
The fixed file trailers written with HFile version 3 are always serialized with protocol buffers.
|
||||
Additionally, it adds an optional field to the version 2 protocol buffer named encryption_key.
|
||||
If HBase is configured to encrypt HFiles this field will store a data encryption key for this particular HFile, encrypted with the current cluster master key using AES.
|
||||
For more information see <<hbase.encryption.server,hbase.encryption.server>>.
|
||||
For more information see <<hbase.encryption.server,hbase.encryption.server>>.
|
||||
|
||||
:numbered:
|
||||
|
|
|
@ -138,7 +138,10 @@ A region with an empty start key is the first region in a table.
|
|||
If a region has both an empty start and an empty end key, it is the only region in the table
|
||||
====
|
||||
|
||||
In the (hopefully unlikely) event that programmatic processing of catalog metadata is required, see the link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/util/Writables.html#getHRegionInfo%28byte[]%29[Writables] utility.
|
||||
In the (hopefully unlikely) event that programmatic processing of catalog metadata
|
||||
is required, see the
|
||||
link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/util/Writables.html#getHRegionInfo%28byte[]%29[Writables]
|
||||
utility.
|
||||
|
||||
[[arch.catalog.startup]]
|
||||
=== Startup Sequencing
|
||||
|
@ -235,11 +238,11 @@ Please use link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/client/C
|
|||
[[client.writebuffer]]
|
||||
=== WriteBuffer and Batch Methods
|
||||
|
||||
In HBase 1.0 and later, link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/client/HTable.html[HTable] is deprecated in favor of link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/client/Table.html[Table]. `Table` does not use autoflush. To do buffered writes, use the BufferedMutator class.
|
||||
In HBase 1.0 and later, link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/client/HTable.html[HTable] is deprecated in favor of link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/client/Table.html[Table]. `Table` does not use autoflush. To do buffered writes, use the BufferedMutator class.
|
||||
|
||||
Before a `Table` or `HTable` instance is discarded, invoke either `close()` or `flushCommits()`, so `Put`s will not be lost.
|
||||
|
||||
For additional information on write durability, review the link:../acid-semantics.html[ACID semantics] page.
|
||||
For additional information on write durability, review the link:/acid-semantics.html[ACID semantics] page.
|
||||
|
||||
For fine-grained control of batching of ``Put``s or ``Delete``s, see the link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/client/Table.html#batch%28java.util.List%29[batch] methods on Table.
|
||||
|
||||
|
@ -759,7 +762,7 @@ When we go to look for a cached block, we look first in L1 and if none found, th
|
|||
Let us call this deploy format, _Raw L1+L2_.
|
||||
|
||||
Other BucketCache configs include: specifying a location to persist cache to across restarts, how many threads to use writing the cache, etc.
|
||||
See the link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/io/hfile/CacheConfig.html[CacheConfig.html] class for configuration options and descriptions.
|
||||
See the link:https://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/io/hfile/CacheConfig.html[CacheConfig.html] class for configuration options and descriptions.
|
||||
|
||||
|
||||
|
||||
|
@ -1368,8 +1371,10 @@ The RegionServer splits a region, offlines the split region and then adds the da
|
|||
See <<disable.splitting>> for how to manually manage splits (and for why you might do this).
|
||||
|
||||
==== Custom Split Policies
|
||||
You can override the default split policy using a custom link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/regionserver/RegionSplitPolicy.html[RegionSplitPolicy](HBase 0.94+). Typically a custom split policy should extend
|
||||
HBase's default split policy: link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/regionserver/IncreasingToUpperBoundRegionSplitPolicy.html[IncreasingToUpperBoundRegionSplitPolicy].
|
||||
You can override the default split policy using a custom
|
||||
link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/regionserver/RegionSplitPolicy.html[RegionSplitPolicy](HBase 0.94+).
|
||||
Typically a custom split policy should extend HBase's default split policy:
|
||||
link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/regionserver/IncreasingToUpperBoundRegionSplitPolicy.html[IncreasingToUpperBoundRegionSplitPolicy].
|
||||
|
||||
The policy can set globally through the HBase configuration or on a per-table
|
||||
basis.
|
||||
|
@ -1388,7 +1393,7 @@ basis.
|
|||
HTableDescriptor tableDesc = new HTableDescriptor("test");
|
||||
tableDesc.setValue(HTableDescriptor.SPLIT_POLICY, ConstantSizeRegionSplitPolicy.class.getName());
|
||||
tableDesc.addFamily(new HColumnDescriptor(Bytes.toBytes("cf1")));
|
||||
admin.createTable(tableDesc);
|
||||
admin.createTable(tableDesc);
|
||||
----
|
||||
|
||||
[source]
|
||||
|
@ -1398,7 +1403,10 @@ hbase> create 'test', {METHOD => 'table_att', CONFIG => {'SPLIT_POLICY' => 'org.
|
|||
{NAME => 'cf1'}
|
||||
----
|
||||
|
||||
The default split policy can be overwritten using a custom link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/regionserver/RegionSplitPolicy.html[RegionSplitPolicy(HBase 0.94+)]. Typically a custom split policy should extend HBase's default split policy: link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/regionserver/ConstantSizeRegionSplitPolicy.html[ConstantSizeRegionSplitPolicy].
|
||||
The default split policy can be overwritten using a custom
|
||||
link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/regionserver/RegionSplitPolicy.html
|
||||
[RegionSplitPolicy(HBase 0.94+)]. Typically a custom split policy should extend HBase's default split policy:
|
||||
link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/regionserver/ConstantSizeRegionSplitPolicy.html[ConstantSizeRegionSplitPolicy].
|
||||
|
||||
The policy can be set globally through the HBaseConfiguration used or on a per table basis:
|
||||
[source,java]
|
||||
|
@ -1445,9 +1453,15 @@ Using a Custom Algorithm::
|
|||
The RegionSplitter tool is provided with HBase, and uses a _SplitAlgorithm_ to determine split points for you.
|
||||
As parameters, you give it the algorithm, desired number of regions, and column families.
|
||||
It includes two split algorithms.
|
||||
The first is the `link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/util/RegionSplitter.HexStringSplit.html[HexStringSplit]` algorithm, which assumes the row keys are hexadecimal strings.
|
||||
The second, `link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/util/RegionSplitter.UniformSplit.html[UniformSplit]`, assumes the row keys are random byte arrays.
|
||||
You will probably need to develop your own `link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/util/RegionSplitter.SplitAlgorithm.html[SplitAlgorithm]`, using the provided ones as models.
|
||||
The first is the
|
||||
`link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/util/RegionSplitter.HexStringSplit.html[HexStringSplit]`
|
||||
algorithm, which assumes the row keys are hexadecimal strings.
|
||||
The second,
|
||||
`link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/util/RegionSplitter.UniformSplit.html[UniformSplit]`,
|
||||
assumes the row keys are random byte arrays.
|
||||
You will probably need to develop your own
|
||||
`link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/util/RegionSplitter.SplitAlgorithm.html[SplitAlgorithm]`,
|
||||
using the provided ones as models.
|
||||
|
||||
=== Online Region Merges
|
||||
|
||||
|
|
|
@ -515,8 +515,7 @@ To implement this functionality we will take the help of Observer Coprocessor.
|
|||
Following are the implementation steps:
|
||||
|
||||
. Write a class that extends the
|
||||
// Below URL is more than 100 characters long.
|
||||
link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/coprocessor/BaseRegionObserver.html[BaseRegionObserver]
|
||||
link:https://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/coprocessor/BaseRegionObserver.html[BaseRegionObserver]
|
||||
class.
|
||||
|
||||
. Override the 'preGetOp()' method (Note that 'preGet()' method is now deprecated). The reason for
|
||||
|
|
|
@ -552,7 +552,7 @@ hash-joins). So which is the best approach? It depends on what you are trying to
|
|||
|
||||
== ACID
|
||||
|
||||
See link:http://hbase.apache.org/acid-semantics.html[ACID Semantics].
|
||||
See link:/acid-semantics.html[ACID Semantics].
|
||||
Lars Hofhansl has also written a note on link:http://hadoop-hbase.blogspot.com/2012/03/acid-in-hbase.html[ACID in HBase].
|
||||
|
||||
ifdef::backend-docbook[]
|
||||
|
|
|
@ -637,10 +637,14 @@ See link:https://issues.apache.org/jira/browse/HBASE-4391[HBASE-4391 Add ability
|
|||
[[compaction.tool]]
|
||||
=== Offline Compaction Tool
|
||||
|
||||
See the usage for the link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/regionserver/CompactionTool.html[Compaction
|
||||
Tool].
|
||||
Run it like this +./bin/hbase
|
||||
org.apache.hadoop.hbase.regionserver.CompactionTool+
|
||||
See the usage for the
|
||||
link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/regionserver/CompactionTool.html[CompactionTool].
|
||||
Run it like:
|
||||
|
||||
[source, bash]
|
||||
----
|
||||
$ ./bin/hbase org.apache.hadoop.hbase.regionserver.CompactionTool
|
||||
----
|
||||
|
||||
=== `hbase clean`
|
||||
|
||||
|
|
|
@ -196,7 +196,8 @@ tableDesc.addFamily(cfDesc);
|
|||
----
|
||||
====
|
||||
|
||||
See the API documentation for link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/io/hfile/CacheConfig.html[CacheConfig].
|
||||
See the API documentation for
|
||||
link:https://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/io/hfile/CacheConfig.html[CacheConfig].
|
||||
|
||||
[[perf.rs.memstore.size]]
|
||||
=== `hbase.regionserver.global.memstore.size`
|
||||
|
|
|
@ -1334,7 +1334,7 @@ static Table createTableAndWriteDataWithLabels(TableName tableName, String... la
|
|||
|
||||
<<reading_cells_with_labels>>
|
||||
==== Reading Cells with Labels
|
||||
When you issue a Scan or Get, HBase uses your default set of authorizations to filter out cells that you do not have access to. A superuser can set the default set of authorizations for a given user by using the `set_auths` HBase Shell command or the link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/security/visibility/VisibilityClient.html#setAuths(org.apache.hadoop.conf.Configuration,%20java.lang.String\[\],%20java.lang.String)[VisibilityClient.setAuths()] method.
|
||||
When you issue a Scan or Get, HBase uses your default set of authorizations to filter out cells that you do not have access to. A superuser can set the default set of authorizations for a given user by using the `set_auths` HBase Shell command or the link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/security/visibility/VisibilityClient.html#setAuths(org.apache.hadoop.hbase.client.Connection,%20java.lang.String[],%20java.lang.String)[VisibilityClient.setAuths()] method.
|
||||
|
||||
You can specify a different authorization during the Scan or Get, by passing the AUTHORIZATIONS option in HBase Shell, or the link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/client/Scan.html#setAuthorizations%28org.apache.hadoop.hbase.security.visibility.Authorizations%29[setAuthorizations()] method if you use the API. This authorization will be combined with your default set as an additional filter. It will further filter your results, rather than giving you additional authorization.
|
||||
|
||||
|
@ -1582,7 +1582,8 @@ Rotate the Master Key::
|
|||
=== Secure Bulk Load
|
||||
|
||||
Bulk loading in secure mode is a bit more involved than normal setup, since the client has to transfer the ownership of the files generated from the MapReduce job to HBase.
|
||||
Secure bulk loading is implemented by a coprocessor, named link:http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/security/access/SecureBulkLoadEndpoint.html[SecureBulkLoadEndpoint], which uses a staging directory configured by the configuration property `hbase.bulkload.staging.dir`, which defaults to _/tmp/hbase-staging/_.
|
||||
Secure bulk loading is implemented by a coprocessor, named link:http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/security/access/SecureBulkLoadEndpoint.html
|
||||
[SecureBulkLoadEndpoint], which uses a staging directory configured by the configuration property `hbase.bulkload.staging.dir`, which defaults to _/tmp/hbase-staging/_.
|
||||
|
||||
.Secure Bulk Load Algorithm
|
||||
|
||||
|
|
|
@ -282,7 +282,7 @@ under the License.
|
|||
use HBase a map-reduce data source to overcome traditional query speed limits
|
||||
in MySQL.</dd>
|
||||
|
||||
<dt><a href=">http://www.tokenizer.org">Shopping Engine at Tokenizer</a></dt>
|
||||
<dt><a href="http://www.tokenizer.org">Shopping Engine at Tokenizer</a></dt>
|
||||
<dd>Shopping Engine at Tokenizer is a web crawler; it uses HBase to store URLs
|
||||
and Outlinks (AnchorText + LinkedURL): more than a billion. It was initially
|
||||
designed as Nutch-Hadoop extension, then (due to very specific 'shopping'
|
||||
|
|
Loading…
Reference in New Issue