diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSelector.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSelector.java index f9572b3b63c..c3ce868757f 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSelector.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSelector.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSelectorFactory.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSelectorFactory.java index 4db0d31134c..485afb40d26 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSelectorFactory.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSelectorFactory.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSimpleSelector.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSimpleSelector.java index 410a2c3066e..f0f067286c5 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSimpleSelector.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSimpleSelector.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaMode.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaMode.java index 0f126e1139c..40062e32e83 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaMode.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaMode.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/CatalogReplicationSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/CatalogReplicationSource.java index f36514d0c21..8cb7860e73f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/CatalogReplicationSource.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/CatalogReplicationSource.java @@ -36,8 +36,12 @@ class CatalogReplicationSource extends ReplicationSource { @Override public void logPositionAndCleanOldLogs(WALEntryBatch entryBatch) { - // Noop. This implementation does not persist state to backing storage nor does it keep its - // WALs in a general map up in ReplicationSourceManager so just skip calling through to the - // default implemenentation. + // Noop. This CatalogReplicationSource implementation does not persist state to backing storage + // nor does it keep its WALs in a general map up in ReplicationSourceManager -- + // CatalogReplicationSource is used by the Catalog Read Replica feature which resets everytime + // the WAL source process crashes. Skip calling through to the default implementation. + // See "4.1 Skip maintaining zookeeper replication queue (offsets/WALs)" in the + // design doc attached to HBASE-18070 'Enable memstore replication for meta replica for detail' + // for background on why no need to keep WAL state. } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java index 860cbd3cb5e..81d68657ec6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java @@ -1015,7 +1015,10 @@ public class ReplicationSourceManager implements ReplicationListener { /** * Create, initialize, and start the Catalog ReplicationSource. * Presumes called one-time only (caller must ensure one-time only call). + * This ReplicationSource is NOT created via {@link ReplicationSourceFactory}. * @see #addSource(String) This is a specialization of the addSource call. + * @see #catalogReplicationSource for a note on this ReplicationSource's lifecycle (and more on + * why the special handling). */ private ReplicationSourceInterface createCatalogReplicationSource(RegionInfo regionInfo) throws IOException { @@ -1026,6 +1029,11 @@ public class ReplicationSourceManager implements ReplicationListener { if (instantiate) { walProvider = this.walFactory.getMetaProvider(); } + // Here we do a specialization on what {@link ReplicationSourceFactory} does. There is no need + // for persisting offset into WALs up in zookeeper (via ReplicationQueueInfo) as the catalog + // read replicas feature that makes use of the source does a reset on a crash of the WAL + // source process. See "4.1 Skip maintaining zookeeper replication queue (offsets/WALs)" in the + // design doc attached to HBASE-18070 'Enable memstore replication for meta replica' for detail. CatalogReplicationSourcePeer peer = new CatalogReplicationSourcePeer(this.conf, this.clusterId.toString(), "meta_" + ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_PEER); final ReplicationSourceInterface crs = new CatalogReplicationSource(); diff --git a/src/main/asciidoc/_chapters/architecture.adoc b/src/main/asciidoc/_chapters/architecture.adoc index a10a09adae9..48ecc996281 100644 --- a/src/main/asciidoc/_chapters/architecture.adoc +++ b/src/main/asciidoc/_chapters/architecture.adoc @@ -2865,13 +2865,33 @@ The first mechanism is store file refresher which is introduced in HBase-1.0+. S For turning this feature on, you should configure `hbase.regionserver.storefile.refresh.period` to a non-zero value. See Configuration section below. -==== Asnyc WAL replication -The second mechanism for propagation of writes to secondaries is done via “Async WAL Replication” feature and is only available in HBase-1.1+. This works similarly to HBase’s multi-datacenter replication, but instead the data from a region is replicated to the secondary regions. Each secondary replica always receives and observes the writes in the same order that the primary region committed them. In some sense, this design can be thought of as “in-cluster replication”, where instead of replicating to a different datacenter, the data goes to secondary regions to keep secondary region’s in-memory state up to date. The data files are shared between the primary region and the other replicas, so that there is no extra storage overhead. However, the secondary regions will have recent non-flushed data in their memstores, which increases the memory overhead. The primary region writes flush, compaction, and bulk load events to its WAL as well, which are also replicated through wal replication to secondaries. When they observe the flush/compaction or bulk load event, the secondary regions replay the event to pick up the new files and drop the old ones. +[[async.wal.replication]] +==== Async WAL replication +The second mechanism for propagation of writes to secondaries is done via the +“Async WAL Replication” feature. It is only available in HBase-1.1+. This works +similarly to HBase’s multi-datacenter replication, but instead the data from a +region is replicated to the secondary regions. Each secondary replica always +receives and observes the writes in the same order that the primary region +committed them. In some sense, this design can be thought of as “in-cluster +replication”, where instead of replicating to a different datacenter, the data +goes to secondary regions to keep secondary region’s in-memory state up to date. +The data files are shared between the primary region and the other replicas, so +that there is no extra storage overhead. However, the secondary regions will +have recent non-flushed data in their memstores, which increases the memory +overhead. The primary region writes flush, compaction, and bulk load events +to its WAL as well, which are also replicated through wal replication to +secondaries. When they observe the flush/compaction or bulk load event, the +secondary regions replay the event to pick up the new files and drop the old +ones. Committing writes in the same order as in primary ensures that the secondaries won’t diverge from the primary regions data, but since the log replication is asynchronous, the data might still be stale in secondary regions. Since this feature works as a replication endpoint, the performance and latency characteristics is expected to be similar to inter-cluster replication. -Async WAL Replication is *disabled* by default. You can enable this feature by setting `hbase.region.replica.replication.enabled` to `true`. -Asyn WAL Replication feature will add a new replication peer named `region_replica_replication` as a replication peer when you create a table with region replication > 1 for the first time. Once enabled, if you want to disable this feature, you need to do two actions: +Async WAL Replication is *disabled* by default. You can enable this feature by +setting `hbase.region.replica.replication.enabled` to `true`. The Async WAL +Replication feature will add a new replication peer named +`region_replica_replication` as a replication peer when you create a table with +region replication > 1 for the first time. Once enabled, if you want to disable +this feature, you need to do two actions in the following order: * Set configuration property `hbase.region.replica.replication.enabled` to false in `hbase-site.xml` (see Configuration section below) * Disable the replication peer named `region_replica_replication` in the cluster using hbase shell or `Admin` class: [source,bourne] @@ -2879,12 +2899,52 @@ Asyn WAL Replication feature will add a new replication peer named `region_repli hbase> disable_peer 'region_replica_replication' ---- +Async WAL Replication and the `hbase:meta` table is a little more involved and gets its own section below; see <> + === Store File TTL In both of the write propagation approaches mentioned above, store files of the primary will be opened in secondaries independent of the primary region. So for files that the primary compacted away, the secondaries might still be referring to these files for reading. Both features are using HFileLinks to refer to files, but there is no protection (yet) for guaranteeing that the file will not be deleted prematurely. Thus, as a guard, you should set the configuration property `hbase.master.hfilecleaner.ttl` to a larger value, such as 1 hour to guarantee that you will not receive IOExceptions for requests going to replicas. +[[async.wal.replication.meta]] === Region replication for META table’s region -Currently, Async WAL Replication is not done for the META table’s WAL. The meta table’s secondary replicas still refreshes themselves from the persistent store files. Hence the `hbase.regionserver.meta.storefile.refresh.period` needs to be set to a certain non-zero value for refreshing the meta store files. Note that this configuration is configured differently than -`hbase.regionserver.storefile.refresh.period`. +Async WAL Replication does not work for the META table’s WAL. +The meta table’s secondary replicas refresh themselves from the persistent store +files every `hbase.regionserver.meta.storefile.refresh.period`, (a non-zero value). +Note how the META replication period is distinct from the user-space +`hbase.regionserver.storefile.refresh.period` value. + +==== Async WAL Replication for META table as of hbase-2.4.0+ ==== +Async WAL replication for META is added as a new feature in 2.4.0. It is still under +active development. Use with caution. Set +`hbase.region.replica.replication.catalog.enabled` to enable async WAL Replication +for META region replicas. It is off by default. + +Regarding META replicas count, up to hbase-2.4.0, you would set the special +property 'hbase.meta.replica.count'. Now you can alter the META table as you +would a user-space table (if `hbase.meta.replica.count` is set, it will take +precedent over what is set for replica count in the META table updating META +replica count to match). + +===== Load Balancing META table load ===== + +hbase-2.4.0 also adds a *new* client-side `LoadBalance` mode. When enabled +client-side, clients will try to read META replicas first before falling back on +the primary. Before this, the replica lookup mode -- now named `HedgedRead` in +hbase-2.4.0 -- had clients read the primary and if no response after a +configurable amount of time had elapsed, it would start up reads against the +replicas. + +The new 'LoadBalance' mode helps alleviate hotspotting on the META +table distributing META read load. + +To enable the meta replica locator's load balance mode, please set the following +configuration at on the *client-side* (only): set 'hbase.locator.meta.replicas.mode' +to "LoadBalance". Valid options for this configuration are `None`, `HedgedRead`, and +`LoadBalance`. Option parse is case insensitive. The default mode is `None` (which falls +through to `HedgedRead`, the current default). Do NOT put this configuration in any +hbase server-side's configuration, Master or RegionServer (Master could make decisions +based off stale state -- to be avoided). + +`LoadBalance` also is a new feature. Use with caution. === Memory accounting The secondary region replicas refer to the data files of the primary region replica, but they have their own memstores (in HBase-1.1+) and uses block cache as well. However, one distinction is that the secondary region replicas cannot flush the data when there is memory pressure for their memstores. They can only free up memstore memory when the primary region does a flush and this flush is replicated to the secondary. Since in a region server hosting primary replicas for some regions and secondaries for some others, the secondaries might cause extra flushes to the primary regions in the same host. In extreme situations, there can be no memory left for adding new writes coming from the primary via wal replication. For unblocking this situation (and since secondary cannot flush by itself), the secondary is allowed to do a “store file refresh” by doing a file system list operation to pick up new files from primary, and possibly dropping its memstore. This refresh will only be performed if the memstore size of the biggest secondary region replica is at least `hbase.region.replica.storefile.refresh.memstore.multiplier` (default 4) times bigger than the biggest memstore of a primary replica. One caveat is that if this is performed, the secondary can observe partial row updates across column families (since column families are flushed independently). The default should be good to not do this operation frequently. You can set this value to a large number to disable this feature if desired, but be warned that it might cause the replication to block forever.