From 779c4e231695f0a1056505887ca768d0d9acad84 Mon Sep 17 00:00:00 2001 From: Bryan Beaudreault Date: Mon, 11 Jul 2022 22:00:04 -0400 Subject: [PATCH] HBASE-27186 Report block cache size metrics separately for L1 and L2 (#4608) Signed-off-by: Andrew Purtell --- .../MetricsRegionServerSource.java | 19 ++++++++ .../MetricsRegionServerSourceImpl.java | 14 ++++++ .../MetricsRegionServerWrapper.java | 45 ++++++++++++++++++ .../hbase/io/hfile/CombinedBlockCache.java | 4 ++ .../MetricsRegionServerWrapperImpl.java | 47 +++++++++++++++++++ .../MetricsRegionServerWrapperStub.java | 45 ++++++++++++++++++ .../regionserver/TestMetricsRegionServer.java | 9 ++++ src/main/asciidoc/_chapters/architecture.adoc | 4 ++ 8 files changed, 187 insertions(+) diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java index 7bffc57d0c0..a271b1d2187 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java @@ -316,6 +316,8 @@ public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSo String BLOCK_CACHE_FREE_DESC = "Size of the block cache that is not occupied."; String BLOCK_CACHE_COUNT = "blockCacheCount"; String BLOCK_CACHE_COUNT_DESC = "Number of block in the block cache."; + String BLOCK_CACHE_DATA_BLOCK_COUNT = "blockCacheDataBlockCount"; + String BLOCK_CACHE_DATA_BLOCK_COUNT_DESC = "Number of DATA block in the block cache."; String BLOCK_CACHE_SIZE = "blockCacheSize"; String BLOCK_CACHE_SIZE_DESC = "Size of the block cache."; String BLOCK_CACHE_HIT_COUNT = "blockCacheHitCount"; @@ -365,6 +367,15 @@ public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSo String BLOCK_CACHE_GENERAL_BLOOM_META_HIT_COUNT = "blockCacheGeneralBloomMetaHitCount"; String BLOCK_CACHE_DELETE_FAMILY_BLOOM_HIT_COUNT = "blockCacheDeleteFamilyBloomHitCount"; String BLOCK_CACHE_TRAILER_HIT_COUNT = "blockCacheTrailerHitCount"; + String L1_CACHE_FREE_SIZE = "l1CacheFreeSize"; + String L1_CACHE_FREE_SIZE_DESC = "Amount of free bytes in the L1 cache"; + String L1_CACHE_SIZE = "l1CacheSize"; + String L1_CACHE_SIZE_DESC = "Size of the L1 cache in bytes"; + String L1_CACHE_COUNT = "l1CacheCount"; + String L1_CACHE_COUNT_DESC = "Count of blocks in the L1 cache"; + String L1_CACHE_EVICTION_COUNT = "l1CacheEvictionCount"; + String L1_CACHE_EVICTION_COUNT_DESC = "Count of blocks evicted from the L1 cache"; + String L1_CACHE_HIT_COUNT = "l1CacheHitCount"; String L1_CACHE_HIT_COUNT_DESC = "L1 cache hit count."; String L1_CACHE_MISS_COUNT = "l1CacheMissCount"; @@ -373,6 +384,14 @@ public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSo String L1_CACHE_HIT_RATIO_DESC = "L1 cache hit ratio."; String L1_CACHE_MISS_RATIO = "l1CacheMissRatio"; String L1_CACHE_MISS_RATIO_DESC = "L1 cache miss ratio."; + String L2_CACHE_FREE_SIZE = "l2CacheFreeSize"; + String L2_CACHE_FREE_SIZE_DESC = "Amount of free bytes in the L2 cache"; + String L2_CACHE_SIZE = "l2CacheSize"; + String L2_CACHE_SIZE_DESC = "Size of the L2 cache in bytes"; + String L2_CACHE_COUNT = "l2CacheCount"; + String L2_CACHE_COUNT_DESC = "Count of blocks in the L2 cache"; + String L2_CACHE_EVICTION_COUNT = "l2CacheEvictionCount"; + String L2_CACHE_EVICTION_COUNT_DESC = "Count of blocks evicted from the L2 cache"; String L2_CACHE_HIT_COUNT = "l2CacheHitCount"; String L2_CACHE_HIT_COUNT_DESC = "L2 cache hit count."; String L2_CACHE_MISS_COUNT = "l2CacheMissCount"; diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java index f8cebd2ec60..d264649c381 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java @@ -532,11 +532,19 @@ public class MetricsRegionServerSourceImpl extends BaseSourceImpl rsWrap.getBlockCacheFreeSize()) .addGauge(Interns.info(BLOCK_CACHE_COUNT, BLOCK_CACHE_COUNT_DESC), rsWrap.getBlockCacheCount()) + .addGauge(Interns.info(BLOCK_CACHE_DATA_BLOCK_COUNT, BLOCK_CACHE_DATA_BLOCK_COUNT_DESC), + rsWrap.getBlockCacheDataBlockCount()) .addGauge(Interns.info(BLOCK_CACHE_SIZE, BLOCK_CACHE_SIZE_DESC), rsWrap.getBlockCacheSize()) .addGauge(Interns.info(BLOCK_CACHE_HIT_PERCENT, BLOCK_CACHE_HIT_PERCENT_DESC), rsWrap.getBlockCacheHitPercent()) .addGauge(Interns.info(BLOCK_CACHE_EXPRESS_HIT_PERCENT, BLOCK_CACHE_EXPRESS_HIT_PERCENT_DESC), rsWrap.getBlockCacheHitCachingPercent()) + .addGauge(Interns.info(L1_CACHE_SIZE, L1_CACHE_SIZE_DESC), rsWrap.getL1CacheSize()) + .addGauge(Interns.info(L1_CACHE_FREE_SIZE, L1_CACHE_FREE_SIZE_DESC), + rsWrap.getL1CacheFreeSize()) + .addGauge(Interns.info(L1_CACHE_COUNT, L1_CACHE_COUNT_DESC), rsWrap.getL1CacheCount()) + .addCounter(Interns.info(L1_CACHE_EVICTION_COUNT, L1_CACHE_EVICTION_COUNT_DESC), + rsWrap.getL1CacheEvictedCount()) .addGauge(Interns.info(L1_CACHE_HIT_COUNT, L1_CACHE_HIT_COUNT_DESC), rsWrap.getL1CacheHitCount()) .addGauge(Interns.info(L1_CACHE_MISS_COUNT, L1_CACHE_MISS_COUNT_DESC), @@ -545,6 +553,12 @@ public class MetricsRegionServerSourceImpl extends BaseSourceImpl rsWrap.getL1CacheHitRatio()) .addGauge(Interns.info(L1_CACHE_MISS_RATIO, L1_CACHE_MISS_RATIO_DESC), rsWrap.getL1CacheMissRatio()) + .addGauge(Interns.info(L2_CACHE_SIZE, L2_CACHE_SIZE_DESC), rsWrap.getL2CacheSize()) + .addGauge(Interns.info(L2_CACHE_FREE_SIZE, L2_CACHE_FREE_SIZE_DESC), + rsWrap.getL2CacheFreeSize()) + .addGauge(Interns.info(L2_CACHE_COUNT, L2_CACHE_COUNT_DESC), rsWrap.getL2CacheCount()) + .addCounter(Interns.info(L2_CACHE_EVICTION_COUNT, L2_CACHE_EVICTION_COUNT_DESC), + rsWrap.getL2CacheEvictedCount()) .addGauge(Interns.info(L2_CACHE_HIT_COUNT, L2_CACHE_HIT_COUNT_DESC), rsWrap.getL2CacheHitCount()) .addGauge(Interns.info(L2_CACHE_MISS_COUNT, L2_CACHE_MISS_COUNT_DESC), diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapper.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapper.java index c654ba844e3..d4f33737c44 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapper.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapper.java @@ -267,6 +267,11 @@ public interface MetricsRegionServerWrapper { */ long getBlockCacheCount(); + /** + * Get the number of DATA blocks in the block cache. + */ + long getBlockCacheDataBlockCount(); + /** * Get the total size (in bytes) of the block cache. */ @@ -317,6 +322,26 @@ public interface MetricsRegionServerWrapper { */ long getBlockCacheFailedInsertions(); + /** + * Cache size (bytes) of L1 cache + */ + long getL1CacheSize(); + + /** + * Free cache size (bytes) of L1 cache + */ + long getL1CacheFreeSize(); + + /** + * Number of blocks in L1 cache + */ + long getL1CacheCount(); + + /** + * Number of blocks evicted from L1 cache + */ + long getL1CacheEvictedCount(); + /** * Hit count of L1 cache. */ @@ -337,6 +362,26 @@ public interface MetricsRegionServerWrapper { */ double getL1CacheMissRatio(); + /** + * Cache size (bytes) of L2 cache + */ + long getL2CacheSize(); + + /** + * Free cache size (bytes) of L2 cache + */ + long getL2CacheFreeSize(); + + /** + * Number of blocks in L2 cache + */ + long getL2CacheCount(); + + /** + * Number of blocks evicted from L2 cache + */ + long getL2CacheEvictedCount(); + /** * Hit count of L2 cache. */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java index 69a70600a6c..6cd40b0858f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java @@ -390,4 +390,8 @@ public class CombinedBlockCache implements ResizableBlockCache, HeapSize { public FirstLevelBlockCache getFirstLevelCache() { return l1Cache; } + + public BlockCache getSecondLevelCache() { + return l2Cache; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java index 8793f495cb7..ae5e904955e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java @@ -66,6 +66,8 @@ class MetricsRegionServerWrapperImpl implements MetricsRegionServerWrapper { private final ByteBuffAllocator allocator; private BlockCache blockCache; + private BlockCache l1Cache = null; + private BlockCache l2Cache = null; private MobFileCache mobFileCache; private CacheStats cacheStats; private CacheStats l1Stats = null; @@ -173,6 +175,14 @@ class MetricsRegionServerWrapperImpl implements MetricsRegionServerWrapper { l1Stats = this.cacheStats; } } + if (this.blockCache != null) { + if (this.blockCache instanceof CombinedBlockCache) { + l1Cache = ((CombinedBlockCache) this.blockCache).getFirstLevelCache(); + l2Cache = ((CombinedBlockCache) this.blockCache).getSecondLevelCache(); + } else { + l1Cache = this.blockCache; + } + } } /** @@ -276,6 +286,11 @@ class MetricsRegionServerWrapperImpl implements MetricsRegionServerWrapper { return this.blockCache != null ? this.blockCache.getBlockCount() : 0L; } + @Override + public long getBlockCacheDataBlockCount() { + return this.blockCache != null ? this.blockCache.getDataBlockCount() : 0L; + } + @Override public long getMemStoreLimit() { return this.regionServer.getRegionServerAccounting().getGlobalMemStoreLimit(); @@ -354,6 +369,38 @@ class MetricsRegionServerWrapperImpl implements MetricsRegionServerWrapper { return this.cacheStats != null ? this.cacheStats.getFailedInserts() : 0L; } + public long getL1CacheSize() { + return this.l1Cache != null ? this.l1Cache.getCurrentSize() : 0L; + } + + public long getL1CacheFreeSize() { + return this.l1Cache != null ? this.l1Cache.getFreeSize() : 0L; + } + + public long getL1CacheCount() { + return this.l1Cache != null ? this.l1Cache.getBlockCount() : 0L; + } + + public long getL1CacheEvictedCount() { + return this.l1Stats != null ? this.l1Stats.getEvictedCount() : 0L; + } + + public long getL2CacheSize() { + return this.l2Cache != null ? this.l2Cache.getCurrentSize() : 0L; + } + + public long getL2CacheFreeSize() { + return this.l2Cache != null ? this.l2Cache.getFreeSize() : 0L; + } + + public long getL2CacheCount() { + return this.l2Cache != null ? this.l2Cache.getBlockCount() : 0L; + } + + public long getL2CacheEvictedCount() { + return this.l2Stats != null ? this.l2Stats.getEvictedCount() : 0L; + } + @Override public long getL1CacheHitCount() { return this.l1Stats != null ? this.l1Stats.getHitCount() : 0L; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperStub.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperStub.java index e451683a367..d604cf00d49 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperStub.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperStub.java @@ -292,6 +292,11 @@ public class MetricsRegionServerWrapperStub implements MetricsRegionServerWrappe return 414; } + @Override + public long getBlockCacheDataBlockCount() { + return 300; + } + @Override public long getBlockCacheSize() { return 415; @@ -342,6 +347,26 @@ public class MetricsRegionServerWrapperStub implements MetricsRegionServerWrappe return 36; } + @Override + public long getL1CacheSize() { + return 123; + } + + @Override + public long getL1CacheFreeSize() { + return 100; + } + + @Override + public long getL1CacheCount() { + return 50; + } + + @Override + public long getL1CacheEvictedCount() { + return 1000; + } + @Override public long getL1CacheHitCount() { return 200; @@ -362,6 +387,26 @@ public class MetricsRegionServerWrapperStub implements MetricsRegionServerWrappe return 20; } + @Override + public long getL2CacheSize() { + return 456; + } + + @Override + public long getL2CacheFreeSize() { + return 200; + } + + @Override + public long getL2CacheCount() { + return 75; + } + + @Override + public long getL2CacheEvictedCount() { + return 2000; + } + @Override public long getL2CacheHitCount() { return 800; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestMetricsRegionServer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestMetricsRegionServer.java index 70e37bc5408..a44f3c6b62e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestMetricsRegionServer.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestMetricsRegionServer.java @@ -104,6 +104,7 @@ public class TestMetricsRegionServer { HELPER.assertGauge("flushQueueLength", 412, serverSource); HELPER.assertGauge("blockCacheFreeSize", 413, serverSource); HELPER.assertGauge("blockCacheCount", 414, serverSource); + HELPER.assertGauge("blockCacheDataBlockCount", 300, serverSource); HELPER.assertGauge("blockCacheSize", 415, serverSource); HELPER.assertCounter("blockCacheHitCount", 416, serverSource); HELPER.assertCounter("blockCacheMissCount", 417, serverSource); @@ -111,10 +112,18 @@ public class TestMetricsRegionServer { HELPER.assertGauge("blockCacheCountHitPercent", 98, serverSource); HELPER.assertGauge("blockCacheExpressHitPercent", 97, serverSource); HELPER.assertCounter("blockCacheFailedInsertionCount", 36, serverSource); + HELPER.assertGauge("l1CacheFreeSize", 100, serverSource); + HELPER.assertGauge("l1CacheSize", 123, serverSource); + HELPER.assertGauge("l1CacheCount", 50, serverSource); + HELPER.assertCounter("l1CacheEvictionCount", 1000, serverSource); HELPER.assertGauge("l1CacheHitCount", 200, serverSource); HELPER.assertGauge("l1CacheMissCount", 100, serverSource); HELPER.assertGauge("l1CacheHitRatio", 80, serverSource); HELPER.assertGauge("l1CacheMissRatio", 20, serverSource); + HELPER.assertGauge("l2CacheFreeSize", 200, serverSource); + HELPER.assertGauge("l2CacheSize", 456, serverSource); + HELPER.assertGauge("l2CacheCount", 75, serverSource); + HELPER.assertCounter("l2CacheEvictionCount", 2000, serverSource); HELPER.assertGauge("l2CacheHitCount", 800, serverSource); HELPER.assertGauge("l2CacheMissCount", 200, serverSource); HELPER.assertGauge("l2CacheHitRatio", 90, serverSource); diff --git a/src/main/asciidoc/_chapters/architecture.adoc b/src/main/asciidoc/_chapters/architecture.adoc index 91192f9614c..23d069c1d91 100644 --- a/src/main/asciidoc/_chapters/architecture.adoc +++ b/src/main/asciidoc/_chapters/architecture.adoc @@ -946,6 +946,10 @@ Bloom Filters:: Currently the recommended way to measure HFile indexes and bloom filters sizes is to look at the region server web UI and checkout the relevant metrics. For keys, sampling can be done by using the HFile command line tool and look for the average key size metric. Since HBase 0.98.3, you can view details on BlockCache stats and metrics in a special Block Cache section in the UI. +As of HBase 2.4.14, you can estimate HFile indexes and bloom filters vs other DATA blocks using blockCacheCount and blockCacheDataBlockCount in JMX. The +formula `(blockCacheCount - blockCacheDataBlockCount) * blockSize` will give you an estimate which can be useful when trying to enable the BucketCache. You +should make sure the post-BucketCache config gives enough memory to the on-heap LRU cache to hold at least the same number of non-DATA blocks from pre-BucketCache. +Once BucketCache is enabled, the L1 metrics like l1CacheSize, l1CacheCount, and l1CacheEvictionCount can help you further tune the size. It's generally bad to use block caching when the WSS doesn't fit in memory. This is the case when you have for example 40GB available across all your region servers' block caches but you need to process 1TB of data.