HBASE-27186 Report block cache size metrics separately for L1 and L2 (#4608)

Signed-off-by: Andrew Purtell <apurtell@apache.org>
This commit is contained in:
Bryan Beaudreault 2022-07-11 22:00:04 -04:00 committed by GitHub
parent 5bc8670322
commit 779c4e2316
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 187 additions and 0 deletions

View File

@ -316,6 +316,8 @@ public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSo
String BLOCK_CACHE_FREE_DESC = "Size of the block cache that is not occupied.";
String BLOCK_CACHE_COUNT = "blockCacheCount";
String BLOCK_CACHE_COUNT_DESC = "Number of block in the block cache.";
String BLOCK_CACHE_DATA_BLOCK_COUNT = "blockCacheDataBlockCount";
String BLOCK_CACHE_DATA_BLOCK_COUNT_DESC = "Number of DATA block in the block cache.";
String BLOCK_CACHE_SIZE = "blockCacheSize";
String BLOCK_CACHE_SIZE_DESC = "Size of the block cache.";
String BLOCK_CACHE_HIT_COUNT = "blockCacheHitCount";
@ -365,6 +367,15 @@ public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSo
String BLOCK_CACHE_GENERAL_BLOOM_META_HIT_COUNT = "blockCacheGeneralBloomMetaHitCount";
String BLOCK_CACHE_DELETE_FAMILY_BLOOM_HIT_COUNT = "blockCacheDeleteFamilyBloomHitCount";
String BLOCK_CACHE_TRAILER_HIT_COUNT = "blockCacheTrailerHitCount";
String L1_CACHE_FREE_SIZE = "l1CacheFreeSize";
String L1_CACHE_FREE_SIZE_DESC = "Amount of free bytes in the L1 cache";
String L1_CACHE_SIZE = "l1CacheSize";
String L1_CACHE_SIZE_DESC = "Size of the L1 cache in bytes";
String L1_CACHE_COUNT = "l1CacheCount";
String L1_CACHE_COUNT_DESC = "Count of blocks in the L1 cache";
String L1_CACHE_EVICTION_COUNT = "l1CacheEvictionCount";
String L1_CACHE_EVICTION_COUNT_DESC = "Count of blocks evicted from the L1 cache";
String L1_CACHE_HIT_COUNT = "l1CacheHitCount";
String L1_CACHE_HIT_COUNT_DESC = "L1 cache hit count.";
String L1_CACHE_MISS_COUNT = "l1CacheMissCount";
@ -373,6 +384,14 @@ public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSo
String L1_CACHE_HIT_RATIO_DESC = "L1 cache hit ratio.";
String L1_CACHE_MISS_RATIO = "l1CacheMissRatio";
String L1_CACHE_MISS_RATIO_DESC = "L1 cache miss ratio.";
String L2_CACHE_FREE_SIZE = "l2CacheFreeSize";
String L2_CACHE_FREE_SIZE_DESC = "Amount of free bytes in the L2 cache";
String L2_CACHE_SIZE = "l2CacheSize";
String L2_CACHE_SIZE_DESC = "Size of the L2 cache in bytes";
String L2_CACHE_COUNT = "l2CacheCount";
String L2_CACHE_COUNT_DESC = "Count of blocks in the L2 cache";
String L2_CACHE_EVICTION_COUNT = "l2CacheEvictionCount";
String L2_CACHE_EVICTION_COUNT_DESC = "Count of blocks evicted from the L2 cache";
String L2_CACHE_HIT_COUNT = "l2CacheHitCount";
String L2_CACHE_HIT_COUNT_DESC = "L2 cache hit count.";
String L2_CACHE_MISS_COUNT = "l2CacheMissCount";

View File

@ -532,11 +532,19 @@ public class MetricsRegionServerSourceImpl extends BaseSourceImpl
rsWrap.getBlockCacheFreeSize())
.addGauge(Interns.info(BLOCK_CACHE_COUNT, BLOCK_CACHE_COUNT_DESC),
rsWrap.getBlockCacheCount())
.addGauge(Interns.info(BLOCK_CACHE_DATA_BLOCK_COUNT, BLOCK_CACHE_DATA_BLOCK_COUNT_DESC),
rsWrap.getBlockCacheDataBlockCount())
.addGauge(Interns.info(BLOCK_CACHE_SIZE, BLOCK_CACHE_SIZE_DESC), rsWrap.getBlockCacheSize())
.addGauge(Interns.info(BLOCK_CACHE_HIT_PERCENT, BLOCK_CACHE_HIT_PERCENT_DESC),
rsWrap.getBlockCacheHitPercent())
.addGauge(Interns.info(BLOCK_CACHE_EXPRESS_HIT_PERCENT, BLOCK_CACHE_EXPRESS_HIT_PERCENT_DESC),
rsWrap.getBlockCacheHitCachingPercent())
.addGauge(Interns.info(L1_CACHE_SIZE, L1_CACHE_SIZE_DESC), rsWrap.getL1CacheSize())
.addGauge(Interns.info(L1_CACHE_FREE_SIZE, L1_CACHE_FREE_SIZE_DESC),
rsWrap.getL1CacheFreeSize())
.addGauge(Interns.info(L1_CACHE_COUNT, L1_CACHE_COUNT_DESC), rsWrap.getL1CacheCount())
.addCounter(Interns.info(L1_CACHE_EVICTION_COUNT, L1_CACHE_EVICTION_COUNT_DESC),
rsWrap.getL1CacheEvictedCount())
.addGauge(Interns.info(L1_CACHE_HIT_COUNT, L1_CACHE_HIT_COUNT_DESC),
rsWrap.getL1CacheHitCount())
.addGauge(Interns.info(L1_CACHE_MISS_COUNT, L1_CACHE_MISS_COUNT_DESC),
@ -545,6 +553,12 @@ public class MetricsRegionServerSourceImpl extends BaseSourceImpl
rsWrap.getL1CacheHitRatio())
.addGauge(Interns.info(L1_CACHE_MISS_RATIO, L1_CACHE_MISS_RATIO_DESC),
rsWrap.getL1CacheMissRatio())
.addGauge(Interns.info(L2_CACHE_SIZE, L2_CACHE_SIZE_DESC), rsWrap.getL2CacheSize())
.addGauge(Interns.info(L2_CACHE_FREE_SIZE, L2_CACHE_FREE_SIZE_DESC),
rsWrap.getL2CacheFreeSize())
.addGauge(Interns.info(L2_CACHE_COUNT, L2_CACHE_COUNT_DESC), rsWrap.getL2CacheCount())
.addCounter(Interns.info(L2_CACHE_EVICTION_COUNT, L2_CACHE_EVICTION_COUNT_DESC),
rsWrap.getL2CacheEvictedCount())
.addGauge(Interns.info(L2_CACHE_HIT_COUNT, L2_CACHE_HIT_COUNT_DESC),
rsWrap.getL2CacheHitCount())
.addGauge(Interns.info(L2_CACHE_MISS_COUNT, L2_CACHE_MISS_COUNT_DESC),

View File

@ -267,6 +267,11 @@ public interface MetricsRegionServerWrapper {
*/
long getBlockCacheCount();
/**
* Get the number of DATA blocks in the block cache.
*/
long getBlockCacheDataBlockCount();
/**
* Get the total size (in bytes) of the block cache.
*/
@ -317,6 +322,26 @@ public interface MetricsRegionServerWrapper {
*/
long getBlockCacheFailedInsertions();
/**
* Cache size (bytes) of L1 cache
*/
long getL1CacheSize();
/**
* Free cache size (bytes) of L1 cache
*/
long getL1CacheFreeSize();
/**
* Number of blocks in L1 cache
*/
long getL1CacheCount();
/**
* Number of blocks evicted from L1 cache
*/
long getL1CacheEvictedCount();
/**
* Hit count of L1 cache.
*/
@ -337,6 +362,26 @@ public interface MetricsRegionServerWrapper {
*/
double getL1CacheMissRatio();
/**
* Cache size (bytes) of L2 cache
*/
long getL2CacheSize();
/**
* Free cache size (bytes) of L2 cache
*/
long getL2CacheFreeSize();
/**
* Number of blocks in L2 cache
*/
long getL2CacheCount();
/**
* Number of blocks evicted from L2 cache
*/
long getL2CacheEvictedCount();
/**
* Hit count of L2 cache.
*/

View File

@ -390,4 +390,8 @@ public class CombinedBlockCache implements ResizableBlockCache, HeapSize {
public FirstLevelBlockCache getFirstLevelCache() {
return l1Cache;
}
public BlockCache getSecondLevelCache() {
return l2Cache;
}
}

View File

@ -66,6 +66,8 @@ class MetricsRegionServerWrapperImpl implements MetricsRegionServerWrapper {
private final ByteBuffAllocator allocator;
private BlockCache blockCache;
private BlockCache l1Cache = null;
private BlockCache l2Cache = null;
private MobFileCache mobFileCache;
private CacheStats cacheStats;
private CacheStats l1Stats = null;
@ -173,6 +175,14 @@ class MetricsRegionServerWrapperImpl implements MetricsRegionServerWrapper {
l1Stats = this.cacheStats;
}
}
if (this.blockCache != null) {
if (this.blockCache instanceof CombinedBlockCache) {
l1Cache = ((CombinedBlockCache) this.blockCache).getFirstLevelCache();
l2Cache = ((CombinedBlockCache) this.blockCache).getSecondLevelCache();
} else {
l1Cache = this.blockCache;
}
}
}
/**
@ -276,6 +286,11 @@ class MetricsRegionServerWrapperImpl implements MetricsRegionServerWrapper {
return this.blockCache != null ? this.blockCache.getBlockCount() : 0L;
}
@Override
public long getBlockCacheDataBlockCount() {
return this.blockCache != null ? this.blockCache.getDataBlockCount() : 0L;
}
@Override
public long getMemStoreLimit() {
return this.regionServer.getRegionServerAccounting().getGlobalMemStoreLimit();
@ -354,6 +369,38 @@ class MetricsRegionServerWrapperImpl implements MetricsRegionServerWrapper {
return this.cacheStats != null ? this.cacheStats.getFailedInserts() : 0L;
}
public long getL1CacheSize() {
return this.l1Cache != null ? this.l1Cache.getCurrentSize() : 0L;
}
public long getL1CacheFreeSize() {
return this.l1Cache != null ? this.l1Cache.getFreeSize() : 0L;
}
public long getL1CacheCount() {
return this.l1Cache != null ? this.l1Cache.getBlockCount() : 0L;
}
public long getL1CacheEvictedCount() {
return this.l1Stats != null ? this.l1Stats.getEvictedCount() : 0L;
}
public long getL2CacheSize() {
return this.l2Cache != null ? this.l2Cache.getCurrentSize() : 0L;
}
public long getL2CacheFreeSize() {
return this.l2Cache != null ? this.l2Cache.getFreeSize() : 0L;
}
public long getL2CacheCount() {
return this.l2Cache != null ? this.l2Cache.getBlockCount() : 0L;
}
public long getL2CacheEvictedCount() {
return this.l2Stats != null ? this.l2Stats.getEvictedCount() : 0L;
}
@Override
public long getL1CacheHitCount() {
return this.l1Stats != null ? this.l1Stats.getHitCount() : 0L;

View File

@ -292,6 +292,11 @@ public class MetricsRegionServerWrapperStub implements MetricsRegionServerWrappe
return 414;
}
@Override
public long getBlockCacheDataBlockCount() {
return 300;
}
@Override
public long getBlockCacheSize() {
return 415;
@ -342,6 +347,26 @@ public class MetricsRegionServerWrapperStub implements MetricsRegionServerWrappe
return 36;
}
@Override
public long getL1CacheSize() {
return 123;
}
@Override
public long getL1CacheFreeSize() {
return 100;
}
@Override
public long getL1CacheCount() {
return 50;
}
@Override
public long getL1CacheEvictedCount() {
return 1000;
}
@Override
public long getL1CacheHitCount() {
return 200;
@ -362,6 +387,26 @@ public class MetricsRegionServerWrapperStub implements MetricsRegionServerWrappe
return 20;
}
@Override
public long getL2CacheSize() {
return 456;
}
@Override
public long getL2CacheFreeSize() {
return 200;
}
@Override
public long getL2CacheCount() {
return 75;
}
@Override
public long getL2CacheEvictedCount() {
return 2000;
}
@Override
public long getL2CacheHitCount() {
return 800;

View File

@ -104,6 +104,7 @@ public class TestMetricsRegionServer {
HELPER.assertGauge("flushQueueLength", 412, serverSource);
HELPER.assertGauge("blockCacheFreeSize", 413, serverSource);
HELPER.assertGauge("blockCacheCount", 414, serverSource);
HELPER.assertGauge("blockCacheDataBlockCount", 300, serverSource);
HELPER.assertGauge("blockCacheSize", 415, serverSource);
HELPER.assertCounter("blockCacheHitCount", 416, serverSource);
HELPER.assertCounter("blockCacheMissCount", 417, serverSource);
@ -111,10 +112,18 @@ public class TestMetricsRegionServer {
HELPER.assertGauge("blockCacheCountHitPercent", 98, serverSource);
HELPER.assertGauge("blockCacheExpressHitPercent", 97, serverSource);
HELPER.assertCounter("blockCacheFailedInsertionCount", 36, serverSource);
HELPER.assertGauge("l1CacheFreeSize", 100, serverSource);
HELPER.assertGauge("l1CacheSize", 123, serverSource);
HELPER.assertGauge("l1CacheCount", 50, serverSource);
HELPER.assertCounter("l1CacheEvictionCount", 1000, serverSource);
HELPER.assertGauge("l1CacheHitCount", 200, serverSource);
HELPER.assertGauge("l1CacheMissCount", 100, serverSource);
HELPER.assertGauge("l1CacheHitRatio", 80, serverSource);
HELPER.assertGauge("l1CacheMissRatio", 20, serverSource);
HELPER.assertGauge("l2CacheFreeSize", 200, serverSource);
HELPER.assertGauge("l2CacheSize", 456, serverSource);
HELPER.assertGauge("l2CacheCount", 75, serverSource);
HELPER.assertCounter("l2CacheEvictionCount", 2000, serverSource);
HELPER.assertGauge("l2CacheHitCount", 800, serverSource);
HELPER.assertGauge("l2CacheMissCount", 200, serverSource);
HELPER.assertGauge("l2CacheHitRatio", 90, serverSource);

View File

@ -946,6 +946,10 @@ Bloom Filters::
Currently the recommended way to measure HFile indexes and bloom filters sizes is to look at the region server web UI and checkout the relevant metrics.
For keys, sampling can be done by using the HFile command line tool and look for the average key size metric.
Since HBase 0.98.3, you can view details on BlockCache stats and metrics in a special Block Cache section in the UI.
As of HBase 2.4.14, you can estimate HFile indexes and bloom filters vs other DATA blocks using blockCacheCount and blockCacheDataBlockCount in JMX. The
formula `(blockCacheCount - blockCacheDataBlockCount) * blockSize` will give you an estimate which can be useful when trying to enable the BucketCache. You
should make sure the post-BucketCache config gives enough memory to the on-heap LRU cache to hold at least the same number of non-DATA blocks from pre-BucketCache.
Once BucketCache is enabled, the L1 metrics like l1CacheSize, l1CacheCount, and l1CacheEvictionCount can help you further tune the size.
It's generally bad to use block caching when the WSS doesn't fit in memory.
This is the case when you have for example 40GB available across all your region servers' block caches but you need to process 1TB of data.