From 2067fcb64638df2dde8100f44cce431baa23ceb8 Mon Sep 17 00:00:00 2001 From: litao Date: Wed, 30 Nov 2022 23:44:21 +0800 Subject: [PATCH] HDFS-16550. Allow JN edit cache size to be set as a fraction of heap memory (#4209) --- .../org/apache/hadoop/hdfs/DFSConfigKeys.java | 5 +++- .../qjournal/server/JournaledEditsCache.java | 28 ++++++++++++++----- .../src/main/resources/hdfs-default.xml | 18 +++++++++++- .../markdown/HDFSHighAvailabilityWithQJM.md | 10 +++++++ .../src/site/markdown/ObserverNameNode.md | 18 ++++++++++++ .../server/TestJournaledEditsCache.java | 21 ++++++++++++++ 6 files changed, 91 insertions(+), 9 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index f766c48d7c5..0e10bc61c99 100755 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -1424,7 +1424,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final long DFS_JOURNALNODE_SYNC_INTERVAL_DEFAULT = 2*60*1000L; public static final String DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY = "dfs.journalnode.edit-cache-size.bytes"; - public static final int DFS_JOURNALNODE_EDIT_CACHE_SIZE_DEFAULT = 1024 * 1024; + + public static final String DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY = + "dfs.journalnode.edit-cache-size.fraction"; + public static final float DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_DEFAULT = 0.5f; // Journal-node related configs for the client side. public static final String DFS_QJOURNAL_QUEUE_SIZE_LIMIT_KEY = "dfs.qjournal.queued-edits.limit.mb"; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournaledEditsCache.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournaledEditsCache.java index 65f54609ef3..339b7fa7b68 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournaledEditsCache.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournaledEditsCache.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hdfs.server.namenode.EditLogFileOutputStream; import org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp; import org.apache.hadoop.util.AutoCloseableLock; +import org.apache.hadoop.util.Preconditions; /** * An in-memory cache of edits in their serialized form. This is used to serve @@ -121,12 +122,18 @@ class JournaledEditsCache { // ** End lock-protected fields ** JournaledEditsCache(Configuration conf) { + float fraction = conf.getFloat(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY, + DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_DEFAULT); + Preconditions.checkArgument((fraction > 0 && fraction < 1.0f), + String.format("Cache config %s is set at %f, it should be a positive float value, " + + "less than 1.0. The recommended value is less than 0.9.", + DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY, fraction)); capacity = conf.getInt(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY, - DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_DEFAULT); + (int) (Runtime.getRuntime().maxMemory() * fraction)); if (capacity > 0.9 * Runtime.getRuntime().maxMemory()) { Journal.LOG.warn(String.format("Cache capacity is set at %d bytes but " + "maximum JVM memory is only %d bytes. It is recommended that you " + - "decrease the cache size or increase the heap size.", + "decrease the cache size/fraction or increase the heap size.", capacity, Runtime.getRuntime().maxMemory())); } Journal.LOG.info("Enabling the journaled edits cache with a capacity " + @@ -277,11 +284,12 @@ class JournaledEditsCache { initialize(INVALID_TXN_ID); Journal.LOG.warn(String.format("A single batch of edits was too " + "large to fit into the cache: startTxn = %d, endTxn = %d, " + - "input length = %d. The capacity of the cache (%s) must be " + + "input length = %d. The cache size (%s) or cache fraction (%s) must be " + "increased for it to work properly (current capacity %d)." + "Cache is now empty.", newStartTxn, newEndTxn, inputData.length, - DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY, capacity)); + DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY, + DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY, capacity)); return; } if (dataMap.isEmpty()) { @@ -388,10 +396,11 @@ class JournaledEditsCache { } else { return new CacheMissException(lowestTxnId - requestedTxnId, "Oldest txn ID available in the cache is %d, but requested txns " + - "starting at %d. The cache size (%s) may need to be increased " + - "to hold more transactions (currently %d bytes containing %d " + + "starting at %d. The cache size (%s) or cache fraction (%s) may need to be " + + "increased to hold more transactions (currently %d bytes containing %d " + "transactions)", lowestTxnId, requestedTxnId, - DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY, capacity, + DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY, + DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY, capacity, highestTxnId - lowestTxnId + 1); } } @@ -414,4 +423,9 @@ class JournaledEditsCache { } + @VisibleForTesting + int getCapacity() { + return capacity; + } + } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index 2a0a4945faa..bf7f99aa1fa 100755 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -4945,7 +4945,7 @@ dfs.journalnode.edit-cache-size.bytes - 1048576 + The size, in bytes, of the in-memory cache of edits to keep on the JournalNode. This cache is used to serve edits for tailing via the RPC-based @@ -4955,6 +4955,22 @@ + + dfs.journalnode.edit-cache-size.fraction + 0.5f + + This ratio refers to the proportion of the maximum memory of the JVM. + Used to calculate the size of the edits cache that is kept in the JournalNode's memory. + This config is an alternative to the dfs.journalnode.edit-cache-size.bytes. + And it is used to serve edits for tailing via the RPC-based mechanism, and is only + enabled when dfs.ha.tail-edits.in-progress is true. Transactions range in size but + are around 200 bytes on average, so the default of 1MB can store around 5000 transactions. + So we can configure a reasonable value based on the maximum memory. The recommended value + is less than 0.9. If we set dfs.journalnode.edit-cache-size.bytes, this parameter will + not take effect. + + + dfs.journalnode.kerberos.internal.spnego.principal diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithQJM.md b/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithQJM.md index 5591f4f2245..b6b408db8b4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithQJM.md +++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithQJM.md @@ -502,6 +502,16 @@ lag time will be much longer. The relevant configurations are: the oldest data in the cache was at transaction ID 20, a value of 10 would be added to the average. +* **dfs.journalnode.edit-cache-size.fraction** - This fraction refers to the proportion of + the maximum memory of the JVM. Used to calculate the size of the edits cache that is + kept in the JournalNode's memory. This config is an alternative to the + dfs.journalnode.edit-cache-size.bytes. And it is used to serve edits for tailing via + the RPC-based mechanism, and is only enabled when dfs.ha.tail-edits.in-progress is true. + Transactions range in size but are around 200 bytes on average, so the default of 1MB + can store around 5000 transactions. So we can configure a reasonable value based on + the maximum memory. The recommended value is less than 0.9. If we set + dfs.journalnode.edit-cache-size.bytes, this parameter will not take effect. + This feature is primarily useful in conjunction with the Standby/Observer Read feature. Using this feature, read requests can be serviced from non-active NameNodes; thus tailing in-progress edits provides these nodes with the ability to serve requests with data which is much more fresh. See the diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/ObserverNameNode.md b/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/ObserverNameNode.md index 00aeb5bd2e0..74026ec8625 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/ObserverNameNode.md +++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/ObserverNameNode.md @@ -194,6 +194,24 @@ few configurations to your **hdfs-site.xml**: 1048576 +* **dfs.journalnode.edit-cache-size.fraction** - the fraction refers to + the proportion of the maximum memory of the JVM. + + Used to calculate the size of the edits cache that + is kept in the JournalNode's memory. + This config is an alternative to the dfs.journalnode.edit-cache-size.bytes. + And it is used to serve edits for tailing via the RPC-based mechanism, and is only + enabled when dfs.ha.tail-edits.in-progress is true. Transactions range in size but + are around 200 bytes on average, so the default of 1MB can store around 5000 transactions. + So we can configure a reasonable value based on the maximum memory. The recommended value + is less than 0.9. If we set dfs.journalnode.edit-cache-size.bytes, this parameter will + not take effect. + + + dfs.journalnode.edit-cache-size.fraction + 0.5f + + * **dfs.namenode.accesstime.precision** -- whether to enable access time for HDFS file. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/server/TestJournaledEditsCache.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/server/TestJournaledEditsCache.java index 2a178a1547e..82b8b587694 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/server/TestJournaledEditsCache.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/server/TestJournaledEditsCache.java @@ -221,6 +221,27 @@ public class TestJournaledEditsCache { cache.retrieveEdits(-1, 10, new ArrayList<>()); } + @Test + public void testCacheSizeConfigs() { + // Assert the default configs. + Configuration config = new Configuration(); + cache = new JournaledEditsCache(config); + assertEquals((int) (Runtime.getRuntime().maxMemory() * 0.5f), cache.getCapacity()); + + // Set dfs.journalnode.edit-cache-size.bytes. + Configuration config1 = new Configuration(); + config1.setInt(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY, 1); + config1.setFloat(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY, 0.1f); + cache = new JournaledEditsCache(config1); + assertEquals(1, cache.getCapacity()); + + // Don't set dfs.journalnode.edit-cache-size.bytes. + Configuration config2 = new Configuration(); + config2.setFloat(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY, 0.1f); + cache = new JournaledEditsCache(config2); + assertEquals((int) (Runtime.getRuntime().maxMemory() * 0.1f), cache.getCapacity()); + } + private void storeEdits(int startTxn, int endTxn) throws Exception { cache.storeEdits(createTxnData(startTxn, endTxn - startTxn + 1), startTxn, endTxn, NameNodeLayoutVersion.CURRENT_LAYOUT_VERSION);