Rename NodeHash to FSTSuffixNodeCache (#13259)

2024-11-02 07:14:17 +09:00 · 2024-11-02 07:14:17 +09:00 · 13285279c2
parent cfdd20f5bc
commit 13285279c2
3 changed files with 31 additions and 13 deletions
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java
@ -98,8 +98,8 @@ public class FSTCompiler<T> {
  // it will throw exceptions if attempt to call getReverseBytesReader() or writeTo(DataOutput)
  private static final FSTReader NULL_FST_READER = new NullFSTReader();

-  private final NodeHash<T> dedupHash;
-  // a temporary FST used during building for NodeHash cache
+  private final FSTSuffixNodeCache<T> suffixDedupCache;
+  // a temporary FST used during building for FSTSuffixNodeCache cache
  final FST<T> fst;
  private final T NO_OUTPUT;

@ -178,9 +178,9 @@ public class FSTCompiler<T> {
    if (suffixRAMLimitMB < 0) {
      throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
    } else if (suffixRAMLimitMB > 0) {
-      dedupHash = new NodeHash<>(this, suffixRAMLimitMB);
+      suffixDedupCache = new FSTSuffixNodeCache<>(this, suffixRAMLimitMB);
    } else {
-      dedupHash = null;
+      suffixDedupCache = null;
    }
    NO_OUTPUT = outputs.getNoOutput();

@ -379,12 +379,12 @@ public class FSTCompiler<T> {
  private CompiledNode compileNode(UnCompiledNode<T> nodeIn) throws IOException {
    final long node;
    long bytesPosStart = numBytesWritten;
-    if (dedupHash != null) {
+    if (suffixDedupCache != null) {
      if (nodeIn.numArcs == 0) {
        node = addNode(nodeIn);
        lastFrozenNode = node;
      } else {
-        node = dedupHash.add(nodeIn);
+        node = suffixDedupCache.add(nodeIn);
      }
    } else {
      node = addNode(nodeIn);
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTSuffixNodeCache.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTSuffixNodeCache.java
@ -31,8 +31,24 @@ import org.apache.lucene.util.packed.PagedGrowableWriter;
 // TODO: couldn't we prune naturally back until we see a transition with an output?  it's highly
 // unlikely (mostly impossible) such suffixes can be shared?

-// Used to dedup states (lookup already-frozen states)
-final class NodeHash<T> {
+/**
+ * This is essentially a LRU cache to maintain and lookup node suffix. Un-compiled node can be added
+ * into the cache and if a similar node exists we will return its address in the FST. A node is
+ * defined as similar if it has the same label, arcs, outputs & other properties that identify a
+ * node.
+ *
+ * <p>The total size of the cache is controlled through the constructor parameter <code>ramLimitMB
+ * </code> Implementation-wise, we maintain two lookup tables, a primary table where node can be
+ * looked up from, and a fallback lookup table in case the lookup in the primary table fails. Nodes
+ * from the fallback table can also be promoted to the primary table when that happens. When the
+ * primary table is full, we swap it with the fallback table and clear out the primary table.
+ *
+ * <p>To lookup the node address, we build a special hash table which maps from the Node hash value
+ * to the Node address in the FST, called <code>PagedGrowableHash</code>. Internally it uses {@link
+ * PagedGrowableWriter} to store the mapping, which allows efficient packing the hash & address long
+ * values, and uses {@link ByteBlockPool} to store the actual node content (arcs & outputs).
+ */
+final class FSTSuffixNodeCache<T> {

  // primary table -- we add nodes into this until it reaches the requested tableSizeLimit/2, then
  // we move it to fallback
@ -60,7 +76,7 @@ final class NodeHash<T> {
   * recently used suffixes are discarded, and the FST is no longer minimalI. Still, larger
   * ramLimitMB will make the FST smaller (closer to minimal).
   */
-  public NodeHash(FSTCompiler<T> fstCompiler, double ramLimitMB) {
+  public FSTSuffixNodeCache(FSTCompiler<T> fstCompiler, double ramLimitMB) {
    if (ramLimitMB <= 0) {
      throw new IllegalArgumentException("ramLimitMB must be > 0; got: " + ramLimitMB);
    }
--- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTSuffixNodeCache.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTSuffixNodeCache.java
@ -19,14 +19,16 @@ package org.apache.lucene.util.fst;
 import com.carrotsearch.randomizedtesting.generators.RandomBytes;
 import org.apache.lucene.tests.util.LuceneTestCase;

-public class TestNodeHash extends LuceneTestCase {
+public class TestFSTSuffixNodeCache extends LuceneTestCase {

  public void testCopyFallbackNodeBytes() {
    // we don't need the FSTCompiler in this test
-    NodeHash<Object> nodeHash = new NodeHash<>(null, 1);
+    FSTSuffixNodeCache<Object> suffixCache = new FSTSuffixNodeCache<>(null, 1);

-    NodeHash<Object>.PagedGrowableHash primaryHashTable = nodeHash.new PagedGrowableHash();
-    NodeHash<Object>.PagedGrowableHash fallbackHashTable = nodeHash.new PagedGrowableHash();
+    FSTSuffixNodeCache<Object>.PagedGrowableHash primaryHashTable =
+        suffixCache.new PagedGrowableHash();
+    FSTSuffixNodeCache<Object>.PagedGrowableHash fallbackHashTable =
+        suffixCache.new PagedGrowableHash();
    int nodeLength = atLeast(500);
    long fallbackHashSlot = 1;
    byte[] fallbackBytes = RandomBytes.randomBytesOfLength(random(), nodeLength);