Rename NodeHash to FSTSuffixNodeCache (#13259)

This commit is contained in:
Dzung Bui 2024-11-02 07:14:17 +09:00 committed by GitHub
parent cfdd20f5bc
commit 13285279c2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 31 additions and 13 deletions

View File

@ -98,8 +98,8 @@ public class FSTCompiler<T> {
// it will throw exceptions if attempt to call getReverseBytesReader() or writeTo(DataOutput)
private static final FSTReader NULL_FST_READER = new NullFSTReader();
private final NodeHash<T> dedupHash;
// a temporary FST used during building for NodeHash cache
private final FSTSuffixNodeCache<T> suffixDedupCache;
// a temporary FST used during building for FSTSuffixNodeCache cache
final FST<T> fst;
private final T NO_OUTPUT;
@ -178,9 +178,9 @@ public class FSTCompiler<T> {
if (suffixRAMLimitMB < 0) {
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
} else if (suffixRAMLimitMB > 0) {
dedupHash = new NodeHash<>(this, suffixRAMLimitMB);
suffixDedupCache = new FSTSuffixNodeCache<>(this, suffixRAMLimitMB);
} else {
dedupHash = null;
suffixDedupCache = null;
}
NO_OUTPUT = outputs.getNoOutput();
@ -379,12 +379,12 @@ public class FSTCompiler<T> {
private CompiledNode compileNode(UnCompiledNode<T> nodeIn) throws IOException {
final long node;
long bytesPosStart = numBytesWritten;
if (dedupHash != null) {
if (suffixDedupCache != null) {
if (nodeIn.numArcs == 0) {
node = addNode(nodeIn);
lastFrozenNode = node;
} else {
node = dedupHash.add(nodeIn);
node = suffixDedupCache.add(nodeIn);
}
} else {
node = addNode(nodeIn);

View File

@ -31,8 +31,24 @@ import org.apache.lucene.util.packed.PagedGrowableWriter;
// TODO: couldn't we prune naturally back until we see a transition with an output? it's highly
// unlikely (mostly impossible) such suffixes can be shared?
// Used to dedup states (lookup already-frozen states)
final class NodeHash<T> {
/**
* This is essentially a LRU cache to maintain and lookup node suffix. Un-compiled node can be added
* into the cache and if a similar node exists we will return its address in the FST. A node is
* defined as similar if it has the same label, arcs, outputs & other properties that identify a
* node.
*
* <p>The total size of the cache is controlled through the constructor parameter <code>ramLimitMB
* </code> Implementation-wise, we maintain two lookup tables, a primary table where node can be
* looked up from, and a fallback lookup table in case the lookup in the primary table fails. Nodes
* from the fallback table can also be promoted to the primary table when that happens. When the
* primary table is full, we swap it with the fallback table and clear out the primary table.
*
* <p>To lookup the node address, we build a special hash table which maps from the Node hash value
* to the Node address in the FST, called <code>PagedGrowableHash</code>. Internally it uses {@link
* PagedGrowableWriter} to store the mapping, which allows efficient packing the hash & address long
* values, and uses {@link ByteBlockPool} to store the actual node content (arcs & outputs).
*/
final class FSTSuffixNodeCache<T> {
// primary table -- we add nodes into this until it reaches the requested tableSizeLimit/2, then
// we move it to fallback
@ -60,7 +76,7 @@ final class NodeHash<T> {
* recently used suffixes are discarded, and the FST is no longer minimalI. Still, larger
* ramLimitMB will make the FST smaller (closer to minimal).
*/
public NodeHash(FSTCompiler<T> fstCompiler, double ramLimitMB) {
public FSTSuffixNodeCache(FSTCompiler<T> fstCompiler, double ramLimitMB) {
if (ramLimitMB <= 0) {
throw new IllegalArgumentException("ramLimitMB must be > 0; got: " + ramLimitMB);
}

View File

@ -19,14 +19,16 @@ package org.apache.lucene.util.fst;
import com.carrotsearch.randomizedtesting.generators.RandomBytes;
import org.apache.lucene.tests.util.LuceneTestCase;
public class TestNodeHash extends LuceneTestCase {
public class TestFSTSuffixNodeCache extends LuceneTestCase {
public void testCopyFallbackNodeBytes() {
// we don't need the FSTCompiler in this test
NodeHash<Object> nodeHash = new NodeHash<>(null, 1);
FSTSuffixNodeCache<Object> suffixCache = new FSTSuffixNodeCache<>(null, 1);
NodeHash<Object>.PagedGrowableHash primaryHashTable = nodeHash.new PagedGrowableHash();
NodeHash<Object>.PagedGrowableHash fallbackHashTable = nodeHash.new PagedGrowableHash();
FSTSuffixNodeCache<Object>.PagedGrowableHash primaryHashTable =
suffixCache.new PagedGrowableHash();
FSTSuffixNodeCache<Object>.PagedGrowableHash fallbackHashTable =
suffixCache.new PagedGrowableHash();
int nodeLength = atLeast(500);
long fallbackHashSlot = 1;
byte[] fallbackBytes = RandomBytes.randomBytesOfLength(random(), nodeLength);