mirror of https://github.com/apache/lucene.git
Rename NodeHash to FSTSuffixNodeCache (#13259)
This commit is contained in:
parent
cfdd20f5bc
commit
13285279c2
|
@ -98,8 +98,8 @@ public class FSTCompiler<T> {
|
||||||
// it will throw exceptions if attempt to call getReverseBytesReader() or writeTo(DataOutput)
|
// it will throw exceptions if attempt to call getReverseBytesReader() or writeTo(DataOutput)
|
||||||
private static final FSTReader NULL_FST_READER = new NullFSTReader();
|
private static final FSTReader NULL_FST_READER = new NullFSTReader();
|
||||||
|
|
||||||
private final NodeHash<T> dedupHash;
|
private final FSTSuffixNodeCache<T> suffixDedupCache;
|
||||||
// a temporary FST used during building for NodeHash cache
|
// a temporary FST used during building for FSTSuffixNodeCache cache
|
||||||
final FST<T> fst;
|
final FST<T> fst;
|
||||||
private final T NO_OUTPUT;
|
private final T NO_OUTPUT;
|
||||||
|
|
||||||
|
@ -178,9 +178,9 @@ public class FSTCompiler<T> {
|
||||||
if (suffixRAMLimitMB < 0) {
|
if (suffixRAMLimitMB < 0) {
|
||||||
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
|
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
|
||||||
} else if (suffixRAMLimitMB > 0) {
|
} else if (suffixRAMLimitMB > 0) {
|
||||||
dedupHash = new NodeHash<>(this, suffixRAMLimitMB);
|
suffixDedupCache = new FSTSuffixNodeCache<>(this, suffixRAMLimitMB);
|
||||||
} else {
|
} else {
|
||||||
dedupHash = null;
|
suffixDedupCache = null;
|
||||||
}
|
}
|
||||||
NO_OUTPUT = outputs.getNoOutput();
|
NO_OUTPUT = outputs.getNoOutput();
|
||||||
|
|
||||||
|
@ -379,12 +379,12 @@ public class FSTCompiler<T> {
|
||||||
private CompiledNode compileNode(UnCompiledNode<T> nodeIn) throws IOException {
|
private CompiledNode compileNode(UnCompiledNode<T> nodeIn) throws IOException {
|
||||||
final long node;
|
final long node;
|
||||||
long bytesPosStart = numBytesWritten;
|
long bytesPosStart = numBytesWritten;
|
||||||
if (dedupHash != null) {
|
if (suffixDedupCache != null) {
|
||||||
if (nodeIn.numArcs == 0) {
|
if (nodeIn.numArcs == 0) {
|
||||||
node = addNode(nodeIn);
|
node = addNode(nodeIn);
|
||||||
lastFrozenNode = node;
|
lastFrozenNode = node;
|
||||||
} else {
|
} else {
|
||||||
node = dedupHash.add(nodeIn);
|
node = suffixDedupCache.add(nodeIn);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
node = addNode(nodeIn);
|
node = addNode(nodeIn);
|
||||||
|
|
|
@ -31,8 +31,24 @@ import org.apache.lucene.util.packed.PagedGrowableWriter;
|
||||||
// TODO: couldn't we prune naturally back until we see a transition with an output? it's highly
|
// TODO: couldn't we prune naturally back until we see a transition with an output? it's highly
|
||||||
// unlikely (mostly impossible) such suffixes can be shared?
|
// unlikely (mostly impossible) such suffixes can be shared?
|
||||||
|
|
||||||
// Used to dedup states (lookup already-frozen states)
|
/**
|
||||||
final class NodeHash<T> {
|
* This is essentially a LRU cache to maintain and lookup node suffix. Un-compiled node can be added
|
||||||
|
* into the cache and if a similar node exists we will return its address in the FST. A node is
|
||||||
|
* defined as similar if it has the same label, arcs, outputs & other properties that identify a
|
||||||
|
* node.
|
||||||
|
*
|
||||||
|
* <p>The total size of the cache is controlled through the constructor parameter <code>ramLimitMB
|
||||||
|
* </code> Implementation-wise, we maintain two lookup tables, a primary table where node can be
|
||||||
|
* looked up from, and a fallback lookup table in case the lookup in the primary table fails. Nodes
|
||||||
|
* from the fallback table can also be promoted to the primary table when that happens. When the
|
||||||
|
* primary table is full, we swap it with the fallback table and clear out the primary table.
|
||||||
|
*
|
||||||
|
* <p>To lookup the node address, we build a special hash table which maps from the Node hash value
|
||||||
|
* to the Node address in the FST, called <code>PagedGrowableHash</code>. Internally it uses {@link
|
||||||
|
* PagedGrowableWriter} to store the mapping, which allows efficient packing the hash & address long
|
||||||
|
* values, and uses {@link ByteBlockPool} to store the actual node content (arcs & outputs).
|
||||||
|
*/
|
||||||
|
final class FSTSuffixNodeCache<T> {
|
||||||
|
|
||||||
// primary table -- we add nodes into this until it reaches the requested tableSizeLimit/2, then
|
// primary table -- we add nodes into this until it reaches the requested tableSizeLimit/2, then
|
||||||
// we move it to fallback
|
// we move it to fallback
|
||||||
|
@ -60,7 +76,7 @@ final class NodeHash<T> {
|
||||||
* recently used suffixes are discarded, and the FST is no longer minimalI. Still, larger
|
* recently used suffixes are discarded, and the FST is no longer minimalI. Still, larger
|
||||||
* ramLimitMB will make the FST smaller (closer to minimal).
|
* ramLimitMB will make the FST smaller (closer to minimal).
|
||||||
*/
|
*/
|
||||||
public NodeHash(FSTCompiler<T> fstCompiler, double ramLimitMB) {
|
public FSTSuffixNodeCache(FSTCompiler<T> fstCompiler, double ramLimitMB) {
|
||||||
if (ramLimitMB <= 0) {
|
if (ramLimitMB <= 0) {
|
||||||
throw new IllegalArgumentException("ramLimitMB must be > 0; got: " + ramLimitMB);
|
throw new IllegalArgumentException("ramLimitMB must be > 0; got: " + ramLimitMB);
|
||||||
}
|
}
|
|
@ -19,14 +19,16 @@ package org.apache.lucene.util.fst;
|
||||||
import com.carrotsearch.randomizedtesting.generators.RandomBytes;
|
import com.carrotsearch.randomizedtesting.generators.RandomBytes;
|
||||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||||
|
|
||||||
public class TestNodeHash extends LuceneTestCase {
|
public class TestFSTSuffixNodeCache extends LuceneTestCase {
|
||||||
|
|
||||||
public void testCopyFallbackNodeBytes() {
|
public void testCopyFallbackNodeBytes() {
|
||||||
// we don't need the FSTCompiler in this test
|
// we don't need the FSTCompiler in this test
|
||||||
NodeHash<Object> nodeHash = new NodeHash<>(null, 1);
|
FSTSuffixNodeCache<Object> suffixCache = new FSTSuffixNodeCache<>(null, 1);
|
||||||
|
|
||||||
NodeHash<Object>.PagedGrowableHash primaryHashTable = nodeHash.new PagedGrowableHash();
|
FSTSuffixNodeCache<Object>.PagedGrowableHash primaryHashTable =
|
||||||
NodeHash<Object>.PagedGrowableHash fallbackHashTable = nodeHash.new PagedGrowableHash();
|
suffixCache.new PagedGrowableHash();
|
||||||
|
FSTSuffixNodeCache<Object>.PagedGrowableHash fallbackHashTable =
|
||||||
|
suffixCache.new PagedGrowableHash();
|
||||||
int nodeLength = atLeast(500);
|
int nodeLength = atLeast(500);
|
||||||
long fallbackHashSlot = 1;
|
long fallbackHashSlot = 1;
|
||||||
byte[] fallbackBytes = RandomBytes.randomBytesOfLength(random(), nodeLength);
|
byte[] fallbackBytes = RandomBytes.randomBytesOfLength(random(), nodeLength);
|
Loading…
Reference in New Issue