diff --git a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java index d2d5fc00d8c..cbdd3bb973d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java +++ b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java @@ -38,6 +38,8 @@ public final class ByteBlockPool implements Accountable { /** Abstract class for allocating and freeing byte blocks. */ public abstract static class Allocator { + // TODO: ByteBlockPool assume the blockSize is always {@link BYTE_BLOCK_SIZE}, but this class + // allow arbitrary value of blockSize. We should make them consistent. protected final int blockSize; protected Allocator(int blockSize) { @@ -215,19 +217,38 @@ public final class ByteBlockPool implements Accountable { /** Appends the bytes in the provided {@link BytesRef} at the current position. */ public void append(final BytesRef bytes) { - int bytesLeft = bytes.length; - int offset = bytes.offset; + append(bytes.bytes, bytes.offset, bytes.length); + } + + /** + * Append the provided byte array at the current position. + * + * @param bytes the byte array to write + */ + public void append(final byte[] bytes) { + append(bytes, 0, bytes.length); + } + + /** + * Append some portion of the provided byte array at the current position. + * + * @param bytes the byte array to write + * @param offset the offset of the byte array + * @param length the number of bytes to write + */ + public void append(final byte[] bytes, int offset, int length) { + int bytesLeft = length; while (bytesLeft > 0) { int bufferLeft = BYTE_BLOCK_SIZE - byteUpto; if (bytesLeft < bufferLeft) { // fits within current buffer - System.arraycopy(bytes.bytes, offset, buffer, byteUpto, bytesLeft); + System.arraycopy(bytes, offset, buffer, byteUpto, bytesLeft); byteUpto += bytesLeft; break; } else { // fill up this buffer and move to next one if (bufferLeft > 0) { - System.arraycopy(bytes.bytes, offset, buffer, byteUpto, bufferLeft); + System.arraycopy(bytes, offset, buffer, byteUpto, bufferLeft); } nextBuffer(); bytesLeft -= bufferLeft; @@ -256,6 +277,18 @@ public final class ByteBlockPool implements Accountable { } } + /** + * Read a single byte at the given offset + * + * @param offset the offset to read + * @return the byte + */ + public byte readByte(final long offset) { + int bufferIndex = (int) (offset >> BYTE_BLOCK_SHIFT); + int pos = (int) (offset & BYTE_BLOCK_MASK); + return buffers[bufferIndex][pos]; + } + @Override public long ramBytesUsed() { long size = BASE_RAM_BYTES; @@ -269,4 +302,9 @@ public final class ByteBlockPool implements Accountable { } return size; } + + /** the current position (in absolute value) of this byte pool */ + public long getPosition() { + return bufferUpto * allocator.blockSize + byteUpto; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ByteBlockPoolReverseBytesReader.java b/lucene/core/src/java/org/apache/lucene/util/fst/ByteBlockPoolReverseBytesReader.java new file mode 100644 index 00000000000..41ca21d3144 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ByteBlockPoolReverseBytesReader.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.io.IOException; +import org.apache.lucene.util.ByteBlockPool; + +/** Reads in reverse from a ByteBlockPool. */ +final class ByteBlockPoolReverseBytesReader extends FST.BytesReader { + + private final ByteBlockPool buf; + // the difference between the FST node address and the hash table copied node address + private long posDelta; + private long pos; + + public ByteBlockPoolReverseBytesReader(ByteBlockPool buf) { + this.buf = buf; + } + + @Override + public byte readByte() { + return buf.readByte(pos--); + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + for (int i = 0; i < len; i++) { + b[offset + i] = buf.readByte(pos--); + } + } + + @Override + public void skipBytes(long numBytes) throws IOException { + pos -= numBytes; + } + + @Override + public long getPosition() { + return pos + posDelta; + } + + @Override + public void setPosition(long pos) { + this.pos = pos - posDelta; + } + + @Override + public boolean reversed() { + return true; + } + + public void setPosDelta(long posDelta) { + this.posDelta = posDelta; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java index 469454d3d52..a03b9b0f12d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -444,11 +444,7 @@ class BytesStore extends DataOutput implements FSTReader { @Override public FST.BytesReader getReverseBytesReader() { - return getReverseReader(true); - } - - FST.BytesReader getReverseReader(boolean allowSingle) { - if (allowSingle && blocks.size() == 1) { + if (blocks.size() == 1) { return new ReverseBytesReader(blocks.get(0)); } return new FST.BytesReader() { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 3af62410070..53cb18a1263 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -145,7 +145,7 @@ public class FSTCompiler { if (suffixRAMLimitMB < 0) { throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB); } else if (suffixRAMLimitMB > 0) { - dedupHash = new NodeHash<>(this, suffixRAMLimitMB, bytes.getReverseReader(false)); + dedupHash = new NodeHash<>(this, suffixRAMLimitMB); } else { dedupHash = null; } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java index 04c1be414c2..690741682a6 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -17,6 +17,7 @@ package org.apache.lucene.util.fst; import java.io.IOException; +import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PagedGrowableWriter; @@ -49,14 +50,17 @@ final class NodeHash { private final FSTCompiler fstCompiler; private final FST.Arc scratchArc = new FST.Arc<>(); - private final FST.BytesReader in; + // store the last fallback table node length in getFallback() + private int lastFallbackNodeLength; + // store the last fallback table hashtable slot in getFallback() + private long lastFallbackHashSlot; /** * ramLimitMB is the max RAM we can use for recording suffixes. If we hit this limit, the least * recently used suffixes are discarded, and the FST is no longer minimalI. Still, larger * ramLimitMB will make the FST smaller (closer to minimal). */ - public NodeHash(FSTCompiler fstCompiler, double ramLimitMB, FST.BytesReader in) { + public NodeHash(FSTCompiler fstCompiler, double ramLimitMB) { if (ramLimitMB <= 0) { throw new IllegalArgumentException("ramLimitMB must be > 0; got: " + ramLimitMB); } @@ -70,28 +74,35 @@ final class NodeHash { primaryTable = new PagedGrowableHash(); this.fstCompiler = fstCompiler; - this.in = in; } private long getFallback(FSTCompiler.UnCompiledNode nodeIn, long hash) throws IOException { + this.lastFallbackNodeLength = -1; + this.lastFallbackHashSlot = -1; if (fallbackTable == null) { // no fallback yet (primary table is not yet large enough to swap) return 0; } - long pos = hash & fallbackTable.mask; + long hashSlot = hash & fallbackTable.mask; int c = 0; while (true) { - long node = fallbackTable.get(pos); - if (node == 0) { + long nodeAddress = fallbackTable.getNodeAddress(hashSlot); + if (nodeAddress == 0) { // not found return 0; - } else if (nodesEqual(nodeIn, node)) { - // frozen version of this node is already here - return node; + } else { + int length = fallbackTable.nodesEqual(nodeIn, nodeAddress, hashSlot); + if (length != -1) { + // store the node length for further use + this.lastFallbackNodeLength = length; + this.lastFallbackHashSlot = hashSlot; + // frozen version of this node is already here + return nodeAddress; + } } // quadratic probe (but is it, really?) - pos = (pos + (++c)) & fallbackTable.mask; + hashSlot = (hashSlot + (++c)) & fallbackTable.mask; } } @@ -99,36 +110,60 @@ final class NodeHash { long hash = hash(nodeIn); - long pos = hash & primaryTable.mask; + long hashSlot = hash & primaryTable.mask; int c = 0; while (true) { - long node = primaryTable.get(pos); - if (node == 0) { + long nodeAddress = primaryTable.getNodeAddress(hashSlot); + if (nodeAddress == 0) { // node is not in primary table; is it in fallback table? - node = getFallback(nodeIn, hash); - if (node != 0) { + nodeAddress = getFallback(nodeIn, hash); + if (nodeAddress != 0) { + assert lastFallbackHashSlot != -1 && lastFallbackNodeLength != -1; + // it was already in fallback -- promote to primary - primaryTable.set(pos, node); + // TODO: Copy directly between 2 ByteBlockPool to avoid double-copy + primaryTable.setNode( + hashSlot, + nodeAddress, + fallbackTable.getBytes(lastFallbackHashSlot, lastFallbackNodeLength)); } else { // not in fallback either -- freeze & add the incoming node + long startAddress = fstCompiler.bytes.getPosition(); // freeze & add - node = fstCompiler.addNode(nodeIn); + nodeAddress = fstCompiler.addNode(nodeIn); + // TODO: Write the bytes directly from BytesStore // we use 0 as empty marker in hash table, so it better be impossible to get a frozen node // at 0: - assert node != 0; + assert nodeAddress != FST.FINAL_END_NODE && nodeAddress != FST.NON_FINAL_END_NODE; + byte[] buf = new byte[Math.toIntExact(nodeAddress - startAddress + 1)]; + fstCompiler.bytes.copyBytes(startAddress, buf, 0, buf.length); + + primaryTable.setNode(hashSlot, nodeAddress, buf); // confirm frozen hash and unfrozen hash are the same - assert hash(node) == hash : "mismatch frozenHash=" + hash(node) + " vs hash=" + hash; - - primaryTable.set(pos, node); + assert primaryTable.hash(nodeAddress, hashSlot) == hash + : "mismatch frozenHash=" + + primaryTable.hash(nodeAddress, hashSlot) + + " vs hash=" + + hash; } // how many bytes would be used if we had "perfect" hashing: - long ramBytesUsed = primaryTable.count * PackedInts.bitsRequired(node) / 8; + // - x2 for fstNodeAddress for FST node address + // - x2 for copiedNodeAddress for copied node address + // - the bytes copied out FST to the hashtable copiedNodes + // each account for approximate hash table overhead halfway between 33.3% and 66.6% + // note that some of the copiedNodes are shared between fallback and primary tables so this + // computation is pessimistic + long copiedBytes = primaryTable.copiedNodes.getPosition(); + long ramBytesUsed = + primaryTable.count * 2 * PackedInts.bitsRequired(nodeAddress) / 8 + + primaryTable.count * 2 * PackedInts.bitsRequired(copiedBytes) / 8 + + copiedBytes; // NOTE: we could instead use the more precise RAM used, but this leads to unpredictable // quantized behavior due to 2X rehashing where for large ranges of the RAM limit, the @@ -138,30 +173,29 @@ final class NodeHash { // in smaller FSTs, even if the precise RAM used is not always under the limit. // divide limit by 2 because fallback gets half the RAM and primary gets the other half - // divide by 2 again to account for approximate hash table overhead halfway between 33.3% - // and 66.7% occupancy = 50% - if (ramBytesUsed >= ramLimitBytes / (2 * 2)) { + if (ramBytesUsed >= ramLimitBytes / 2) { // time to fallback -- fallback is now used read-only to promote a node (suffix) to // primary if we encounter it again fallbackTable = primaryTable; // size primary table the same size to reduce rehash cost // TODO: we could clear & reuse the previous fallbackTable, instead of allocating a new // to reduce GC load - primaryTable = new PagedGrowableHash(node, Math.max(16, primaryTable.entries.size())); - } else if (primaryTable.count > primaryTable.entries.size() * (2f / 3)) { + primaryTable = + new PagedGrowableHash(nodeAddress, Math.max(16, primaryTable.fstNodeAddress.size())); + } else if (primaryTable.count > primaryTable.fstNodeAddress.size() * (2f / 3)) { // rehash at 2/3 occupancy - primaryTable.rehash(node); + primaryTable.rehash(nodeAddress); } - return node; + return nodeAddress; - } else if (nodesEqual(nodeIn, node)) { + } else if (primaryTable.nodesEqual(nodeIn, nodeAddress, hashSlot) != -1) { // same node (in frozen form) is already in primary table - return node; + return nodeAddress; } // quadratic probe (but is it, really?) - pos = (pos + (++c)) & primaryTable.mask; + hashSlot = (hashSlot + (++c)) & primaryTable.mask; } } @@ -186,149 +220,233 @@ final class NodeHash { return h; } - // hash code for a frozen node. this must precisely match the hash computation of an unfrozen - // node! - private long hash(long node) throws IOException { - final int PRIME = 31; - - long h = 0; - fstCompiler.fst.readFirstRealTargetArc(node, scratchArc, in); - while (true) { - h = PRIME * h + scratchArc.label(); - h = PRIME * h + (int) (scratchArc.target() ^ (scratchArc.target() >> 32)); - h = PRIME * h + scratchArc.output().hashCode(); - h = PRIME * h + scratchArc.nextFinalOutput().hashCode(); - if (scratchArc.isFinal()) { - h += 17; - } - if (scratchArc.isLast()) { - break; - } - fstCompiler.fst.readNextRealArc(scratchArc, in); - } - - return h; - } - - /** - * Compares an unfrozen node (UnCompiledNode) with a frozen node at byte location address (long), - * returning true if they are equal. - */ - private boolean nodesEqual(FSTCompiler.UnCompiledNode node, long address) throws IOException { - fstCompiler.fst.readFirstRealTargetArc(address, scratchArc, in); - - // fail fast for a node with fixed length arcs - if (scratchArc.bytesPerArc() != 0) { - assert node.numArcs > 0; - // the frozen node uses fixed-with arc encoding (same number of bytes per arc), but may be - // sparse or dense - switch (scratchArc.nodeFlags()) { - case FST.ARCS_FOR_BINARY_SEARCH: - // sparse - if (node.numArcs != scratchArc.numArcs()) { - return false; - } - break; - case FST.ARCS_FOR_DIRECT_ADDRESSING: - // dense -- compare both the number of labels allocated in the array (some of which may - // not actually be arcs), and the number of arcs - if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs() - || node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) { - return false; - } - break; - default: - throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags()); - } - } - - // compare arc by arc to see if there is a difference - for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) { - final FSTCompiler.Arc arc = node.arcs[arcUpto]; - if (arc.label != scratchArc.label() - || arc.output.equals(scratchArc.output()) == false - || ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target() - || arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) == false - || arc.isFinal != scratchArc.isFinal()) { - return false; - } - - if (scratchArc.isLast()) { - if (arcUpto == node.numArcs - 1) { - return true; - } else { - return false; - } - } - - fstCompiler.fst.readNextRealArc(scratchArc, in); - } - - // unfrozen node has fewer arcs than frozen node - - return false; - } - /** Inner class because it needs access to hash function and FST bytes. */ private class PagedGrowableHash { - private PagedGrowableWriter entries; + // storing the FST node address where the position is the masked hash of the node arcs + private PagedGrowableWriter fstNodeAddress; + // storing the local copiedNodes address in the same position as fstNodeAddress + // here we are effectively storing a Map from the FST node address to copiedNodes + // address + private PagedGrowableWriter copiedNodeAddress; private long count; private long mask; + // storing the byte slice from the FST for nodes we added to the hash so that we don't need to + // look up from the FST itself, so the FST bytes can stream directly to disk as append-only + // writes. + // each node will be written subsequently + private final ByteBlockPool copiedNodes; + // the {@link FST.BytesReader} to read from copiedNodes. we use this when computing a frozen + // node hash + // or comparing if a frozen and unfrozen nodes are equal + private final ByteBlockPoolReverseBytesReader bytesReader; // 256K blocks, but note that the final block is sized only as needed so it won't use the full // block size when just a few elements were written to it private static final int BLOCK_SIZE_BYTES = 1 << 18; public PagedGrowableHash() { - entries = new PagedGrowableWriter(16, BLOCK_SIZE_BYTES, 8, PackedInts.COMPACT); + fstNodeAddress = new PagedGrowableWriter(16, BLOCK_SIZE_BYTES, 8, PackedInts.COMPACT); + copiedNodeAddress = new PagedGrowableWriter(16, BLOCK_SIZE_BYTES, 8, PackedInts.COMPACT); mask = 15; + copiedNodes = new ByteBlockPool(new ByteBlockPool.DirectAllocator()); + bytesReader = new ByteBlockPoolReverseBytesReader(copiedNodes); } public PagedGrowableHash(long lastNodeAddress, long size) { - entries = + fstNodeAddress = new PagedGrowableWriter( size, BLOCK_SIZE_BYTES, PackedInts.bitsRequired(lastNodeAddress), PackedInts.COMPACT); + copiedNodeAddress = new PagedGrowableWriter(size, BLOCK_SIZE_BYTES, 8, PackedInts.COMPACT); mask = size - 1; assert (mask & size) == 0 : "size must be a power-of-2; got size=" + size + " mask=" + mask; + copiedNodes = new ByteBlockPool(new ByteBlockPool.DirectAllocator()); + bytesReader = new ByteBlockPoolReverseBytesReader(copiedNodes); } - public long get(long index) { - return entries.get(index); + /** + * Get the copied bytes at the provided hash slot + * + * @param hashSlot the hash slot to read from + * @param length the number of bytes to read + * @return the copied byte array + */ + public byte[] getBytes(long hashSlot, int length) { + long address = copiedNodeAddress.get(hashSlot); + assert address - length + 1 >= 0; + byte[] buf = new byte[length]; + copiedNodes.readBytes(address - length + 1, buf, 0, length); + return buf; } - public void set(long index, long pointer) throws IOException { - entries.set(index, pointer); + /** + * Get the node address from the provided hash slot + * + * @param hashSlot the hash slot to read + * @return the node address + */ + public long getNodeAddress(long hashSlot) { + return fstNodeAddress.get(hashSlot); + } + + /** + * Set the node address and bytes from the provided hash slot + * + * @param hashSlot the hash slot to write to + * @param nodeAddress the node address + * @param bytes the node bytes to be copied + */ + public void setNode(long hashSlot, long nodeAddress, byte[] bytes) { + assert fstNodeAddress.get(hashSlot) == 0; + fstNodeAddress.set(hashSlot, nodeAddress); count++; + + copiedNodes.append(bytes); + // write the offset, which points to the last byte of the node we copied since we later read + // this node in reverse + assert copiedNodeAddress.get(hashSlot) == 0; + copiedNodeAddress.set(hashSlot, copiedNodes.getPosition() - 1); } private void rehash(long lastNodeAddress) throws IOException { + // TODO: https://github.com/apache/lucene/issues/12744 + // should we always use a small startBitsPerValue here (e.g 8) instead base off of + // lastNodeAddress? + // double hash table size on each rehash - PagedGrowableWriter newEntries = + long newSize = 2 * fstNodeAddress.size(); + PagedGrowableWriter newCopiedNodeAddress = new PagedGrowableWriter( - 2 * entries.size(), + newSize, + BLOCK_SIZE_BYTES, + PackedInts.bitsRequired(copiedNodes.getPosition()), + PackedInts.COMPACT); + PagedGrowableWriter newFSTNodeAddress = + new PagedGrowableWriter( + newSize, BLOCK_SIZE_BYTES, PackedInts.bitsRequired(lastNodeAddress), PackedInts.COMPACT); - long newMask = newEntries.size() - 1; - for (long idx = 0; idx < entries.size(); idx++) { - long address = entries.get(idx); + long newMask = newFSTNodeAddress.size() - 1; + for (long idx = 0; idx < fstNodeAddress.size(); idx++) { + long address = fstNodeAddress.get(idx); if (address != 0) { - long pos = hash(address) & newMask; + long hashSlot = hash(address, idx) & newMask; int c = 0; while (true) { - if (newEntries.get(pos) == 0) { - newEntries.set(pos, address); + if (newFSTNodeAddress.get(hashSlot) == 0) { + newFSTNodeAddress.set(hashSlot, address); + newCopiedNodeAddress.set(hashSlot, copiedNodeAddress.get(idx)); break; } // quadratic probe - pos = (pos + (++c)) & newMask; + hashSlot = (hashSlot + (++c)) & newMask; } } } mask = newMask; - entries = newEntries; + fstNodeAddress = newFSTNodeAddress; + copiedNodeAddress = newCopiedNodeAddress; + } + + // hash code for a frozen node. this must precisely match the hash computation of an unfrozen + // node! + private long hash(long nodeAddress, long hashSlot) throws IOException { + FST.BytesReader in = getBytesReader(nodeAddress, hashSlot); + + final int PRIME = 31; + + long h = 0; + fstCompiler.fst.readFirstRealTargetArc(nodeAddress, scratchArc, in); + while (true) { + h = PRIME * h + scratchArc.label(); + h = PRIME * h + (int) (scratchArc.target() ^ (scratchArc.target() >> 32)); + h = PRIME * h + scratchArc.output().hashCode(); + h = PRIME * h + scratchArc.nextFinalOutput().hashCode(); + if (scratchArc.isFinal()) { + h += 17; + } + if (scratchArc.isLast()) { + break; + } + fstCompiler.fst.readNextRealArc(scratchArc, in); + } + + return h; + } + + /** + * Compares an unfrozen node (UnCompiledNode) with a frozen node at byte location address + * (long), returning the node length if the two nodes are equals, or -1 otherwise + * + *

The node length will be used to promote the node from the fallback table to the primary + * table + */ + private int nodesEqual(FSTCompiler.UnCompiledNode node, long address, long hashSlot) + throws IOException { + FST.BytesReader in = getBytesReader(address, hashSlot); + fstCompiler.fst.readFirstRealTargetArc(address, scratchArc, in); + + // fail fast for a node with fixed length arcs + if (scratchArc.bytesPerArc() != 0) { + assert node.numArcs > 0; + // the frozen node uses fixed-with arc encoding (same number of bytes per arc), but may be + // sparse or dense + switch (scratchArc.nodeFlags()) { + case FST.ARCS_FOR_BINARY_SEARCH: + // sparse + if (node.numArcs != scratchArc.numArcs()) { + return -1; + } + break; + case FST.ARCS_FOR_DIRECT_ADDRESSING: + // dense -- compare both the number of labels allocated in the array (some of which may + // not actually be arcs), and the number of arcs + if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs() + || node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) { + return -1; + } + break; + default: + throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags()); + } + } + + // compare arc by arc to see if there is a difference + for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) { + final FSTCompiler.Arc arc = node.arcs[arcUpto]; + if (arc.label != scratchArc.label() + || arc.output.equals(scratchArc.output()) == false + || ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target() + || arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) == false + || arc.isFinal != scratchArc.isFinal()) { + return -1; + } + + if (scratchArc.isLast()) { + if (arcUpto == node.numArcs - 1) { + // position is 1 index past the starting address, as we are reading in backward + return Math.toIntExact(address - in.getPosition()); + } else { + return -1; + } + } + + fstCompiler.fst.readNextRealArc(scratchArc, in); + } + + // unfrozen node has fewer arcs than frozen node + + return -1; + } + + private FST.BytesReader getBytesReader(long nodeAddress, long hashSlot) { + // make sure the nodeAddress and hashSlot is consistent + assert fstNodeAddress.get(hashSlot) == nodeAddress; + long localAddress = copiedNodeAddress.get(hashSlot); + bytesReader.setPosDelta(nodeAddress - localAddress); + return bytesReader; } } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java b/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java index b242f004096..c7c4e80872d 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java @@ -79,6 +79,7 @@ public class TestByteBlockPool extends LuceneTestCase { ByteBlockPool pool = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(bytesUsed)); pool.nextBuffer(); + long totalBytes = 0; List items = new ArrayList<>(); for (int i = 0; i < 100; i++) { int size; @@ -91,6 +92,10 @@ public class TestByteBlockPool extends LuceneTestCase { random().nextBytes(bytes); items.add(bytes); pool.append(new BytesRef(bytes)); + totalBytes += size; + + // make sure we report the correct position + assertEquals(totalBytes, pool.getPosition()); } long position = 0;