diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a3e7cb8ed27..cfe00abdf2e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -62,6 +62,9 @@ API Changes * GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera) +* GITHUB#12709 Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods + of the two (Anh Dung Bui) + New Features --------------------- @@ -150,7 +153,8 @@ API Changes * GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera) -* GITHUB#12646: Move FST#addNode to FSTCompiler to avoid a circular dependency between FST and FSTCompiler +* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency + between FST and FSTCompiler (Anh Dung Bui) New Features --------------------- @@ -200,7 +204,7 @@ Optimizations * GITHUB#12382: Faster top-level conjunctions on term queries when sorting by descending score. (Adrien Grand) - + * GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng) * GITHUB#12587: Use radix sort to speed up the sorting of terms in TermInSetQuery. (Guo Feng) @@ -215,7 +219,7 @@ Optimizations * GITHUB#12668: ImpactsEnums now decode frequencies lazily like PostingsEnums. (Adrien Grand) - + * GITHUB#12651: Use 2d array for OnHeapHnswGraph representation. (Patrick Zhai) * GITHUB#12653: Optimize computing number of levels in MultiLevelSkipListWriter#bufferSkip. (Shubham Chaudhary) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java index db9e5cfce10..469454d3d52 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -21,13 +21,12 @@ import java.util.ArrayList; import java.util.List; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; -import org.apache.lucene.util.Accountable; import org.apache.lucene.util.RamUsageEstimator; // TODO: merge with PagedBytes, except PagedBytes doesn't // let you read while writing which FST needs -class BytesStore extends DataOutput implements Accountable { +class BytesStore extends DataOutput implements FSTReader { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(BytesStore.class) @@ -333,6 +332,11 @@ class BytesStore extends DataOutput implements Accountable { return ((long) blocks.size() - 1) * blockSize + nextWrite; } + @Override + public long size() { + return getPosition(); + } + /** * Pos must be less than the max position written so far! Ie, you cannot "grow" the file with * this! @@ -365,6 +369,7 @@ class BytesStore extends DataOutput implements Accountable { } /** Writes all of our bytes to the target {@link DataOutput}. */ + @Override public void writeTo(DataOutput out) throws IOException { for (byte[] block : blocks) { out.writeBytes(block, 0, block.length); @@ -437,7 +442,8 @@ class BytesStore extends DataOutput implements Accountable { }; } - public FST.BytesReader getReverseReader() { + @Override + public FST.BytesReader getReverseBytesReader() { return getReverseReader(true); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 7636f819959..7769cfe82b7 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -123,9 +123,7 @@ public final class FST implements Accountable { * A {@link BytesStore}, used during building, or during reading when the FST is very large (more * than 1 GB). If the FST is less than 1 GB then bytesArray is set instead. */ - final BytesStore bytes; - - private final FSTStore fstStore; + private final FSTReader fstReader; private long startNode = -1; @@ -398,15 +396,11 @@ public final class FST implements Accountable { } // make a new empty FST, for building; Builder invokes this - FST(INPUT_TYPE inputType, Outputs outputs, int bytesPageBits) { + FST(INPUT_TYPE inputType, Outputs outputs, FSTReader fstReader) { this.inputType = inputType; this.outputs = outputs; - fstStore = null; - bytes = new BytesStore(bytesPageBits); - // pad: ensure no node gets address 0 which is reserved to mean - // the stop state w/ no arcs - bytes.writeByte((byte) 0); emptyOutput = null; + this.fstReader = fstReader; this.version = VERSION_CURRENT; } @@ -423,8 +417,6 @@ public final class FST implements Accountable { */ public FST(DataInput metaIn, DataInput in, Outputs outputs, FSTStore fstStore) throws IOException { - bytes = null; - this.fstStore = fstStore; this.outputs = outputs; // NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have @@ -438,7 +430,7 @@ public final class FST implements Accountable { emptyBytes.copyBytes(metaIn, numBytes); // De-serialize empty-string output: - BytesReader reader = emptyBytes.getReverseReader(); + BytesReader reader = emptyBytes.getReverseBytesReader(); // NoOutputs uses 0 bytes when writing its output, // so we have to check here else BytesStore gets // angry: @@ -466,19 +458,13 @@ public final class FST implements Accountable { startNode = metaIn.readVLong(); long numBytes = metaIn.readVLong(); - this.fstStore.init(in, numBytes); + fstStore.init(in, numBytes); + this.fstReader = fstStore; } @Override public long ramBytesUsed() { - long size = BASE_RAM_BYTES_USED; - if (this.fstStore != null) { - size += this.fstStore.ramBytesUsed(); - } else { - size += bytes.ramBytesUsed(); - } - - return size; + return BASE_RAM_BYTES_USED + fstReader.ramBytesUsed(); } @Override @@ -487,7 +473,7 @@ public final class FST implements Accountable { } void finish(long newStartNode) throws IOException { - assert newStartNode <= bytes.getPosition(); + assert newStartNode <= fstReader.size(); if (startNode != -1) { throw new IllegalStateException("already finished"); } @@ -495,11 +481,10 @@ public final class FST implements Accountable { newStartNode = 0; } startNode = newStartNode; - bytes.finish(); } public long numBytes() { - return bytes.getPosition(); + return fstReader.size(); } public T getEmptyOutput() { @@ -555,16 +540,8 @@ public final class FST implements Accountable { } metaOut.writeByte(t); metaOut.writeVLong(startNode); - if (bytes != null) { - long numBytes = bytes.getPosition(); - metaOut.writeVLong(numBytes); - bytes.writeTo(out); - } else { - assert fstStore != null; - long numBytes = fstStore.size(); - metaOut.writeVLong(numBytes); - fstStore.writeTo(out); - } + metaOut.writeVLong(numBytes()); + fstReader.writeTo(out); } /** Writes an automaton to a file. */ @@ -1141,11 +1118,7 @@ public final class FST implements Accountable { /** Returns a {@link BytesReader} for this FST, positioned at position 0. */ public BytesReader getBytesReader() { - if (this.fstStore != null) { - return this.fstStore.getReverseBytesReader(); - } else { - return bytes.getReverseReader(); - } + return fstReader.getReverseBytesReader(); } /** Reads bytes stored in an FST. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 13f873562a5..f17c220f83d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -137,9 +137,11 @@ public class FSTCompiler { float directAddressingMaxOversizingFactor) { this.allowFixedLengthArcs = allowFixedLengthArcs; this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor; - fst = new FST<>(inputType, outputs, bytesPageBits); - bytes = fst.bytes; - assert bytes != null; + bytes = new BytesStore(bytesPageBits); + // pad: ensure no node gets address 0 which is reserved to mean + // the stop state w/ no arcs + bytes.writeByte((byte) 0); + fst = new FST<>(inputType, outputs, bytes); if (suffixRAMLimitMB < 0) { throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB); } else if (suffixRAMLimitMB > 0) { @@ -317,8 +319,6 @@ public class FSTCompiler { // serializes new node by appending its bytes to the end // of the current byte[] long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { - T NO_OUTPUT = fst.outputs.getNoOutput(); - // System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); if (nodeIn.numArcs == 0) { if (nodeIn.isFinal) { @@ -859,6 +859,7 @@ public class FSTCompiler { // if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " // root.output=" + root.output); fst.finish(compileNode(root, lastInput.length()).node); + bytes.finish(); return fst; } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTReader.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTReader.java new file mode 100644 index 00000000000..f299fcb7558 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTReader.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.Accountable; + +/** Abstraction for reading bytes necessary for FST. */ +public interface FSTReader extends Accountable { + + /** + * The raw size in bytes of the FST + * + * @return the FST size + */ + long size(); + + /** + * Get the reverse BytesReader for this FST + * + * @return the reverse BytesReader + */ + FST.BytesReader getReverseBytesReader(); + + /** + * Write this FST to another DataOutput + * + * @param out the DataOutput + * @throws IOException if exception occurred during writing + */ + void writeTo(DataOutput out) throws IOException; +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java index c9caeebbcaf..f50623ff886 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java @@ -18,16 +18,8 @@ package org.apache.lucene.util.fst; import java.io.IOException; import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.util.Accountable; -/** Abstraction for reading/writing bytes necessary for FST. */ -public interface FSTStore extends Accountable { +/** A type of {@link FSTReader} which needs data to be initialized before use */ +public interface FSTStore extends FSTReader { void init(DataInput in, long numBytes) throws IOException; - - long size(); - - FST.BytesReader getReverseBytesReader(); - - void writeTo(DataOutput out) throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java index df8b59d0380..4cf441d19a5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java @@ -88,7 +88,7 @@ public final class OnHeapFSTStore implements FSTStore { if (bytesArray != null) { return new ReverseBytesReader(bytesArray); } else { - return bytes.getReverseReader(); + return bytes.getReverseBytesReader(); } } diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java index 3b644309c0a..001e2b092cb 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java @@ -271,7 +271,7 @@ public class TestBytesStore extends LuceneTestCase { System.out.println(" bulk: reversed"); } // reversed - FST.BytesReader r = bytes.getReverseReader(); + FST.BytesReader r = bytes.getReverseBytesReader(); assertTrue(r.reversed()); r.setPosition(totalLength - 1); r.readBytes(actual, 0, actual.length); @@ -306,7 +306,7 @@ public class TestBytesStore extends LuceneTestCase { if (VERBOSE) { System.out.println(" ops: reversed"); } - r = bytes.getReverseReader(); + r = bytes.getReverseBytesReader(); } else { if (VERBOSE) { System.out.println(" ops: forward");