mirror of https://github.com/apache/lucene.git
LUCENE-8635: add option to move FSTs off-heap, and do so for the FST terms index in the default codec for non-primary-key fields if MMapDirectory is being used
This commit is contained in:
parent
6f61bdea0e
commit
ec801b4c54
|
@ -189,6 +189,12 @@ Changes in Runtime Behavior
|
|||
is instead calculated as a function of the sloppy frequency of the matching
|
||||
intervals. (Alan Woodward, Jim Ferenczi)
|
||||
|
||||
* LUCENE-8635: FSTs can now remain off-heap, accessed via
|
||||
IndexInput, and the default codec's term dictionary
|
||||
(BlockTreeTermsReader) will now leave the FST for the terms index
|
||||
off-heap for non-primary-key fields using MMapDirectory, reducing
|
||||
heap usage for such fields. (Ankit Jain)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-8340: LongPoint#newDistanceFeatureQuery may be used to boost scores based on
|
||||
|
|
|
@ -102,6 +102,8 @@ public final class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
// Open input to the main terms dict file (_X.tib)
|
||||
final IndexInput termsIn;
|
||||
// Open input to the terms index file (_X.tip)
|
||||
final IndexInput indexIn;
|
||||
|
||||
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||
|
||||
|
@ -118,7 +120,6 @@ public final class BlockTreeTermsReader extends FieldsProducer {
|
|||
/** Sole constructor. */
|
||||
public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) throws IOException {
|
||||
boolean success = false;
|
||||
IndexInput indexIn = null;
|
||||
|
||||
this.postingsReader = postingsReader;
|
||||
this.segment = state.segmentInfo.name;
|
||||
|
@ -197,13 +198,11 @@ public final class BlockTreeTermsReader extends FieldsProducer {
|
|||
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
|
||||
}
|
||||
}
|
||||
|
||||
indexIn.close();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
// this.close() will close in:
|
||||
IOUtils.closeWhileHandlingException(indexIn, this);
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -237,7 +236,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
|
|||
@Override
|
||||
public void close() throws IOException {
|
||||
try {
|
||||
IOUtils.close(termsIn, postingsReader);
|
||||
IOUtils.close(indexIn, termsIn, postingsReader);
|
||||
} finally {
|
||||
// Clear so refs to terms index is GCable even if
|
||||
// app hangs onto us:
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexOptions;
|
|||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteBufferIndexInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.Accountables;
|
||||
|
@ -34,6 +35,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.OffHeapFSTStore;
|
||||
|
||||
/**
|
||||
* BlockTree's implementation of {@link Terms}.
|
||||
|
@ -86,9 +88,14 @@ public final class FieldReader extends Terms implements Accountable {
|
|||
|
||||
if (indexIn != null) {
|
||||
final IndexInput clone = indexIn.clone();
|
||||
//System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
|
||||
clone.seek(indexStartFP);
|
||||
index = new FST<>(clone, ByteSequenceOutputs.getSingleton());
|
||||
// Initialize FST offheap if index is MMapDirectory and
|
||||
// docCount != sumDocFreq implying field is not primary key
|
||||
if (clone instanceof ByteBufferIndexInput && this.docCount != this.sumDocFreq) {
|
||||
index = new FST<>(clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
|
||||
} else {
|
||||
index = new FST<>(clone, ByteSequenceOutputs.getSingleton());
|
||||
}
|
||||
|
||||
/*
|
||||
if (false) {
|
||||
|
|
|
@ -33,7 +33,7 @@ import java.nio.ByteBuffer;
|
|||
* For efficiency, this class requires that the buffers
|
||||
* are a power-of-two (<code>chunkSizePower</code>).
|
||||
*/
|
||||
abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessInput {
|
||||
public abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessInput {
|
||||
protected final long length;
|
||||
protected final long chunkSizeMask;
|
||||
protected final int chunkSizePower;
|
||||
|
|
|
@ -127,8 +127,7 @@ public final class FST<T> implements Accountable {
|
|||
* GB then bytesArray is set instead. */
|
||||
final BytesStore bytes;
|
||||
|
||||
/** Used at read time when the FST fits into a single byte[]. */
|
||||
final byte[] bytesArray;
|
||||
private final FSTStore fstStore;
|
||||
|
||||
private long startNode = -1;
|
||||
|
||||
|
@ -238,7 +237,7 @@ public final class FST<T> implements Accountable {
|
|||
this.inputType = inputType;
|
||||
this.outputs = outputs;
|
||||
version = VERSION_CURRENT;
|
||||
bytesArray = null;
|
||||
fstStore = null;
|
||||
bytes = new BytesStore(bytesPageBits);
|
||||
// pad: ensure no node gets address 0 which is reserved to mean
|
||||
// the stop state w/ no arcs
|
||||
|
@ -251,18 +250,16 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
/** Load a previously saved FST. */
|
||||
public FST(DataInput in, Outputs<T> outputs) throws IOException {
|
||||
this(in, outputs, DEFAULT_MAX_BLOCK_BITS);
|
||||
this(in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
|
||||
}
|
||||
|
||||
/** Load a previously saved FST; maxBlockBits allows you to
|
||||
* control the size of the byte[] pages used to hold the FST bytes. */
|
||||
public FST(DataInput in, Outputs<T> outputs, int maxBlockBits) throws IOException {
|
||||
public FST(DataInput in, Outputs<T> outputs, FSTStore fstStore) throws IOException {
|
||||
bytes = null;
|
||||
this.fstStore = fstStore;
|
||||
this.outputs = outputs;
|
||||
|
||||
if (maxBlockBits < 1 || maxBlockBits > 30) {
|
||||
throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
|
||||
}
|
||||
|
||||
// NOTE: only reads most recent format; we don't have
|
||||
// back-compat promise for FSTs (they are experimental):
|
||||
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
|
||||
|
@ -302,17 +299,7 @@ public final class FST<T> implements Accountable {
|
|||
startNode = in.readVLong();
|
||||
|
||||
long numBytes = in.readVLong();
|
||||
if (numBytes > 1 << maxBlockBits) {
|
||||
// FST is big: we need multiple pages
|
||||
bytes = new BytesStore(in, numBytes, 1<<maxBlockBits);
|
||||
bytesArray = null;
|
||||
} else {
|
||||
// FST fits into a single block: use ByteArrayBytesStoreReader for less overhead
|
||||
bytes = null;
|
||||
bytesArray = new byte[(int) numBytes];
|
||||
in.readBytes(bytesArray, 0, bytesArray.length);
|
||||
}
|
||||
|
||||
this.fstStore.init(in, numBytes);
|
||||
cacheRootArcs();
|
||||
}
|
||||
|
||||
|
@ -344,11 +331,12 @@ public final class FST<T> implements Accountable {
|
|||
@Override
|
||||
public long ramBytesUsed() {
|
||||
long size = BASE_RAM_BYTES_USED;
|
||||
if (bytesArray != null) {
|
||||
size += bytesArray.length;
|
||||
if (this.fstStore != null) {
|
||||
size += this.fstStore.ramBytesUsed();
|
||||
} else {
|
||||
size += bytes.ramBytesUsed();
|
||||
}
|
||||
|
||||
size += cachedArcsBytesUsed;
|
||||
return size;
|
||||
}
|
||||
|
@ -465,9 +453,8 @@ public final class FST<T> implements Accountable {
|
|||
out.writeVLong(numBytes);
|
||||
bytes.writeTo(out);
|
||||
} else {
|
||||
assert bytesArray != null;
|
||||
out.writeVLong(bytesArray.length);
|
||||
out.writeBytes(bytesArray, 0, bytesArray.length);
|
||||
assert fstStore != null;
|
||||
fstStore.writeTo(out);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1137,8 +1124,8 @@ public final class FST<T> implements Accountable {
|
|||
/** Returns a {@link BytesReader} for this FST, positioned at
|
||||
* position 0. */
|
||||
public BytesReader getBytesReader() {
|
||||
if (bytesArray != null) {
|
||||
return new ReverseBytesReader(bytesArray);
|
||||
if (this.fstStore != null) {
|
||||
return this.fstStore.getReverseBytesReader();
|
||||
} else {
|
||||
return bytes.getReverseReader();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util.fst;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** Abstraction for reading/writing bytes necessary for FST. */
|
||||
public interface FSTStore extends Accountable {
|
||||
void init(DataInput in, long numBytes) throws IOException;
|
||||
FST.BytesReader getReverseBytesReader();
|
||||
void writeTo(DataOutput out) throws IOException;
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util.fst;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** Provides off heap storage of finite state machine (FST),
|
||||
* using underlying index input instead of byte store on heap
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class OffHeapFSTStore implements FSTStore {
|
||||
|
||||
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OffHeapFSTStore.class);
|
||||
|
||||
private IndexInput in;
|
||||
private long offset;
|
||||
private long numBytes;
|
||||
|
||||
@Override
|
||||
public void init(DataInput in, long numBytes) throws IOException {
|
||||
if (in instanceof IndexInput) {
|
||||
this.in = (IndexInput) in;
|
||||
this.numBytes = numBytes;
|
||||
this.offset = this.in.getFilePointer();
|
||||
} else {
|
||||
throw new IllegalArgumentException("parameter:in should be an instance of IndexInput for using OffHeapFSTStore, not a "
|
||||
+ in.getClass().getName());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return BASE_RAM_BYTES_USED;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FST.BytesReader getReverseBytesReader() {
|
||||
try {
|
||||
return new ReverseRandomAccessReader(in.randomAccessSlice(offset, numBytes));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(DataOutput out) throws IOException {
|
||||
throw new UnsupportedOperationException("writeToOutput operation is not supported for OffHeapFSTStore");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util.fst;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** Provides storage of finite state machine (FST),
|
||||
* using byte array or byte store allocated on heap.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class OnHeapFSTStore implements FSTStore {
|
||||
|
||||
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OnHeapFSTStore.class);
|
||||
|
||||
/** A {@link BytesStore}, used during building, or during reading when
|
||||
* the FST is very large (more than 1 GB). If the FST is less than 1
|
||||
* GB then bytesArray is set instead. */
|
||||
private BytesStore bytes;
|
||||
|
||||
/** Used at read time when the FST fits into a single byte[]. */
|
||||
private byte[] bytesArray;
|
||||
|
||||
private final int maxBlockBits;
|
||||
|
||||
public OnHeapFSTStore(int maxBlockBits) {
|
||||
if (maxBlockBits < 1 || maxBlockBits > 30) {
|
||||
throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
|
||||
}
|
||||
|
||||
this.maxBlockBits = maxBlockBits;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(DataInput in, long numBytes) throws IOException {
|
||||
if (numBytes > 1 << this.maxBlockBits) {
|
||||
// FST is big: we need multiple pages
|
||||
bytes = new BytesStore(in, numBytes, 1<<this.maxBlockBits);
|
||||
} else {
|
||||
// FST fits into a single block: use ByteArrayBytesStoreReader for less overhead
|
||||
bytesArray = new byte[(int) numBytes];
|
||||
in.readBytes(bytesArray, 0, bytesArray.length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
long size = BASE_RAM_BYTES_USED;
|
||||
if (bytesArray != null) {
|
||||
size += bytesArray.length;
|
||||
} else {
|
||||
size += bytes.ramBytesUsed();
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FST.BytesReader getReverseBytesReader() {
|
||||
if (bytesArray != null) {
|
||||
return new ReverseBytesReader(bytesArray);
|
||||
} else {
|
||||
return bytes.getReverseReader();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(DataOutput out) throws IOException {
|
||||
if (bytes != null) {
|
||||
long numBytes = bytes.getPosition();
|
||||
out.writeVLong(numBytes);
|
||||
bytes.writeTo(out);
|
||||
} else {
|
||||
assert bytesArray != null;
|
||||
out.writeVLong(bytesArray.length);
|
||||
out.writeBytes(bytesArray, 0, bytesArray.length);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util.fst;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.RandomAccessInput;
|
||||
|
||||
/** Implements reverse read from a RandomAccessInput. */
|
||||
final class ReverseRandomAccessReader extends FST.BytesReader {
|
||||
private final RandomAccessInput in;
|
||||
private long pos;
|
||||
|
||||
public ReverseRandomAccessReader(RandomAccessInput in) {
|
||||
this.in = in;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte readByte() throws IOException {
|
||||
return in.readByte(pos--);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readBytes(byte[] b, int offset, int len) throws IOException {
|
||||
int i = offset, end = offset + len;
|
||||
while (i < end) {
|
||||
b[i++] = in.readByte(pos--);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skipBytes(long count) {
|
||||
pos -= count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getPosition() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setPosition(long pos) {
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean reversed() {
|
||||
return true;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue