LUCENE-8635: add option to move FSTs off-heap, and do so for the FST terms index in the default codec for non-primary-key fields if MMapDirectory is being used

This commit is contained in:
Mike McCandless 2019-02-19 12:52:22 -05:00
parent 6f61bdea0e
commit ec801b4c54
9 changed files with 293 additions and 35 deletions

View File

@ -189,6 +189,12 @@ Changes in Runtime Behavior
is instead calculated as a function of the sloppy frequency of the matching
intervals. (Alan Woodward, Jim Ferenczi)
* LUCENE-8635: FSTs can now remain off-heap, accessed via
IndexInput, and the default codec's term dictionary
(BlockTreeTermsReader) will now leave the FST for the terms index
off-heap for non-primary-key fields using MMapDirectory, reducing
heap usage for such fields. (Ankit Jain)
New Features
* LUCENE-8340: LongPoint#newDistanceFeatureQuery may be used to boost scores based on

View File

@ -102,6 +102,8 @@ public final class BlockTreeTermsReader extends FieldsProducer {
// Open input to the main terms dict file (_X.tib)
final IndexInput termsIn;
// Open input to the terms index file (_X.tip)
final IndexInput indexIn;
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
@ -118,7 +120,6 @@ public final class BlockTreeTermsReader extends FieldsProducer {
/** Sole constructor. */
public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) throws IOException {
boolean success = false;
IndexInput indexIn = null;
this.postingsReader = postingsReader;
this.segment = state.segmentInfo.name;
@ -197,13 +198,11 @@ public final class BlockTreeTermsReader extends FieldsProducer {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
}
}
indexIn.close();
success = true;
} finally {
if (!success) {
// this.close() will close in:
IOUtils.closeWhileHandlingException(indexIn, this);
IOUtils.closeWhileHandlingException(this);
}
}
}
@ -237,7 +236,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
@Override
public void close() throws IOException {
try {
IOUtils.close(termsIn, postingsReader);
IOUtils.close(indexIn, termsIn, postingsReader);
} finally {
// Clear so refs to terms index is GCable even if
// app hangs onto us:

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteBufferIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
@ -34,6 +35,7 @@ import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.OffHeapFSTStore;
/**
* BlockTree's implementation of {@link Terms}.
@ -86,9 +88,14 @@ public final class FieldReader extends Terms implements Accountable {
if (indexIn != null) {
final IndexInput clone = indexIn.clone();
//System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
clone.seek(indexStartFP);
index = new FST<>(clone, ByteSequenceOutputs.getSingleton());
// Initialize FST offheap if index is MMapDirectory and
// docCount != sumDocFreq implying field is not primary key
if (clone instanceof ByteBufferIndexInput && this.docCount != this.sumDocFreq) {
index = new FST<>(clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
} else {
index = new FST<>(clone, ByteSequenceOutputs.getSingleton());
}
/*
if (false) {

View File

@ -33,7 +33,7 @@ import java.nio.ByteBuffer;
* For efficiency, this class requires that the buffers
* are a power-of-two (<code>chunkSizePower</code>).
*/
abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessInput {
public abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessInput {
protected final long length;
protected final long chunkSizeMask;
protected final int chunkSizePower;

View File

@ -127,8 +127,7 @@ public final class FST<T> implements Accountable {
* GB then bytesArray is set instead. */
final BytesStore bytes;
/** Used at read time when the FST fits into a single byte[]. */
final byte[] bytesArray;
private final FSTStore fstStore;
private long startNode = -1;
@ -238,7 +237,7 @@ public final class FST<T> implements Accountable {
this.inputType = inputType;
this.outputs = outputs;
version = VERSION_CURRENT;
bytesArray = null;
fstStore = null;
bytes = new BytesStore(bytesPageBits);
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
@ -251,18 +250,16 @@ public final class FST<T> implements Accountable {
/** Load a previously saved FST. */
public FST(DataInput in, Outputs<T> outputs) throws IOException {
this(in, outputs, DEFAULT_MAX_BLOCK_BITS);
this(in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
}
/** Load a previously saved FST; maxBlockBits allows you to
* control the size of the byte[] pages used to hold the FST bytes. */
public FST(DataInput in, Outputs<T> outputs, int maxBlockBits) throws IOException {
public FST(DataInput in, Outputs<T> outputs, FSTStore fstStore) throws IOException {
bytes = null;
this.fstStore = fstStore;
this.outputs = outputs;
if (maxBlockBits < 1 || maxBlockBits > 30) {
throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
}
// NOTE: only reads most recent format; we don't have
// back-compat promise for FSTs (they are experimental):
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
@ -302,17 +299,7 @@ public final class FST<T> implements Accountable {
startNode = in.readVLong();
long numBytes = in.readVLong();
if (numBytes > 1 << maxBlockBits) {
// FST is big: we need multiple pages
bytes = new BytesStore(in, numBytes, 1<<maxBlockBits);
bytesArray = null;
} else {
// FST fits into a single block: use ByteArrayBytesStoreReader for less overhead
bytes = null;
bytesArray = new byte[(int) numBytes];
in.readBytes(bytesArray, 0, bytesArray.length);
}
this.fstStore.init(in, numBytes);
cacheRootArcs();
}
@ -344,11 +331,12 @@ public final class FST<T> implements Accountable {
@Override
public long ramBytesUsed() {
long size = BASE_RAM_BYTES_USED;
if (bytesArray != null) {
size += bytesArray.length;
if (this.fstStore != null) {
size += this.fstStore.ramBytesUsed();
} else {
size += bytes.ramBytesUsed();
}
size += cachedArcsBytesUsed;
return size;
}
@ -465,9 +453,8 @@ public final class FST<T> implements Accountable {
out.writeVLong(numBytes);
bytes.writeTo(out);
} else {
assert bytesArray != null;
out.writeVLong(bytesArray.length);
out.writeBytes(bytesArray, 0, bytesArray.length);
assert fstStore != null;
fstStore.writeTo(out);
}
}
@ -1137,8 +1124,8 @@ public final class FST<T> implements Accountable {
/** Returns a {@link BytesReader} for this FST, positioned at
* position 0. */
public BytesReader getBytesReader() {
if (bytesArray != null) {
return new ReverseBytesReader(bytesArray);
if (this.fstStore != null) {
return this.fstStore.getReverseBytesReader();
} else {
return bytes.getReverseReader();
}

View File

@ -0,0 +1,30 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.Accountable;
import java.io.IOException;
/** Abstraction for reading/writing bytes necessary for FST. */
public interface FSTStore extends Accountable {
void init(DataInput in, long numBytes) throws IOException;
FST.BytesReader getReverseBytesReader();
void writeTo(DataOutput out) throws IOException;
}

View File

@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.RamUsageEstimator;
import java.io.IOException;
/** Provides off heap storage of finite state machine (FST),
* using underlying index input instead of byte store on heap
*
* @lucene.experimental
*/
public final class OffHeapFSTStore implements FSTStore {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OffHeapFSTStore.class);
private IndexInput in;
private long offset;
private long numBytes;
@Override
public void init(DataInput in, long numBytes) throws IOException {
if (in instanceof IndexInput) {
this.in = (IndexInput) in;
this.numBytes = numBytes;
this.offset = this.in.getFilePointer();
} else {
throw new IllegalArgumentException("parameter:in should be an instance of IndexInput for using OffHeapFSTStore, not a "
+ in.getClass().getName());
}
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED;
}
@Override
public FST.BytesReader getReverseBytesReader() {
try {
return new ReverseRandomAccessReader(in.randomAccessSlice(offset, numBytes));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public void writeTo(DataOutput out) throws IOException {
throw new UnsupportedOperationException("writeToOutput operation is not supported for OffHeapFSTStore");
}
}

View File

@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.RamUsageEstimator;
import java.io.IOException;
/** Provides storage of finite state machine (FST),
* using byte array or byte store allocated on heap.
*
* @lucene.experimental
*/
public final class OnHeapFSTStore implements FSTStore {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OnHeapFSTStore.class);
/** A {@link BytesStore}, used during building, or during reading when
* the FST is very large (more than 1 GB). If the FST is less than 1
* GB then bytesArray is set instead. */
private BytesStore bytes;
/** Used at read time when the FST fits into a single byte[]. */
private byte[] bytesArray;
private final int maxBlockBits;
public OnHeapFSTStore(int maxBlockBits) {
if (maxBlockBits < 1 || maxBlockBits > 30) {
throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
}
this.maxBlockBits = maxBlockBits;
}
@Override
public void init(DataInput in, long numBytes) throws IOException {
if (numBytes > 1 << this.maxBlockBits) {
// FST is big: we need multiple pages
bytes = new BytesStore(in, numBytes, 1<<this.maxBlockBits);
} else {
// FST fits into a single block: use ByteArrayBytesStoreReader for less overhead
bytesArray = new byte[(int) numBytes];
in.readBytes(bytesArray, 0, bytesArray.length);
}
}
@Override
public long ramBytesUsed() {
long size = BASE_RAM_BYTES_USED;
if (bytesArray != null) {
size += bytesArray.length;
} else {
size += bytes.ramBytesUsed();
}
return size;
}
@Override
public FST.BytesReader getReverseBytesReader() {
if (bytesArray != null) {
return new ReverseBytesReader(bytesArray);
} else {
return bytes.getReverseReader();
}
}
@Override
public void writeTo(DataOutput out) throws IOException {
if (bytes != null) {
long numBytes = bytes.getPosition();
out.writeVLong(numBytes);
bytes.writeTo(out);
} else {
assert bytesArray != null;
out.writeVLong(bytesArray.length);
out.writeBytes(bytesArray, 0, bytesArray.length);
}
}
}

View File

@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import java.io.IOException;
import org.apache.lucene.store.RandomAccessInput;
/** Implements reverse read from a RandomAccessInput. */
final class ReverseRandomAccessReader extends FST.BytesReader {
private final RandomAccessInput in;
private long pos;
public ReverseRandomAccessReader(RandomAccessInput in) {
this.in = in;
}
@Override
public byte readByte() throws IOException {
return in.readByte(pos--);
}
@Override
public void readBytes(byte[] b, int offset, int len) throws IOException {
int i = offset, end = offset + len;
while (i < end) {
b[i++] = in.readByte(pos--);
}
}
@Override
public void skipBytes(long count) {
pos -= count;
}
@Override
public long getPosition() {
return pos;
}
@Override
public void setPosition(long pos) {
this.pos = pos;
}
@Override
public boolean reversed() {
return true;
}
}