LUCENE-7531: Removed packing support from FST.

This commit is contained in:
Adrien Grand 2016-11-10 14:09:52 +01:00
parent 6b9f11311a
commit c4c5e868d2
15 changed files with 81 additions and 680 deletions

View File

@ -27,7 +27,7 @@
<dependencies>
<dependency org="mecab" name="mecab-ipadic" rev="${/mecab/mecab-ipadic}" conf="ipadic">
<artifact name="ipadic" type=".tar.gz" url="http://mecab.googlecode.com/files/mecab-ipadic-2.7.0-20070801.tar.gz"/>
<artifact name="ipadic" type=".tar.gz" url="http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz"/>
</dependency>
<dependency org="mecab" name="mecab-naist-jdic" rev="${/mecab/mecab-naist-jdic}" conf="naist">
<artifact name="mecab-naist-jdic" type=".tar.gz" url="http://sourceforge.jp/frs/redir.php?m=iij&amp;f=/naist-jdic/53500/mecab-naist-jdic-0.6.3b-20111013.tar.gz"/>

View File

@ -28,7 +28,6 @@ import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.BitUtil;
public final class ConnectionCostsWriter {

View File

@ -33,12 +33,10 @@ import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.packed.PackedInts;
import com.ibm.icu.text.Normalizer2;
@ -133,7 +131,7 @@ public class TokenInfoDictionaryBuilder {
System.out.println(" encode...");
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, PackedInts.DEFAULT, true, 15);
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0
String lastValue = null;

View File

@ -48,7 +48,6 @@ import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.PackedInts;
/*
TODO:
@ -363,8 +362,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
FST_OUTPUTS, false,
PackedInts.COMPACT, true, 15);
FST_OUTPUTS, true, 15);
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}

View File

@ -81,9 +81,6 @@ import org.apache.lucene.util.packed.PackedInts;
// loads itself in ram?
public final class MemoryPostingsFormat extends PostingsFormat {
private final boolean doPackFST;
private final float acceptableOverheadRatio;
public MemoryPostingsFormat() {
this(false, PackedInts.DEFAULT);
}
@ -97,13 +94,11 @@ public final class MemoryPostingsFormat extends PostingsFormat {
*/
public MemoryPostingsFormat(boolean doPackFST, float acceptableOverheadRatio) {
super("Memory");
this.doPackFST = doPackFST;
this.acceptableOverheadRatio = acceptableOverheadRatio;
}
@Override
public String toString() {
return "PostingsFormat(name=" + getName() + " doPackFST= " + doPackFST + ")";
return "PostingsFormat(name=" + getName() + ")";
}
private final static class TermsWriter {
@ -111,16 +106,12 @@ public final class MemoryPostingsFormat extends PostingsFormat {
private final FieldInfo field;
private final Builder<BytesRef> builder;
private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
private final boolean doPackFST;
private final float acceptableOverheadRatio;
private int termCount;
public TermsWriter(IndexOutput out, FieldInfo field, boolean doPackFST, float acceptableOverheadRatio) {
public TermsWriter(IndexOutput out, FieldInfo field) {
this.out = out;
this.field = field;
this.doPackFST = doPackFST;
this.acceptableOverheadRatio = acceptableOverheadRatio;
builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, doPackFST, acceptableOverheadRatio, true, 15);
builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
}
private class PostingsWriter {
@ -307,8 +298,7 @@ public final class MemoryPostingsFormat extends PostingsFormat {
TermsEnum termsEnum = terms.iterator();
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
TermsWriter termsWriter = new TermsWriter(out, fieldInfo,
doPackFST, acceptableOverheadRatio);
TermsWriter termsWriter = new TermsWriter(out, fieldInfo);
FixedBitSet docsSeen = new FixedBitSet(state.segmentInfo.maxDoc());
long sumTotalTermFreq = 0;

View File

@ -48,7 +48,6 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.PackedInts;
/*
TODO:
@ -456,8 +455,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
outputs, false,
PackedInts.COMPACT, true, 15);
outputs, true, 15);
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}

View File

@ -23,7 +23,6 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
import org.apache.lucene.util.packed.PackedInts;
// TODO: could we somehow stream an FST to disk while we
// build it?
@ -69,10 +68,6 @@ public class Builder<T> {
private final int shareMaxTailLength;
private final IntsRefBuilder lastInput = new IntsRefBuilder();
// for packing
private final boolean doPackFST;
private final float acceptableOverheadRatio;
// NOTE: cutting this over to ArrayList instead loses ~6%
// in build performance on 9.8M Wikipedia terms; so we
@ -99,11 +94,10 @@ public class Builder<T> {
/**
* Instantiates an FST/FSA builder without any pruning. A shortcut
* to {@link #Builder(FST.INPUT_TYPE, int, int, boolean,
* boolean, int, Outputs, boolean, float,
* boolean, int)} with pruning options turned off.
* boolean, int, Outputs, boolean, int)} with pruning options turned off.
*/
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, false, PackedInts.COMPACT, true, 15);
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
}
/**
@ -143,11 +137,6 @@ public class Builder<T> {
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object.
*
* @param doPackFST Pass true to create a packed FST.
*
* @param acceptableOverheadRatio How to trade speed for space when building the FST. This option
* is only relevant when doPackFST is true. @see PackedInts#getMutable(int, int, float)
*
* @param allowArrayArcs Pass false to disable the array arc optimization
* while building the FST; this will make the resulting
* FST smaller but slower to traverse.
@ -159,16 +148,13 @@ public class Builder<T> {
*/
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
boolean doPackFST, float acceptableOverheadRatio, boolean allowArrayArcs,
int bytesPageBits) {
boolean allowArrayArcs, int bytesPageBits) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
this.doPackFST = doPackFST;
this.acceptableOverheadRatio = acceptableOverheadRatio;
this.allowArrayArcs = allowArrayArcs;
fst = new FST<>(inputType, outputs, doPackFST, acceptableOverheadRatio, bytesPageBits);
fst = new FST<>(inputType, outputs, bytesPageBits);
bytes = fst.bytes;
assert bytes != null;
if (doShareSuffix) {
@ -496,11 +482,7 @@ public class Builder<T> {
//if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
fst.finish(compileNode(root, lastInput.length()).node);
if (doPackFST) {
return fst.pack(this, 3, Math.max(10, (int) (getNodeCount()/4)), acceptableOverheadRatio);
} else {
return fst;
}
return fst;
}
private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {

View File

@ -24,13 +24,9 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
@ -38,13 +34,9 @@ import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
// TODO: break this into WritableFST and ReadOnlyFST.. then
// we can have subclasses of ReadOnlyFST to handle the
@ -90,14 +82,6 @@ public final class FST<T> implements Accountable {
static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
// Arcs are stored as fixed-size (per entry) array, so
// that we can find an arc using binary search. We do
// this when number of arcs is > NUM_ARCS_ARRAY:
// If set, the target node is delta coded vs current
// position:
private static final int BIT_TARGET_DELTA = 1 << 6;
// We use this as a marker (because this one flag is
// illegal by itself ...):
private static final byte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT;
@ -137,7 +121,9 @@ public final class FST<T> implements Accountable {
/** Don't store arcWithOutputCount anymore */
private static final int VERSION_NO_NODE_ARC_COUNTS = 5;
private static final int VERSION_CURRENT = VERSION_NO_NODE_ARC_COUNTS;
private static final int VERSION_PACKED_REMOVED = 6;
private static final int VERSION_CURRENT = VERSION_PACKED_REMOVED;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
@ -168,9 +154,6 @@ public final class FST<T> implements Accountable {
public final Outputs<T> outputs;
private final boolean packed;
private PackedInts.Reader nodeRefToAddress;
private Arc<T> cachedRootArcs[];
/** Represents a single arc. */
@ -273,18 +256,11 @@ public final class FST<T> implements Accountable {
return (flags & bit) != 0;
}
private GrowableWriter nodeAddress;
// TODO: we could be smarter here, and prune periodically
// as we go; high in-count nodes will "usually" become
// clear early on:
private GrowableWriter inCounts;
private final int version;
// make a new empty FST, for building; Builder invokes
// this ctor
FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, int bytesPageBits) {
FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) {
this.inputType = inputType;
this.outputs = outputs;
version = VERSION_CURRENT;
@ -293,17 +269,8 @@ public final class FST<T> implements Accountable {
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
bytes.writeByte((byte) 0);
if (willPackFST) {
nodeAddress = new GrowableWriter(15, 8, acceptableOverheadRatio);
inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio);
} else {
nodeAddress = null;
inCounts = null;
}
emptyOutput = null;
packed = false;
nodeRefToAddress = null;
}
public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
@ -324,8 +291,12 @@ public final class FST<T> implements Accountable {
// NOTE: only reads most recent format; we don't have
// back-compat promise for FSTs (they are experimental):
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_NO_NODE_ARC_COUNTS);
packed = in.readByte() == 1;
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_CURRENT);
if (version < VERSION_PACKED_REMOVED) {
if (in.readByte() == 1) {
throw new CorruptIndexException("Cannot read packed FSTs anymore", in);
}
}
if (in.readByte() == 1) {
// accepts empty string
// 1 KB blocks:
@ -334,17 +305,12 @@ public final class FST<T> implements Accountable {
emptyBytes.copyBytes(in, numBytes);
// De-serialize empty-string output:
BytesReader reader;
if (packed) {
reader = emptyBytes.getForwardReader();
} else {
reader = emptyBytes.getReverseReader();
// NoOutputs uses 0 bytes when writing its output,
// so we have to check here else BytesStore gets
// angry:
if (numBytes > 0) {
reader.setPosition(numBytes-1);
}
BytesReader reader = emptyBytes.getReverseReader();
// NoOutputs uses 0 bytes when writing its output,
// so we have to check here else BytesStore gets
// angry:
if (numBytes > 0) {
reader.setPosition(numBytes-1);
}
emptyOutput = outputs.readFinalOutput(reader);
} else {
@ -364,11 +330,6 @@ public final class FST<T> implements Accountable {
default:
throw new IllegalStateException("invalid input type " + t);
}
if (packed) {
nodeRefToAddress = PackedInts.getReader(in);
} else {
nodeRefToAddress = null;
}
startNode = in.readVLong();
if (version < VERSION_NO_NODE_ARC_COUNTS) {
in.readVLong();
@ -424,31 +385,13 @@ public final class FST<T> implements Accountable {
} else {
size += bytes.ramBytesUsed();
}
if (packed) {
size += nodeRefToAddress.ramBytesUsed();
} else if (nodeAddress != null) {
size += nodeAddress.ramBytesUsed();
size += inCounts.ramBytesUsed();
}
size += cachedArcsBytesUsed;
return size;
}
@Override
public Collection<Accountable> getChildResources() {
List<Accountable> resources = new ArrayList<>();
if (packed) {
resources.add(Accountables.namedAccountable("node ref to address", nodeRefToAddress));
} else if (nodeAddress != null) {
resources.add(Accountables.namedAccountable("node addresses", nodeAddress));
resources.add(Accountables.namedAccountable("in counts", inCounts));
}
return resources;
}
@Override
public String toString() {
return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs + ",packed=" + packed;
return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs;
}
void finish(long newStartNode) throws IOException {
@ -463,16 +406,6 @@ public final class FST<T> implements Accountable {
bytes.finish();
cacheRootArcs();
}
private long getNodeAddress(long node) {
if (nodeAddress != null) {
// Deref
return nodeAddress.get((int) node);
} else {
// Straight
return node;
}
}
// Optionally caches first 128 labels
@SuppressWarnings({"rawtypes","unchecked"})
@ -527,18 +460,7 @@ public final class FST<T> implements Accountable {
if (startNode == -1) {
throw new IllegalStateException("call finish first");
}
if (nodeAddress != null) {
throw new IllegalStateException("cannot save an FST pre-packed FST; it must first be packed");
}
if (packed && !(nodeRefToAddress instanceof PackedInts.Mutable)) {
throw new IllegalStateException("cannot save a FST which has been loaded from disk ");
}
CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT);
if (packed) {
out.writeByte((byte) 1);
} else {
out.writeByte((byte) 0);
}
// TODO: really we should encode this as an arc, arriving
// to the root node, instead of special casing here:
if (emptyOutput != null) {
@ -552,16 +474,14 @@ public final class FST<T> implements Accountable {
byte[] emptyOutputBytes = new byte[(int) ros.getFilePointer()];
ros.writeTo(emptyOutputBytes, 0);
if (!packed) {
// reverse
final int stopAt = emptyOutputBytes.length/2;
int upto = 0;
while(upto < stopAt) {
final byte b = emptyOutputBytes[upto];
emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1];
emptyOutputBytes[emptyOutputBytes.length-upto-1] = b;
upto++;
}
// reverse
final int stopAt = emptyOutputBytes.length/2;
int upto = 0;
while(upto < stopAt) {
final byte b = emptyOutputBytes[upto];
emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1];
emptyOutputBytes[emptyOutputBytes.length-upto-1] = b;
upto++;
}
out.writeVInt(emptyOutputBytes.length);
out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length);
@ -577,9 +497,6 @@ public final class FST<T> implements Accountable {
t = 2;
}
out.writeByte(t);
if (packed) {
((PackedInts.Mutable) nodeRefToAddress).save(out);
}
out.writeVLong(startNode);
if (bytes != null) {
long numBytes = bytes.getPosition();
@ -705,8 +622,6 @@ public final class FST<T> implements Accountable {
if (!targetHasArcs) {
flags += BIT_STOP_NODE;
} else if (inCounts != null) {
inCounts.set((int) target.node, inCounts.get((int) target.node) + 1);
}
if (arc.output != NO_OUTPUT) {
@ -810,30 +725,8 @@ public final class FST<T> implements Accountable {
builder.bytes.reverse(startAddress, thisNodeAddress);
// PackedInts uses int as the index, so we cannot handle
// > 2.1B nodes when packing:
if (nodeAddress != null && builder.nodeCount == Integer.MAX_VALUE) {
throw new IllegalStateException("cannot create a packed FST with more than 2.1 billion nodes");
}
builder.nodeCount++;
final long node;
if (nodeAddress != null) {
// Nodes are addressed by 1+ord:
if ((int) builder.nodeCount == nodeAddress.size()) {
nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue()));
inCounts = inCounts.resize(ArrayUtil.oversize(inCounts.size() + 1, inCounts.getBitsPerValue()));
}
nodeAddress.set((int) builder.nodeCount, thisNodeAddress);
// System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress);
node = builder.nodeCount;
} else {
node = thisNodeAddress;
}
//System.out.println(" ret node=" + node + " address=" + thisNodeAddress + " nodeAddress=" + nodeAddress);
return node;
return thisNodeAddress;
}
/** Fills virtual 'start' arc, ie, an empty incoming arc to
@ -876,13 +769,13 @@ public final class FST<T> implements Accountable {
arc.flags = BIT_LAST_ARC;
return arc;
} else {
in.setPosition(getNodeAddress(follow.target));
in.setPosition(follow.target);
arc.node = follow.target;
final byte b = in.readByte();
if (b == ARCS_AS_FIXED_ARRAY) {
// array: jump straight to end
arc.numArcs = in.readVInt();
if (packed || version >= VERSION_VINT_TARGET) {
if (version >= VERSION_VINT_TARGET) {
arc.bytesPerArc = in.readVInt();
} else {
arc.bytesPerArc = in.readInt();
@ -906,8 +799,6 @@ public final class FST<T> implements Accountable {
}
if (arc.flag(BIT_STOP_NODE)) {
} else if (arc.flag(BIT_TARGET_NEXT)) {
} else if (packed) {
in.readVLong();
} else {
readUnpackedNodeTarget(in);
}
@ -964,7 +855,7 @@ public final class FST<T> implements Accountable {
}
public Arc<T> readFirstRealTargetArc(long node, Arc<T> arc, final BytesReader in) throws IOException {
final long address = getNodeAddress(node);
final long address = node;
in.setPosition(address);
//System.out.println(" readFirstRealTargtArc address="
//+ address);
@ -975,7 +866,7 @@ public final class FST<T> implements Accountable {
//System.out.println(" fixedArray");
// this is first arc in a fixed-array
arc.numArcs = in.readVInt();
if (packed || version >= VERSION_VINT_TARGET) {
if (version >= VERSION_VINT_TARGET) {
arc.bytesPerArc = in.readVInt();
} else {
arc.bytesPerArc = in.readInt();
@ -1002,7 +893,7 @@ public final class FST<T> implements Accountable {
if (!targetHasArcs(follow)) {
return false;
} else {
in.setPosition(getNodeAddress(follow.target));
in.setPosition(follow.target);
return in.readByte() == ARCS_AS_FIXED_ARRAY;
}
}
@ -1029,7 +920,7 @@ public final class FST<T> implements Accountable {
//System.out.println(" nextArc fake " +
//arc.nextArc);
long pos = getNodeAddress(arc.nextArc);
long pos = arc.nextArc;
in.setPosition(pos);
final byte b = in.readByte();
@ -1038,7 +929,7 @@ public final class FST<T> implements Accountable {
in.readVInt();
// Skip bytesPerArc:
if (packed || version >= VERSION_VINT_TARGET) {
if (version >= VERSION_VINT_TARGET) {
in.readVInt();
} else {
in.readInt();
@ -1107,41 +998,18 @@ public final class FST<T> implements Accountable {
arc.nextArc = in.getPosition();
// TODO: would be nice to make this lazy -- maybe
// caller doesn't need the target and is scanning arcs...
if (nodeAddress == null) {
if (!arc.flag(BIT_LAST_ARC)) {
if (arc.bytesPerArc == 0) {
// must scan
seekToNextNode(in);
} else {
in.setPosition(arc.posArcsStart);
in.skipBytes(arc.bytesPerArc * arc.numArcs);
}
}
arc.target = in.getPosition();
} else {
arc.target = arc.node - 1;
assert arc.target > 0;
}
} else {
if (packed) {
final long pos = in.getPosition();
final long code = in.readVLong();
if (arc.flag(BIT_TARGET_DELTA)) {
// Address is delta-coded from current address:
arc.target = pos + code;
//System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target);
} else if (code < nodeRefToAddress.size()) {
// Deref
arc.target = nodeRefToAddress.get((int) code);
//System.out.println(" deref code=" + code + " target=" + arc.target);
if (!arc.flag(BIT_LAST_ARC)) {
if (arc.bytesPerArc == 0) {
// must scan
seekToNextNode(in);
} else {
// Absolute
arc.target = code;
//System.out.println(" abs code=" + code);
in.setPosition(arc.posArcsStart);
in.skipBytes(arc.bytesPerArc * arc.numArcs);
}
} else {
arc.target = readUnpackedNodeTarget(in);
}
arc.target = in.getPosition();
} else {
arc.target = readUnpackedNodeTarget(in);
arc.nextArc = in.getPosition();
}
return arc;
@ -1228,7 +1096,7 @@ public final class FST<T> implements Accountable {
return null;
}
in.setPosition(getNodeAddress(follow.target));
in.setPosition(follow.target);
arc.node = follow.target;
@ -1237,7 +1105,7 @@ public final class FST<T> implements Accountable {
if (in.readByte() == ARCS_AS_FIXED_ARRAY) {
// Arcs are full array; do binary search:
arc.numArcs = in.readVInt();
if (packed || version >= VERSION_VINT_TARGET) {
if (version >= VERSION_VINT_TARGET) {
arc.bytesPerArc = in.readVInt();
} else {
arc.bytesPerArc = in.readInt();
@ -1303,11 +1171,7 @@ public final class FST<T> implements Accountable {
}
if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) {
if (packed) {
in.readVLong();
} else {
readUnpackedNodeTarget(in);
}
readUnpackedNodeTarget(in);
}
if (flag(flags, BIT_LAST_ARC)) {
@ -1340,18 +1204,10 @@ public final class FST<T> implements Accountable {
/** Returns a {@link BytesReader} for this FST, positioned at
* position 0. */
public BytesReader getBytesReader() {
if (packed) {
if (bytesArray != null) {
return new ForwardBytesReader(bytesArray);
} else {
return bytes.getForwardReader();
}
if (bytesArray != null) {
return new ReverseBytesReader(bytesArray);
} else {
if (bytesArray != null) {
return new ReverseBytesReader(bytesArray);
} else {
return bytes.getReverseReader();
}
return bytes.getReverseReader();
}
}
@ -1476,395 +1332,4 @@ public final class FST<T> implements Accountable {
}
*/
// Creates a packed FST
private FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) {
version = VERSION_CURRENT;
packed = true;
this.inputType = inputType;
bytesArray = null;
bytes = new BytesStore(bytesPageBits);
this.outputs = outputs;
}
/** Expert: creates an FST by packing this one. This
* process requires substantial additional RAM (currently
* up to ~8 bytes per node depending on
* <code>acceptableOverheadRatio</code>), but then should
* produce a smaller FST.
*
* <p>The implementation of this method uses ideas from
* <a target="_blank" href="http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf">Smaller Representation of Finite State Automata</a>,
* which describes techniques to reduce the size of a FST.
* However, this is not a strict implementation of the
* algorithms described in this paper.
*/
FST<T> pack(Builder<T> builder, int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException {
// NOTE: maxDerefNodes is intentionally int: we cannot
// support > 2.1B deref nodes
// TODO: other things to try
// - renumber the nodes to get more next / better locality?
// - allow multiple input labels on an arc, so
// singular chain of inputs can take one arc (on
// wikipedia terms this could save another ~6%)
// - in the ord case, the output '1' is presumably
// very common (after NO_OUTPUT)... maybe use a bit
// for it..?
// - use spare bits in flags.... for top few labels /
// outputs / targets
if (nodeAddress == null) {
throw new IllegalArgumentException("this FST was not built with willPackFST=true");
}
T NO_OUTPUT = outputs.getNoOutput();
Arc<T> arc = new Arc<>();
final BytesReader r = getBytesReader();
final int topN = Math.min(maxDerefNodes, inCounts.size());
// Find top nodes with highest number of incoming arcs:
NodeQueue q = new NodeQueue(topN);
// TODO: we could use more RAM efficient selection algo here...
NodeAndInCount bottom = null;
for(int node=0; node<inCounts.size(); node++) {
if (inCounts.get(node) >= minInCountDeref) {
if (bottom == null) {
q.add(new NodeAndInCount(node, (int) inCounts.get(node)));
if (q.size() == topN) {
bottom = q.top();
}
} else if (inCounts.get(node) > bottom.count) {
q.insertWithOverflow(new NodeAndInCount(node, (int) inCounts.get(node)));
}
}
}
// Free up RAM:
inCounts = null;
final Map<Integer,Integer> topNodeMap = new HashMap<>();
for(int downTo=q.size()-1;downTo>=0;downTo--) {
NodeAndInCount n = q.pop();
topNodeMap.put(n.node, downTo);
//System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo);
}
// +1 because node ords start at 1 (0 is reserved as stop node):
final GrowableWriter newNodeAddress = new GrowableWriter(
PackedInts.bitsRequired(builder.bytes.getPosition()), (int) (1 + builder.nodeCount), acceptableOverheadRatio);
// Fill initial coarse guess:
for(int node=1;node<=builder.nodeCount;node++) {
newNodeAddress.set(node, 1 + builder.bytes.getPosition() - nodeAddress.get(node));
}
int absCount;
int deltaCount;
int topCount;
int nextCount;
FST<T> fst;
// Iterate until we converge:
while(true) {
//System.out.println("\nITER");
boolean changed = false;
// for assert:
boolean negDelta = false;
fst = new FST<>(inputType, outputs, builder.bytes.getBlockBits());
final BytesStore writer = fst.bytes;
// Skip 0 byte since 0 is reserved target:
writer.writeByte((byte) 0);
absCount = deltaCount = topCount = nextCount = 0;
int changedCount = 0;
long addressError = 0;
//int totWasted = 0;
// Since we re-reverse the bytes, we now write the
// nodes backwards, so that BIT_TARGET_NEXT is
// unchanged:
for(int node=(int) builder.nodeCount;node>=1;node--) {
final long address = writer.getPosition();
//System.out.println(" node: " + node + " address=" + address);
if (address != newNodeAddress.get(node)) {
addressError = address - newNodeAddress.get(node);
//System.out.println(" change: " + (address - newNodeAddress[node]));
changed = true;
newNodeAddress.set(node, address);
changedCount++;
}
int nodeArcCount = 0;
int bytesPerArc = 0;
boolean retry = false;
// for assert:
boolean anyNegDelta = false;
// Retry loop: possibly iterate more than once, if
// this is an array'd node and bytesPerArc changes:
writeNode:
while(true) { // retry writing this node
//System.out.println(" cycle: retry");
readFirstRealTargetArc(node, arc, r);
final boolean useArcArray = arc.bytesPerArc != 0;
if (useArcArray) {
// Write false first arc:
if (bytesPerArc == 0) {
bytesPerArc = arc.bytesPerArc;
}
writer.writeByte(ARCS_AS_FIXED_ARRAY);
writer.writeVInt(arc.numArcs);
writer.writeVInt(bytesPerArc);
//System.out.println("node " + node + ": " + arc.numArcs + " arcs");
}
int maxBytesPerArc = 0;
//int wasted = 0;
while(true) { // iterate over all arcs for this node
//System.out.println(" cycle next arc");
final long arcStartPos = writer.getPosition();
nodeArcCount++;
byte flags = 0;
if (arc.isLast()) {
flags += BIT_LAST_ARC;
}
/*
if (!useArcArray && nodeUpto < nodes.length-1 && arc.target == nodes[nodeUpto+1]) {
flags += BIT_TARGET_NEXT;
}
*/
if (!useArcArray && node != 1 && arc.target == node-1) {
flags += BIT_TARGET_NEXT;
if (!retry) {
nextCount++;
}
}
if (arc.isFinal()) {
flags += BIT_FINAL_ARC;
if (arc.nextFinalOutput != NO_OUTPUT) {
flags += BIT_ARC_HAS_FINAL_OUTPUT;
}
} else {
assert arc.nextFinalOutput == NO_OUTPUT;
}
if (!targetHasArcs(arc)) {
flags += BIT_STOP_NODE;
}
if (arc.output != NO_OUTPUT) {
flags += BIT_ARC_HAS_OUTPUT;
}
final long absPtr;
final boolean doWriteTarget = targetHasArcs(arc) && (flags & BIT_TARGET_NEXT) == 0;
if (doWriteTarget) {
final Integer ptr = topNodeMap.get(arc.target);
if (ptr != null) {
absPtr = ptr;
} else {
absPtr = topNodeMap.size() + newNodeAddress.get((int) arc.target) + addressError;
}
long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition() - 2;
if (delta < 0) {
//System.out.println("neg: " + delta);
anyNegDelta = true;
delta = 0;
}
if (delta < absPtr) {
flags |= BIT_TARGET_DELTA;
}
} else {
absPtr = 0;
}
assert flags != ARCS_AS_FIXED_ARRAY;
writer.writeByte(flags);
fst.writeLabel(writer, arc.label);
if (arc.output != NO_OUTPUT) {
outputs.write(arc.output, writer);
}
if (arc.nextFinalOutput != NO_OUTPUT) {
outputs.writeFinalOutput(arc.nextFinalOutput, writer);
}
if (doWriteTarget) {
long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition();
if (delta < 0) {
anyNegDelta = true;
//System.out.println("neg: " + delta);
delta = 0;
}
if (flag(flags, BIT_TARGET_DELTA)) {
//System.out.println(" delta");
writer.writeVLong(delta);
if (!retry) {
deltaCount++;
}
} else {
/*
if (ptr != null) {
System.out.println(" deref");
} else {
System.out.println(" abs");
}
*/
writer.writeVLong(absPtr);
if (!retry) {
if (absPtr >= topNodeMap.size()) {
absCount++;
} else {
topCount++;
}
}
}
}
if (useArcArray) {
final int arcBytes = (int) (writer.getPosition() - arcStartPos);
//System.out.println(" " + arcBytes + " bytes");
maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes);
// NOTE: this may in fact go "backwards", if
// somehow (rarely, possibly never) we use
// more bytesPerArc in this rewrite than the
// incoming FST did... but in this case we
// will retry (below) so it's OK to ovewrite
// bytes:
//wasted += bytesPerArc - arcBytes;
writer.skipBytes((int) (arcStartPos + bytesPerArc - writer.getPosition()));
}
if (arc.isLast()) {
break;
}
readNextRealArc(arc, r);
}
if (useArcArray) {
if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc)) {
// converged
//System.out.println(" bba=" + bytesPerArc + " wasted=" + wasted);
//totWasted += wasted;
break;
}
} else {
break;
}
//System.out.println(" retry this node maxBytesPerArc=" + maxBytesPerArc + " vs " + bytesPerArc);
// Retry:
bytesPerArc = maxBytesPerArc;
writer.truncate(address);
nodeArcCount = 0;
retry = true;
anyNegDelta = false;
}
negDelta |= anyNegDelta;
}
if (!changed) {
// We don't renumber the nodes (just reverse their
// order) so nodes should only point forward to
// other nodes because we only produce acyclic FSTs
// w/ nodes only pointing "forwards":
assert !negDelta;
//System.out.println("TOT wasted=" + totWasted);
// Converged!
break;
}
}
long maxAddress = 0;
for (long key : topNodeMap.keySet()) {
maxAddress = Math.max(maxAddress, newNodeAddress.get((int) key));
}
PackedInts.Mutable nodeRefToAddressIn = PackedInts.getMutable(topNodeMap.size(),
PackedInts.bitsRequired(maxAddress), acceptableOverheadRatio);
for(Map.Entry<Integer,Integer> ent : topNodeMap.entrySet()) {
nodeRefToAddressIn.set(ent.getValue(), newNodeAddress.get(ent.getKey()));
}
fst.nodeRefToAddress = nodeRefToAddressIn;
fst.startNode = newNodeAddress.get((int) startNode);
//System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode);
if (emptyOutput != null) {
fst.setEmptyOutput(emptyOutput);
}
fst.bytes.finish();
fst.cacheRootArcs();
//final int size = fst.sizeInBytes();
//System.out.println("nextCount=" + nextCount + " topCount=" + topCount + " deltaCount=" + deltaCount + " absCount=" + absCount);
return fst;
}
private static class NodeAndInCount implements Comparable<NodeAndInCount> {
final int node;
final int count;
public NodeAndInCount(int node, int count) {
this.node = node;
this.count = count;
}
@Override
public int compareTo(NodeAndInCount other) {
if (count > other.count) {
return 1;
} else if (count < other.count) {
return -1;
} else {
// Tie-break: smaller node compares as greater than
return other.node - node;
}
}
}
private static class NodeQueue extends PriorityQueue<NodeAndInCount> {
public NodeQueue(int topN) {
super(topN, false);
}
@Override
public boolean lessThan(NodeAndInCount a, NodeAndInCount b) {
final int cmp = a.compareTo(b);
assert cmp != 0;
return cmp < 0;
}
}
}

View File

@ -24,7 +24,6 @@
* <li>Fast and low memory overhead construction of the minimal FST
* (but inputs must be provided in sorted order)</li>
* <li>Low object overhead and quick deserialization (byte[] representation)</li>
* <li>Optional two-pass compression: {@link org.apache.lucene.util.fst.FST#pack FST.pack()}</li>
* <li>{@link org.apache.lucene.util.fst.Util#getByOutput Lookup-by-output} when the
* outputs are in sorted order (e.g., ordinals or file pointers)</li>
* <li>Pluggable {@link org.apache.lucene.util.fst.Outputs Outputs} representation</li>

View File

@ -29,7 +29,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TimeUnits;
import org.apache.lucene.util.packed.PackedInts;
import org.junit.Ignore;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
@ -47,16 +46,14 @@ public class Test2BFST extends LuceneTestCase {
Directory dir = new MMapDirectory(createTempDir("2BFST"));
for(int doPackIter=0;doPackIter<2;doPackIter++) {
boolean doPack = doPackIter == 1;
for(int iter=0;iter<1;iter++) {
// Build FST w/ NoOutputs and stop when nodeCount > 2.2B
if (!doPack) {
{
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
Outputs<Object> outputs = NoOutputs.getSingleton();
Object NO_OUTPUT = outputs.getNoOutput();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
doPack, PackedInts.COMPACT, true, 15);
true, 15);
int count = 0;
Random r = new Random(seed);
@ -135,10 +132,10 @@ public class Test2BFST extends LuceneTestCase {
// Build FST w/ ByteSequenceOutputs and stop when FST
// size = 3GB
{
System.out.println("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes");
System.out.println("\nTEST: 3 GB size; outputs=bytes");
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
doPack, PackedInts.COMPACT, true, 15);
true, 15);
byte[] outputBytes = new byte[20];
BytesRef output = new BytesRef(outputBytes);
@ -212,10 +209,10 @@ public class Test2BFST extends LuceneTestCase {
// Build FST w/ PositiveIntOutputs and stop when FST
// size = 3GB
{
System.out.println("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long");
System.out.println("\nTEST: 3 GB size; outputs=long");
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
doPack, PackedInts.COMPACT, true, 15);
true, 15);
long output = 1;

View File

@ -76,7 +76,6 @@ import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.util.fst.FSTTester.getRandomString;
import static org.apache.lucene.util.fst.FSTTester.simpleRandomString;
@ -328,9 +327,7 @@ public class TestFSTs extends LuceneTestCase {
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final boolean doRewrite = random().nextBoolean();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, doRewrite, PackedInts.DEFAULT, true, 15);
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
boolean storeOrd = random().nextBoolean();
if (VERBOSE) {
@ -464,16 +461,14 @@ public class TestFSTs extends LuceneTestCase {
private int inputMode;
private final Outputs<T> outputs;
private final Builder<T> builder;
private final boolean doPack;
public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean doPack, boolean noArcArrays) {
public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) {
this.dirOut = dirOut;
this.wordsFileIn = wordsFileIn;
this.inputMode = inputMode;
this.outputs = outputs;
this.doPack = doPack;
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, doPack, PackedInts.DEFAULT, !noArcArrays, 15);
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15);
}
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@ -622,7 +617,6 @@ public class TestFSTs extends LuceneTestCase {
boolean storeOrds = false;
boolean storeDocFreqs = false;
boolean verify = true;
boolean doPack = false;
boolean noArcArrays = false;
Path wordsFileIn = null;
Path dirOut = null;
@ -647,8 +641,6 @@ public class TestFSTs extends LuceneTestCase {
storeOrds = true;
} else if (args[idx].equals("-noverify")) {
verify = false;
} else if (args[idx].equals("-pack")) {
doPack = true;
} else if (args[idx].startsWith("-")) {
System.err.println("Unrecognized option: " + args[idx]);
System.exit(-1);
@ -677,7 +669,7 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton();
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton();
final PairOutputs<Long,Long> outputs = new PairOutputs<>(o1, o2);
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
Random rand;
@Override
public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
@ -691,7 +683,7 @@ public class TestFSTs extends LuceneTestCase {
} else if (storeOrds) {
// Store only ords
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
@Override
public Long getOutput(IntsRef input, int ord) {
return (long) ord;
@ -700,7 +692,7 @@ public class TestFSTs extends LuceneTestCase {
} else if (storeDocFreqs) {
// Store only docFreq
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
Random rand;
@Override
public Long getOutput(IntsRef input, int ord) {
@ -714,7 +706,7 @@ public class TestFSTs extends LuceneTestCase {
// Store nothing
final NoOutputs outputs = NoOutputs.getSingleton();
final Object NO_OUTPUT = outputs.getNoOutput();
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
@Override
public Object getOutput(IntsRef input, int ord) {
return NO_OUTPUT;
@ -1118,7 +1110,7 @@ public class TestFSTs extends LuceneTestCase {
public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, random().nextBoolean(), PackedInts.DEFAULT, true, 15);
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
final FST<Long> fst = builder.finish();
@ -1132,8 +1124,7 @@ public class TestFSTs extends LuceneTestCase {
public void testInternalFinalState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final boolean willRewrite = random().nextBoolean();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, willRewrite, PackedInts.DEFAULT, true, 15);
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
final FST<Long> fst = builder.finish();

View File

@ -50,7 +50,6 @@ import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.PackedInts;
/*
TODO:
@ -354,8 +353,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
FST_OUTPUTS, false,
PackedInts.COMPACT, true, 15);
FST_OUTPUTS, true, 15);
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}

View File

@ -26,7 +26,6 @@ import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.*;
import org.apache.lucene.util.packed.PackedInts;
/**
* Finite state automata based implementation of "autocomplete" functionality.
@ -237,8 +236,7 @@ public class FSTCompletionBuilder {
final Object empty = outputs.getNoOutput();
final Builder<Object> builder = new Builder<>(
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
shareMaxTailLength, outputs, false,
PackedInts.DEFAULT, true, 15);
shareMaxTailLength, outputs, true, 15);
BytesRefBuilder scratch = new BytesRefBuilder();
BytesRef entry;

View File

@ -40,7 +40,6 @@ import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.packed.PackedInts;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
@ -273,25 +272,14 @@ public class FSTTester<T> {
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
}
final boolean willRewrite = random.nextBoolean();
final Builder<T> builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1==0 && prune2==0,
allowRandomSuffixSharing ? random.nextBoolean() : true,
allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
outputs,
willRewrite,
PackedInts.DEFAULT,
true,
15);
if (LuceneTestCase.VERBOSE) {
if (willRewrite) {
System.out.println("TEST: packed FST");
} else {
System.out.println("TEST: non-packed FST");
}
}
for(InputOutput<T> pair : pairs) {
if (pair.output instanceof List) {
@ -306,7 +294,7 @@ public class FSTTester<T> {
}
FST<T> fst = builder.finish();
if (random.nextBoolean() && fst != null && !willRewrite) {
if (random.nextBoolean() && fst != null) {
IOContext context = LuceneTestCase.newIOContext(random);
IndexOutput out = dir.createOutput("fst.bin", context);
fst.save(out);