mirror of https://github.com/apache/lucene.git
LUCENE-7531: Removed packing support from FST.
This commit is contained in:
parent
6b9f11311a
commit
c4c5e868d2
|
@ -27,7 +27,7 @@
|
|||
|
||||
<dependencies>
|
||||
<dependency org="mecab" name="mecab-ipadic" rev="${/mecab/mecab-ipadic}" conf="ipadic">
|
||||
<artifact name="ipadic" type=".tar.gz" url="http://mecab.googlecode.com/files/mecab-ipadic-2.7.0-20070801.tar.gz"/>
|
||||
<artifact name="ipadic" type=".tar.gz" url="http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz"/>
|
||||
</dependency>
|
||||
<dependency org="mecab" name="mecab-naist-jdic" rev="${/mecab/mecab-naist-jdic}" conf="naist">
|
||||
<artifact name="mecab-naist-jdic" type=".tar.gz" url="http://sourceforge.jp/frs/redir.php?m=iij&f=/naist-jdic/53500/mecab-naist-jdic-0.6.3b-20111013.tar.gz"/>
|
||||
|
|
Binary file not shown.
|
@ -28,7 +28,6 @@ import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
|
|||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
|
||||
public final class ConnectionCostsWriter {
|
||||
|
||||
|
|
|
@ -33,12 +33,10 @@ import java.util.Comparator;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
|
||||
|
@ -133,7 +131,7 @@ public class TokenInfoDictionaryBuilder {
|
|||
System.out.println(" encode...");
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, PackedInts.DEFAULT, true, 15);
|
||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
long ord = -1; // first ord will be 0
|
||||
String lastValue = null;
|
||||
|
|
|
@ -48,7 +48,6 @@ import org.apache.lucene.util.fst.Builder;
|
|||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/*
|
||||
TODO:
|
||||
|
@ -363,8 +362,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0, true, false, Integer.MAX_VALUE,
|
||||
FST_OUTPUTS, false,
|
||||
PackedInts.COMPACT, true, 15);
|
||||
FST_OUTPUTS, true, 15);
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
//}
|
||||
|
|
|
@ -81,9 +81,6 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
// loads itself in ram?
|
||||
public final class MemoryPostingsFormat extends PostingsFormat {
|
||||
|
||||
private final boolean doPackFST;
|
||||
private final float acceptableOverheadRatio;
|
||||
|
||||
public MemoryPostingsFormat() {
|
||||
this(false, PackedInts.DEFAULT);
|
||||
}
|
||||
|
@ -97,13 +94,11 @@ public final class MemoryPostingsFormat extends PostingsFormat {
|
|||
*/
|
||||
public MemoryPostingsFormat(boolean doPackFST, float acceptableOverheadRatio) {
|
||||
super("Memory");
|
||||
this.doPackFST = doPackFST;
|
||||
this.acceptableOverheadRatio = acceptableOverheadRatio;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "PostingsFormat(name=" + getName() + " doPackFST= " + doPackFST + ")";
|
||||
return "PostingsFormat(name=" + getName() + ")";
|
||||
}
|
||||
|
||||
private final static class TermsWriter {
|
||||
|
@ -111,16 +106,12 @@ public final class MemoryPostingsFormat extends PostingsFormat {
|
|||
private final FieldInfo field;
|
||||
private final Builder<BytesRef> builder;
|
||||
private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
private final boolean doPackFST;
|
||||
private final float acceptableOverheadRatio;
|
||||
private int termCount;
|
||||
|
||||
public TermsWriter(IndexOutput out, FieldInfo field, boolean doPackFST, float acceptableOverheadRatio) {
|
||||
public TermsWriter(IndexOutput out, FieldInfo field) {
|
||||
this.out = out;
|
||||
this.field = field;
|
||||
this.doPackFST = doPackFST;
|
||||
this.acceptableOverheadRatio = acceptableOverheadRatio;
|
||||
builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, doPackFST, acceptableOverheadRatio, true, 15);
|
||||
builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
}
|
||||
|
||||
private class PostingsWriter {
|
||||
|
@ -307,8 +298,7 @@ public final class MemoryPostingsFormat extends PostingsFormat {
|
|||
TermsEnum termsEnum = terms.iterator();
|
||||
|
||||
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
|
||||
TermsWriter termsWriter = new TermsWriter(out, fieldInfo,
|
||||
doPackFST, acceptableOverheadRatio);
|
||||
TermsWriter termsWriter = new TermsWriter(out, fieldInfo);
|
||||
|
||||
FixedBitSet docsSeen = new FixedBitSet(state.segmentInfo.maxDoc());
|
||||
long sumTotalTermFreq = 0;
|
||||
|
|
|
@ -48,7 +48,6 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
|||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/*
|
||||
TODO:
|
||||
|
@ -456,8 +455,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0, true, false, Integer.MAX_VALUE,
|
||||
outputs, false,
|
||||
PackedInts.COMPACT, true, 15);
|
||||
outputs, true, 15);
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
//}
|
||||
|
|
|
@ -23,7 +23,6 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
// TODO: could we somehow stream an FST to disk while we
|
||||
// build it?
|
||||
|
@ -69,10 +68,6 @@ public class Builder<T> {
|
|||
private final int shareMaxTailLength;
|
||||
|
||||
private final IntsRefBuilder lastInput = new IntsRefBuilder();
|
||||
|
||||
// for packing
|
||||
private final boolean doPackFST;
|
||||
private final float acceptableOverheadRatio;
|
||||
|
||||
// NOTE: cutting this over to ArrayList instead loses ~6%
|
||||
// in build performance on 9.8M Wikipedia terms; so we
|
||||
|
@ -99,11 +94,10 @@ public class Builder<T> {
|
|||
/**
|
||||
* Instantiates an FST/FSA builder without any pruning. A shortcut
|
||||
* to {@link #Builder(FST.INPUT_TYPE, int, int, boolean,
|
||||
* boolean, int, Outputs, boolean, float,
|
||||
* boolean, int)} with pruning options turned off.
|
||||
* boolean, int, Outputs, boolean, int)} with pruning options turned off.
|
||||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, false, PackedInts.COMPACT, true, 15);
|
||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -143,11 +137,6 @@ public class Builder<T> {
|
|||
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
||||
* singleton output object.
|
||||
*
|
||||
* @param doPackFST Pass true to create a packed FST.
|
||||
*
|
||||
* @param acceptableOverheadRatio How to trade speed for space when building the FST. This option
|
||||
* is only relevant when doPackFST is true. @see PackedInts#getMutable(int, int, float)
|
||||
*
|
||||
* @param allowArrayArcs Pass false to disable the array arc optimization
|
||||
* while building the FST; this will make the resulting
|
||||
* FST smaller but slower to traverse.
|
||||
|
@ -159,16 +148,13 @@ public class Builder<T> {
|
|||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
||||
boolean doPackFST, float acceptableOverheadRatio, boolean allowArrayArcs,
|
||||
int bytesPageBits) {
|
||||
boolean allowArrayArcs, int bytesPageBits) {
|
||||
this.minSuffixCount1 = minSuffixCount1;
|
||||
this.minSuffixCount2 = minSuffixCount2;
|
||||
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
this.doPackFST = doPackFST;
|
||||
this.acceptableOverheadRatio = acceptableOverheadRatio;
|
||||
this.allowArrayArcs = allowArrayArcs;
|
||||
fst = new FST<>(inputType, outputs, doPackFST, acceptableOverheadRatio, bytesPageBits);
|
||||
fst = new FST<>(inputType, outputs, bytesPageBits);
|
||||
bytes = fst.bytes;
|
||||
assert bytes != null;
|
||||
if (doShareSuffix) {
|
||||
|
@ -496,11 +482,7 @@ public class Builder<T> {
|
|||
//if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
|
||||
fst.finish(compileNode(root, lastInput.length()).node);
|
||||
|
||||
if (doPackFST) {
|
||||
return fst.pack(this, 3, Math.max(10, (int) (getNodeCount()/4)), acceptableOverheadRatio);
|
||||
} else {
|
||||
return fst;
|
||||
}
|
||||
return fst;
|
||||
}
|
||||
|
||||
private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
|
||||
|
|
|
@ -24,13 +24,9 @@ import java.io.InputStream;
|
|||
import java.io.OutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
@ -38,13 +34,9 @@ import org.apache.lucene.store.InputStreamDataInput;
|
|||
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.Accountables;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.packed.GrowableWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
// TODO: break this into WritableFST and ReadOnlyFST.. then
|
||||
// we can have subclasses of ReadOnlyFST to handle the
|
||||
|
@ -90,14 +82,6 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
|
||||
|
||||
// Arcs are stored as fixed-size (per entry) array, so
|
||||
// that we can find an arc using binary search. We do
|
||||
// this when number of arcs is > NUM_ARCS_ARRAY:
|
||||
|
||||
// If set, the target node is delta coded vs current
|
||||
// position:
|
||||
private static final int BIT_TARGET_DELTA = 1 << 6;
|
||||
|
||||
// We use this as a marker (because this one flag is
|
||||
// illegal by itself ...):
|
||||
private static final byte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT;
|
||||
|
@ -137,7 +121,9 @@ public final class FST<T> implements Accountable {
|
|||
/** Don't store arcWithOutputCount anymore */
|
||||
private static final int VERSION_NO_NODE_ARC_COUNTS = 5;
|
||||
|
||||
private static final int VERSION_CURRENT = VERSION_NO_NODE_ARC_COUNTS;
|
||||
private static final int VERSION_PACKED_REMOVED = 6;
|
||||
|
||||
private static final int VERSION_CURRENT = VERSION_PACKED_REMOVED;
|
||||
|
||||
// Never serialized; just used to represent the virtual
|
||||
// final node w/ no arcs:
|
||||
|
@ -168,9 +154,6 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
public final Outputs<T> outputs;
|
||||
|
||||
private final boolean packed;
|
||||
private PackedInts.Reader nodeRefToAddress;
|
||||
|
||||
private Arc<T> cachedRootArcs[];
|
||||
|
||||
/** Represents a single arc. */
|
||||
|
@ -273,18 +256,11 @@ public final class FST<T> implements Accountable {
|
|||
return (flags & bit) != 0;
|
||||
}
|
||||
|
||||
private GrowableWriter nodeAddress;
|
||||
|
||||
// TODO: we could be smarter here, and prune periodically
|
||||
// as we go; high in-count nodes will "usually" become
|
||||
// clear early on:
|
||||
private GrowableWriter inCounts;
|
||||
|
||||
private final int version;
|
||||
|
||||
// make a new empty FST, for building; Builder invokes
|
||||
// this ctor
|
||||
FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, int bytesPageBits) {
|
||||
FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) {
|
||||
this.inputType = inputType;
|
||||
this.outputs = outputs;
|
||||
version = VERSION_CURRENT;
|
||||
|
@ -293,17 +269,8 @@ public final class FST<T> implements Accountable {
|
|||
// pad: ensure no node gets address 0 which is reserved to mean
|
||||
// the stop state w/ no arcs
|
||||
bytes.writeByte((byte) 0);
|
||||
if (willPackFST) {
|
||||
nodeAddress = new GrowableWriter(15, 8, acceptableOverheadRatio);
|
||||
inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio);
|
||||
} else {
|
||||
nodeAddress = null;
|
||||
inCounts = null;
|
||||
}
|
||||
|
||||
emptyOutput = null;
|
||||
packed = false;
|
||||
nodeRefToAddress = null;
|
||||
}
|
||||
|
||||
public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
|
||||
|
@ -324,8 +291,12 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
// NOTE: only reads most recent format; we don't have
|
||||
// back-compat promise for FSTs (they are experimental):
|
||||
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_NO_NODE_ARC_COUNTS);
|
||||
packed = in.readByte() == 1;
|
||||
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_CURRENT);
|
||||
if (version < VERSION_PACKED_REMOVED) {
|
||||
if (in.readByte() == 1) {
|
||||
throw new CorruptIndexException("Cannot read packed FSTs anymore", in);
|
||||
}
|
||||
}
|
||||
if (in.readByte() == 1) {
|
||||
// accepts empty string
|
||||
// 1 KB blocks:
|
||||
|
@ -334,17 +305,12 @@ public final class FST<T> implements Accountable {
|
|||
emptyBytes.copyBytes(in, numBytes);
|
||||
|
||||
// De-serialize empty-string output:
|
||||
BytesReader reader;
|
||||
if (packed) {
|
||||
reader = emptyBytes.getForwardReader();
|
||||
} else {
|
||||
reader = emptyBytes.getReverseReader();
|
||||
// NoOutputs uses 0 bytes when writing its output,
|
||||
// so we have to check here else BytesStore gets
|
||||
// angry:
|
||||
if (numBytes > 0) {
|
||||
reader.setPosition(numBytes-1);
|
||||
}
|
||||
BytesReader reader = emptyBytes.getReverseReader();
|
||||
// NoOutputs uses 0 bytes when writing its output,
|
||||
// so we have to check here else BytesStore gets
|
||||
// angry:
|
||||
if (numBytes > 0) {
|
||||
reader.setPosition(numBytes-1);
|
||||
}
|
||||
emptyOutput = outputs.readFinalOutput(reader);
|
||||
} else {
|
||||
|
@ -364,11 +330,6 @@ public final class FST<T> implements Accountable {
|
|||
default:
|
||||
throw new IllegalStateException("invalid input type " + t);
|
||||
}
|
||||
if (packed) {
|
||||
nodeRefToAddress = PackedInts.getReader(in);
|
||||
} else {
|
||||
nodeRefToAddress = null;
|
||||
}
|
||||
startNode = in.readVLong();
|
||||
if (version < VERSION_NO_NODE_ARC_COUNTS) {
|
||||
in.readVLong();
|
||||
|
@ -424,31 +385,13 @@ public final class FST<T> implements Accountable {
|
|||
} else {
|
||||
size += bytes.ramBytesUsed();
|
||||
}
|
||||
if (packed) {
|
||||
size += nodeRefToAddress.ramBytesUsed();
|
||||
} else if (nodeAddress != null) {
|
||||
size += nodeAddress.ramBytesUsed();
|
||||
size += inCounts.ramBytesUsed();
|
||||
}
|
||||
size += cachedArcsBytesUsed;
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Accountable> getChildResources() {
|
||||
List<Accountable> resources = new ArrayList<>();
|
||||
if (packed) {
|
||||
resources.add(Accountables.namedAccountable("node ref to address", nodeRefToAddress));
|
||||
} else if (nodeAddress != null) {
|
||||
resources.add(Accountables.namedAccountable("node addresses", nodeAddress));
|
||||
resources.add(Accountables.namedAccountable("in counts", inCounts));
|
||||
}
|
||||
return resources;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs + ",packed=" + packed;
|
||||
return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs;
|
||||
}
|
||||
|
||||
void finish(long newStartNode) throws IOException {
|
||||
|
@ -463,16 +406,6 @@ public final class FST<T> implements Accountable {
|
|||
bytes.finish();
|
||||
cacheRootArcs();
|
||||
}
|
||||
|
||||
private long getNodeAddress(long node) {
|
||||
if (nodeAddress != null) {
|
||||
// Deref
|
||||
return nodeAddress.get((int) node);
|
||||
} else {
|
||||
// Straight
|
||||
return node;
|
||||
}
|
||||
}
|
||||
|
||||
// Optionally caches first 128 labels
|
||||
@SuppressWarnings({"rawtypes","unchecked"})
|
||||
|
@ -527,18 +460,7 @@ public final class FST<T> implements Accountable {
|
|||
if (startNode == -1) {
|
||||
throw new IllegalStateException("call finish first");
|
||||
}
|
||||
if (nodeAddress != null) {
|
||||
throw new IllegalStateException("cannot save an FST pre-packed FST; it must first be packed");
|
||||
}
|
||||
if (packed && !(nodeRefToAddress instanceof PackedInts.Mutable)) {
|
||||
throw new IllegalStateException("cannot save a FST which has been loaded from disk ");
|
||||
}
|
||||
CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT);
|
||||
if (packed) {
|
||||
out.writeByte((byte) 1);
|
||||
} else {
|
||||
out.writeByte((byte) 0);
|
||||
}
|
||||
// TODO: really we should encode this as an arc, arriving
|
||||
// to the root node, instead of special casing here:
|
||||
if (emptyOutput != null) {
|
||||
|
@ -552,16 +474,14 @@ public final class FST<T> implements Accountable {
|
|||
byte[] emptyOutputBytes = new byte[(int) ros.getFilePointer()];
|
||||
ros.writeTo(emptyOutputBytes, 0);
|
||||
|
||||
if (!packed) {
|
||||
// reverse
|
||||
final int stopAt = emptyOutputBytes.length/2;
|
||||
int upto = 0;
|
||||
while(upto < stopAt) {
|
||||
final byte b = emptyOutputBytes[upto];
|
||||
emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1];
|
||||
emptyOutputBytes[emptyOutputBytes.length-upto-1] = b;
|
||||
upto++;
|
||||
}
|
||||
// reverse
|
||||
final int stopAt = emptyOutputBytes.length/2;
|
||||
int upto = 0;
|
||||
while(upto < stopAt) {
|
||||
final byte b = emptyOutputBytes[upto];
|
||||
emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1];
|
||||
emptyOutputBytes[emptyOutputBytes.length-upto-1] = b;
|
||||
upto++;
|
||||
}
|
||||
out.writeVInt(emptyOutputBytes.length);
|
||||
out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length);
|
||||
|
@ -577,9 +497,6 @@ public final class FST<T> implements Accountable {
|
|||
t = 2;
|
||||
}
|
||||
out.writeByte(t);
|
||||
if (packed) {
|
||||
((PackedInts.Mutable) nodeRefToAddress).save(out);
|
||||
}
|
||||
out.writeVLong(startNode);
|
||||
if (bytes != null) {
|
||||
long numBytes = bytes.getPosition();
|
||||
|
@ -705,8 +622,6 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
if (!targetHasArcs) {
|
||||
flags += BIT_STOP_NODE;
|
||||
} else if (inCounts != null) {
|
||||
inCounts.set((int) target.node, inCounts.get((int) target.node) + 1);
|
||||
}
|
||||
|
||||
if (arc.output != NO_OUTPUT) {
|
||||
|
@ -810,30 +725,8 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
builder.bytes.reverse(startAddress, thisNodeAddress);
|
||||
|
||||
// PackedInts uses int as the index, so we cannot handle
|
||||
// > 2.1B nodes when packing:
|
||||
if (nodeAddress != null && builder.nodeCount == Integer.MAX_VALUE) {
|
||||
throw new IllegalStateException("cannot create a packed FST with more than 2.1 billion nodes");
|
||||
}
|
||||
|
||||
builder.nodeCount++;
|
||||
final long node;
|
||||
if (nodeAddress != null) {
|
||||
|
||||
// Nodes are addressed by 1+ord:
|
||||
if ((int) builder.nodeCount == nodeAddress.size()) {
|
||||
nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue()));
|
||||
inCounts = inCounts.resize(ArrayUtil.oversize(inCounts.size() + 1, inCounts.getBitsPerValue()));
|
||||
}
|
||||
nodeAddress.set((int) builder.nodeCount, thisNodeAddress);
|
||||
// System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress);
|
||||
node = builder.nodeCount;
|
||||
} else {
|
||||
node = thisNodeAddress;
|
||||
}
|
||||
|
||||
//System.out.println(" ret node=" + node + " address=" + thisNodeAddress + " nodeAddress=" + nodeAddress);
|
||||
return node;
|
||||
return thisNodeAddress;
|
||||
}
|
||||
|
||||
/** Fills virtual 'start' arc, ie, an empty incoming arc to
|
||||
|
@ -876,13 +769,13 @@ public final class FST<T> implements Accountable {
|
|||
arc.flags = BIT_LAST_ARC;
|
||||
return arc;
|
||||
} else {
|
||||
in.setPosition(getNodeAddress(follow.target));
|
||||
in.setPosition(follow.target);
|
||||
arc.node = follow.target;
|
||||
final byte b = in.readByte();
|
||||
if (b == ARCS_AS_FIXED_ARRAY) {
|
||||
// array: jump straight to end
|
||||
arc.numArcs = in.readVInt();
|
||||
if (packed || version >= VERSION_VINT_TARGET) {
|
||||
if (version >= VERSION_VINT_TARGET) {
|
||||
arc.bytesPerArc = in.readVInt();
|
||||
} else {
|
||||
arc.bytesPerArc = in.readInt();
|
||||
|
@ -906,8 +799,6 @@ public final class FST<T> implements Accountable {
|
|||
}
|
||||
if (arc.flag(BIT_STOP_NODE)) {
|
||||
} else if (arc.flag(BIT_TARGET_NEXT)) {
|
||||
} else if (packed) {
|
||||
in.readVLong();
|
||||
} else {
|
||||
readUnpackedNodeTarget(in);
|
||||
}
|
||||
|
@ -964,7 +855,7 @@ public final class FST<T> implements Accountable {
|
|||
}
|
||||
|
||||
public Arc<T> readFirstRealTargetArc(long node, Arc<T> arc, final BytesReader in) throws IOException {
|
||||
final long address = getNodeAddress(node);
|
||||
final long address = node;
|
||||
in.setPosition(address);
|
||||
//System.out.println(" readFirstRealTargtArc address="
|
||||
//+ address);
|
||||
|
@ -975,7 +866,7 @@ public final class FST<T> implements Accountable {
|
|||
//System.out.println(" fixedArray");
|
||||
// this is first arc in a fixed-array
|
||||
arc.numArcs = in.readVInt();
|
||||
if (packed || version >= VERSION_VINT_TARGET) {
|
||||
if (version >= VERSION_VINT_TARGET) {
|
||||
arc.bytesPerArc = in.readVInt();
|
||||
} else {
|
||||
arc.bytesPerArc = in.readInt();
|
||||
|
@ -1002,7 +893,7 @@ public final class FST<T> implements Accountable {
|
|||
if (!targetHasArcs(follow)) {
|
||||
return false;
|
||||
} else {
|
||||
in.setPosition(getNodeAddress(follow.target));
|
||||
in.setPosition(follow.target);
|
||||
return in.readByte() == ARCS_AS_FIXED_ARRAY;
|
||||
}
|
||||
}
|
||||
|
@ -1029,7 +920,7 @@ public final class FST<T> implements Accountable {
|
|||
//System.out.println(" nextArc fake " +
|
||||
//arc.nextArc);
|
||||
|
||||
long pos = getNodeAddress(arc.nextArc);
|
||||
long pos = arc.nextArc;
|
||||
in.setPosition(pos);
|
||||
|
||||
final byte b = in.readByte();
|
||||
|
@ -1038,7 +929,7 @@ public final class FST<T> implements Accountable {
|
|||
in.readVInt();
|
||||
|
||||
// Skip bytesPerArc:
|
||||
if (packed || version >= VERSION_VINT_TARGET) {
|
||||
if (version >= VERSION_VINT_TARGET) {
|
||||
in.readVInt();
|
||||
} else {
|
||||
in.readInt();
|
||||
|
@ -1107,41 +998,18 @@ public final class FST<T> implements Accountable {
|
|||
arc.nextArc = in.getPosition();
|
||||
// TODO: would be nice to make this lazy -- maybe
|
||||
// caller doesn't need the target and is scanning arcs...
|
||||
if (nodeAddress == null) {
|
||||
if (!arc.flag(BIT_LAST_ARC)) {
|
||||
if (arc.bytesPerArc == 0) {
|
||||
// must scan
|
||||
seekToNextNode(in);
|
||||
} else {
|
||||
in.setPosition(arc.posArcsStart);
|
||||
in.skipBytes(arc.bytesPerArc * arc.numArcs);
|
||||
}
|
||||
}
|
||||
arc.target = in.getPosition();
|
||||
} else {
|
||||
arc.target = arc.node - 1;
|
||||
assert arc.target > 0;
|
||||
}
|
||||
} else {
|
||||
if (packed) {
|
||||
final long pos = in.getPosition();
|
||||
final long code = in.readVLong();
|
||||
if (arc.flag(BIT_TARGET_DELTA)) {
|
||||
// Address is delta-coded from current address:
|
||||
arc.target = pos + code;
|
||||
//System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target);
|
||||
} else if (code < nodeRefToAddress.size()) {
|
||||
// Deref
|
||||
arc.target = nodeRefToAddress.get((int) code);
|
||||
//System.out.println(" deref code=" + code + " target=" + arc.target);
|
||||
if (!arc.flag(BIT_LAST_ARC)) {
|
||||
if (arc.bytesPerArc == 0) {
|
||||
// must scan
|
||||
seekToNextNode(in);
|
||||
} else {
|
||||
// Absolute
|
||||
arc.target = code;
|
||||
//System.out.println(" abs code=" + code);
|
||||
in.setPosition(arc.posArcsStart);
|
||||
in.skipBytes(arc.bytesPerArc * arc.numArcs);
|
||||
}
|
||||
} else {
|
||||
arc.target = readUnpackedNodeTarget(in);
|
||||
}
|
||||
arc.target = in.getPosition();
|
||||
} else {
|
||||
arc.target = readUnpackedNodeTarget(in);
|
||||
arc.nextArc = in.getPosition();
|
||||
}
|
||||
return arc;
|
||||
|
@ -1228,7 +1096,7 @@ public final class FST<T> implements Accountable {
|
|||
return null;
|
||||
}
|
||||
|
||||
in.setPosition(getNodeAddress(follow.target));
|
||||
in.setPosition(follow.target);
|
||||
|
||||
arc.node = follow.target;
|
||||
|
||||
|
@ -1237,7 +1105,7 @@ public final class FST<T> implements Accountable {
|
|||
if (in.readByte() == ARCS_AS_FIXED_ARRAY) {
|
||||
// Arcs are full array; do binary search:
|
||||
arc.numArcs = in.readVInt();
|
||||
if (packed || version >= VERSION_VINT_TARGET) {
|
||||
if (version >= VERSION_VINT_TARGET) {
|
||||
arc.bytesPerArc = in.readVInt();
|
||||
} else {
|
||||
arc.bytesPerArc = in.readInt();
|
||||
|
@ -1303,11 +1171,7 @@ public final class FST<T> implements Accountable {
|
|||
}
|
||||
|
||||
if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) {
|
||||
if (packed) {
|
||||
in.readVLong();
|
||||
} else {
|
||||
readUnpackedNodeTarget(in);
|
||||
}
|
||||
readUnpackedNodeTarget(in);
|
||||
}
|
||||
|
||||
if (flag(flags, BIT_LAST_ARC)) {
|
||||
|
@ -1340,18 +1204,10 @@ public final class FST<T> implements Accountable {
|
|||
/** Returns a {@link BytesReader} for this FST, positioned at
|
||||
* position 0. */
|
||||
public BytesReader getBytesReader() {
|
||||
if (packed) {
|
||||
if (bytesArray != null) {
|
||||
return new ForwardBytesReader(bytesArray);
|
||||
} else {
|
||||
return bytes.getForwardReader();
|
||||
}
|
||||
if (bytesArray != null) {
|
||||
return new ReverseBytesReader(bytesArray);
|
||||
} else {
|
||||
if (bytesArray != null) {
|
||||
return new ReverseBytesReader(bytesArray);
|
||||
} else {
|
||||
return bytes.getReverseReader();
|
||||
}
|
||||
return bytes.getReverseReader();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1476,395 +1332,4 @@ public final class FST<T> implements Accountable {
|
|||
}
|
||||
*/
|
||||
|
||||
// Creates a packed FST
|
||||
private FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) {
|
||||
version = VERSION_CURRENT;
|
||||
packed = true;
|
||||
this.inputType = inputType;
|
||||
bytesArray = null;
|
||||
bytes = new BytesStore(bytesPageBits);
|
||||
this.outputs = outputs;
|
||||
}
|
||||
|
||||
/** Expert: creates an FST by packing this one. This
|
||||
* process requires substantial additional RAM (currently
|
||||
* up to ~8 bytes per node depending on
|
||||
* <code>acceptableOverheadRatio</code>), but then should
|
||||
* produce a smaller FST.
|
||||
*
|
||||
* <p>The implementation of this method uses ideas from
|
||||
* <a target="_blank" href="http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf">Smaller Representation of Finite State Automata</a>,
|
||||
* which describes techniques to reduce the size of a FST.
|
||||
* However, this is not a strict implementation of the
|
||||
* algorithms described in this paper.
|
||||
*/
|
||||
FST<T> pack(Builder<T> builder, int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException {
|
||||
|
||||
// NOTE: maxDerefNodes is intentionally int: we cannot
|
||||
// support > 2.1B deref nodes
|
||||
|
||||
// TODO: other things to try
|
||||
// - renumber the nodes to get more next / better locality?
|
||||
// - allow multiple input labels on an arc, so
|
||||
// singular chain of inputs can take one arc (on
|
||||
// wikipedia terms this could save another ~6%)
|
||||
// - in the ord case, the output '1' is presumably
|
||||
// very common (after NO_OUTPUT)... maybe use a bit
|
||||
// for it..?
|
||||
// - use spare bits in flags.... for top few labels /
|
||||
// outputs / targets
|
||||
|
||||
if (nodeAddress == null) {
|
||||
throw new IllegalArgumentException("this FST was not built with willPackFST=true");
|
||||
}
|
||||
|
||||
T NO_OUTPUT = outputs.getNoOutput();
|
||||
|
||||
Arc<T> arc = new Arc<>();
|
||||
|
||||
final BytesReader r = getBytesReader();
|
||||
|
||||
final int topN = Math.min(maxDerefNodes, inCounts.size());
|
||||
|
||||
// Find top nodes with highest number of incoming arcs:
|
||||
NodeQueue q = new NodeQueue(topN);
|
||||
|
||||
// TODO: we could use more RAM efficient selection algo here...
|
||||
NodeAndInCount bottom = null;
|
||||
for(int node=0; node<inCounts.size(); node++) {
|
||||
if (inCounts.get(node) >= minInCountDeref) {
|
||||
if (bottom == null) {
|
||||
q.add(new NodeAndInCount(node, (int) inCounts.get(node)));
|
||||
if (q.size() == topN) {
|
||||
bottom = q.top();
|
||||
}
|
||||
} else if (inCounts.get(node) > bottom.count) {
|
||||
q.insertWithOverflow(new NodeAndInCount(node, (int) inCounts.get(node)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Free up RAM:
|
||||
inCounts = null;
|
||||
|
||||
final Map<Integer,Integer> topNodeMap = new HashMap<>();
|
||||
for(int downTo=q.size()-1;downTo>=0;downTo--) {
|
||||
NodeAndInCount n = q.pop();
|
||||
topNodeMap.put(n.node, downTo);
|
||||
//System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo);
|
||||
}
|
||||
|
||||
// +1 because node ords start at 1 (0 is reserved as stop node):
|
||||
final GrowableWriter newNodeAddress = new GrowableWriter(
|
||||
PackedInts.bitsRequired(builder.bytes.getPosition()), (int) (1 + builder.nodeCount), acceptableOverheadRatio);
|
||||
|
||||
// Fill initial coarse guess:
|
||||
for(int node=1;node<=builder.nodeCount;node++) {
|
||||
newNodeAddress.set(node, 1 + builder.bytes.getPosition() - nodeAddress.get(node));
|
||||
}
|
||||
|
||||
int absCount;
|
||||
int deltaCount;
|
||||
int topCount;
|
||||
int nextCount;
|
||||
|
||||
FST<T> fst;
|
||||
|
||||
// Iterate until we converge:
|
||||
while(true) {
|
||||
|
||||
//System.out.println("\nITER");
|
||||
boolean changed = false;
|
||||
|
||||
// for assert:
|
||||
boolean negDelta = false;
|
||||
|
||||
fst = new FST<>(inputType, outputs, builder.bytes.getBlockBits());
|
||||
|
||||
final BytesStore writer = fst.bytes;
|
||||
|
||||
// Skip 0 byte since 0 is reserved target:
|
||||
writer.writeByte((byte) 0);
|
||||
|
||||
absCount = deltaCount = topCount = nextCount = 0;
|
||||
|
||||
int changedCount = 0;
|
||||
|
||||
long addressError = 0;
|
||||
|
||||
//int totWasted = 0;
|
||||
|
||||
// Since we re-reverse the bytes, we now write the
|
||||
// nodes backwards, so that BIT_TARGET_NEXT is
|
||||
// unchanged:
|
||||
for(int node=(int) builder.nodeCount;node>=1;node--) {
|
||||
final long address = writer.getPosition();
|
||||
|
||||
//System.out.println(" node: " + node + " address=" + address);
|
||||
if (address != newNodeAddress.get(node)) {
|
||||
addressError = address - newNodeAddress.get(node);
|
||||
//System.out.println(" change: " + (address - newNodeAddress[node]));
|
||||
changed = true;
|
||||
newNodeAddress.set(node, address);
|
||||
changedCount++;
|
||||
}
|
||||
|
||||
int nodeArcCount = 0;
|
||||
int bytesPerArc = 0;
|
||||
|
||||
boolean retry = false;
|
||||
|
||||
// for assert:
|
||||
boolean anyNegDelta = false;
|
||||
|
||||
// Retry loop: possibly iterate more than once, if
|
||||
// this is an array'd node and bytesPerArc changes:
|
||||
writeNode:
|
||||
while(true) { // retry writing this node
|
||||
|
||||
//System.out.println(" cycle: retry");
|
||||
readFirstRealTargetArc(node, arc, r);
|
||||
|
||||
final boolean useArcArray = arc.bytesPerArc != 0;
|
||||
if (useArcArray) {
|
||||
// Write false first arc:
|
||||
if (bytesPerArc == 0) {
|
||||
bytesPerArc = arc.bytesPerArc;
|
||||
}
|
||||
writer.writeByte(ARCS_AS_FIXED_ARRAY);
|
||||
writer.writeVInt(arc.numArcs);
|
||||
writer.writeVInt(bytesPerArc);
|
||||
//System.out.println("node " + node + ": " + arc.numArcs + " arcs");
|
||||
}
|
||||
|
||||
int maxBytesPerArc = 0;
|
||||
//int wasted = 0;
|
||||
while(true) { // iterate over all arcs for this node
|
||||
//System.out.println(" cycle next arc");
|
||||
|
||||
final long arcStartPos = writer.getPosition();
|
||||
nodeArcCount++;
|
||||
|
||||
byte flags = 0;
|
||||
|
||||
if (arc.isLast()) {
|
||||
flags += BIT_LAST_ARC;
|
||||
}
|
||||
/*
|
||||
if (!useArcArray && nodeUpto < nodes.length-1 && arc.target == nodes[nodeUpto+1]) {
|
||||
flags += BIT_TARGET_NEXT;
|
||||
}
|
||||
*/
|
||||
if (!useArcArray && node != 1 && arc.target == node-1) {
|
||||
flags += BIT_TARGET_NEXT;
|
||||
if (!retry) {
|
||||
nextCount++;
|
||||
}
|
||||
}
|
||||
if (arc.isFinal()) {
|
||||
flags += BIT_FINAL_ARC;
|
||||
if (arc.nextFinalOutput != NO_OUTPUT) {
|
||||
flags += BIT_ARC_HAS_FINAL_OUTPUT;
|
||||
}
|
||||
} else {
|
||||
assert arc.nextFinalOutput == NO_OUTPUT;
|
||||
}
|
||||
if (!targetHasArcs(arc)) {
|
||||
flags += BIT_STOP_NODE;
|
||||
}
|
||||
|
||||
if (arc.output != NO_OUTPUT) {
|
||||
flags += BIT_ARC_HAS_OUTPUT;
|
||||
}
|
||||
|
||||
final long absPtr;
|
||||
final boolean doWriteTarget = targetHasArcs(arc) && (flags & BIT_TARGET_NEXT) == 0;
|
||||
if (doWriteTarget) {
|
||||
|
||||
final Integer ptr = topNodeMap.get(arc.target);
|
||||
if (ptr != null) {
|
||||
absPtr = ptr;
|
||||
} else {
|
||||
absPtr = topNodeMap.size() + newNodeAddress.get((int) arc.target) + addressError;
|
||||
}
|
||||
|
||||
long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition() - 2;
|
||||
if (delta < 0) {
|
||||
//System.out.println("neg: " + delta);
|
||||
anyNegDelta = true;
|
||||
delta = 0;
|
||||
}
|
||||
|
||||
if (delta < absPtr) {
|
||||
flags |= BIT_TARGET_DELTA;
|
||||
}
|
||||
} else {
|
||||
absPtr = 0;
|
||||
}
|
||||
|
||||
assert flags != ARCS_AS_FIXED_ARRAY;
|
||||
writer.writeByte(flags);
|
||||
|
||||
fst.writeLabel(writer, arc.label);
|
||||
|
||||
if (arc.output != NO_OUTPUT) {
|
||||
outputs.write(arc.output, writer);
|
||||
}
|
||||
if (arc.nextFinalOutput != NO_OUTPUT) {
|
||||
outputs.writeFinalOutput(arc.nextFinalOutput, writer);
|
||||
}
|
||||
|
||||
if (doWriteTarget) {
|
||||
|
||||
long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition();
|
||||
if (delta < 0) {
|
||||
anyNegDelta = true;
|
||||
//System.out.println("neg: " + delta);
|
||||
delta = 0;
|
||||
}
|
||||
|
||||
if (flag(flags, BIT_TARGET_DELTA)) {
|
||||
//System.out.println(" delta");
|
||||
writer.writeVLong(delta);
|
||||
if (!retry) {
|
||||
deltaCount++;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
if (ptr != null) {
|
||||
System.out.println(" deref");
|
||||
} else {
|
||||
System.out.println(" abs");
|
||||
}
|
||||
*/
|
||||
writer.writeVLong(absPtr);
|
||||
if (!retry) {
|
||||
if (absPtr >= topNodeMap.size()) {
|
||||
absCount++;
|
||||
} else {
|
||||
topCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (useArcArray) {
|
||||
final int arcBytes = (int) (writer.getPosition() - arcStartPos);
|
||||
//System.out.println(" " + arcBytes + " bytes");
|
||||
maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes);
|
||||
// NOTE: this may in fact go "backwards", if
|
||||
// somehow (rarely, possibly never) we use
|
||||
// more bytesPerArc in this rewrite than the
|
||||
// incoming FST did... but in this case we
|
||||
// will retry (below) so it's OK to ovewrite
|
||||
// bytes:
|
||||
//wasted += bytesPerArc - arcBytes;
|
||||
writer.skipBytes((int) (arcStartPos + bytesPerArc - writer.getPosition()));
|
||||
}
|
||||
|
||||
if (arc.isLast()) {
|
||||
break;
|
||||
}
|
||||
|
||||
readNextRealArc(arc, r);
|
||||
}
|
||||
|
||||
if (useArcArray) {
|
||||
if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc)) {
|
||||
// converged
|
||||
//System.out.println(" bba=" + bytesPerArc + " wasted=" + wasted);
|
||||
//totWasted += wasted;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
//System.out.println(" retry this node maxBytesPerArc=" + maxBytesPerArc + " vs " + bytesPerArc);
|
||||
|
||||
// Retry:
|
||||
bytesPerArc = maxBytesPerArc;
|
||||
writer.truncate(address);
|
||||
nodeArcCount = 0;
|
||||
retry = true;
|
||||
anyNegDelta = false;
|
||||
}
|
||||
|
||||
negDelta |= anyNegDelta;
|
||||
}
|
||||
|
||||
if (!changed) {
|
||||
// We don't renumber the nodes (just reverse their
|
||||
// order) so nodes should only point forward to
|
||||
// other nodes because we only produce acyclic FSTs
|
||||
// w/ nodes only pointing "forwards":
|
||||
assert !negDelta;
|
||||
//System.out.println("TOT wasted=" + totWasted);
|
||||
// Converged!
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
long maxAddress = 0;
|
||||
for (long key : topNodeMap.keySet()) {
|
||||
maxAddress = Math.max(maxAddress, newNodeAddress.get((int) key));
|
||||
}
|
||||
|
||||
PackedInts.Mutable nodeRefToAddressIn = PackedInts.getMutable(topNodeMap.size(),
|
||||
PackedInts.bitsRequired(maxAddress), acceptableOverheadRatio);
|
||||
for(Map.Entry<Integer,Integer> ent : topNodeMap.entrySet()) {
|
||||
nodeRefToAddressIn.set(ent.getValue(), newNodeAddress.get(ent.getKey()));
|
||||
}
|
||||
fst.nodeRefToAddress = nodeRefToAddressIn;
|
||||
|
||||
fst.startNode = newNodeAddress.get((int) startNode);
|
||||
//System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode);
|
||||
|
||||
if (emptyOutput != null) {
|
||||
fst.setEmptyOutput(emptyOutput);
|
||||
}
|
||||
|
||||
fst.bytes.finish();
|
||||
fst.cacheRootArcs();
|
||||
|
||||
//final int size = fst.sizeInBytes();
|
||||
//System.out.println("nextCount=" + nextCount + " topCount=" + topCount + " deltaCount=" + deltaCount + " absCount=" + absCount);
|
||||
|
||||
return fst;
|
||||
}
|
||||
|
||||
private static class NodeAndInCount implements Comparable<NodeAndInCount> {
|
||||
final int node;
|
||||
final int count;
|
||||
|
||||
public NodeAndInCount(int node, int count) {
|
||||
this.node = node;
|
||||
this.count = count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(NodeAndInCount other) {
|
||||
if (count > other.count) {
|
||||
return 1;
|
||||
} else if (count < other.count) {
|
||||
return -1;
|
||||
} else {
|
||||
// Tie-break: smaller node compares as greater than
|
||||
return other.node - node;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class NodeQueue extends PriorityQueue<NodeAndInCount> {
|
||||
public NodeQueue(int topN) {
|
||||
super(topN, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean lessThan(NodeAndInCount a, NodeAndInCount b) {
|
||||
final int cmp = a.compareTo(b);
|
||||
assert cmp != 0;
|
||||
return cmp < 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,6 @@
|
|||
* <li>Fast and low memory overhead construction of the minimal FST
|
||||
* (but inputs must be provided in sorted order)</li>
|
||||
* <li>Low object overhead and quick deserialization (byte[] representation)</li>
|
||||
* <li>Optional two-pass compression: {@link org.apache.lucene.util.fst.FST#pack FST.pack()}</li>
|
||||
* <li>{@link org.apache.lucene.util.fst.Util#getByOutput Lookup-by-output} when the
|
||||
* outputs are in sorted order (e.g., ordinals or file pointers)</li>
|
||||
* <li>Pluggable {@link org.apache.lucene.util.fst.Outputs Outputs} representation</li>
|
||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TimeUnits;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.junit.Ignore;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
||||
|
@ -47,16 +46,14 @@ public class Test2BFST extends LuceneTestCase {
|
|||
|
||||
Directory dir = new MMapDirectory(createTempDir("2BFST"));
|
||||
|
||||
for(int doPackIter=0;doPackIter<2;doPackIter++) {
|
||||
boolean doPack = doPackIter == 1;
|
||||
|
||||
for(int iter=0;iter<1;iter++) {
|
||||
// Build FST w/ NoOutputs and stop when nodeCount > 2.2B
|
||||
if (!doPack) {
|
||||
{
|
||||
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
|
||||
Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
Object NO_OUTPUT = outputs.getNoOutput();
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
||||
doPack, PackedInts.COMPACT, true, 15);
|
||||
true, 15);
|
||||
|
||||
int count = 0;
|
||||
Random r = new Random(seed);
|
||||
|
@ -135,10 +132,10 @@ public class Test2BFST extends LuceneTestCase {
|
|||
// Build FST w/ ByteSequenceOutputs and stop when FST
|
||||
// size = 3GB
|
||||
{
|
||||
System.out.println("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes");
|
||||
System.out.println("\nTEST: 3 GB size; outputs=bytes");
|
||||
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
|
||||
final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
||||
doPack, PackedInts.COMPACT, true, 15);
|
||||
true, 15);
|
||||
|
||||
byte[] outputBytes = new byte[20];
|
||||
BytesRef output = new BytesRef(outputBytes);
|
||||
|
@ -212,10 +209,10 @@ public class Test2BFST extends LuceneTestCase {
|
|||
// Build FST w/ PositiveIntOutputs and stop when FST
|
||||
// size = 3GB
|
||||
{
|
||||
System.out.println("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long");
|
||||
System.out.println("\nTEST: 3 GB size; outputs=long");
|
||||
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
|
||||
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
||||
doPack, PackedInts.COMPACT, true, 15);
|
||||
true, 15);
|
||||
|
||||
long output = 1;
|
||||
|
||||
|
|
|
@ -76,7 +76,6 @@ import org.apache.lucene.util.fst.FST.Arc;
|
|||
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||
import org.apache.lucene.util.fst.PairOutputs.Pair;
|
||||
import org.apache.lucene.util.fst.Util.Result;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.apache.lucene.util.fst.FSTTester.getRandomString;
|
||||
import static org.apache.lucene.util.fst.FSTTester.simpleRandomString;
|
||||
|
@ -328,9 +327,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
writer.close();
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
|
||||
final boolean doRewrite = random().nextBoolean();
|
||||
|
||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, doRewrite, PackedInts.DEFAULT, true, 15);
|
||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
|
||||
boolean storeOrd = random().nextBoolean();
|
||||
if (VERBOSE) {
|
||||
|
@ -464,16 +461,14 @@ public class TestFSTs extends LuceneTestCase {
|
|||
private int inputMode;
|
||||
private final Outputs<T> outputs;
|
||||
private final Builder<T> builder;
|
||||
private final boolean doPack;
|
||||
|
||||
public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean doPack, boolean noArcArrays) {
|
||||
public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) {
|
||||
this.dirOut = dirOut;
|
||||
this.wordsFileIn = wordsFileIn;
|
||||
this.inputMode = inputMode;
|
||||
this.outputs = outputs;
|
||||
this.doPack = doPack;
|
||||
|
||||
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, doPack, PackedInts.DEFAULT, !noArcArrays, 15);
|
||||
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15);
|
||||
}
|
||||
|
||||
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
|
||||
|
@ -622,7 +617,6 @@ public class TestFSTs extends LuceneTestCase {
|
|||
boolean storeOrds = false;
|
||||
boolean storeDocFreqs = false;
|
||||
boolean verify = true;
|
||||
boolean doPack = false;
|
||||
boolean noArcArrays = false;
|
||||
Path wordsFileIn = null;
|
||||
Path dirOut = null;
|
||||
|
@ -647,8 +641,6 @@ public class TestFSTs extends LuceneTestCase {
|
|||
storeOrds = true;
|
||||
} else if (args[idx].equals("-noverify")) {
|
||||
verify = false;
|
||||
} else if (args[idx].equals("-pack")) {
|
||||
doPack = true;
|
||||
} else if (args[idx].startsWith("-")) {
|
||||
System.err.println("Unrecognized option: " + args[idx]);
|
||||
System.exit(-1);
|
||||
|
@ -677,7 +669,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton();
|
||||
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton();
|
||||
final PairOutputs<Long,Long> outputs = new PairOutputs<>(o1, o2);
|
||||
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
|
||||
Random rand;
|
||||
@Override
|
||||
public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
|
||||
|
@ -691,7 +683,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
} else if (storeOrds) {
|
||||
// Store only ords
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
|
||||
@Override
|
||||
public Long getOutput(IntsRef input, int ord) {
|
||||
return (long) ord;
|
||||
|
@ -700,7 +692,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
} else if (storeDocFreqs) {
|
||||
// Store only docFreq
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
|
||||
Random rand;
|
||||
@Override
|
||||
public Long getOutput(IntsRef input, int ord) {
|
||||
|
@ -714,7 +706,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
// Store nothing
|
||||
final NoOutputs outputs = NoOutputs.getSingleton();
|
||||
final Object NO_OUTPUT = outputs.getNoOutput();
|
||||
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
|
||||
@Override
|
||||
public Object getOutput(IntsRef input, int ord) {
|
||||
return NO_OUTPUT;
|
||||
|
@ -1118,7 +1110,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
public void testFinalOutputOnEndState() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, random().nextBoolean(), PackedInts.DEFAULT, true, 15);
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
|
||||
builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
|
||||
final FST<Long> fst = builder.finish();
|
||||
|
@ -1132,8 +1124,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void testInternalFinalState() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
final boolean willRewrite = random().nextBoolean();
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, willRewrite, PackedInts.DEFAULT, true, 15);
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
final FST<Long> fst = builder.finish();
|
||||
|
|
|
@ -50,7 +50,6 @@ import org.apache.lucene.util.fst.PairOutputs.Pair;
|
|||
import org.apache.lucene.util.fst.PairOutputs;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/*
|
||||
TODO:
|
||||
|
@ -354,8 +353,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0, true, false, Integer.MAX_VALUE,
|
||||
FST_OUTPUTS, false,
|
||||
PackedInts.COMPACT, true, 15);
|
||||
FST_OUTPUTS, true, 15);
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
//}
|
||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.util.BytesRefBuilder;
|
|||
import org.apache.lucene.util.BytesRefIterator;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.*;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Finite state automata based implementation of "autocomplete" functionality.
|
||||
|
@ -237,8 +236,7 @@ public class FSTCompletionBuilder {
|
|||
final Object empty = outputs.getNoOutput();
|
||||
final Builder<Object> builder = new Builder<>(
|
||||
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
|
||||
shareMaxTailLength, outputs, false,
|
||||
PackedInts.DEFAULT, true, 15);
|
||||
shareMaxTailLength, outputs, true, 15);
|
||||
|
||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
BytesRef entry;
|
||||
|
|
|
@ -40,7 +40,6 @@ import org.apache.lucene.util.IntsRefBuilder;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
|
@ -273,25 +272,14 @@ public class FSTTester<T> {
|
|||
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
|
||||
}
|
||||
|
||||
final boolean willRewrite = random.nextBoolean();
|
||||
|
||||
final Builder<T> builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
|
||||
prune1, prune2,
|
||||
prune1==0 && prune2==0,
|
||||
allowRandomSuffixSharing ? random.nextBoolean() : true,
|
||||
allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
|
||||
outputs,
|
||||
willRewrite,
|
||||
PackedInts.DEFAULT,
|
||||
true,
|
||||
15);
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
if (willRewrite) {
|
||||
System.out.println("TEST: packed FST");
|
||||
} else {
|
||||
System.out.println("TEST: non-packed FST");
|
||||
}
|
||||
}
|
||||
|
||||
for(InputOutput<T> pair : pairs) {
|
||||
if (pair.output instanceof List) {
|
||||
|
@ -306,7 +294,7 @@ public class FSTTester<T> {
|
|||
}
|
||||
FST<T> fst = builder.finish();
|
||||
|
||||
if (random.nextBoolean() && fst != null && !willRewrite) {
|
||||
if (random.nextBoolean() && fst != null) {
|
||||
IOContext context = LuceneTestCase.newIOContext(random);
|
||||
IndexOutput out = dir.createOutput("fst.bin", context);
|
||||
fst.save(out);
|
||||
|
|
Loading…
Reference in New Issue