mirror of https://github.com/apache/lucene.git
LUCENE-3725: add optional packing to FSTs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1237500 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2e5be2f75c
commit
d1165b1972
|
@ -806,6 +806,9 @@ New Features
|
||||||
|
|
||||||
* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
|
* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
|
||||||
markup. (Steve Rowe)
|
markup. (Steve Rowe)
|
||||||
|
|
||||||
|
* LUCENE-3725: Added optional packing to FST building; this uses extra
|
||||||
|
RAM during building but results in a smaller FST. (Mike McCandless)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
|
|
|
@ -398,7 +398,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
final long indexStartFP;
|
final long indexStartFP;
|
||||||
final long rootBlockFP;
|
final long rootBlockFP;
|
||||||
final BytesRef rootCode;
|
final BytesRef rootCode;
|
||||||
private FST<BytesRef> index;
|
private final FST<BytesRef> index;
|
||||||
|
|
||||||
//private boolean DEBUG;
|
//private boolean DEBUG;
|
||||||
|
|
||||||
|
@ -433,6 +433,8 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
w.close();
|
w.close();
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
} else {
|
||||||
|
index = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -495,6 +497,8 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
|
|
||||||
private final BytesRef term = new BytesRef();
|
private final BytesRef term = new BytesRef();
|
||||||
|
|
||||||
|
private final FST.BytesReader fstReader;
|
||||||
|
|
||||||
// TODO: can we share this with the frame in STE?
|
// TODO: can we share this with the frame in STE?
|
||||||
private final class Frame {
|
private final class Frame {
|
||||||
final int ord;
|
final int ord;
|
||||||
|
@ -755,6 +759,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
arcs[arcIdx] = new FST.Arc<BytesRef>();
|
arcs[arcIdx] = new FST.Arc<BytesRef>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (index == null) {
|
||||||
|
fstReader = null;
|
||||||
|
} else {
|
||||||
|
fstReader = index.getBytesReader(0);
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: if the automaton is "smallish" we really
|
// TODO: if the automaton is "smallish" we really
|
||||||
// should use the terms index to seek at least to
|
// should use the terms index to seek at least to
|
||||||
// the initial term and likely to subsequent terms
|
// the initial term and likely to subsequent terms
|
||||||
|
@ -842,7 +852,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
// TODO: we could be more efficient for the next()
|
// TODO: we could be more efficient for the next()
|
||||||
// case by using current arc as starting point,
|
// case by using current arc as starting point,
|
||||||
// passed to findTargetArc
|
// passed to findTargetArc
|
||||||
arc = index.findTargetArc(target, arc, getArc(1+idx));
|
arc = index.findTargetArc(target, arc, getArc(1+idx), fstReader);
|
||||||
assert arc != null;
|
assert arc != null;
|
||||||
output = fstOutputs.add(output, arc.output);
|
output = fstOutputs.add(output, arc.output);
|
||||||
idx++;
|
idx++;
|
||||||
|
@ -1186,6 +1196,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
private boolean eof;
|
private boolean eof;
|
||||||
|
|
||||||
final BytesRef term = new BytesRef();
|
final BytesRef term = new BytesRef();
|
||||||
|
private final FST.BytesReader fstReader;
|
||||||
|
|
||||||
@SuppressWarnings("unchecked") private FST.Arc<BytesRef>[] arcs = new FST.Arc[1];
|
@SuppressWarnings("unchecked") private FST.Arc<BytesRef>[] arcs = new FST.Arc[1];
|
||||||
|
|
||||||
|
@ -1196,6 +1207,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
// Used to hold seek by TermState, or cached seek
|
// Used to hold seek by TermState, or cached seek
|
||||||
staticFrame = new Frame(-1);
|
staticFrame = new Frame(-1);
|
||||||
|
|
||||||
|
if (index == null) {
|
||||||
|
fstReader = null;
|
||||||
|
} else {
|
||||||
|
fstReader = index.getBytesReader(0);
|
||||||
|
}
|
||||||
|
|
||||||
// Init w/ root block; don't use index since it may
|
// Init w/ root block; don't use index since it may
|
||||||
// not (and need not) have been loaded
|
// not (and need not) have been loaded
|
||||||
for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
|
for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
|
||||||
|
@ -1581,7 +1598,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
|
|
||||||
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
|
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
|
||||||
|
|
||||||
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto));
|
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
|
||||||
|
|
||||||
if (nextArc == null) {
|
if (nextArc == null) {
|
||||||
|
|
||||||
|
@ -1838,7 +1855,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
|
|
||||||
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
|
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
|
||||||
|
|
||||||
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto));
|
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
|
||||||
|
|
||||||
if (nextArc == null) {
|
if (nextArc == null) {
|
||||||
|
|
||||||
|
|
|
@ -288,7 +288,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
final Builder<BytesRef> indexBuilder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1,
|
final Builder<BytesRef> indexBuilder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1,
|
||||||
0, 0, true, false, Integer.MAX_VALUE,
|
0, 0, true, false, Integer.MAX_VALUE,
|
||||||
outputs, null);
|
outputs, null, false);
|
||||||
//if (DEBUG) {
|
//if (DEBUG) {
|
||||||
// System.out.println(" compile index for prefix=" + prefix);
|
// System.out.println(" compile index for prefix=" + prefix);
|
||||||
//}
|
//}
|
||||||
|
@ -831,7 +831,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
0, 0, true,
|
0, 0, true,
|
||||||
true, Integer.MAX_VALUE,
|
true, Integer.MAX_VALUE,
|
||||||
noOutputs,
|
noOutputs,
|
||||||
new FindBlocks());
|
new FindBlocks(), false);
|
||||||
|
|
||||||
postingsWriter.setField(fieldInfo);
|
postingsWriter.setField(fieldInfo);
|
||||||
}
|
}
|
||||||
|
|
|
@ -229,7 +229,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
////System.out.println("VGW: field=" + fieldInfo.name);
|
////System.out.println("VGW: field=" + fieldInfo.name);
|
||||||
|
|
||||||
// Always put empty string in
|
// Always put empty string in
|
||||||
fstBuilder.add(new IntsRef(), fstOutputs.get(termsFilePointer));
|
fstBuilder.add(new IntsRef(), termsFilePointer);
|
||||||
startTermsFilePointer = termsFilePointer;
|
startTermsFilePointer = termsFilePointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -260,7 +260,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
final int lengthSave = text.length;
|
final int lengthSave = text.length;
|
||||||
text.length = indexedTermPrefixLength(lastTerm, text);
|
text.length = indexedTermPrefixLength(lastTerm, text);
|
||||||
try {
|
try {
|
||||||
fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), fstOutputs.get(termsFilePointer));
|
fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
|
||||||
} finally {
|
} finally {
|
||||||
text.length = lengthSave;
|
text.length = lengthSave;
|
||||||
}
|
}
|
||||||
|
|
|
@ -521,9 +521,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
private void loadTerms() throws IOException {
|
private void loadTerms() throws IOException {
|
||||||
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
|
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
|
||||||
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
|
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
|
||||||
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
|
final PairOutputs<Long,Long> outputsInner = new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs);
|
||||||
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
|
final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
|
||||||
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
|
outputsInner);
|
||||||
|
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||||
in.seek(termsStart);
|
in.seek(termsStart);
|
||||||
final BytesRef lastTerm = new BytesRef(10);
|
final BytesRef lastTerm = new BytesRef(10);
|
||||||
|
@ -536,9 +537,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
SimpleTextUtil.readLine(in, scratch);
|
SimpleTextUtil.readLine(in, scratch);
|
||||||
if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
|
if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
|
||||||
if (lastDocsStart != -1) {
|
if (lastDocsStart != -1) {
|
||||||
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
|
b.add(Util.toIntsRef(lastTerm, scratchIntsRef),
|
||||||
new PairOutputs.Pair<Long,Long>((long) docFreq,
|
outputs.newPair(lastDocsStart,
|
||||||
posIntOutputs.get(totalTermFreq))));
|
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
||||||
sumTotalTermFreq += totalTermFreq;
|
sumTotalTermFreq += totalTermFreq;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -553,9 +554,8 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
|
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
|
||||||
} else if (StringHelper.startsWith(scratch, TERM)) {
|
} else if (StringHelper.startsWith(scratch, TERM)) {
|
||||||
if (lastDocsStart != -1) {
|
if (lastDocsStart != -1) {
|
||||||
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
|
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair(lastDocsStart,
|
||||||
new PairOutputs.Pair<Long,Long>((long) docFreq,
|
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
||||||
posIntOutputs.get(totalTermFreq))));
|
|
||||||
}
|
}
|
||||||
lastDocsStart = in.getFilePointer();
|
lastDocsStart = in.getFilePointer();
|
||||||
final int len = scratch.length - TERM.length;
|
final int len = scratch.length - TERM.length;
|
||||||
|
|
|
@ -95,7 +95,7 @@ public final class FixedBitSet extends DocIdSet implements Bits {
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean get(int index) {
|
public boolean get(int index) {
|
||||||
assert index >= 0 && index < numBits;
|
assert index >= 0 && index < numBits: "index=" + index;
|
||||||
int i = index >> 6; // div 64
|
int i = index >> 6; // div 64
|
||||||
// signed shift will keep a negative index and force an
|
// signed shift will keep a negative index and force an
|
||||||
// array-index-out-of-bounds-exception, removing the need for an explicit check.
|
// array-index-out-of-bounds-exception, removing the need for an explicit check.
|
||||||
|
|
|
@ -588,7 +588,7 @@ public final class UnicodeUtil {
|
||||||
out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
|
out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
|
||||||
offset += 2;
|
offset += 2;
|
||||||
} else {
|
} else {
|
||||||
assert b < 0xf8;
|
assert b < 0xf8: "b=" + b;
|
||||||
int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
|
int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
|
||||||
offset += 3;
|
offset += 3;
|
||||||
if (ch < UNI_MAX_BMP) {
|
if (ch < UNI_MAX_BMP) {
|
||||||
|
|
|
@ -17,15 +17,15 @@ package org.apache.lucene.util.fst;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
|
||||||
import org.apache.lucene.util.RamUsageEstimator;
|
|
||||||
import org.apache.lucene.util.IntsRef;
|
|
||||||
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds a compact FST (maps an IntsRef term to an arbitrary
|
* Builds a minimal FST (maps an IntsRef term to an arbitrary
|
||||||
* output) from pre-sorted terms with outputs (the FST
|
* output) from pre-sorted terms with outputs (the FST
|
||||||
* becomes an FSA if you use NoOutputs). The FST is written
|
* becomes an FSA if you use NoOutputs). The FST is written
|
||||||
* on-the-fly into a compact serialized format byte array, which can
|
* on-the-fly into a compact serialized format byte array, which can
|
||||||
|
@ -35,12 +35,6 @@ import java.io.IOException;
|
||||||
* <p>NOTE: The algorithm is described at
|
* <p>NOTE: The algorithm is described at
|
||||||
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698</p>
|
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698</p>
|
||||||
*
|
*
|
||||||
* If your outputs are ByteSequenceOutput then the final FST
|
|
||||||
* will be minimal, but if you use PositiveIntOutput then
|
|
||||||
* it's only "near minimal". For example, aa/0, aab/1, bbb/2
|
|
||||||
* will produce 6 states when a 5 state fst is also
|
|
||||||
* possible.
|
|
||||||
*
|
|
||||||
* The parameterized type T is the output type. See the
|
* The parameterized type T is the output type. See the
|
||||||
* subclasses of {@link Outputs}.
|
* subclasses of {@link Outputs}.
|
||||||
*
|
*
|
||||||
|
@ -52,7 +46,7 @@ public class Builder<T> {
|
||||||
private final FST<T> fst;
|
private final FST<T> fst;
|
||||||
private final T NO_OUTPUT;
|
private final T NO_OUTPUT;
|
||||||
|
|
||||||
// private static final boolean DEBUG = false;
|
// private static final boolean DEBUG = true;
|
||||||
|
|
||||||
// simplistic pruning: we prune node (and all following
|
// simplistic pruning: we prune node (and all following
|
||||||
// nodes) if less than this number of terms go through it:
|
// nodes) if less than this number of terms go through it:
|
||||||
|
@ -88,7 +82,7 @@ public class Builder<T> {
|
||||||
* pruning options turned off.
|
* pruning options turned off.
|
||||||
*/
|
*/
|
||||||
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null);
|
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -127,16 +121,20 @@ public class Builder<T> {
|
||||||
* @param outputs The output type for each input sequence. Applies only if building an FST. For
|
* @param outputs The output type for each input sequence. Applies only if building an FST. For
|
||||||
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
||||||
* singleton output object.
|
* singleton output object.
|
||||||
|
*
|
||||||
|
* @param willPackFST Pass true if you will rewrite (compact) the FST before saving. This
|
||||||
|
* causes the FST to create additional data structures intenrally to facilitate rewriting, but
|
||||||
|
* it means the resulting FST cannot be saved: it must first be rewritten using {@link FST#FST(FST,int[])}}
|
||||||
*/
|
*/
|
||||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
||||||
FreezeTail<T> freezeTail) {
|
FreezeTail<T> freezeTail, boolean willPackFST) {
|
||||||
this.minSuffixCount1 = minSuffixCount1;
|
this.minSuffixCount1 = minSuffixCount1;
|
||||||
this.minSuffixCount2 = minSuffixCount2;
|
this.minSuffixCount2 = minSuffixCount2;
|
||||||
this.freezeTail = freezeTail;
|
this.freezeTail = freezeTail;
|
||||||
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
||||||
this.shareMaxTailLength = shareMaxTailLength;
|
this.shareMaxTailLength = shareMaxTailLength;
|
||||||
fst = new FST<T>(inputType, outputs);
|
fst = new FST<T>(inputType, outputs, willPackFST);
|
||||||
if (doShareSuffix) {
|
if (doShareSuffix) {
|
||||||
dedupHash = new NodeHash<T>(fst);
|
dedupHash = new NodeHash<T>(fst);
|
||||||
} else {
|
} else {
|
||||||
|
@ -170,23 +168,23 @@ public class Builder<T> {
|
||||||
fst.setAllowArrayArcs(b);
|
fst.setAllowArrayArcs(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
private CompiledNode compileNode(UnCompiledNode<T> n, int tailLength) throws IOException {
|
private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throws IOException {
|
||||||
final int address;
|
final int node;
|
||||||
if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) {
|
if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) {
|
||||||
if (n.numArcs == 0) {
|
if (nodeIn.numArcs == 0) {
|
||||||
address = fst.addNode(n);
|
node = fst.addNode(nodeIn);
|
||||||
} else {
|
} else {
|
||||||
address = dedupHash.add(n);
|
node = dedupHash.add(nodeIn);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
address = fst.addNode(n);
|
node = fst.addNode(nodeIn);
|
||||||
}
|
}
|
||||||
assert address != -2;
|
assert node != -2;
|
||||||
|
|
||||||
n.clear();
|
nodeIn.clear();
|
||||||
|
|
||||||
final CompiledNode fn = new CompiledNode();
|
final CompiledNode fn = new CompiledNode();
|
||||||
fn.address = address;
|
fn.node = node;
|
||||||
return fn;
|
return fn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -319,6 +317,11 @@ public class Builder<T> {
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// De-dup NO_OUTPUT since it must be a singleton:
|
||||||
|
if (output.equals(NO_OUTPUT)) {
|
||||||
|
output = NO_OUTPUT;
|
||||||
|
}
|
||||||
|
|
||||||
assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input;
|
assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input;
|
||||||
assert validOutput(output);
|
assert validOutput(output);
|
||||||
|
|
||||||
|
@ -443,7 +446,7 @@ public class Builder<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
|
//if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
|
||||||
fst.finish(compileNode(root, lastInput.length).address);
|
fst.finish(compileNode(root, lastInput.length).node);
|
||||||
|
|
||||||
return fst;
|
return fst;
|
||||||
}
|
}
|
||||||
|
@ -480,7 +483,7 @@ public class Builder<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
static final class CompiledNode implements Node {
|
static final class CompiledNode implements Node {
|
||||||
int address;
|
int node;
|
||||||
public boolean isCompiled() {
|
public boolean isCompiled() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -560,7 +563,7 @@ public class Builder<T> {
|
||||||
final Arc<T> arc = arcs[numArcs-1];
|
final Arc<T> arc = arcs[numArcs-1];
|
||||||
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
|
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
|
||||||
arc.target = target;
|
arc.target = target;
|
||||||
//assert target.address != -2;
|
//assert target.node != -2;
|
||||||
arc.nextFinalOutput = nextFinalOutput;
|
arc.nextFinalOutput = nextFinalOutput;
|
||||||
arc.isFinal = isFinal;
|
arc.isFinal = isFinal;
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -151,7 +151,8 @@ abstract class FSTEnum<T> {
|
||||||
boolean found = false;
|
boolean found = false;
|
||||||
while (low <= high) {
|
while (low <= high) {
|
||||||
mid = (low + high) >>> 1;
|
mid = (low + high) >>> 1;
|
||||||
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
|
in.pos = arc.posArcsStart;
|
||||||
|
in.skip(arc.bytesPerArc*mid+1);
|
||||||
final int midLabel = fst.readLabel(in);
|
final int midLabel = fst.readLabel(in);
|
||||||
final int cmp = midLabel - targetLabel;
|
final int cmp = midLabel - targetLabel;
|
||||||
//System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
|
//System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
|
||||||
|
@ -275,7 +276,7 @@ abstract class FSTEnum<T> {
|
||||||
|
|
||||||
// Now scan forward, matching the new suffix of the target
|
// Now scan forward, matching the new suffix of the target
|
||||||
while(true) {
|
while(true) {
|
||||||
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast());
|
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + arc.bytesPerArc);
|
||||||
|
|
||||||
if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
|
if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
|
||||||
// Arcs are fixed array -- use binary search to find
|
// Arcs are fixed array -- use binary search to find
|
||||||
|
@ -289,15 +290,16 @@ abstract class FSTEnum<T> {
|
||||||
boolean found = false;
|
boolean found = false;
|
||||||
while (low <= high) {
|
while (low <= high) {
|
||||||
mid = (low + high) >>> 1;
|
mid = (low + high) >>> 1;
|
||||||
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
|
in.pos = arc.posArcsStart;
|
||||||
|
in.skip(arc.bytesPerArc*mid+1);
|
||||||
final int midLabel = fst.readLabel(in);
|
final int midLabel = fst.readLabel(in);
|
||||||
final int cmp = midLabel - targetLabel;
|
final int cmp = midLabel - targetLabel;
|
||||||
//System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
|
//System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
|
||||||
if (cmp < 0)
|
if (cmp < 0) {
|
||||||
low = mid + 1;
|
low = mid + 1;
|
||||||
else if (cmp > 0)
|
} else if (cmp > 0) {
|
||||||
high = mid - 1;
|
high = mid - 1;
|
||||||
else {
|
} else {
|
||||||
found = true;
|
found = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -430,9 +432,11 @@ abstract class FSTEnum<T> {
|
||||||
FST.Arc<T> arc = getArc(upto-1);
|
FST.Arc<T> arc = getArc(upto-1);
|
||||||
int targetLabel = getTargetLabel();
|
int targetLabel = getTargetLabel();
|
||||||
|
|
||||||
|
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||||
|
|
||||||
while(true) {
|
while(true) {
|
||||||
//System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel));
|
//System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel));
|
||||||
final FST.Arc<T> nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto));
|
final FST.Arc<T> nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto), fstReader);
|
||||||
if (nextArc == null) {
|
if (nextArc == null) {
|
||||||
// short circuit
|
// short circuit
|
||||||
//upto--;
|
//upto--;
|
||||||
|
|
|
@ -35,7 +35,7 @@ final class NodeHash<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address, FST.BytesReader in) throws IOException {
|
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address, FST.BytesReader in) throws IOException {
|
||||||
fst.readFirstRealArc(address, scratchArc, in);
|
fst.readFirstRealTargetArc(address, scratchArc, in);
|
||||||
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
|
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -43,7 +43,7 @@ final class NodeHash<T> {
|
||||||
final Builder.Arc<T> arc = node.arcs[arcUpto];
|
final Builder.Arc<T> arc = node.arcs[arcUpto];
|
||||||
if (arc.label != scratchArc.label ||
|
if (arc.label != scratchArc.label ||
|
||||||
!arc.output.equals(scratchArc.output) ||
|
!arc.output.equals(scratchArc.output) ||
|
||||||
((Builder.CompiledNode) arc.target).address != scratchArc.target ||
|
((Builder.CompiledNode) arc.target).node != scratchArc.target ||
|
||||||
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput) ||
|
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput) ||
|
||||||
arc.isFinal != scratchArc.isFinal()) {
|
arc.isFinal != scratchArc.isFinal()) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -71,9 +71,9 @@ final class NodeHash<T> {
|
||||||
// TODO: maybe if number of arcs is high we can safely subsample?
|
// TODO: maybe if number of arcs is high we can safely subsample?
|
||||||
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
|
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
|
||||||
final Builder.Arc<T> arc = node.arcs[arcIdx];
|
final Builder.Arc<T> arc = node.arcs[arcIdx];
|
||||||
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).address + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
|
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
|
||||||
h = PRIME * h + arc.label;
|
h = PRIME * h + arc.label;
|
||||||
h = PRIME * h + ((Builder.CompiledNode) arc.target).address;
|
h = PRIME * h + ((Builder.CompiledNode) arc.target).node;
|
||||||
h = PRIME * h + arc.output.hashCode();
|
h = PRIME * h + arc.output.hashCode();
|
||||||
h = PRIME * h + arc.nextFinalOutput.hashCode();
|
h = PRIME * h + arc.nextFinalOutput.hashCode();
|
||||||
if (arc.isFinal) {
|
if (arc.isFinal) {
|
||||||
|
@ -88,9 +88,9 @@ final class NodeHash<T> {
|
||||||
private int hash(int node) throws IOException {
|
private int hash(int node) throws IOException {
|
||||||
final int PRIME = 31;
|
final int PRIME = 31;
|
||||||
final FST.BytesReader in = fst.getBytesReader(0);
|
final FST.BytesReader in = fst.getBytesReader(0);
|
||||||
//System.out.println("hash frozen");
|
//System.out.println("hash frozen node=" + node);
|
||||||
int h = 0;
|
int h = 0;
|
||||||
fst.readFirstRealArc(node, scratchArc, in);
|
fst.readFirstRealTargetArc(node, scratchArc, in);
|
||||||
while(true) {
|
while(true) {
|
||||||
//System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal());
|
//System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal());
|
||||||
h = PRIME * h + scratchArc.label;
|
h = PRIME * h + scratchArc.label;
|
||||||
|
@ -109,26 +109,26 @@ final class NodeHash<T> {
|
||||||
return h & Integer.MAX_VALUE;
|
return h & Integer.MAX_VALUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int add(Builder.UnCompiledNode<T> node) throws IOException {
|
public int add(Builder.UnCompiledNode<T> nodeIn) throws IOException {
|
||||||
// System.out.println("hash: add count=" + count + " vs " + table.length);
|
// System.out.println("hash: add count=" + count + " vs " + table.length);
|
||||||
final FST.BytesReader in = fst.getBytesReader(0);
|
final FST.BytesReader in = fst.getBytesReader(0);
|
||||||
final int h = hash(node);
|
final int h = hash(nodeIn);
|
||||||
int pos = h & mask;
|
int pos = h & mask;
|
||||||
int c = 0;
|
int c = 0;
|
||||||
while(true) {
|
while(true) {
|
||||||
final int v = table[pos];
|
final int v = table[pos];
|
||||||
if (v == 0) {
|
if (v == 0) {
|
||||||
// freeze & add
|
// freeze & add
|
||||||
final int address = fst.addNode(node);
|
final int node = fst.addNode(nodeIn);
|
||||||
//System.out.println(" now freeze addr=" + address);
|
//System.out.println(" now freeze node=" + node);
|
||||||
assert hash(address) == h : "frozenHash=" + hash(address) + " vs h=" + h;
|
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
|
||||||
count++;
|
count++;
|
||||||
table[pos] = address;
|
table[pos] = node;
|
||||||
if (table.length < 2*count) {
|
if (table.length < 2*count) {
|
||||||
rehash();
|
rehash();
|
||||||
}
|
}
|
||||||
return address;
|
return node;
|
||||||
} else if (nodesEqual(node, v, in)) {
|
} else if (nodesEqual(nodeIn, v, in)) {
|
||||||
// same node is already here
|
// same node is already here
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,10 @@ import org.apache.lucene.store.DataOutput;
|
||||||
* Represents the outputs for an FST, providing the basic
|
* Represents the outputs for an FST, providing the basic
|
||||||
* algebra needed for the FST.
|
* algebra needed for the FST.
|
||||||
*
|
*
|
||||||
|
* <p>Note that any operation that returns NO_OUTPUT must
|
||||||
|
* return the same singleton object from {@link
|
||||||
|
* #getNoOutput}.</p>
|
||||||
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -56,6 +60,8 @@ public abstract class Outputs<T> {
|
||||||
|
|
||||||
public abstract String outputToString(T output);
|
public abstract String outputToString(T output);
|
||||||
|
|
||||||
|
// TODO: maybe make valid(T output) public...? for asserts
|
||||||
|
|
||||||
public T merge(T first, T second) {
|
public T merge(T first, T second) {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,8 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
|
||||||
public final A output1;
|
public final A output1;
|
||||||
public final B output2;
|
public final B output2;
|
||||||
|
|
||||||
public Pair(A output1, B output2) {
|
// use newPair
|
||||||
|
private Pair(A output1, B output2) {
|
||||||
this.output1 = output1;
|
this.output1 = output1;
|
||||||
this.output2 = output2;
|
this.output2 = output2;
|
||||||
}
|
}
|
||||||
|
@ -66,35 +67,79 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
|
||||||
this.outputs2 = outputs2;
|
this.outputs2 = outputs2;
|
||||||
NO_OUTPUT = new Pair<A,B>(outputs1.getNoOutput(), outputs2.getNoOutput());
|
NO_OUTPUT = new Pair<A,B>(outputs1.getNoOutput(), outputs2.getNoOutput());
|
||||||
}
|
}
|
||||||
|
|
||||||
public Pair<A,B> get(A output1, B output2) {
|
/** Create a new Pair */
|
||||||
if (output1 == outputs1.getNoOutput() && output2 == outputs2.getNoOutput()) {
|
public Pair<A,B> newPair(A a, B b) {
|
||||||
|
if (a.equals(outputs1.getNoOutput())) {
|
||||||
|
a = outputs1.getNoOutput();
|
||||||
|
}
|
||||||
|
if (b.equals(outputs2.getNoOutput())) {
|
||||||
|
b = outputs2.getNoOutput();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a == outputs1.getNoOutput() && b == outputs2.getNoOutput()) {
|
||||||
return NO_OUTPUT;
|
return NO_OUTPUT;
|
||||||
} else {
|
} else {
|
||||||
return new Pair<A,B>(output1, output2);
|
final Pair<A,B> p = new Pair<A,B>(a, b);
|
||||||
|
assert valid(p);
|
||||||
|
return p;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// for assert
|
||||||
|
private boolean valid(Pair<A,B> pair) {
|
||||||
|
final boolean noOutput1 = pair.output1.equals(outputs1.getNoOutput());
|
||||||
|
final boolean noOutput2 = pair.output2.equals(outputs2.getNoOutput());
|
||||||
|
|
||||||
|
if (noOutput1 && pair.output1 != outputs1.getNoOutput()) {
|
||||||
|
System.out.println("invalid0");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (noOutput2 && pair.output2 != outputs2.getNoOutput()) {
|
||||||
|
System.out.println("invalid1");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (noOutput1 && noOutput2) {
|
||||||
|
if (pair != NO_OUTPUT) {
|
||||||
|
System.out.println("invalid2");
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Pair<A,B> common(Pair<A,B> pair1, Pair<A,B> pair2) {
|
public Pair<A,B> common(Pair<A,B> pair1, Pair<A,B> pair2) {
|
||||||
return get(outputs1.common(pair1.output1, pair2.output1),
|
assert valid(pair1);
|
||||||
outputs2.common(pair1.output2, pair2.output2));
|
assert valid(pair2);
|
||||||
|
return newPair(outputs1.common(pair1.output1, pair2.output1),
|
||||||
|
outputs2.common(pair1.output2, pair2.output2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Pair<A,B> subtract(Pair<A,B> output, Pair<A,B> inc) {
|
public Pair<A,B> subtract(Pair<A,B> output, Pair<A,B> inc) {
|
||||||
return get(outputs1.subtract(output.output1, inc.output1),
|
assert valid(output);
|
||||||
outputs2.subtract(output.output2, inc.output2));
|
assert valid(inc);
|
||||||
|
return newPair(outputs1.subtract(output.output1, inc.output1),
|
||||||
|
outputs2.subtract(output.output2, inc.output2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Pair<A,B> add(Pair<A,B> prefix, Pair<A,B> output) {
|
public Pair<A,B> add(Pair<A,B> prefix, Pair<A,B> output) {
|
||||||
return get(outputs1.add(prefix.output1, output.output1),
|
assert valid(prefix);
|
||||||
outputs2.add(prefix.output2, output.output2));
|
assert valid(output);
|
||||||
|
return newPair(outputs1.add(prefix.output1, output.output1),
|
||||||
|
outputs2.add(prefix.output2, output.output2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void write(Pair<A,B> output, DataOutput writer) throws IOException {
|
public void write(Pair<A,B> output, DataOutput writer) throws IOException {
|
||||||
|
assert valid(output);
|
||||||
outputs1.write(output.output1, writer);
|
outputs1.write(output.output1, writer);
|
||||||
outputs2.write(output.output2, writer);
|
outputs2.write(output.output2, writer);
|
||||||
}
|
}
|
||||||
|
@ -103,7 +148,7 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
|
||||||
public Pair<A,B> read(DataInput in) throws IOException {
|
public Pair<A,B> read(DataInput in) throws IOException {
|
||||||
A output1 = outputs1.read(in);
|
A output1 = outputs1.read(in);
|
||||||
B output2 = outputs2.read(in);
|
B output2 = outputs2.read(in);
|
||||||
return get(output1, output2);
|
return newPair(output1, output2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -113,6 +158,12 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String outputToString(Pair<A,B> output) {
|
public String outputToString(Pair<A,B> output) {
|
||||||
|
assert valid(output);
|
||||||
return "<pair:" + outputs1.outputToString(output.output1) + "," + outputs2.outputToString(output.output2) + ">";
|
return "<pair:" + outputs1.outputToString(output.output1) + "," + outputs2.outputToString(output.output2) + ">";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "PairOutputs<" + outputs1 + "," + outputs2 + ">";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,10 +25,7 @@ import org.apache.lucene.store.DataOutput;
|
||||||
/**
|
/**
|
||||||
* Output is a long, for each input term. NOTE: the
|
* Output is a long, for each input term. NOTE: the
|
||||||
* resulting FST is not guaranteed to be minimal! See
|
* resulting FST is not guaranteed to be minimal! See
|
||||||
* {@link Builder}. You must use {@link #get} to obtain the
|
* {@link Builder}.
|
||||||
* output for a given long value -- do not use autoboxing
|
|
||||||
* nor create your own Long instance (the value 0
|
|
||||||
* must map to the {@link #getNoOutput} singleton).
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
@ -50,14 +47,6 @@ public final class PositiveIntOutputs extends Outputs<Long> {
|
||||||
return doShare ? singletonShare : singletonNoShare;
|
return doShare ? singletonShare : singletonNoShare;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Long get(long v) {
|
|
||||||
if (v == 0) {
|
|
||||||
return NO_OUTPUT;
|
|
||||||
} else {
|
|
||||||
return Long.valueOf(v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Long common(Long output1, Long output2) {
|
public Long common(Long output1, Long output2) {
|
||||||
assert valid(output1);
|
assert valid(output1);
|
||||||
|
|
|
@ -37,23 +37,21 @@ public final class Util {
|
||||||
// TODO: would be nice not to alloc this on every lookup
|
// TODO: would be nice not to alloc this on every lookup
|
||||||
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
||||||
|
|
||||||
|
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||||
|
|
||||||
// Accumulate output as we go
|
// Accumulate output as we go
|
||||||
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
T output = fst.outputs.getNoOutput();
|
||||||
T output = NO_OUTPUT;
|
|
||||||
for(int i=0;i<input.length;i++) {
|
for(int i=0;i<input.length;i++) {
|
||||||
if (fst.findTargetArc(input.ints[input.offset + i], arc, arc) == null) {
|
if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) {
|
||||||
return null;
|
return null;
|
||||||
} else if (arc.output != NO_OUTPUT) {
|
|
||||||
output = fst.outputs.add(output, arc.output);
|
|
||||||
}
|
}
|
||||||
|
output = fst.outputs.add(output, arc.output);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
|
if (arc.isFinal()) {
|
||||||
return null;
|
return fst.outputs.add(output, arc.nextFinalOutput);
|
||||||
} else if (arc.output != NO_OUTPUT) {
|
|
||||||
return fst.outputs.add(output, arc.output);
|
|
||||||
} else {
|
} else {
|
||||||
return output;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -64,26 +62,24 @@ public final class Util {
|
||||||
public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
|
public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
|
||||||
assert fst.inputType == FST.INPUT_TYPE.BYTE1;
|
assert fst.inputType == FST.INPUT_TYPE.BYTE1;
|
||||||
|
|
||||||
|
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||||
|
|
||||||
// TODO: would be nice not to alloc this on every lookup
|
// TODO: would be nice not to alloc this on every lookup
|
||||||
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
||||||
|
|
||||||
// Accumulate output as we go
|
// Accumulate output as we go
|
||||||
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
T output = fst.outputs.getNoOutput();
|
||||||
T output = NO_OUTPUT;
|
|
||||||
for(int i=0;i<input.length;i++) {
|
for(int i=0;i<input.length;i++) {
|
||||||
if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc) == null) {
|
if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc, fstReader) == null) {
|
||||||
return null;
|
return null;
|
||||||
} else if (arc.output != NO_OUTPUT) {
|
|
||||||
output = fst.outputs.add(output, arc.output);
|
|
||||||
}
|
}
|
||||||
|
output = fst.outputs.add(output, arc.output);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
|
if (arc.isFinal()) {
|
||||||
return null;
|
return fst.outputs.add(output, arc.nextFinalOutput);
|
||||||
} else if (arc.output != NO_OUTPUT) {
|
|
||||||
return fst.outputs.add(output, arc.output);
|
|
||||||
} else {
|
} else {
|
||||||
return output;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -142,7 +138,7 @@ public final class Util {
|
||||||
result.grow(1+upto);
|
result.grow(1+upto);
|
||||||
}
|
}
|
||||||
|
|
||||||
fst.readFirstRealArc(arc.target, arc, in);
|
fst.readFirstRealTargetArc(arc.target, arc, in);
|
||||||
|
|
||||||
FST.Arc<Long> prevArc = null;
|
FST.Arc<Long> prevArc = null;
|
||||||
|
|
||||||
|
@ -238,6 +234,7 @@ public final class Util {
|
||||||
// A queue of transitions to consider when processing the next level.
|
// A queue of transitions to consider when processing the next level.
|
||||||
final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
|
final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
|
||||||
nextLevelQueue.add(startArc);
|
nextLevelQueue.add(startArc);
|
||||||
|
//System.out.println("toDot: startArc: " + startArc);
|
||||||
|
|
||||||
// A list of states on the same level (for ranking).
|
// A list of states on the same level (for ranking).
|
||||||
final List<Integer> sameLevelStates = new ArrayList<Integer>();
|
final List<Integer> sameLevelStates = new ArrayList<Integer>();
|
||||||
|
@ -289,8 +286,11 @@ public final class Util {
|
||||||
|
|
||||||
int level = 0;
|
int level = 0;
|
||||||
|
|
||||||
|
final FST.BytesReader r = fst.getBytesReader(0);
|
||||||
|
|
||||||
while (!nextLevelQueue.isEmpty()) {
|
while (!nextLevelQueue.isEmpty()) {
|
||||||
// we could double buffer here, but it doesn't matter probably.
|
// we could double buffer here, but it doesn't matter probably.
|
||||||
|
//System.out.println("next level=" + level);
|
||||||
thisLevelQueue.addAll(nextLevelQueue);
|
thisLevelQueue.addAll(nextLevelQueue);
|
||||||
nextLevelQueue.clear();
|
nextLevelQueue.clear();
|
||||||
|
|
||||||
|
@ -298,19 +298,19 @@ public final class Util {
|
||||||
out.write("\n // Transitions and states at level: " + level + "\n");
|
out.write("\n // Transitions and states at level: " + level + "\n");
|
||||||
while (!thisLevelQueue.isEmpty()) {
|
while (!thisLevelQueue.isEmpty()) {
|
||||||
final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
|
final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
|
||||||
|
//System.out.println(" pop: " + arc);
|
||||||
if (fst.targetHasArcs(arc)) {
|
if (fst.targetHasArcs(arc)) {
|
||||||
// scan all arcs
|
// scan all target arcs
|
||||||
|
//System.out.println(" readFirstTarget...");
|
||||||
final int node = arc.target;
|
final int node = arc.target;
|
||||||
fst.readFirstTargetArc(arc, arc);
|
|
||||||
|
|
||||||
if (arc.label == FST.END_LABEL) {
|
fst.readFirstRealTargetArc(arc.target, arc, r);
|
||||||
// Skip it -- prior recursion took this into account already
|
|
||||||
assert !arc.isLast();
|
//System.out.println(" firstTarget: " + arc);
|
||||||
fst.readNextArc(arc);
|
|
||||||
}
|
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
|
||||||
|
//System.out.println(" cycle arc=" + arc);
|
||||||
// Emit the unseen state and add it to the queue for the next level.
|
// Emit the unseen state and add it to the queue for the next level.
|
||||||
if (arc.target >= 0 && !seen.get(arc.target)) {
|
if (arc.target >= 0 && !seen.get(arc.target)) {
|
||||||
|
|
||||||
|
@ -329,7 +329,7 @@ public final class Util {
|
||||||
if (fst.isExpandedTarget(arc)) {
|
if (fst.isExpandedTarget(arc)) {
|
||||||
stateColor = expandedNodeColor;
|
stateColor = expandedNodeColor;
|
||||||
} else {
|
} else {
|
||||||
stateColor = null;
|
stateColor = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
final String finalOutput;
|
final String finalOutput;
|
||||||
|
@ -339,7 +339,9 @@ public final class Util {
|
||||||
finalOutput = "";
|
finalOutput = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
emitDotState(out, Integer.toString(arc.target), arc.isFinal() ? finalStateShape : stateShape, stateColor, finalOutput);
|
emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, finalOutput);
|
||||||
|
// To see the node address, use this instead:
|
||||||
|
//emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
|
||||||
seen.set(arc.target);
|
seen.set(arc.target);
|
||||||
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
|
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
|
||||||
sameLevelStates.add(arc.target);
|
sameLevelStates.add(arc.target);
|
||||||
|
@ -362,14 +364,22 @@ public final class Util {
|
||||||
outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]";
|
outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final String arcColor;
|
||||||
|
if (arc.flag(FST.BIT_TARGET_NEXT)) {
|
||||||
|
arcColor = "red";
|
||||||
|
} else {
|
||||||
|
arcColor = "black";
|
||||||
|
}
|
||||||
|
|
||||||
assert arc.label != FST.END_LABEL;
|
assert arc.label != FST.END_LABEL;
|
||||||
out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"]\n");
|
out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n");
|
||||||
|
|
||||||
// Break the loop if we're on the last arc of this state.
|
// Break the loop if we're on the last arc of this state.
|
||||||
if (arc.isLast()) {
|
if (arc.isLast()) {
|
||||||
|
//System.out.println(" break");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
fst.readNextArc(arc);
|
fst.readNextRealArc(arc, r);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -89,11 +89,11 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
return br;
|
return br;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static IntsRef toIntsRef(String s, int inputMode) {
|
static IntsRef toIntsRef(String s, int inputMode) {
|
||||||
return toIntsRef(s, inputMode, new IntsRef(10));
|
return toIntsRef(s, inputMode, new IntsRef(10));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
|
static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
|
||||||
if (inputMode == 0) {
|
if (inputMode == 0) {
|
||||||
// utf8
|
// utf8
|
||||||
return toIntsRef(new BytesRef(s), ir);
|
return toIntsRef(new BytesRef(s), ir);
|
||||||
|
@ -103,7 +103,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
|
static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
|
||||||
final int charLength = s.length();
|
final int charLength = s.length();
|
||||||
int charIdx = 0;
|
int charIdx = 0;
|
||||||
int intIdx = 0;
|
int intIdx = 0;
|
||||||
|
@ -120,7 +120,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
return ir;
|
return ir;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
|
static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
|
||||||
if (br.length > ir.ints.length) {
|
if (br.length > ir.ints.length) {
|
||||||
ir.grow(br.length);
|
ir.grow(br.length);
|
||||||
}
|
}
|
||||||
|
@ -172,7 +172,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||||
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms2.length);
|
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms2.length);
|
||||||
for(int idx=0;idx<terms2.length;idx++) {
|
for(int idx=0;idx<terms2.length;idx++) {
|
||||||
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
|
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], (long) idx));
|
||||||
}
|
}
|
||||||
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false);
|
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false);
|
||||||
assertNotNull(fst);
|
assertNotNull(fst);
|
||||||
|
@ -230,7 +230,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||||
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
|
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
|
||||||
for(int idx=0;idx<terms.length;idx++) {
|
for(int idx=0;idx<terms.length;idx++) {
|
||||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
|
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], (long) idx));
|
||||||
}
|
}
|
||||||
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest();
|
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest();
|
||||||
}
|
}
|
||||||
|
@ -244,7 +244,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
for(int idx=0;idx<terms.length;idx++) {
|
for(int idx=0;idx<terms.length;idx++) {
|
||||||
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
|
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
|
||||||
lastOutput = value;
|
lastOutput = value;
|
||||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
|
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], value));
|
||||||
}
|
}
|
||||||
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, doShare).doTest();
|
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, doShare).doTest();
|
||||||
}
|
}
|
||||||
|
@ -254,7 +254,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
|
||||||
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
|
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
|
||||||
for(int idx=0;idx<terms.length;idx++) {
|
for(int idx=0;idx<terms.length;idx++) {
|
||||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
|
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], random.nextLong() & Long.MAX_VALUE));
|
||||||
}
|
}
|
||||||
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, false).doTest();
|
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, false).doTest();
|
||||||
}
|
}
|
||||||
|
@ -270,8 +270,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
|
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
|
||||||
lastOutput = value;
|
lastOutput = value;
|
||||||
pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
|
pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
|
||||||
outputs.get(o1.get(idx),
|
outputs.newPair((long) idx, value)));
|
||||||
o2.get(value))));
|
|
||||||
}
|
}
|
||||||
new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs, false).doTest();
|
new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs, false).doTest();
|
||||||
}
|
}
|
||||||
|
@ -393,6 +392,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
||||||
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
||||||
T output = NO_OUTPUT;
|
T output = NO_OUTPUT;
|
||||||
|
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||||
|
|
||||||
for(int i=0;i<=term.length;i++) {
|
for(int i=0;i<=term.length;i++) {
|
||||||
final int label;
|
final int label;
|
||||||
|
@ -401,8 +401,9 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
} else {
|
} else {
|
||||||
label = term.ints[term.offset+i];
|
label = term.ints[term.offset+i];
|
||||||
}
|
}
|
||||||
//System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
|
// System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
|
||||||
if (fst.findTargetArc(label, arc, arc) == null) {
|
if (fst.findTargetArc(label, arc, arc, fstReader) == null) {
|
||||||
|
// System.out.println(" not found");
|
||||||
if (prefixLength != null) {
|
if (prefixLength != null) {
|
||||||
prefixLength[0] = i;
|
prefixLength[0] = i;
|
||||||
return output;
|
return output;
|
||||||
|
@ -462,16 +463,19 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
|
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
|
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final boolean willRewrite = random.nextBoolean();
|
||||||
|
|
||||||
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
|
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
|
||||||
prune1, prune2,
|
prune1, prune2,
|
||||||
prune1==0 && prune2==0,
|
prune1==0 && prune2==0,
|
||||||
allowRandomSuffixSharing ? random.nextBoolean() : true,
|
allowRandomSuffixSharing ? random.nextBoolean() : true,
|
||||||
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
|
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
|
||||||
outputs,
|
outputs,
|
||||||
null);
|
null,
|
||||||
|
willRewrite);
|
||||||
|
|
||||||
for(InputOutput<T> pair : pairs) {
|
for(InputOutput<T> pair : pairs) {
|
||||||
if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
|
if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
|
||||||
|
@ -486,7 +490,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
FST<T> fst = builder.finish();
|
FST<T> fst = builder.finish();
|
||||||
|
|
||||||
if (random.nextBoolean() && fst != null) {
|
if (random.nextBoolean() && fst != null && !willRewrite) {
|
||||||
TestFSTs t = new TestFSTs();
|
TestFSTs t = new TestFSTs();
|
||||||
IOContext context = t.newIOContext(random);
|
IOContext context = t.newIOContext(random);
|
||||||
IndexOutput out = dir.createOutput("fst.bin", context);
|
IndexOutput out = dir.createOutput("fst.bin", context);
|
||||||
|
@ -522,6 +526,21 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
verifyPruned(inputMode, fst, prune1, prune2);
|
verifyPruned(inputMode, fst, prune1, prune2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (willRewrite && fst != null) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: now rewrite");
|
||||||
|
}
|
||||||
|
final FST<T> packed =fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000));
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: now verify packed FST");
|
||||||
|
}
|
||||||
|
if (prune1 == 0 && prune2 == 0) {
|
||||||
|
verifyUnPruned(inputMode, packed);
|
||||||
|
} else {
|
||||||
|
verifyPruned(inputMode, packed, prune1, prune2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return fst;
|
return fst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -638,7 +657,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
num = atLeast(100);
|
num = atLeast(100);
|
||||||
for(int iter=0;iter<num;iter++) {
|
for(int iter=0;iter<num;iter++) {
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println("TEST: iter=" + iter);
|
System.out.println(" iter=" + iter);
|
||||||
}
|
}
|
||||||
if (random.nextBoolean()) {
|
if (random.nextBoolean()) {
|
||||||
// seek to term that doesn't exist:
|
// seek to term that doesn't exist:
|
||||||
|
@ -866,7 +885,15 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
|
prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
|
||||||
} else {
|
} else {
|
||||||
cmo.count++;
|
cmo.count++;
|
||||||
cmo.output = outputs.common(cmo.output, pair.output);
|
T output1 = cmo.output;
|
||||||
|
if (output1.equals(outputs.getNoOutput())) {
|
||||||
|
output1 = outputs.getNoOutput();
|
||||||
|
}
|
||||||
|
T output2 = pair.output;
|
||||||
|
if (output2.equals(outputs.getNoOutput())) {
|
||||||
|
output2 = outputs.getNoOutput();
|
||||||
|
}
|
||||||
|
cmo.output = outputs.common(output1, output2);
|
||||||
}
|
}
|
||||||
if (idx == pair.input.length) {
|
if (idx == pair.input.length) {
|
||||||
cmo.isFinal = true;
|
cmo.isFinal = true;
|
||||||
|
@ -992,7 +1019,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
public void testRandomWords() throws IOException {
|
public void testRandomWords() throws IOException {
|
||||||
testRandomWords(1000, atLeast(2));
|
testRandomWords(1000, atLeast(2));
|
||||||
//testRandomWords(20, 100);
|
//testRandomWords(100, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
String inputModeToString(int mode) {
|
String inputModeToString(int mode) {
|
||||||
|
@ -1055,50 +1082,6 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: this test shows a case where our current builder
|
|
||||||
// fails to produce minimal FST:
|
|
||||||
/*
|
|
||||||
public void test3() throws Exception {
|
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
|
||||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
|
||||||
IntsRef scratchIntsRef = new IntsRef();
|
|
||||||
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.get(0));
|
|
||||||
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), 1L);
|
|
||||||
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), 2L);
|
|
||||||
final FST<Long> fst = builder.finish();
|
|
||||||
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
|
|
||||||
// NOTE: we produce 7 nodes today
|
|
||||||
assertEquals(6, fst.getNodeCount());
|
|
||||||
// NOTE: we produce 8 arcs today
|
|
||||||
assertEquals(7, fst.getNodeCount());
|
|
||||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
|
||||||
//Util.toDot(fst, w, false, false);
|
|
||||||
//w.close();
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
// NOTE: this test shows a case where our current builder
|
|
||||||
// fails to produce minimal FST:
|
|
||||||
/*
|
|
||||||
public void test4() throws Exception {
|
|
||||||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
|
||||||
Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
|
|
||||||
IntsRef scratchIntsRef = new IntsRef();
|
|
||||||
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput());
|
|
||||||
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1"));
|
|
||||||
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11"));
|
|
||||||
final FST<BytesRef> fst = builder.finish();
|
|
||||||
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
|
|
||||||
// NOTE: we produce 7 nodes today
|
|
||||||
assertEquals(6, fst.getNodeCount());
|
|
||||||
// NOTE: we produce 8 arcs today
|
|
||||||
assertEquals(7, fst.getNodeCount());
|
|
||||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
|
||||||
//Util.toDot(fst, w, false, false);
|
|
||||||
//w.close();
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Build FST for all unique terms in the test line docs
|
// Build FST for all unique terms in the test line docs
|
||||||
// file, up until a time limit
|
// file, up until a time limit
|
||||||
public void testRealTerms() throws Exception {
|
public void testRealTerms() throws Exception {
|
||||||
|
@ -1126,7 +1109,10 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
IndexReader r = IndexReader.open(writer, true);
|
IndexReader r = IndexReader.open(writer, true);
|
||||||
writer.close();
|
writer.close();
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
|
||||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
|
||||||
|
final boolean doRewrite = random.nextBoolean();
|
||||||
|
|
||||||
|
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite);
|
||||||
|
|
||||||
boolean storeOrd = random.nextBoolean();
|
boolean storeOrd = random.nextBoolean();
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
|
@ -1162,59 +1148,69 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
} else {
|
} else {
|
||||||
output = termsEnum.docFreq();
|
output = termsEnum.docFreq();
|
||||||
}
|
}
|
||||||
builder.add(Util.toIntsRef(term, scratchIntsRef), outputs.get(output));
|
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
|
||||||
ord++;
|
ord++;
|
||||||
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
|
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
|
||||||
System.out.println(ord + " terms...");
|
System.out.println(ord + " terms...");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
final FST<Long> fst = builder.finish();
|
FST<Long> fst = builder.finish();
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes");
|
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ord > 0) {
|
if (ord > 0) {
|
||||||
// Now confirm BytesRefFSTEnum and TermsEnum act the
|
for(int rewriteIter=0;rewriteIter<2;rewriteIter++) {
|
||||||
// same:
|
if (rewriteIter == 1) {
|
||||||
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
|
if (doRewrite) {
|
||||||
int num = atLeast(1000);
|
// Verify again, with packed FST:
|
||||||
for(int iter=0;iter<num;iter++) {
|
fst = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000));
|
||||||
final BytesRef randomTerm = new BytesRef(getRandomString());
|
} else {
|
||||||
|
break;
|
||||||
if (VERBOSE) {
|
}
|
||||||
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
|
|
||||||
}
|
}
|
||||||
|
// Now confirm BytesRefFSTEnum and TermsEnum act the
|
||||||
|
// same:
|
||||||
|
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
|
||||||
|
int num = atLeast(1000);
|
||||||
|
for(int iter=0;iter<num;iter++) {
|
||||||
|
final BytesRef randomTerm = new BytesRef(getRandomString());
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
|
||||||
|
}
|
||||||
|
|
||||||
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
|
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
|
||||||
final BytesRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(randomTerm);
|
final BytesRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(randomTerm);
|
||||||
|
|
||||||
if (seekResult == TermsEnum.SeekStatus.END) {
|
if (seekResult == TermsEnum.SeekStatus.END) {
|
||||||
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
|
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
|
||||||
} else {
|
} else {
|
||||||
assertSame(termsEnum, fstEnum, storeOrd);
|
assertSame(termsEnum, fstEnum, storeOrd);
|
||||||
for(int nextIter=0;nextIter<10;nextIter++) {
|
for(int nextIter=0;nextIter<10;nextIter++) {
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: next");
|
|
||||||
if (storeOrd) {
|
|
||||||
System.out.println(" ord=" + termsEnum.ord());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (termsEnum.next() != null) {
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(" term=" + termsEnum.term().utf8ToString());
|
System.out.println("TEST: next");
|
||||||
|
if (storeOrd) {
|
||||||
|
System.out.println(" ord=" + termsEnum.ord());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
assertNotNull(fstEnum.next());
|
if (termsEnum.next() != null) {
|
||||||
assertSame(termsEnum, fstEnum, storeOrd);
|
if (VERBOSE) {
|
||||||
} else {
|
System.out.println(" term=" + termsEnum.term().utf8ToString());
|
||||||
if (VERBOSE) {
|
}
|
||||||
System.out.println(" end!");
|
assertNotNull(fstEnum.next());
|
||||||
|
assertSame(termsEnum, fstEnum, storeOrd);
|
||||||
|
} else {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" end!");
|
||||||
|
}
|
||||||
|
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
|
||||||
|
if (nextResult != null) {
|
||||||
|
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
|
||||||
|
fail();
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
|
|
||||||
if (nextResult != null) {
|
|
||||||
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
|
|
||||||
fail();
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1248,14 +1244,17 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
private int inputMode;
|
private int inputMode;
|
||||||
private final Outputs<T> outputs;
|
private final Outputs<T> outputs;
|
||||||
private final Builder<T> builder;
|
private final Builder<T> builder;
|
||||||
|
private final boolean doPack;
|
||||||
|
|
||||||
public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs) {
|
public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean doPack, boolean noArcArrays) {
|
||||||
this.dirOut = dirOut;
|
this.dirOut = dirOut;
|
||||||
this.wordsFileIn = wordsFileIn;
|
this.wordsFileIn = wordsFileIn;
|
||||||
this.inputMode = inputMode;
|
this.inputMode = inputMode;
|
||||||
this.outputs = outputs;
|
this.outputs = outputs;
|
||||||
|
this.doPack = doPack;
|
||||||
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null);
|
|
||||||
|
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack);
|
||||||
|
builder.setAllowArrayArcs(!noArcArrays);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
|
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
|
||||||
|
@ -1287,14 +1286,15 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
assert builder.getTermCount() == ord;
|
assert builder.getTermCount() == ord;
|
||||||
final FST<T> fst = builder.finish();
|
FST<T> fst = builder.finish();
|
||||||
if (fst == null) {
|
if (fst == null) {
|
||||||
System.out.println("FST was fully pruned!");
|
System.out.println("FST was fully pruned!");
|
||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dirOut == null)
|
if (dirOut == null) {
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
|
System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
|
||||||
if (fst.getNodeCount() < 100) {
|
if (fst.getNodeCount() < 100) {
|
||||||
|
@ -1304,12 +1304,17 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
System.out.println("Wrote FST to out.dot");
|
System.out.println("Wrote FST to out.dot");
|
||||||
}
|
}
|
||||||
|
|
||||||
Directory dir = FSDirectory.open(new File(dirOut));
|
if (doPack) {
|
||||||
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT);
|
System.out.println("Pack...");
|
||||||
fst.save(out);
|
fst = fst.pack(4, 100000000);
|
||||||
out.close();
|
System.out.println("New size " + fst.sizeInBytes() + " bytes");
|
||||||
|
} else {
|
||||||
System.out.println("Saved FST to fst.bin.");
|
Directory dir = FSDirectory.open(new File(dirOut));
|
||||||
|
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT);
|
||||||
|
fst.save(out);
|
||||||
|
out.close();
|
||||||
|
System.out.println("Saved FST to fst.bin.");
|
||||||
|
}
|
||||||
|
|
||||||
if (!verify) {
|
if (!verify) {
|
||||||
return;
|
return;
|
||||||
|
@ -1317,45 +1322,50 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
System.out.println("\nNow verify...");
|
System.out.println("\nNow verify...");
|
||||||
|
|
||||||
is.close();
|
|
||||||
is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
|
|
||||||
|
|
||||||
ord = 0;
|
|
||||||
tStart = System.currentTimeMillis();
|
|
||||||
while(true) {
|
while(true) {
|
||||||
String w = is.readLine();
|
is.close();
|
||||||
if (w == null) {
|
is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
|
||||||
break;
|
|
||||||
}
|
ord = 0;
|
||||||
toIntsRef(w, inputMode, intsRef);
|
tStart = System.currentTimeMillis();
|
||||||
T expected = getOutput(intsRef, ord);
|
while(true) {
|
||||||
T actual = Util.get(fst, intsRef);
|
String w = is.readLine();
|
||||||
if (actual == null) {
|
if (w == null) {
|
||||||
throw new RuntimeException("unexpected null output on input=" + w);
|
break;
|
||||||
}
|
}
|
||||||
if (!actual.equals(expected)) {
|
toIntsRef(w, inputMode, intsRef);
|
||||||
throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
|
T expected = getOutput(intsRef, ord);
|
||||||
|
T actual = Util.get(fst, intsRef);
|
||||||
|
if (actual == null) {
|
||||||
|
throw new RuntimeException("unexpected null output on input=" + w);
|
||||||
|
}
|
||||||
|
if (!actual.equals(expected)) {
|
||||||
|
throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
|
||||||
|
}
|
||||||
|
|
||||||
|
ord++;
|
||||||
|
if (ord % 500000 == 0) {
|
||||||
|
System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
|
||||||
|
}
|
||||||
|
if (ord >= limit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ord++;
|
double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
|
||||||
if (ord % 500000 == 0) {
|
System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
|
||||||
System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
|
|
||||||
}
|
// NOTE: comment out to profile lookup...
|
||||||
if (ord >= limit) {
|
break;
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
|
|
||||||
System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
|
|
||||||
|
|
||||||
} finally {
|
} finally {
|
||||||
is.close();
|
is.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out
|
// java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
int prune = 0;
|
int prune = 0;
|
||||||
int limit = Integer.MAX_VALUE;
|
int limit = Integer.MAX_VALUE;
|
||||||
|
@ -1363,7 +1373,8 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
boolean storeOrds = false;
|
boolean storeOrds = false;
|
||||||
boolean storeDocFreqs = false;
|
boolean storeDocFreqs = false;
|
||||||
boolean verify = true;
|
boolean verify = true;
|
||||||
|
boolean doPack = false;
|
||||||
|
boolean noArcArrays = false;
|
||||||
String wordsFileIn = null;
|
String wordsFileIn = null;
|
||||||
String dirOut = null;
|
String dirOut = null;
|
||||||
|
|
||||||
|
@ -1381,10 +1392,14 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
inputMode = 1;
|
inputMode = 1;
|
||||||
} else if (args[idx].equals("-docFreq")) {
|
} else if (args[idx].equals("-docFreq")) {
|
||||||
storeDocFreqs = true;
|
storeDocFreqs = true;
|
||||||
|
} else if (args[idx].equals("-noArcArrays")) {
|
||||||
|
noArcArrays = true;
|
||||||
} else if (args[idx].equals("-ords")) {
|
} else if (args[idx].equals("-ords")) {
|
||||||
storeOrds = true;
|
storeOrds = true;
|
||||||
} else if (args[idx].equals("-noverify")) {
|
} else if (args[idx].equals("-noverify")) {
|
||||||
verify = false;
|
verify = false;
|
||||||
|
} else if (args[idx].equals("-pack")) {
|
||||||
|
doPack = true;
|
||||||
} else if (args[idx].startsWith("-")) {
|
} else if (args[idx].startsWith("-")) {
|
||||||
System.err.println("Unrecognized option: " + args[idx]);
|
System.err.println("Unrecognized option: " + args[idx]);
|
||||||
System.exit(-1);
|
System.exit(-1);
|
||||||
|
@ -1413,44 +1428,44 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true);
|
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true);
|
||||||
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false);
|
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false);
|
||||||
final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
|
final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
|
||||||
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs) {
|
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||||
Random rand;
|
Random rand;
|
||||||
@Override
|
@Override
|
||||||
public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
|
public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
|
||||||
if (ord == 0) {
|
if (ord == 0) {
|
||||||
rand = new Random(17);
|
rand = new Random(17);
|
||||||
}
|
}
|
||||||
return new PairOutputs.Pair<Long,Long>(o1.get(ord),
|
return outputs.newPair((long) ord,
|
||||||
o2.get(_TestUtil.nextInt(rand, 1, 5000)));
|
(long) _TestUtil.nextInt(rand, 1, 5000));
|
||||||
}
|
}
|
||||||
}.run(limit, verify);
|
}.run(limit, verify);
|
||||||
} else if (storeOrds) {
|
} else if (storeOrds) {
|
||||||
// Store only ords
|
// Store only ords
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
|
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||||
@Override
|
@Override
|
||||||
public Long getOutput(IntsRef input, int ord) {
|
public Long getOutput(IntsRef input, int ord) {
|
||||||
return outputs.get(ord);
|
return (long) ord;
|
||||||
}
|
}
|
||||||
}.run(limit, verify);
|
}.run(limit, verify);
|
||||||
} else if (storeDocFreqs) {
|
} else if (storeDocFreqs) {
|
||||||
// Store only docFreq
|
// Store only docFreq
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
|
||||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
|
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||||
Random rand;
|
Random rand;
|
||||||
@Override
|
@Override
|
||||||
public Long getOutput(IntsRef input, int ord) {
|
public Long getOutput(IntsRef input, int ord) {
|
||||||
if (ord == 0) {
|
if (ord == 0) {
|
||||||
rand = new Random(17);
|
rand = new Random(17);
|
||||||
}
|
}
|
||||||
return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
|
return (long) _TestUtil.nextInt(rand, 1, 5000);
|
||||||
}
|
}
|
||||||
}.run(limit, verify);
|
}.run(limit, verify);
|
||||||
} else {
|
} else {
|
||||||
// Store nothing
|
// Store nothing
|
||||||
final NoOutputs outputs = NoOutputs.getSingleton();
|
final NoOutputs outputs = NoOutputs.getSingleton();
|
||||||
final Object NO_OUTPUT = outputs.getNoOutput();
|
final Object NO_OUTPUT = outputs.getNoOutput();
|
||||||
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs) {
|
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||||
@Override
|
@Override
|
||||||
public Object getOutput(IntsRef input, int ord) {
|
public Object getOutput(IntsRef input, int ord) {
|
||||||
return NO_OUTPUT;
|
return NO_OUTPUT;
|
||||||
|
@ -1468,6 +1483,46 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
|
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
public void testTrivial() throws Exception {
|
||||||
|
|
||||||
|
// Get outputs -- passing true means FST will share
|
||||||
|
// (delta code) the outputs. This should result in
|
||||||
|
// smaller FST if the outputs grow monotonically. But
|
||||||
|
// if numbers are "random", false should give smaller
|
||||||
|
// final size:
|
||||||
|
final NoOutputs outputs = NoOutputs.getSingleton();
|
||||||
|
|
||||||
|
String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
|
||||||
|
|
||||||
|
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1,
|
||||||
|
0, 0,
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
Integer.MAX_VALUE,
|
||||||
|
outputs,
|
||||||
|
null,
|
||||||
|
true);
|
||||||
|
Arrays.sort(strings);
|
||||||
|
final IntsRef scratch = new IntsRef();
|
||||||
|
for(String s : strings) {
|
||||||
|
builder.add(Util.toIntsRef(new BytesRef(s), scratch), outputs.getNoOutput());
|
||||||
|
}
|
||||||
|
final FST<Object> fst = builder.finish();
|
||||||
|
System.out.println("DOT before rewrite");
|
||||||
|
Writer w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/before.dot"));
|
||||||
|
Util.toDot(fst, w, false, false);
|
||||||
|
w.close();
|
||||||
|
|
||||||
|
final FST<Object> rewrite = new FST<Object>(fst, 1, 100);
|
||||||
|
|
||||||
|
System.out.println("DOT after rewrite");
|
||||||
|
w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/after.dot"));
|
||||||
|
Util.toDot(rewrite, w, false, false);
|
||||||
|
w.close();
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
public void testSimple() throws Exception {
|
public void testSimple() throws Exception {
|
||||||
|
|
||||||
// Get outputs -- passing true means FST will share
|
// Get outputs -- passing true means FST will share
|
||||||
|
@ -1484,9 +1539,9 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final BytesRef b = new BytesRef("b");
|
final BytesRef b = new BytesRef("b");
|
||||||
final BytesRef c = new BytesRef("c");
|
final BytesRef c = new BytesRef("c");
|
||||||
|
|
||||||
builder.add(Util.toIntsRef(a, new IntsRef()), outputs.get(17));
|
builder.add(Util.toIntsRef(a, new IntsRef()), 17L);
|
||||||
builder.add(Util.toIntsRef(b, new IntsRef()), outputs.get(42));
|
builder.add(Util.toIntsRef(b, new IntsRef()), 42L);
|
||||||
builder.add(Util.toIntsRef(c, new IntsRef()), outputs.get(13824324872317238L));
|
builder.add(Util.toIntsRef(c, new IntsRef()), 13824324872317238L);
|
||||||
|
|
||||||
final FST<Long> fst = builder.finish();
|
final FST<Long> fst = builder.finish();
|
||||||
|
|
||||||
|
@ -1795,11 +1850,11 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
public void testFinalOutputOnEndState() throws Exception {
|
public void testFinalOutputOnEndState() throws Exception {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||||
|
|
||||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null);
|
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random.nextBoolean());
|
||||||
builder.add(Util.toUTF32("stat", new IntsRef()), outputs.get(17));
|
builder.add(Util.toUTF32("stat", new IntsRef()), 17L);
|
||||||
builder.add(Util.toUTF32("station", new IntsRef()), outputs.get(10));
|
builder.add(Util.toUTF32("station", new IntsRef()), 10L);
|
||||||
final FST<Long> fst = builder.finish();
|
final FST<Long> fst = builder.finish();
|
||||||
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
|
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
|
||||||
StringWriter w = new StringWriter();
|
StringWriter w = new StringWriter();
|
||||||
Util.toDot(fst, w, false, false);
|
Util.toDot(fst, w, false, false);
|
||||||
w.close();
|
w.close();
|
||||||
|
@ -1809,8 +1864,8 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
public void testInternalFinalState() throws Exception {
|
public void testInternalFinalState() throws Exception {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||||
|
final boolean willRewrite = random.nextBoolean();
|
||||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null);
|
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput());
|
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput());
|
||||||
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput());
|
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput());
|
||||||
final FST<Long> fst = builder.finish();
|
final FST<Long> fst = builder.finish();
|
||||||
|
@ -1819,17 +1874,23 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
Util.toDot(fst, w, false, false);
|
Util.toDot(fst, w, false, false);
|
||||||
w.close();
|
w.close();
|
||||||
//System.out.println(w.toString());
|
//System.out.println(w.toString());
|
||||||
assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1);
|
final String expected;
|
||||||
|
if (willRewrite) {
|
||||||
|
expected = "4 -> 3 [label=\"t\" style=\"bold\"";
|
||||||
|
} else {
|
||||||
|
expected = "8 -> 6 [label=\"t\" style=\"bold\"";
|
||||||
|
}
|
||||||
|
assertTrue(w.toString().indexOf(expected) != -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure raw FST can differentiate between final vs
|
// Make sure raw FST can differentiate between final vs
|
||||||
// non-final end nodes
|
// non-final end nodes
|
||||||
public void testNonFinalStopNodes() throws Exception {
|
public void testNonFinalStopNode() throws Exception {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||||
final Long nothing = outputs.getNoOutput();
|
final Long nothing = outputs.getNoOutput();
|
||||||
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs, false);
|
||||||
|
|
||||||
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
|
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
|
||||||
|
|
||||||
|
@ -1839,8 +1900,8 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
node.isFinal = true;
|
node.isFinal = true;
|
||||||
rootNode.addArc('a', node);
|
rootNode.addArc('a', node);
|
||||||
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
||||||
frozen.address = fst.addNode(node);
|
frozen.node = fst.addNode(node);
|
||||||
rootNode.arcs[0].nextFinalOutput = outputs.get(17);
|
rootNode.arcs[0].nextFinalOutput = 17L;
|
||||||
rootNode.arcs[0].isFinal = true;
|
rootNode.arcs[0].isFinal = true;
|
||||||
rootNode.arcs[0].output = nothing;
|
rootNode.arcs[0].output = nothing;
|
||||||
rootNode.arcs[0].target = frozen;
|
rootNode.arcs[0].target = frozen;
|
||||||
|
@ -1851,13 +1912,18 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
|
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
|
||||||
rootNode.addArc('b', node);
|
rootNode.addArc('b', node);
|
||||||
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
||||||
frozen.address = fst.addNode(node);
|
frozen.node = fst.addNode(node);
|
||||||
rootNode.arcs[1].nextFinalOutput = nothing;
|
rootNode.arcs[1].nextFinalOutput = nothing;
|
||||||
rootNode.arcs[1].output = outputs.get(42);
|
rootNode.arcs[1].output = 42L;
|
||||||
rootNode.arcs[1].target = frozen;
|
rootNode.arcs[1].target = frozen;
|
||||||
}
|
}
|
||||||
|
|
||||||
fst.finish(fst.addNode(rootNode));
|
fst.finish(fst.addNode(rootNode));
|
||||||
|
|
||||||
|
StringWriter w = new StringWriter();
|
||||||
|
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
|
||||||
|
Util.toDot(fst, w, false, false);
|
||||||
|
w.close();
|
||||||
|
|
||||||
checkStopNodes(fst, outputs);
|
checkStopNodes(fst, outputs);
|
||||||
|
|
||||||
|
|
|
@ -226,6 +226,9 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
|
|
||||||
private final FST<BytesRef> fst;
|
private final FST<BytesRef> fst;
|
||||||
|
|
||||||
|
private final FST.BytesReader fstReader;
|
||||||
|
|
||||||
|
|
||||||
private final BytesRef scratchBytes = new BytesRef();
|
private final BytesRef scratchBytes = new BytesRef();
|
||||||
private final CharsRef scratchChars = new CharsRef();
|
private final CharsRef scratchChars = new CharsRef();
|
||||||
|
|
||||||
|
@ -241,7 +244,7 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
this.synonyms = synonyms;
|
this.synonyms = synonyms;
|
||||||
this.ignoreCase = ignoreCase;
|
this.ignoreCase = ignoreCase;
|
||||||
this.fst = synonyms.fst;
|
this.fst = synonyms.fst;
|
||||||
|
this.fstReader = fst.getBytesReader(0);
|
||||||
if (fst == null) {
|
if (fst == null) {
|
||||||
throw new IllegalArgumentException("fst must be non-null");
|
throw new IllegalArgumentException("fst must be non-null");
|
||||||
}
|
}
|
||||||
|
@ -366,7 +369,7 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
int bufUpto = 0;
|
int bufUpto = 0;
|
||||||
while(bufUpto < bufferLen) {
|
while(bufUpto < bufferLen) {
|
||||||
final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
|
final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
|
||||||
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) {
|
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
|
||||||
//System.out.println(" stop");
|
//System.out.println(" stop");
|
||||||
break byToken;
|
break byToken;
|
||||||
}
|
}
|
||||||
|
@ -388,7 +391,7 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
|
|
||||||
// See if the FST wants to continue matching (ie, needs to
|
// See if the FST wants to continue matching (ie, needs to
|
||||||
// see the next input token):
|
// see the next input token):
|
||||||
if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
|
if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
|
||||||
// No further rules can match here; we're done
|
// No further rules can match here; we're done
|
||||||
// searching for matching rules starting at the
|
// searching for matching rules starting at the
|
||||||
// current input position.
|
// current input position.
|
||||||
|
|
|
@ -47,16 +47,17 @@ public final class TokenInfoFST {
|
||||||
FST.Arc<Long> firstArc = new FST.Arc<Long>();
|
FST.Arc<Long> firstArc = new FST.Arc<Long>();
|
||||||
fst.getFirstArc(firstArc);
|
fst.getFirstArc(firstArc);
|
||||||
FST.Arc<Long> arc = new FST.Arc<Long>();
|
FST.Arc<Long> arc = new FST.Arc<Long>();
|
||||||
|
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||||
// TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs)
|
// TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs)
|
||||||
for (int i = 0; i < rootCache.length; i++) {
|
for (int i = 0; i < rootCache.length; i++) {
|
||||||
if (fst.findTargetArc(0x3040 + i, firstArc, arc) != null) {
|
if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) {
|
||||||
rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
|
rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return rootCache;
|
return rootCache;
|
||||||
}
|
}
|
||||||
|
|
||||||
public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache) throws IOException {
|
public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache, FST.BytesReader fstReader) throws IOException {
|
||||||
if (useCache && ch >= 0x3040 && ch <= cacheCeiling) {
|
if (useCache && ch >= 0x3040 && ch <= cacheCeiling) {
|
||||||
assert ch != FST.END_LABEL;
|
assert ch != FST.END_LABEL;
|
||||||
final Arc<Long> result = rootCache[ch - 0x3040];
|
final Arc<Long> result = rootCache[ch - 0x3040];
|
||||||
|
@ -67,13 +68,17 @@ public final class TokenInfoFST {
|
||||||
return arc;
|
return arc;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
return fst.findTargetArc(ch, follow, arc);
|
return fst.findTargetArc(ch, follow, arc, fstReader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Arc<Long> getFirstArc(FST.Arc<Long> arc) {
|
public Arc<Long> getFirstArc(FST.Arc<Long> arc) {
|
||||||
return fst.getFirstArc(arc);
|
return fst.getFirstArc(arc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public FST.BytesReader getBytesReader(int pos) {
|
||||||
|
return fst.getBytesReader(pos);
|
||||||
|
}
|
||||||
|
|
||||||
/** @lucene.internal for testing only */
|
/** @lucene.internal for testing only */
|
||||||
FST<Long> getInternalFST() {
|
FST<Long> getInternalFST() {
|
||||||
|
|
|
@ -113,7 +113,7 @@ public final class UserDictionary implements Dictionary {
|
||||||
for (int i = 0; i < token.length(); i++) {
|
for (int i = 0; i < token.length(); i++) {
|
||||||
scratch.ints[i] = (int) token.charAt(i);
|
scratch.ints[i] = (int) token.charAt(i);
|
||||||
}
|
}
|
||||||
fstBuilder.add(scratch, fstOutput.get(ord));
|
fstBuilder.add(scratch, ord);
|
||||||
segmentations.add(wordIdAndLength);
|
segmentations.add(wordIdAndLength);
|
||||||
ord++;
|
ord++;
|
||||||
}
|
}
|
||||||
|
@ -134,6 +134,8 @@ public final class UserDictionary implements Dictionary {
|
||||||
TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
|
TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
|
||||||
boolean found = false; // true if we found any results
|
boolean found = false; // true if we found any results
|
||||||
|
|
||||||
|
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||||
|
|
||||||
FST.Arc<Long> arc = new FST.Arc<Long>();
|
FST.Arc<Long> arc = new FST.Arc<Long>();
|
||||||
int end = off + len;
|
int end = off + len;
|
||||||
for (int startOffset = off; startOffset < end; startOffset++) {
|
for (int startOffset = off; startOffset < end; startOffset++) {
|
||||||
|
@ -142,7 +144,7 @@ public final class UserDictionary implements Dictionary {
|
||||||
int remaining = end - startOffset;
|
int remaining = end - startOffset;
|
||||||
for (int i = 0; i < remaining; i++) {
|
for (int i = 0; i < remaining; i++) {
|
||||||
int ch = chars[startOffset+i];
|
int ch = chars[startOffset+i];
|
||||||
if (fst.findTargetArc(ch, arc, arc, i == 0) == null) {
|
if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
|
||||||
break; // continue to next position
|
break; // continue to next position
|
||||||
}
|
}
|
||||||
output += arc.output.intValue();
|
output += arc.output.intValue();
|
||||||
|
|
|
@ -35,7 +35,7 @@ import org.apache.lucene.util.fst.FST;
|
||||||
public class Viterbi {
|
public class Viterbi {
|
||||||
|
|
||||||
private final TokenInfoFST fst;
|
private final TokenInfoFST fst;
|
||||||
|
|
||||||
private final TokenInfoDictionary dictionary;
|
private final TokenInfoDictionary dictionary;
|
||||||
|
|
||||||
private final UnknownDictionary unkDictionary;
|
private final UnknownDictionary unkDictionary;
|
||||||
|
@ -214,6 +214,8 @@ public class Viterbi {
|
||||||
ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
|
ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
|
||||||
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
|
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
|
||||||
|
|
||||||
|
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||||
|
|
||||||
// Process user dictionary;
|
// Process user dictionary;
|
||||||
if (useUserDictionary) {
|
if (useUserDictionary) {
|
||||||
processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
|
processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
|
||||||
|
@ -238,7 +240,7 @@ public class Viterbi {
|
||||||
for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
|
for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
|
||||||
int ch = text[suffixStart + endIndex - 1];
|
int ch = text[suffixStart + endIndex - 1];
|
||||||
|
|
||||||
if (fst.findTargetArc(ch, arc, arc, endIndex == 1) == null) {
|
if (fst.findTargetArc(ch, arc, arc, endIndex == 1, fstReader) == null) {
|
||||||
break; // continue to next position
|
break; // continue to next position
|
||||||
}
|
}
|
||||||
output += arc.output.intValue();
|
output += arc.output.intValue();
|
||||||
|
|
Binary file not shown.
|
@ -131,7 +131,7 @@ public class TokenInfoDictionaryBuilder {
|
||||||
System.out.println(" encode...");
|
System.out.println(" encode...");
|
||||||
|
|
||||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
|
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
|
||||||
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true);
|
||||||
IntsRef scratch = new IntsRef();
|
IntsRef scratch = new IntsRef();
|
||||||
long ord = -1; // first ord will be 0
|
long ord = -1; // first ord will be 0
|
||||||
String lastValue = null;
|
String lastValue = null;
|
||||||
|
@ -155,13 +155,14 @@ public class TokenInfoDictionaryBuilder {
|
||||||
for (int i = 0; i < token.length(); i++) {
|
for (int i = 0; i < token.length(); i++) {
|
||||||
scratch.ints[i] = (int) token.charAt(i);
|
scratch.ints[i] = (int) token.charAt(i);
|
||||||
}
|
}
|
||||||
fstBuilder.add(scratch, fstOutput.get(ord));
|
fstBuilder.add(scratch, ord);
|
||||||
}
|
}
|
||||||
dictionary.addMapping((int)ord, offset);
|
dictionary.addMapping((int)ord, offset);
|
||||||
offset = next;
|
offset = next;
|
||||||
}
|
}
|
||||||
|
|
||||||
FST<Long> fst = fstBuilder.finish();
|
final FST<Long> fst = fstBuilder.finish().pack(2, 100000);
|
||||||
|
|
||||||
System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... ");
|
System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... ");
|
||||||
dictionary.setFST(fst);
|
dictionary.setFST(fst);
|
||||||
System.out.println(" done");
|
System.out.println(" done");
|
||||||
|
|
|
@ -329,8 +329,11 @@ public class FSTCompletion {
|
||||||
private boolean descendWithPrefix(Arc<Object> arc, BytesRef utf8)
|
private boolean descendWithPrefix(Arc<Object> arc, BytesRef utf8)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
final int max = utf8.offset + utf8.length;
|
final int max = utf8.offset + utf8.length;
|
||||||
|
// Cannot save as instance var since multiple threads
|
||||||
|
// can use FSTCompletion at once...
|
||||||
|
final FST.BytesReader fstReader = automaton.getBytesReader(0);
|
||||||
for (int i = utf8.offset; i < max; i++) {
|
for (int i = utf8.offset; i < max; i++) {
|
||||||
if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc) == null) {
|
if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc, fstReader) == null) {
|
||||||
// No matching prefixes, return an empty result.
|
// No matching prefixes, return an empty result.
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -234,7 +234,7 @@ public class FSTCompletionBuilder {
|
||||||
final Object empty = outputs.getNoOutput();
|
final Object empty = outputs.getNoOutput();
|
||||||
final Builder<Object> builder = new Builder<Object>(
|
final Builder<Object> builder = new Builder<Object>(
|
||||||
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
|
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
|
||||||
shareMaxTailLength, outputs, null);
|
shareMaxTailLength, outputs, null, false);
|
||||||
|
|
||||||
BytesRef scratch = new BytesRef();
|
BytesRef scratch = new BytesRef();
|
||||||
final IntsRef scratchIntsRef = new IntsRef();
|
final IntsRef scratchIntsRef = new IntsRef();
|
||||||
|
|
Loading…
Reference in New Issue