mirror of https://github.com/apache/lucene.git
LUCENE-3725: add optional packing to FSTs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1237500 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2e5be2f75c
commit
d1165b1972
|
@ -806,6 +806,9 @@ New Features
|
|||
|
||||
* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
|
||||
markup. (Steve Rowe)
|
||||
|
||||
* LUCENE-3725: Added optional packing to FST building; this uses extra
|
||||
RAM during building but results in a smaller FST. (Mike McCandless)
|
||||
|
||||
Bug fixes
|
||||
|
||||
|
|
|
@ -398,7 +398,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
final long indexStartFP;
|
||||
final long rootBlockFP;
|
||||
final BytesRef rootCode;
|
||||
private FST<BytesRef> index;
|
||||
private final FST<BytesRef> index;
|
||||
|
||||
//private boolean DEBUG;
|
||||
|
||||
|
@ -433,6 +433,8 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
w.close();
|
||||
}
|
||||
*/
|
||||
} else {
|
||||
index = null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -495,6 +497,8 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
private final BytesRef term = new BytesRef();
|
||||
|
||||
private final FST.BytesReader fstReader;
|
||||
|
||||
// TODO: can we share this with the frame in STE?
|
||||
private final class Frame {
|
||||
final int ord;
|
||||
|
@ -755,6 +759,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
arcs[arcIdx] = new FST.Arc<BytesRef>();
|
||||
}
|
||||
|
||||
if (index == null) {
|
||||
fstReader = null;
|
||||
} else {
|
||||
fstReader = index.getBytesReader(0);
|
||||
}
|
||||
|
||||
// TODO: if the automaton is "smallish" we really
|
||||
// should use the terms index to seek at least to
|
||||
// the initial term and likely to subsequent terms
|
||||
|
@ -842,7 +852,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
// TODO: we could be more efficient for the next()
|
||||
// case by using current arc as starting point,
|
||||
// passed to findTargetArc
|
||||
arc = index.findTargetArc(target, arc, getArc(1+idx));
|
||||
arc = index.findTargetArc(target, arc, getArc(1+idx), fstReader);
|
||||
assert arc != null;
|
||||
output = fstOutputs.add(output, arc.output);
|
||||
idx++;
|
||||
|
@ -1186,6 +1196,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
private boolean eof;
|
||||
|
||||
final BytesRef term = new BytesRef();
|
||||
private final FST.BytesReader fstReader;
|
||||
|
||||
@SuppressWarnings("unchecked") private FST.Arc<BytesRef>[] arcs = new FST.Arc[1];
|
||||
|
||||
|
@ -1196,6 +1207,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
// Used to hold seek by TermState, or cached seek
|
||||
staticFrame = new Frame(-1);
|
||||
|
||||
if (index == null) {
|
||||
fstReader = null;
|
||||
} else {
|
||||
fstReader = index.getBytesReader(0);
|
||||
}
|
||||
|
||||
// Init w/ root block; don't use index since it may
|
||||
// not (and need not) have been loaded
|
||||
for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
|
||||
|
@ -1581,7 +1598,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
|
||||
|
||||
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto));
|
||||
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
|
||||
|
||||
if (nextArc == null) {
|
||||
|
||||
|
@ -1838,7 +1855,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
|
||||
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
|
||||
|
||||
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto));
|
||||
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
|
||||
|
||||
if (nextArc == null) {
|
||||
|
||||
|
|
|
@ -288,7 +288,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
final Builder<BytesRef> indexBuilder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0, true, false, Integer.MAX_VALUE,
|
||||
outputs, null);
|
||||
outputs, null, false);
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
//}
|
||||
|
@ -831,7 +831,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
0, 0, true,
|
||||
true, Integer.MAX_VALUE,
|
||||
noOutputs,
|
||||
new FindBlocks());
|
||||
new FindBlocks(), false);
|
||||
|
||||
postingsWriter.setField(fieldInfo);
|
||||
}
|
||||
|
|
|
@ -229,7 +229,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
////System.out.println("VGW: field=" + fieldInfo.name);
|
||||
|
||||
// Always put empty string in
|
||||
fstBuilder.add(new IntsRef(), fstOutputs.get(termsFilePointer));
|
||||
fstBuilder.add(new IntsRef(), termsFilePointer);
|
||||
startTermsFilePointer = termsFilePointer;
|
||||
}
|
||||
|
||||
|
@ -260,7 +260,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
final int lengthSave = text.length;
|
||||
text.length = indexedTermPrefixLength(lastTerm, text);
|
||||
try {
|
||||
fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), fstOutputs.get(termsFilePointer));
|
||||
fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
|
||||
} finally {
|
||||
text.length = lengthSave;
|
||||
}
|
||||
|
|
|
@ -521,9 +521,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
private void loadTerms() throws IOException {
|
||||
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
|
||||
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
|
||||
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
|
||||
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
|
||||
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
|
||||
final PairOutputs<Long,Long> outputsInner = new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs);
|
||||
final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
|
||||
outputsInner);
|
||||
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||
in.seek(termsStart);
|
||||
final BytesRef lastTerm = new BytesRef(10);
|
||||
|
@ -536,9 +537,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
SimpleTextUtil.readLine(in, scratch);
|
||||
if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
|
||||
if (lastDocsStart != -1) {
|
||||
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
|
||||
new PairOutputs.Pair<Long,Long>((long) docFreq,
|
||||
posIntOutputs.get(totalTermFreq))));
|
||||
b.add(Util.toIntsRef(lastTerm, scratchIntsRef),
|
||||
outputs.newPair(lastDocsStart,
|
||||
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
||||
sumTotalTermFreq += totalTermFreq;
|
||||
}
|
||||
break;
|
||||
|
@ -553,9 +554,8 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
|
||||
} else if (StringHelper.startsWith(scratch, TERM)) {
|
||||
if (lastDocsStart != -1) {
|
||||
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
|
||||
new PairOutputs.Pair<Long,Long>((long) docFreq,
|
||||
posIntOutputs.get(totalTermFreq))));
|
||||
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair(lastDocsStart,
|
||||
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
||||
}
|
||||
lastDocsStart = in.getFilePointer();
|
||||
final int len = scratch.length - TERM.length;
|
||||
|
|
|
@ -95,7 +95,7 @@ public final class FixedBitSet extends DocIdSet implements Bits {
|
|||
}
|
||||
|
||||
public boolean get(int index) {
|
||||
assert index >= 0 && index < numBits;
|
||||
assert index >= 0 && index < numBits: "index=" + index;
|
||||
int i = index >> 6; // div 64
|
||||
// signed shift will keep a negative index and force an
|
||||
// array-index-out-of-bounds-exception, removing the need for an explicit check.
|
||||
|
|
|
@ -588,7 +588,7 @@ public final class UnicodeUtil {
|
|||
out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
|
||||
offset += 2;
|
||||
} else {
|
||||
assert b < 0xf8;
|
||||
assert b < 0xf8: "b=" + b;
|
||||
int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
|
||||
offset += 3;
|
||||
if (ch < UNI_MAX_BMP) {
|
||||
|
|
|
@ -17,15 +17,15 @@ package org.apache.lucene.util.fst;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
|
||||
|
||||
/**
|
||||
* Builds a compact FST (maps an IntsRef term to an arbitrary
|
||||
* Builds a minimal FST (maps an IntsRef term to an arbitrary
|
||||
* output) from pre-sorted terms with outputs (the FST
|
||||
* becomes an FSA if you use NoOutputs). The FST is written
|
||||
* on-the-fly into a compact serialized format byte array, which can
|
||||
|
@ -35,12 +35,6 @@ import java.io.IOException;
|
|||
* <p>NOTE: The algorithm is described at
|
||||
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698</p>
|
||||
*
|
||||
* If your outputs are ByteSequenceOutput then the final FST
|
||||
* will be minimal, but if you use PositiveIntOutput then
|
||||
* it's only "near minimal". For example, aa/0, aab/1, bbb/2
|
||||
* will produce 6 states when a 5 state fst is also
|
||||
* possible.
|
||||
*
|
||||
* The parameterized type T is the output type. See the
|
||||
* subclasses of {@link Outputs}.
|
||||
*
|
||||
|
@ -52,7 +46,7 @@ public class Builder<T> {
|
|||
private final FST<T> fst;
|
||||
private final T NO_OUTPUT;
|
||||
|
||||
// private static final boolean DEBUG = false;
|
||||
// private static final boolean DEBUG = true;
|
||||
|
||||
// simplistic pruning: we prune node (and all following
|
||||
// nodes) if less than this number of terms go through it:
|
||||
|
@ -88,7 +82,7 @@ public class Builder<T> {
|
|||
* pruning options turned off.
|
||||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null);
|
||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -127,16 +121,20 @@ public class Builder<T> {
|
|||
* @param outputs The output type for each input sequence. Applies only if building an FST. For
|
||||
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
||||
* singleton output object.
|
||||
*
|
||||
* @param willPackFST Pass true if you will rewrite (compact) the FST before saving. This
|
||||
* causes the FST to create additional data structures intenrally to facilitate rewriting, but
|
||||
* it means the resulting FST cannot be saved: it must first be rewritten using {@link FST#FST(FST,int[])}}
|
||||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
||||
FreezeTail<T> freezeTail) {
|
||||
FreezeTail<T> freezeTail, boolean willPackFST) {
|
||||
this.minSuffixCount1 = minSuffixCount1;
|
||||
this.minSuffixCount2 = minSuffixCount2;
|
||||
this.freezeTail = freezeTail;
|
||||
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
fst = new FST<T>(inputType, outputs);
|
||||
fst = new FST<T>(inputType, outputs, willPackFST);
|
||||
if (doShareSuffix) {
|
||||
dedupHash = new NodeHash<T>(fst);
|
||||
} else {
|
||||
|
@ -170,23 +168,23 @@ public class Builder<T> {
|
|||
fst.setAllowArrayArcs(b);
|
||||
}
|
||||
|
||||
private CompiledNode compileNode(UnCompiledNode<T> n, int tailLength) throws IOException {
|
||||
final int address;
|
||||
if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) {
|
||||
if (n.numArcs == 0) {
|
||||
address = fst.addNode(n);
|
||||
private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throws IOException {
|
||||
final int node;
|
||||
if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) {
|
||||
if (nodeIn.numArcs == 0) {
|
||||
node = fst.addNode(nodeIn);
|
||||
} else {
|
||||
address = dedupHash.add(n);
|
||||
node = dedupHash.add(nodeIn);
|
||||
}
|
||||
} else {
|
||||
address = fst.addNode(n);
|
||||
node = fst.addNode(nodeIn);
|
||||
}
|
||||
assert address != -2;
|
||||
assert node != -2;
|
||||
|
||||
n.clear();
|
||||
nodeIn.clear();
|
||||
|
||||
final CompiledNode fn = new CompiledNode();
|
||||
fn.address = address;
|
||||
fn.node = node;
|
||||
return fn;
|
||||
}
|
||||
|
||||
|
@ -319,6 +317,11 @@ public class Builder<T> {
|
|||
}
|
||||
*/
|
||||
|
||||
// De-dup NO_OUTPUT since it must be a singleton:
|
||||
if (output.equals(NO_OUTPUT)) {
|
||||
output = NO_OUTPUT;
|
||||
}
|
||||
|
||||
assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input;
|
||||
assert validOutput(output);
|
||||
|
||||
|
@ -443,7 +446,7 @@ public class Builder<T> {
|
|||
}
|
||||
}
|
||||
//if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
|
||||
fst.finish(compileNode(root, lastInput.length).address);
|
||||
fst.finish(compileNode(root, lastInput.length).node);
|
||||
|
||||
return fst;
|
||||
}
|
||||
|
@ -480,7 +483,7 @@ public class Builder<T> {
|
|||
}
|
||||
|
||||
static final class CompiledNode implements Node {
|
||||
int address;
|
||||
int node;
|
||||
public boolean isCompiled() {
|
||||
return true;
|
||||
}
|
||||
|
@ -560,7 +563,7 @@ public class Builder<T> {
|
|||
final Arc<T> arc = arcs[numArcs-1];
|
||||
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
|
||||
arc.target = target;
|
||||
//assert target.address != -2;
|
||||
//assert target.node != -2;
|
||||
arc.nextFinalOutput = nextFinalOutput;
|
||||
arc.isFinal = isFinal;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -151,7 +151,8 @@ abstract class FSTEnum<T> {
|
|||
boolean found = false;
|
||||
while (low <= high) {
|
||||
mid = (low + high) >>> 1;
|
||||
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
|
||||
in.pos = arc.posArcsStart;
|
||||
in.skip(arc.bytesPerArc*mid+1);
|
||||
final int midLabel = fst.readLabel(in);
|
||||
final int cmp = midLabel - targetLabel;
|
||||
//System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
|
||||
|
@ -275,7 +276,7 @@ abstract class FSTEnum<T> {
|
|||
|
||||
// Now scan forward, matching the new suffix of the target
|
||||
while(true) {
|
||||
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast());
|
||||
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + arc.bytesPerArc);
|
||||
|
||||
if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
|
||||
// Arcs are fixed array -- use binary search to find
|
||||
|
@ -289,15 +290,16 @@ abstract class FSTEnum<T> {
|
|||
boolean found = false;
|
||||
while (low <= high) {
|
||||
mid = (low + high) >>> 1;
|
||||
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
|
||||
in.pos = arc.posArcsStart;
|
||||
in.skip(arc.bytesPerArc*mid+1);
|
||||
final int midLabel = fst.readLabel(in);
|
||||
final int cmp = midLabel - targetLabel;
|
||||
//System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
|
||||
if (cmp < 0)
|
||||
if (cmp < 0) {
|
||||
low = mid + 1;
|
||||
else if (cmp > 0)
|
||||
} else if (cmp > 0) {
|
||||
high = mid - 1;
|
||||
else {
|
||||
} else {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
|
@ -430,9 +432,11 @@ abstract class FSTEnum<T> {
|
|||
FST.Arc<T> arc = getArc(upto-1);
|
||||
int targetLabel = getTargetLabel();
|
||||
|
||||
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||
|
||||
while(true) {
|
||||
//System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel));
|
||||
final FST.Arc<T> nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto));
|
||||
final FST.Arc<T> nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto), fstReader);
|
||||
if (nextArc == null) {
|
||||
// short circuit
|
||||
//upto--;
|
||||
|
|
|
@ -35,7 +35,7 @@ final class NodeHash<T> {
|
|||
}
|
||||
|
||||
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address, FST.BytesReader in) throws IOException {
|
||||
fst.readFirstRealArc(address, scratchArc, in);
|
||||
fst.readFirstRealTargetArc(address, scratchArc, in);
|
||||
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
|
||||
return false;
|
||||
}
|
||||
|
@ -43,7 +43,7 @@ final class NodeHash<T> {
|
|||
final Builder.Arc<T> arc = node.arcs[arcUpto];
|
||||
if (arc.label != scratchArc.label ||
|
||||
!arc.output.equals(scratchArc.output) ||
|
||||
((Builder.CompiledNode) arc.target).address != scratchArc.target ||
|
||||
((Builder.CompiledNode) arc.target).node != scratchArc.target ||
|
||||
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput) ||
|
||||
arc.isFinal != scratchArc.isFinal()) {
|
||||
return false;
|
||||
|
@ -71,9 +71,9 @@ final class NodeHash<T> {
|
|||
// TODO: maybe if number of arcs is high we can safely subsample?
|
||||
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
|
||||
final Builder.Arc<T> arc = node.arcs[arcIdx];
|
||||
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).address + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
|
||||
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
|
||||
h = PRIME * h + arc.label;
|
||||
h = PRIME * h + ((Builder.CompiledNode) arc.target).address;
|
||||
h = PRIME * h + ((Builder.CompiledNode) arc.target).node;
|
||||
h = PRIME * h + arc.output.hashCode();
|
||||
h = PRIME * h + arc.nextFinalOutput.hashCode();
|
||||
if (arc.isFinal) {
|
||||
|
@ -88,9 +88,9 @@ final class NodeHash<T> {
|
|||
private int hash(int node) throws IOException {
|
||||
final int PRIME = 31;
|
||||
final FST.BytesReader in = fst.getBytesReader(0);
|
||||
//System.out.println("hash frozen");
|
||||
//System.out.println("hash frozen node=" + node);
|
||||
int h = 0;
|
||||
fst.readFirstRealArc(node, scratchArc, in);
|
||||
fst.readFirstRealTargetArc(node, scratchArc, in);
|
||||
while(true) {
|
||||
//System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal());
|
||||
h = PRIME * h + scratchArc.label;
|
||||
|
@ -109,26 +109,26 @@ final class NodeHash<T> {
|
|||
return h & Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
public int add(Builder.UnCompiledNode<T> node) throws IOException {
|
||||
public int add(Builder.UnCompiledNode<T> nodeIn) throws IOException {
|
||||
// System.out.println("hash: add count=" + count + " vs " + table.length);
|
||||
final FST.BytesReader in = fst.getBytesReader(0);
|
||||
final int h = hash(node);
|
||||
final int h = hash(nodeIn);
|
||||
int pos = h & mask;
|
||||
int c = 0;
|
||||
while(true) {
|
||||
final int v = table[pos];
|
||||
if (v == 0) {
|
||||
// freeze & add
|
||||
final int address = fst.addNode(node);
|
||||
//System.out.println(" now freeze addr=" + address);
|
||||
assert hash(address) == h : "frozenHash=" + hash(address) + " vs h=" + h;
|
||||
final int node = fst.addNode(nodeIn);
|
||||
//System.out.println(" now freeze node=" + node);
|
||||
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
|
||||
count++;
|
||||
table[pos] = address;
|
||||
table[pos] = node;
|
||||
if (table.length < 2*count) {
|
||||
rehash();
|
||||
}
|
||||
return address;
|
||||
} else if (nodesEqual(node, v, in)) {
|
||||
return node;
|
||||
} else if (nodesEqual(nodeIn, v, in)) {
|
||||
// same node is already here
|
||||
return v;
|
||||
}
|
||||
|
|
|
@ -26,6 +26,10 @@ import org.apache.lucene.store.DataOutput;
|
|||
* Represents the outputs for an FST, providing the basic
|
||||
* algebra needed for the FST.
|
||||
*
|
||||
* <p>Note that any operation that returns NO_OUTPUT must
|
||||
* return the same singleton object from {@link
|
||||
* #getNoOutput}.</p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
|
@ -56,6 +60,8 @@ public abstract class Outputs<T> {
|
|||
|
||||
public abstract String outputToString(T output);
|
||||
|
||||
// TODO: maybe make valid(T output) public...? for asserts
|
||||
|
||||
public T merge(T first, T second) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
|
|
@ -38,7 +38,8 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
|
|||
public final A output1;
|
||||
public final B output2;
|
||||
|
||||
public Pair(A output1, B output2) {
|
||||
// use newPair
|
||||
private Pair(A output1, B output2) {
|
||||
this.output1 = output1;
|
||||
this.output2 = output2;
|
||||
}
|
||||
|
@ -66,35 +67,79 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
|
|||
this.outputs2 = outputs2;
|
||||
NO_OUTPUT = new Pair<A,B>(outputs1.getNoOutput(), outputs2.getNoOutput());
|
||||
}
|
||||
|
||||
public Pair<A,B> get(A output1, B output2) {
|
||||
if (output1 == outputs1.getNoOutput() && output2 == outputs2.getNoOutput()) {
|
||||
|
||||
/** Create a new Pair */
|
||||
public Pair<A,B> newPair(A a, B b) {
|
||||
if (a.equals(outputs1.getNoOutput())) {
|
||||
a = outputs1.getNoOutput();
|
||||
}
|
||||
if (b.equals(outputs2.getNoOutput())) {
|
||||
b = outputs2.getNoOutput();
|
||||
}
|
||||
|
||||
if (a == outputs1.getNoOutput() && b == outputs2.getNoOutput()) {
|
||||
return NO_OUTPUT;
|
||||
} else {
|
||||
return new Pair<A,B>(output1, output2);
|
||||
final Pair<A,B> p = new Pair<A,B>(a, b);
|
||||
assert valid(p);
|
||||
return p;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// for assert
|
||||
private boolean valid(Pair<A,B> pair) {
|
||||
final boolean noOutput1 = pair.output1.equals(outputs1.getNoOutput());
|
||||
final boolean noOutput2 = pair.output2.equals(outputs2.getNoOutput());
|
||||
|
||||
if (noOutput1 && pair.output1 != outputs1.getNoOutput()) {
|
||||
System.out.println("invalid0");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (noOutput2 && pair.output2 != outputs2.getNoOutput()) {
|
||||
System.out.println("invalid1");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (noOutput1 && noOutput2) {
|
||||
if (pair != NO_OUTPUT) {
|
||||
System.out.println("invalid2");
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Pair<A,B> common(Pair<A,B> pair1, Pair<A,B> pair2) {
|
||||
return get(outputs1.common(pair1.output1, pair2.output1),
|
||||
outputs2.common(pair1.output2, pair2.output2));
|
||||
assert valid(pair1);
|
||||
assert valid(pair2);
|
||||
return newPair(outputs1.common(pair1.output1, pair2.output1),
|
||||
outputs2.common(pair1.output2, pair2.output2));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Pair<A,B> subtract(Pair<A,B> output, Pair<A,B> inc) {
|
||||
return get(outputs1.subtract(output.output1, inc.output1),
|
||||
outputs2.subtract(output.output2, inc.output2));
|
||||
assert valid(output);
|
||||
assert valid(inc);
|
||||
return newPair(outputs1.subtract(output.output1, inc.output1),
|
||||
outputs2.subtract(output.output2, inc.output2));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Pair<A,B> add(Pair<A,B> prefix, Pair<A,B> output) {
|
||||
return get(outputs1.add(prefix.output1, output.output1),
|
||||
outputs2.add(prefix.output2, output.output2));
|
||||
assert valid(prefix);
|
||||
assert valid(output);
|
||||
return newPair(outputs1.add(prefix.output1, output.output1),
|
||||
outputs2.add(prefix.output2, output.output2));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Pair<A,B> output, DataOutput writer) throws IOException {
|
||||
assert valid(output);
|
||||
outputs1.write(output.output1, writer);
|
||||
outputs2.write(output.output2, writer);
|
||||
}
|
||||
|
@ -103,7 +148,7 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
|
|||
public Pair<A,B> read(DataInput in) throws IOException {
|
||||
A output1 = outputs1.read(in);
|
||||
B output2 = outputs2.read(in);
|
||||
return get(output1, output2);
|
||||
return newPair(output1, output2);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -113,6 +158,12 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
|
|||
|
||||
@Override
|
||||
public String outputToString(Pair<A,B> output) {
|
||||
assert valid(output);
|
||||
return "<pair:" + outputs1.outputToString(output.output1) + "," + outputs2.outputToString(output.output2) + ">";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "PairOutputs<" + outputs1 + "," + outputs2 + ">";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,10 +25,7 @@ import org.apache.lucene.store.DataOutput;
|
|||
/**
|
||||
* Output is a long, for each input term. NOTE: the
|
||||
* resulting FST is not guaranteed to be minimal! See
|
||||
* {@link Builder}. You must use {@link #get} to obtain the
|
||||
* output for a given long value -- do not use autoboxing
|
||||
* nor create your own Long instance (the value 0
|
||||
* must map to the {@link #getNoOutput} singleton).
|
||||
* {@link Builder}.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
@ -50,14 +47,6 @@ public final class PositiveIntOutputs extends Outputs<Long> {
|
|||
return doShare ? singletonShare : singletonNoShare;
|
||||
}
|
||||
|
||||
public Long get(long v) {
|
||||
if (v == 0) {
|
||||
return NO_OUTPUT;
|
||||
} else {
|
||||
return Long.valueOf(v);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long common(Long output1, Long output2) {
|
||||
assert valid(output1);
|
||||
|
|
|
@ -37,23 +37,21 @@ public final class Util {
|
|||
// TODO: would be nice not to alloc this on every lookup
|
||||
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
||||
|
||||
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||
|
||||
// Accumulate output as we go
|
||||
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
||||
T output = NO_OUTPUT;
|
||||
T output = fst.outputs.getNoOutput();
|
||||
for(int i=0;i<input.length;i++) {
|
||||
if (fst.findTargetArc(input.ints[input.offset + i], arc, arc) == null) {
|
||||
if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) {
|
||||
return null;
|
||||
} else if (arc.output != NO_OUTPUT) {
|
||||
output = fst.outputs.add(output, arc.output);
|
||||
}
|
||||
output = fst.outputs.add(output, arc.output);
|
||||
}
|
||||
|
||||
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
|
||||
return null;
|
||||
} else if (arc.output != NO_OUTPUT) {
|
||||
return fst.outputs.add(output, arc.output);
|
||||
if (arc.isFinal()) {
|
||||
return fst.outputs.add(output, arc.nextFinalOutput);
|
||||
} else {
|
||||
return output;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -64,26 +62,24 @@ public final class Util {
|
|||
public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
|
||||
assert fst.inputType == FST.INPUT_TYPE.BYTE1;
|
||||
|
||||
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||
|
||||
// TODO: would be nice not to alloc this on every lookup
|
||||
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
||||
|
||||
// Accumulate output as we go
|
||||
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
||||
T output = NO_OUTPUT;
|
||||
T output = fst.outputs.getNoOutput();
|
||||
for(int i=0;i<input.length;i++) {
|
||||
if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc) == null) {
|
||||
if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc, fstReader) == null) {
|
||||
return null;
|
||||
} else if (arc.output != NO_OUTPUT) {
|
||||
output = fst.outputs.add(output, arc.output);
|
||||
}
|
||||
output = fst.outputs.add(output, arc.output);
|
||||
}
|
||||
|
||||
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
|
||||
return null;
|
||||
} else if (arc.output != NO_OUTPUT) {
|
||||
return fst.outputs.add(output, arc.output);
|
||||
if (arc.isFinal()) {
|
||||
return fst.outputs.add(output, arc.nextFinalOutput);
|
||||
} else {
|
||||
return output;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -142,7 +138,7 @@ public final class Util {
|
|||
result.grow(1+upto);
|
||||
}
|
||||
|
||||
fst.readFirstRealArc(arc.target, arc, in);
|
||||
fst.readFirstRealTargetArc(arc.target, arc, in);
|
||||
|
||||
FST.Arc<Long> prevArc = null;
|
||||
|
||||
|
@ -238,6 +234,7 @@ public final class Util {
|
|||
// A queue of transitions to consider when processing the next level.
|
||||
final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
|
||||
nextLevelQueue.add(startArc);
|
||||
//System.out.println("toDot: startArc: " + startArc);
|
||||
|
||||
// A list of states on the same level (for ranking).
|
||||
final List<Integer> sameLevelStates = new ArrayList<Integer>();
|
||||
|
@ -289,8 +286,11 @@ public final class Util {
|
|||
|
||||
int level = 0;
|
||||
|
||||
final FST.BytesReader r = fst.getBytesReader(0);
|
||||
|
||||
while (!nextLevelQueue.isEmpty()) {
|
||||
// we could double buffer here, but it doesn't matter probably.
|
||||
//System.out.println("next level=" + level);
|
||||
thisLevelQueue.addAll(nextLevelQueue);
|
||||
nextLevelQueue.clear();
|
||||
|
||||
|
@ -298,19 +298,19 @@ public final class Util {
|
|||
out.write("\n // Transitions and states at level: " + level + "\n");
|
||||
while (!thisLevelQueue.isEmpty()) {
|
||||
final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
|
||||
//System.out.println(" pop: " + arc);
|
||||
if (fst.targetHasArcs(arc)) {
|
||||
// scan all arcs
|
||||
// scan all target arcs
|
||||
//System.out.println(" readFirstTarget...");
|
||||
final int node = arc.target;
|
||||
fst.readFirstTargetArc(arc, arc);
|
||||
|
||||
if (arc.label == FST.END_LABEL) {
|
||||
// Skip it -- prior recursion took this into account already
|
||||
assert !arc.isLast();
|
||||
fst.readNextArc(arc);
|
||||
}
|
||||
fst.readFirstRealTargetArc(arc.target, arc, r);
|
||||
|
||||
//System.out.println(" firstTarget: " + arc);
|
||||
|
||||
while (true) {
|
||||
|
||||
//System.out.println(" cycle arc=" + arc);
|
||||
// Emit the unseen state and add it to the queue for the next level.
|
||||
if (arc.target >= 0 && !seen.get(arc.target)) {
|
||||
|
||||
|
@ -329,7 +329,7 @@ public final class Util {
|
|||
if (fst.isExpandedTarget(arc)) {
|
||||
stateColor = expandedNodeColor;
|
||||
} else {
|
||||
stateColor = null;
|
||||
stateColor = null;
|
||||
}
|
||||
|
||||
final String finalOutput;
|
||||
|
@ -339,7 +339,9 @@ public final class Util {
|
|||
finalOutput = "";
|
||||
}
|
||||
|
||||
emitDotState(out, Integer.toString(arc.target), arc.isFinal() ? finalStateShape : stateShape, stateColor, finalOutput);
|
||||
emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, finalOutput);
|
||||
// To see the node address, use this instead:
|
||||
//emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
|
||||
seen.set(arc.target);
|
||||
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
|
||||
sameLevelStates.add(arc.target);
|
||||
|
@ -362,14 +364,22 @@ public final class Util {
|
|||
outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]";
|
||||
}
|
||||
|
||||
final String arcColor;
|
||||
if (arc.flag(FST.BIT_TARGET_NEXT)) {
|
||||
arcColor = "red";
|
||||
} else {
|
||||
arcColor = "black";
|
||||
}
|
||||
|
||||
assert arc.label != FST.END_LABEL;
|
||||
out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"]\n");
|
||||
out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n");
|
||||
|
||||
// Break the loop if we're on the last arc of this state.
|
||||
if (arc.isLast()) {
|
||||
//System.out.println(" break");
|
||||
break;
|
||||
}
|
||||
fst.readNextArc(arc);
|
||||
fst.readNextRealArc(arc, r);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -89,11 +89,11 @@ public class TestFSTs extends LuceneTestCase {
|
|||
return br;
|
||||
}
|
||||
|
||||
private static IntsRef toIntsRef(String s, int inputMode) {
|
||||
static IntsRef toIntsRef(String s, int inputMode) {
|
||||
return toIntsRef(s, inputMode, new IntsRef(10));
|
||||
}
|
||||
|
||||
private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
|
||||
static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
|
||||
if (inputMode == 0) {
|
||||
// utf8
|
||||
return toIntsRef(new BytesRef(s), ir);
|
||||
|
@ -103,7 +103,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
|
||||
static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
|
||||
final int charLength = s.length();
|
||||
int charIdx = 0;
|
||||
int intIdx = 0;
|
||||
|
@ -120,7 +120,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
return ir;
|
||||
}
|
||||
|
||||
private static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
|
||||
static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
|
||||
if (br.length > ir.ints.length) {
|
||||
ir.grow(br.length);
|
||||
}
|
||||
|
@ -172,7 +172,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms2.length);
|
||||
for(int idx=0;idx<terms2.length;idx++) {
|
||||
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
|
||||
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], (long) idx));
|
||||
}
|
||||
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false);
|
||||
assertNotNull(fst);
|
||||
|
@ -230,7 +230,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
|
||||
for(int idx=0;idx<terms.length;idx++) {
|
||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
|
||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], (long) idx));
|
||||
}
|
||||
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest();
|
||||
}
|
||||
|
@ -244,7 +244,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
for(int idx=0;idx<terms.length;idx++) {
|
||||
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
|
||||
lastOutput = value;
|
||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
|
||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], value));
|
||||
}
|
||||
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, doShare).doTest();
|
||||
}
|
||||
|
@ -254,7 +254,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
|
||||
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
|
||||
for(int idx=0;idx<terms.length;idx++) {
|
||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
|
||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], random.nextLong() & Long.MAX_VALUE));
|
||||
}
|
||||
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, false).doTest();
|
||||
}
|
||||
|
@ -270,8 +270,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
|
||||
lastOutput = value;
|
||||
pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
|
||||
outputs.get(o1.get(idx),
|
||||
o2.get(value))));
|
||||
outputs.newPair((long) idx, value)));
|
||||
}
|
||||
new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs, false).doTest();
|
||||
}
|
||||
|
@ -393,6 +392,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
||||
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
||||
T output = NO_OUTPUT;
|
||||
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||
|
||||
for(int i=0;i<=term.length;i++) {
|
||||
final int label;
|
||||
|
@ -401,8 +401,9 @@ public class TestFSTs extends LuceneTestCase {
|
|||
} else {
|
||||
label = term.ints[term.offset+i];
|
||||
}
|
||||
//System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
|
||||
if (fst.findTargetArc(label, arc, arc) == null) {
|
||||
// System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
|
||||
if (fst.findTargetArc(label, arc, arc, fstReader) == null) {
|
||||
// System.out.println(" not found");
|
||||
if (prefixLength != null) {
|
||||
prefixLength[0] = i;
|
||||
return output;
|
||||
|
@ -462,16 +463,19 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
|
||||
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
|
||||
}
|
||||
|
||||
final boolean willRewrite = random.nextBoolean();
|
||||
|
||||
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
|
||||
prune1, prune2,
|
||||
prune1==0 && prune2==0,
|
||||
allowRandomSuffixSharing ? random.nextBoolean() : true,
|
||||
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
|
||||
outputs,
|
||||
null);
|
||||
null,
|
||||
willRewrite);
|
||||
|
||||
for(InputOutput<T> pair : pairs) {
|
||||
if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
|
||||
|
@ -486,7 +490,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
}
|
||||
FST<T> fst = builder.finish();
|
||||
|
||||
if (random.nextBoolean() && fst != null) {
|
||||
if (random.nextBoolean() && fst != null && !willRewrite) {
|
||||
TestFSTs t = new TestFSTs();
|
||||
IOContext context = t.newIOContext(random);
|
||||
IndexOutput out = dir.createOutput("fst.bin", context);
|
||||
|
@ -522,6 +526,21 @@ public class TestFSTs extends LuceneTestCase {
|
|||
verifyPruned(inputMode, fst, prune1, prune2);
|
||||
}
|
||||
|
||||
if (willRewrite && fst != null) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: now rewrite");
|
||||
}
|
||||
final FST<T> packed =fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000));
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: now verify packed FST");
|
||||
}
|
||||
if (prune1 == 0 && prune2 == 0) {
|
||||
verifyUnPruned(inputMode, packed);
|
||||
} else {
|
||||
verifyPruned(inputMode, packed, prune1, prune2);
|
||||
}
|
||||
}
|
||||
|
||||
return fst;
|
||||
}
|
||||
|
||||
|
@ -638,7 +657,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
num = atLeast(100);
|
||||
for(int iter=0;iter<num;iter++) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: iter=" + iter);
|
||||
System.out.println(" iter=" + iter);
|
||||
}
|
||||
if (random.nextBoolean()) {
|
||||
// seek to term that doesn't exist:
|
||||
|
@ -866,7 +885,15 @@ public class TestFSTs extends LuceneTestCase {
|
|||
prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
|
||||
} else {
|
||||
cmo.count++;
|
||||
cmo.output = outputs.common(cmo.output, pair.output);
|
||||
T output1 = cmo.output;
|
||||
if (output1.equals(outputs.getNoOutput())) {
|
||||
output1 = outputs.getNoOutput();
|
||||
}
|
||||
T output2 = pair.output;
|
||||
if (output2.equals(outputs.getNoOutput())) {
|
||||
output2 = outputs.getNoOutput();
|
||||
}
|
||||
cmo.output = outputs.common(output1, output2);
|
||||
}
|
||||
if (idx == pair.input.length) {
|
||||
cmo.isFinal = true;
|
||||
|
@ -992,7 +1019,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void testRandomWords() throws IOException {
|
||||
testRandomWords(1000, atLeast(2));
|
||||
//testRandomWords(20, 100);
|
||||
//testRandomWords(100, 1);
|
||||
}
|
||||
|
||||
String inputModeToString(int mode) {
|
||||
|
@ -1055,50 +1082,6 @@ public class TestFSTs extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
// NOTE: this test shows a case where our current builder
|
||||
// fails to produce minimal FST:
|
||||
/*
|
||||
public void test3() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
IntsRef scratchIntsRef = new IntsRef();
|
||||
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.get(0));
|
||||
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), 1L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), 2L);
|
||||
final FST<Long> fst = builder.finish();
|
||||
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
|
||||
// NOTE: we produce 7 nodes today
|
||||
assertEquals(6, fst.getNodeCount());
|
||||
// NOTE: we produce 8 arcs today
|
||||
assertEquals(7, fst.getNodeCount());
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
||||
//Util.toDot(fst, w, false, false);
|
||||
//w.close();
|
||||
}
|
||||
*/
|
||||
|
||||
// NOTE: this test shows a case where our current builder
|
||||
// fails to produce minimal FST:
|
||||
/*
|
||||
public void test4() throws Exception {
|
||||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
IntsRef scratchIntsRef = new IntsRef();
|
||||
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput());
|
||||
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1"));
|
||||
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11"));
|
||||
final FST<BytesRef> fst = builder.finish();
|
||||
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
|
||||
// NOTE: we produce 7 nodes today
|
||||
assertEquals(6, fst.getNodeCount());
|
||||
// NOTE: we produce 8 arcs today
|
||||
assertEquals(7, fst.getNodeCount());
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
||||
//Util.toDot(fst, w, false, false);
|
||||
//w.close();
|
||||
}
|
||||
*/
|
||||
|
||||
// Build FST for all unique terms in the test line docs
|
||||
// file, up until a time limit
|
||||
public void testRealTerms() throws Exception {
|
||||
|
@ -1126,7 +1109,10 @@ public class TestFSTs extends LuceneTestCase {
|
|||
IndexReader r = IndexReader.open(writer, true);
|
||||
writer.close();
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
|
||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final boolean doRewrite = random.nextBoolean();
|
||||
|
||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite);
|
||||
|
||||
boolean storeOrd = random.nextBoolean();
|
||||
if (VERBOSE) {
|
||||
|
@ -1162,59 +1148,69 @@ public class TestFSTs extends LuceneTestCase {
|
|||
} else {
|
||||
output = termsEnum.docFreq();
|
||||
}
|
||||
builder.add(Util.toIntsRef(term, scratchIntsRef), outputs.get(output));
|
||||
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
|
||||
ord++;
|
||||
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
|
||||
System.out.println(ord + " terms...");
|
||||
}
|
||||
}
|
||||
final FST<Long> fst = builder.finish();
|
||||
FST<Long> fst = builder.finish();
|
||||
if (VERBOSE) {
|
||||
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes");
|
||||
}
|
||||
|
||||
if (ord > 0) {
|
||||
// Now confirm BytesRefFSTEnum and TermsEnum act the
|
||||
// same:
|
||||
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
|
||||
int num = atLeast(1000);
|
||||
for(int iter=0;iter<num;iter++) {
|
||||
final BytesRef randomTerm = new BytesRef(getRandomString());
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
|
||||
for(int rewriteIter=0;rewriteIter<2;rewriteIter++) {
|
||||
if (rewriteIter == 1) {
|
||||
if (doRewrite) {
|
||||
// Verify again, with packed FST:
|
||||
fst = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Now confirm BytesRefFSTEnum and TermsEnum act the
|
||||
// same:
|
||||
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
|
||||
int num = atLeast(1000);
|
||||
for(int iter=0;iter<num;iter++) {
|
||||
final BytesRef randomTerm = new BytesRef(getRandomString());
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
|
||||
}
|
||||
|
||||
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
|
||||
final BytesRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(randomTerm);
|
||||
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
|
||||
final BytesRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(randomTerm);
|
||||
|
||||
if (seekResult == TermsEnum.SeekStatus.END) {
|
||||
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
|
||||
} else {
|
||||
assertSame(termsEnum, fstEnum, storeOrd);
|
||||
for(int nextIter=0;nextIter<10;nextIter++) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: next");
|
||||
if (storeOrd) {
|
||||
System.out.println(" ord=" + termsEnum.ord());
|
||||
}
|
||||
}
|
||||
if (termsEnum.next() != null) {
|
||||
if (seekResult == TermsEnum.SeekStatus.END) {
|
||||
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
|
||||
} else {
|
||||
assertSame(termsEnum, fstEnum, storeOrd);
|
||||
for(int nextIter=0;nextIter<10;nextIter++) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" term=" + termsEnum.term().utf8ToString());
|
||||
System.out.println("TEST: next");
|
||||
if (storeOrd) {
|
||||
System.out.println(" ord=" + termsEnum.ord());
|
||||
}
|
||||
}
|
||||
assertNotNull(fstEnum.next());
|
||||
assertSame(termsEnum, fstEnum, storeOrd);
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" end!");
|
||||
if (termsEnum.next() != null) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" term=" + termsEnum.term().utf8ToString());
|
||||
}
|
||||
assertNotNull(fstEnum.next());
|
||||
assertSame(termsEnum, fstEnum, storeOrd);
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" end!");
|
||||
}
|
||||
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
|
||||
if (nextResult != null) {
|
||||
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
|
||||
fail();
|
||||
}
|
||||
break;
|
||||
}
|
||||
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
|
||||
if (nextResult != null) {
|
||||
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
|
||||
fail();
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1248,14 +1244,17 @@ public class TestFSTs extends LuceneTestCase {
|
|||
private int inputMode;
|
||||
private final Outputs<T> outputs;
|
||||
private final Builder<T> builder;
|
||||
private final boolean doPack;
|
||||
|
||||
public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs) {
|
||||
public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean doPack, boolean noArcArrays) {
|
||||
this.dirOut = dirOut;
|
||||
this.wordsFileIn = wordsFileIn;
|
||||
this.inputMode = inputMode;
|
||||
this.outputs = outputs;
|
||||
|
||||
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null);
|
||||
this.doPack = doPack;
|
||||
|
||||
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack);
|
||||
builder.setAllowArrayArcs(!noArcArrays);
|
||||
}
|
||||
|
||||
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
|
||||
|
@ -1287,14 +1286,15 @@ public class TestFSTs extends LuceneTestCase {
|
|||
}
|
||||
|
||||
assert builder.getTermCount() == ord;
|
||||
final FST<T> fst = builder.finish();
|
||||
FST<T> fst = builder.finish();
|
||||
if (fst == null) {
|
||||
System.out.println("FST was fully pruned!");
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
if (dirOut == null)
|
||||
if (dirOut == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
|
||||
if (fst.getNodeCount() < 100) {
|
||||
|
@ -1304,12 +1304,17 @@ public class TestFSTs extends LuceneTestCase {
|
|||
System.out.println("Wrote FST to out.dot");
|
||||
}
|
||||
|
||||
Directory dir = FSDirectory.open(new File(dirOut));
|
||||
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT);
|
||||
fst.save(out);
|
||||
out.close();
|
||||
|
||||
System.out.println("Saved FST to fst.bin.");
|
||||
if (doPack) {
|
||||
System.out.println("Pack...");
|
||||
fst = fst.pack(4, 100000000);
|
||||
System.out.println("New size " + fst.sizeInBytes() + " bytes");
|
||||
} else {
|
||||
Directory dir = FSDirectory.open(new File(dirOut));
|
||||
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT);
|
||||
fst.save(out);
|
||||
out.close();
|
||||
System.out.println("Saved FST to fst.bin.");
|
||||
}
|
||||
|
||||
if (!verify) {
|
||||
return;
|
||||
|
@ -1317,45 +1322,50 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
System.out.println("\nNow verify...");
|
||||
|
||||
is.close();
|
||||
is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
|
||||
|
||||
ord = 0;
|
||||
tStart = System.currentTimeMillis();
|
||||
while(true) {
|
||||
String w = is.readLine();
|
||||
if (w == null) {
|
||||
break;
|
||||
}
|
||||
toIntsRef(w, inputMode, intsRef);
|
||||
T expected = getOutput(intsRef, ord);
|
||||
T actual = Util.get(fst, intsRef);
|
||||
if (actual == null) {
|
||||
throw new RuntimeException("unexpected null output on input=" + w);
|
||||
}
|
||||
if (!actual.equals(expected)) {
|
||||
throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
|
||||
is.close();
|
||||
is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
|
||||
|
||||
ord = 0;
|
||||
tStart = System.currentTimeMillis();
|
||||
while(true) {
|
||||
String w = is.readLine();
|
||||
if (w == null) {
|
||||
break;
|
||||
}
|
||||
toIntsRef(w, inputMode, intsRef);
|
||||
T expected = getOutput(intsRef, ord);
|
||||
T actual = Util.get(fst, intsRef);
|
||||
if (actual == null) {
|
||||
throw new RuntimeException("unexpected null output on input=" + w);
|
||||
}
|
||||
if (!actual.equals(expected)) {
|
||||
throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
|
||||
}
|
||||
|
||||
ord++;
|
||||
if (ord % 500000 == 0) {
|
||||
System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
|
||||
}
|
||||
if (ord >= limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ord++;
|
||||
if (ord % 500000 == 0) {
|
||||
System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
|
||||
}
|
||||
if (ord >= limit) {
|
||||
break;
|
||||
}
|
||||
double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
|
||||
System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
|
||||
|
||||
// NOTE: comment out to profile lookup...
|
||||
break;
|
||||
}
|
||||
|
||||
double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
|
||||
System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
|
||||
|
||||
} finally {
|
||||
is.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out
|
||||
// java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
|
||||
public static void main(String[] args) throws IOException {
|
||||
int prune = 0;
|
||||
int limit = Integer.MAX_VALUE;
|
||||
|
@ -1363,7 +1373,8 @@ public class TestFSTs extends LuceneTestCase {
|
|||
boolean storeOrds = false;
|
||||
boolean storeDocFreqs = false;
|
||||
boolean verify = true;
|
||||
|
||||
boolean doPack = false;
|
||||
boolean noArcArrays = false;
|
||||
String wordsFileIn = null;
|
||||
String dirOut = null;
|
||||
|
||||
|
@ -1381,10 +1392,14 @@ public class TestFSTs extends LuceneTestCase {
|
|||
inputMode = 1;
|
||||
} else if (args[idx].equals("-docFreq")) {
|
||||
storeDocFreqs = true;
|
||||
} else if (args[idx].equals("-noArcArrays")) {
|
||||
noArcArrays = true;
|
||||
} else if (args[idx].equals("-ords")) {
|
||||
storeOrds = true;
|
||||
} else if (args[idx].equals("-noverify")) {
|
||||
verify = false;
|
||||
} else if (args[idx].equals("-pack")) {
|
||||
doPack = true;
|
||||
} else if (args[idx].startsWith("-")) {
|
||||
System.err.println("Unrecognized option: " + args[idx]);
|
||||
System.exit(-1);
|
||||
|
@ -1413,44 +1428,44 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true);
|
||||
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false);
|
||||
final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
|
||||
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs) {
|
||||
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||
Random rand;
|
||||
@Override
|
||||
public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
|
||||
if (ord == 0) {
|
||||
rand = new Random(17);
|
||||
}
|
||||
return new PairOutputs.Pair<Long,Long>(o1.get(ord),
|
||||
o2.get(_TestUtil.nextInt(rand, 1, 5000)));
|
||||
return outputs.newPair((long) ord,
|
||||
(long) _TestUtil.nextInt(rand, 1, 5000));
|
||||
}
|
||||
}.run(limit, verify);
|
||||
} else if (storeOrds) {
|
||||
// Store only ords
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||
@Override
|
||||
public Long getOutput(IntsRef input, int ord) {
|
||||
return outputs.get(ord);
|
||||
return (long) ord;
|
||||
}
|
||||
}.run(limit, verify);
|
||||
} else if (storeDocFreqs) {
|
||||
// Store only docFreq
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||
Random rand;
|
||||
@Override
|
||||
public Long getOutput(IntsRef input, int ord) {
|
||||
if (ord == 0) {
|
||||
rand = new Random(17);
|
||||
}
|
||||
return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
|
||||
return (long) _TestUtil.nextInt(rand, 1, 5000);
|
||||
}
|
||||
}.run(limit, verify);
|
||||
} else {
|
||||
// Store nothing
|
||||
final NoOutputs outputs = NoOutputs.getSingleton();
|
||||
final Object NO_OUTPUT = outputs.getNoOutput();
|
||||
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs) {
|
||||
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
|
||||
@Override
|
||||
public Object getOutput(IntsRef input, int ord) {
|
||||
return NO_OUTPUT;
|
||||
|
@ -1468,6 +1483,46 @@ public class TestFSTs extends LuceneTestCase {
|
|||
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
|
||||
}
|
||||
|
||||
/*
|
||||
public void testTrivial() throws Exception {
|
||||
|
||||
// Get outputs -- passing true means FST will share
|
||||
// (delta code) the outputs. This should result in
|
||||
// smaller FST if the outputs grow monotonically. But
|
||||
// if numbers are "random", false should give smaller
|
||||
// final size:
|
||||
final NoOutputs outputs = NoOutputs.getSingleton();
|
||||
|
||||
String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
|
||||
|
||||
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0,
|
||||
true,
|
||||
true,
|
||||
Integer.MAX_VALUE,
|
||||
outputs,
|
||||
null,
|
||||
true);
|
||||
Arrays.sort(strings);
|
||||
final IntsRef scratch = new IntsRef();
|
||||
for(String s : strings) {
|
||||
builder.add(Util.toIntsRef(new BytesRef(s), scratch), outputs.getNoOutput());
|
||||
}
|
||||
final FST<Object> fst = builder.finish();
|
||||
System.out.println("DOT before rewrite");
|
||||
Writer w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/before.dot"));
|
||||
Util.toDot(fst, w, false, false);
|
||||
w.close();
|
||||
|
||||
final FST<Object> rewrite = new FST<Object>(fst, 1, 100);
|
||||
|
||||
System.out.println("DOT after rewrite");
|
||||
w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/after.dot"));
|
||||
Util.toDot(rewrite, w, false, false);
|
||||
w.close();
|
||||
}
|
||||
*/
|
||||
|
||||
public void testSimple() throws Exception {
|
||||
|
||||
// Get outputs -- passing true means FST will share
|
||||
|
@ -1484,9 +1539,9 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final BytesRef b = new BytesRef("b");
|
||||
final BytesRef c = new BytesRef("c");
|
||||
|
||||
builder.add(Util.toIntsRef(a, new IntsRef()), outputs.get(17));
|
||||
builder.add(Util.toIntsRef(b, new IntsRef()), outputs.get(42));
|
||||
builder.add(Util.toIntsRef(c, new IntsRef()), outputs.get(13824324872317238L));
|
||||
builder.add(Util.toIntsRef(a, new IntsRef()), 17L);
|
||||
builder.add(Util.toIntsRef(b, new IntsRef()), 42L);
|
||||
builder.add(Util.toIntsRef(c, new IntsRef()), 13824324872317238L);
|
||||
|
||||
final FST<Long> fst = builder.finish();
|
||||
|
||||
|
@ -1795,11 +1850,11 @@ public class TestFSTs extends LuceneTestCase {
|
|||
public void testFinalOutputOnEndState() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
|
||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null);
|
||||
builder.add(Util.toUTF32("stat", new IntsRef()), outputs.get(17));
|
||||
builder.add(Util.toUTF32("station", new IntsRef()), outputs.get(10));
|
||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random.nextBoolean());
|
||||
builder.add(Util.toUTF32("stat", new IntsRef()), 17L);
|
||||
builder.add(Util.toUTF32("station", new IntsRef()), 10L);
|
||||
final FST<Long> fst = builder.finish();
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
|
||||
StringWriter w = new StringWriter();
|
||||
Util.toDot(fst, w, false, false);
|
||||
w.close();
|
||||
|
@ -1809,8 +1864,8 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void testInternalFinalState() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
|
||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null);
|
||||
final boolean willRewrite = random.nextBoolean();
|
||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite);
|
||||
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput());
|
||||
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput());
|
||||
final FST<Long> fst = builder.finish();
|
||||
|
@ -1819,17 +1874,23 @@ public class TestFSTs extends LuceneTestCase {
|
|||
Util.toDot(fst, w, false, false);
|
||||
w.close();
|
||||
//System.out.println(w.toString());
|
||||
assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1);
|
||||
final String expected;
|
||||
if (willRewrite) {
|
||||
expected = "4 -> 3 [label=\"t\" style=\"bold\"";
|
||||
} else {
|
||||
expected = "8 -> 6 [label=\"t\" style=\"bold\"";
|
||||
}
|
||||
assertTrue(w.toString().indexOf(expected) != -1);
|
||||
}
|
||||
|
||||
// Make sure raw FST can differentiate between final vs
|
||||
// non-final end nodes
|
||||
public void testNonFinalStopNodes() throws Exception {
|
||||
public void testNonFinalStopNode() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
final Long nothing = outputs.getNoOutput();
|
||||
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs, false);
|
||||
|
||||
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
|
||||
|
||||
|
@ -1839,8 +1900,8 @@ public class TestFSTs extends LuceneTestCase {
|
|||
node.isFinal = true;
|
||||
rootNode.addArc('a', node);
|
||||
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
||||
frozen.address = fst.addNode(node);
|
||||
rootNode.arcs[0].nextFinalOutput = outputs.get(17);
|
||||
frozen.node = fst.addNode(node);
|
||||
rootNode.arcs[0].nextFinalOutput = 17L;
|
||||
rootNode.arcs[0].isFinal = true;
|
||||
rootNode.arcs[0].output = nothing;
|
||||
rootNode.arcs[0].target = frozen;
|
||||
|
@ -1851,13 +1912,18 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
|
||||
rootNode.addArc('b', node);
|
||||
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
||||
frozen.address = fst.addNode(node);
|
||||
frozen.node = fst.addNode(node);
|
||||
rootNode.arcs[1].nextFinalOutput = nothing;
|
||||
rootNode.arcs[1].output = outputs.get(42);
|
||||
rootNode.arcs[1].output = 42L;
|
||||
rootNode.arcs[1].target = frozen;
|
||||
}
|
||||
|
||||
fst.finish(fst.addNode(rootNode));
|
||||
|
||||
StringWriter w = new StringWriter();
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
|
||||
Util.toDot(fst, w, false, false);
|
||||
w.close();
|
||||
|
||||
checkStopNodes(fst, outputs);
|
||||
|
||||
|
|
|
@ -226,6 +226,9 @@ public final class SynonymFilter extends TokenFilter {
|
|||
|
||||
private final FST<BytesRef> fst;
|
||||
|
||||
private final FST.BytesReader fstReader;
|
||||
|
||||
|
||||
private final BytesRef scratchBytes = new BytesRef();
|
||||
private final CharsRef scratchChars = new CharsRef();
|
||||
|
||||
|
@ -241,7 +244,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
this.synonyms = synonyms;
|
||||
this.ignoreCase = ignoreCase;
|
||||
this.fst = synonyms.fst;
|
||||
|
||||
this.fstReader = fst.getBytesReader(0);
|
||||
if (fst == null) {
|
||||
throw new IllegalArgumentException("fst must be non-null");
|
||||
}
|
||||
|
@ -366,7 +369,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
int bufUpto = 0;
|
||||
while(bufUpto < bufferLen) {
|
||||
final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
|
||||
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) {
|
||||
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
|
||||
//System.out.println(" stop");
|
||||
break byToken;
|
||||
}
|
||||
|
@ -388,7 +391,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
|
||||
// See if the FST wants to continue matching (ie, needs to
|
||||
// see the next input token):
|
||||
if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
|
||||
if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
|
||||
// No further rules can match here; we're done
|
||||
// searching for matching rules starting at the
|
||||
// current input position.
|
||||
|
|
|
@ -47,16 +47,17 @@ public final class TokenInfoFST {
|
|||
FST.Arc<Long> firstArc = new FST.Arc<Long>();
|
||||
fst.getFirstArc(firstArc);
|
||||
FST.Arc<Long> arc = new FST.Arc<Long>();
|
||||
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||
// TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs)
|
||||
for (int i = 0; i < rootCache.length; i++) {
|
||||
if (fst.findTargetArc(0x3040 + i, firstArc, arc) != null) {
|
||||
if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) {
|
||||
rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
|
||||
}
|
||||
}
|
||||
return rootCache;
|
||||
}
|
||||
|
||||
public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache) throws IOException {
|
||||
public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache, FST.BytesReader fstReader) throws IOException {
|
||||
if (useCache && ch >= 0x3040 && ch <= cacheCeiling) {
|
||||
assert ch != FST.END_LABEL;
|
||||
final Arc<Long> result = rootCache[ch - 0x3040];
|
||||
|
@ -67,13 +68,17 @@ public final class TokenInfoFST {
|
|||
return arc;
|
||||
}
|
||||
} else {
|
||||
return fst.findTargetArc(ch, follow, arc);
|
||||
return fst.findTargetArc(ch, follow, arc, fstReader);
|
||||
}
|
||||
}
|
||||
|
||||
public Arc<Long> getFirstArc(FST.Arc<Long> arc) {
|
||||
return fst.getFirstArc(arc);
|
||||
}
|
||||
|
||||
public FST.BytesReader getBytesReader(int pos) {
|
||||
return fst.getBytesReader(pos);
|
||||
}
|
||||
|
||||
/** @lucene.internal for testing only */
|
||||
FST<Long> getInternalFST() {
|
||||
|
|
|
@ -113,7 +113,7 @@ public final class UserDictionary implements Dictionary {
|
|||
for (int i = 0; i < token.length(); i++) {
|
||||
scratch.ints[i] = (int) token.charAt(i);
|
||||
}
|
||||
fstBuilder.add(scratch, fstOutput.get(ord));
|
||||
fstBuilder.add(scratch, ord);
|
||||
segmentations.add(wordIdAndLength);
|
||||
ord++;
|
||||
}
|
||||
|
@ -134,6 +134,8 @@ public final class UserDictionary implements Dictionary {
|
|||
TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
|
||||
boolean found = false; // true if we found any results
|
||||
|
||||
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||
|
||||
FST.Arc<Long> arc = new FST.Arc<Long>();
|
||||
int end = off + len;
|
||||
for (int startOffset = off; startOffset < end; startOffset++) {
|
||||
|
@ -142,7 +144,7 @@ public final class UserDictionary implements Dictionary {
|
|||
int remaining = end - startOffset;
|
||||
for (int i = 0; i < remaining; i++) {
|
||||
int ch = chars[startOffset+i];
|
||||
if (fst.findTargetArc(ch, arc, arc, i == 0) == null) {
|
||||
if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
|
||||
break; // continue to next position
|
||||
}
|
||||
output += arc.output.intValue();
|
||||
|
|
|
@ -35,7 +35,7 @@ import org.apache.lucene.util.fst.FST;
|
|||
public class Viterbi {
|
||||
|
||||
private final TokenInfoFST fst;
|
||||
|
||||
|
||||
private final TokenInfoDictionary dictionary;
|
||||
|
||||
private final UnknownDictionary unkDictionary;
|
||||
|
@ -214,6 +214,8 @@ public class Viterbi {
|
|||
ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
|
||||
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
|
||||
|
||||
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||
|
||||
// Process user dictionary;
|
||||
if (useUserDictionary) {
|
||||
processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
|
||||
|
@ -238,7 +240,7 @@ public class Viterbi {
|
|||
for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
|
||||
int ch = text[suffixStart + endIndex - 1];
|
||||
|
||||
if (fst.findTargetArc(ch, arc, arc, endIndex == 1) == null) {
|
||||
if (fst.findTargetArc(ch, arc, arc, endIndex == 1, fstReader) == null) {
|
||||
break; // continue to next position
|
||||
}
|
||||
output += arc.output.intValue();
|
||||
|
|
Binary file not shown.
|
@ -131,7 +131,7 @@ public class TokenInfoDictionaryBuilder {
|
|||
System.out.println(" encode...");
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
|
||||
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true);
|
||||
IntsRef scratch = new IntsRef();
|
||||
long ord = -1; // first ord will be 0
|
||||
String lastValue = null;
|
||||
|
@ -155,13 +155,14 @@ public class TokenInfoDictionaryBuilder {
|
|||
for (int i = 0; i < token.length(); i++) {
|
||||
scratch.ints[i] = (int) token.charAt(i);
|
||||
}
|
||||
fstBuilder.add(scratch, fstOutput.get(ord));
|
||||
fstBuilder.add(scratch, ord);
|
||||
}
|
||||
dictionary.addMapping((int)ord, offset);
|
||||
offset = next;
|
||||
}
|
||||
|
||||
FST<Long> fst = fstBuilder.finish();
|
||||
final FST<Long> fst = fstBuilder.finish().pack(2, 100000);
|
||||
|
||||
System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... ");
|
||||
dictionary.setFST(fst);
|
||||
System.out.println(" done");
|
||||
|
|
|
@ -329,8 +329,11 @@ public class FSTCompletion {
|
|||
private boolean descendWithPrefix(Arc<Object> arc, BytesRef utf8)
|
||||
throws IOException {
|
||||
final int max = utf8.offset + utf8.length;
|
||||
// Cannot save as instance var since multiple threads
|
||||
// can use FSTCompletion at once...
|
||||
final FST.BytesReader fstReader = automaton.getBytesReader(0);
|
||||
for (int i = utf8.offset; i < max; i++) {
|
||||
if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc) == null) {
|
||||
if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc, fstReader) == null) {
|
||||
// No matching prefixes, return an empty result.
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -234,7 +234,7 @@ public class FSTCompletionBuilder {
|
|||
final Object empty = outputs.getNoOutput();
|
||||
final Builder<Object> builder = new Builder<Object>(
|
||||
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
|
||||
shareMaxTailLength, outputs, null);
|
||||
shareMaxTailLength, outputs, null, false);
|
||||
|
||||
BytesRef scratch = new BytesRef();
|
||||
final IntsRef scratchIntsRef = new IntsRef();
|
||||
|
|
Loading…
Reference in New Issue