LUCENE-3725: add optional packing to FSTs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1237500 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-01-29 22:48:45 +00:00
parent 2e5be2f75c
commit d1165b1972
24 changed files with 1361 additions and 422 deletions

View File

@ -806,6 +806,9 @@ New Features
* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML * LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
markup. (Steve Rowe) markup. (Steve Rowe)
* LUCENE-3725: Added optional packing to FST building; this uses extra
RAM during building but results in a smaller FST. (Mike McCandless)
Bug fixes Bug fixes

View File

@ -398,7 +398,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
final long indexStartFP; final long indexStartFP;
final long rootBlockFP; final long rootBlockFP;
final BytesRef rootCode; final BytesRef rootCode;
private FST<BytesRef> index; private final FST<BytesRef> index;
//private boolean DEBUG; //private boolean DEBUG;
@ -433,6 +433,8 @@ public class BlockTreeTermsReader extends FieldsProducer {
w.close(); w.close();
} }
*/ */
} else {
index = null;
} }
} }
@ -495,6 +497,8 @@ public class BlockTreeTermsReader extends FieldsProducer {
private final BytesRef term = new BytesRef(); private final BytesRef term = new BytesRef();
private final FST.BytesReader fstReader;
// TODO: can we share this with the frame in STE? // TODO: can we share this with the frame in STE?
private final class Frame { private final class Frame {
final int ord; final int ord;
@ -755,6 +759,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
arcs[arcIdx] = new FST.Arc<BytesRef>(); arcs[arcIdx] = new FST.Arc<BytesRef>();
} }
if (index == null) {
fstReader = null;
} else {
fstReader = index.getBytesReader(0);
}
// TODO: if the automaton is "smallish" we really // TODO: if the automaton is "smallish" we really
// should use the terms index to seek at least to // should use the terms index to seek at least to
// the initial term and likely to subsequent terms // the initial term and likely to subsequent terms
@ -842,7 +852,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
// TODO: we could be more efficient for the next() // TODO: we could be more efficient for the next()
// case by using current arc as starting point, // case by using current arc as starting point,
// passed to findTargetArc // passed to findTargetArc
arc = index.findTargetArc(target, arc, getArc(1+idx)); arc = index.findTargetArc(target, arc, getArc(1+idx), fstReader);
assert arc != null; assert arc != null;
output = fstOutputs.add(output, arc.output); output = fstOutputs.add(output, arc.output);
idx++; idx++;
@ -1186,6 +1196,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
private boolean eof; private boolean eof;
final BytesRef term = new BytesRef(); final BytesRef term = new BytesRef();
private final FST.BytesReader fstReader;
@SuppressWarnings("unchecked") private FST.Arc<BytesRef>[] arcs = new FST.Arc[1]; @SuppressWarnings("unchecked") private FST.Arc<BytesRef>[] arcs = new FST.Arc[1];
@ -1196,6 +1207,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
// Used to hold seek by TermState, or cached seek // Used to hold seek by TermState, or cached seek
staticFrame = new Frame(-1); staticFrame = new Frame(-1);
if (index == null) {
fstReader = null;
} else {
fstReader = index.getBytesReader(0);
}
// Init w/ root block; don't use index since it may // Init w/ root block; don't use index since it may
// not (and need not) have been loaded // not (and need not) have been loaded
for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) { for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
@ -1581,7 +1598,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto)); final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
if (nextArc == null) { if (nextArc == null) {
@ -1838,7 +1855,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto)); final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
if (nextArc == null) { if (nextArc == null) {

View File

@ -288,7 +288,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> indexBuilder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, final Builder<BytesRef> indexBuilder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE, 0, 0, true, false, Integer.MAX_VALUE,
outputs, null); outputs, null, false);
//if (DEBUG) { //if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix); // System.out.println(" compile index for prefix=" + prefix);
//} //}
@ -831,7 +831,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
0, 0, true, 0, 0, true,
true, Integer.MAX_VALUE, true, Integer.MAX_VALUE,
noOutputs, noOutputs,
new FindBlocks()); new FindBlocks(), false);
postingsWriter.setField(fieldInfo); postingsWriter.setField(fieldInfo);
} }

View File

@ -229,7 +229,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
////System.out.println("VGW: field=" + fieldInfo.name); ////System.out.println("VGW: field=" + fieldInfo.name);
// Always put empty string in // Always put empty string in
fstBuilder.add(new IntsRef(), fstOutputs.get(termsFilePointer)); fstBuilder.add(new IntsRef(), termsFilePointer);
startTermsFilePointer = termsFilePointer; startTermsFilePointer = termsFilePointer;
} }
@ -260,7 +260,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
final int lengthSave = text.length; final int lengthSave = text.length;
text.length = indexedTermPrefixLength(lastTerm, text); text.length = indexedTermPrefixLength(lastTerm, text);
try { try {
fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), fstOutputs.get(termsFilePointer)); fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
} finally { } finally {
text.length = lengthSave; text.length = lengthSave;
} }

View File

@ -521,9 +521,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
private void loadTerms() throws IOException { private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b; final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1, final PairOutputs<Long,Long> outputsInner = new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs);
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs, final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs))); outputsInner);
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1, outputs);
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart); in.seek(termsStart);
final BytesRef lastTerm = new BytesRef(10); final BytesRef lastTerm = new BytesRef(10);
@ -536,9 +537,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
SimpleTextUtil.readLine(in, scratch); SimpleTextUtil.readLine(in, scratch);
if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
if (lastDocsStart != -1) { if (lastDocsStart != -1) {
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart, b.add(Util.toIntsRef(lastTerm, scratchIntsRef),
new PairOutputs.Pair<Long,Long>((long) docFreq, outputs.newPair(lastDocsStart,
posIntOutputs.get(totalTermFreq)))); outputsInner.newPair((long) docFreq, totalTermFreq)));
sumTotalTermFreq += totalTermFreq; sumTotalTermFreq += totalTermFreq;
} }
break; break;
@ -553,9 +554,8 @@ class SimpleTextFieldsReader extends FieldsProducer {
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
} else if (StringHelper.startsWith(scratch, TERM)) { } else if (StringHelper.startsWith(scratch, TERM)) {
if (lastDocsStart != -1) { if (lastDocsStart != -1) {
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart, b.add(Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair(lastDocsStart,
new PairOutputs.Pair<Long,Long>((long) docFreq, outputsInner.newPair((long) docFreq, totalTermFreq)));
posIntOutputs.get(totalTermFreq))));
} }
lastDocsStart = in.getFilePointer(); lastDocsStart = in.getFilePointer();
final int len = scratch.length - TERM.length; final int len = scratch.length - TERM.length;

View File

@ -95,7 +95,7 @@ public final class FixedBitSet extends DocIdSet implements Bits {
} }
public boolean get(int index) { public boolean get(int index) {
assert index >= 0 && index < numBits; assert index >= 0 && index < numBits: "index=" + index;
int i = index >> 6; // div 64 int i = index >> 6; // div 64
// signed shift will keep a negative index and force an // signed shift will keep a negative index and force an
// array-index-out-of-bounds-exception, removing the need for an explicit check. // array-index-out-of-bounds-exception, removing the need for an explicit check.

View File

@ -588,7 +588,7 @@ public final class UnicodeUtil {
out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f)); out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
offset += 2; offset += 2;
} else { } else {
assert b < 0xf8; assert b < 0xf8: "b=" + b;
int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f); int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
offset += 3; offset += 3;
if (ch < UNI_MAX_BMP) { if (ch < UNI_MAX_BMP) {

View File

@ -17,15 +17,15 @@ package org.apache.lucene.util.fst;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
/** /**
* Builds a compact FST (maps an IntsRef term to an arbitrary * Builds a minimal FST (maps an IntsRef term to an arbitrary
* output) from pre-sorted terms with outputs (the FST * output) from pre-sorted terms with outputs (the FST
* becomes an FSA if you use NoOutputs). The FST is written * becomes an FSA if you use NoOutputs). The FST is written
* on-the-fly into a compact serialized format byte array, which can * on-the-fly into a compact serialized format byte array, which can
@ -35,12 +35,6 @@ import java.io.IOException;
* <p>NOTE: The algorithm is described at * <p>NOTE: The algorithm is described at
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698</p> * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698</p>
* *
* If your outputs are ByteSequenceOutput then the final FST
* will be minimal, but if you use PositiveIntOutput then
* it's only "near minimal". For example, aa/0, aab/1, bbb/2
* will produce 6 states when a 5 state fst is also
* possible.
*
* The parameterized type T is the output type. See the * The parameterized type T is the output type. See the
* subclasses of {@link Outputs}. * subclasses of {@link Outputs}.
* *
@ -52,7 +46,7 @@ public class Builder<T> {
private final FST<T> fst; private final FST<T> fst;
private final T NO_OUTPUT; private final T NO_OUTPUT;
// private static final boolean DEBUG = false; // private static final boolean DEBUG = true;
// simplistic pruning: we prune node (and all following // simplistic pruning: we prune node (and all following
// nodes) if less than this number of terms go through it: // nodes) if less than this number of terms go through it:
@ -88,7 +82,7 @@ public class Builder<T> {
* pruning options turned off. * pruning options turned off.
*/ */
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) { public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null); this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false);
} }
/** /**
@ -127,16 +121,20 @@ public class Builder<T> {
* @param outputs The output type for each input sequence. Applies only if building an FST. For * @param outputs The output type for each input sequence. Applies only if building an FST. For
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object. * singleton output object.
*
* @param willPackFST Pass true if you will rewrite (compact) the FST before saving. This
* causes the FST to create additional data structures intenrally to facilitate rewriting, but
* it means the resulting FST cannot be saved: it must first be rewritten using {@link FST#FST(FST,int[])}}
*/ */
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs, boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
FreezeTail<T> freezeTail) { FreezeTail<T> freezeTail, boolean willPackFST) {
this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2; this.minSuffixCount2 = minSuffixCount2;
this.freezeTail = freezeTail; this.freezeTail = freezeTail;
this.doShareNonSingletonNodes = doShareNonSingletonNodes; this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength; this.shareMaxTailLength = shareMaxTailLength;
fst = new FST<T>(inputType, outputs); fst = new FST<T>(inputType, outputs, willPackFST);
if (doShareSuffix) { if (doShareSuffix) {
dedupHash = new NodeHash<T>(fst); dedupHash = new NodeHash<T>(fst);
} else { } else {
@ -170,23 +168,23 @@ public class Builder<T> {
fst.setAllowArrayArcs(b); fst.setAllowArrayArcs(b);
} }
private CompiledNode compileNode(UnCompiledNode<T> n, int tailLength) throws IOException { private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throws IOException {
final int address; final int node;
if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) { if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) {
if (n.numArcs == 0) { if (nodeIn.numArcs == 0) {
address = fst.addNode(n); node = fst.addNode(nodeIn);
} else { } else {
address = dedupHash.add(n); node = dedupHash.add(nodeIn);
} }
} else { } else {
address = fst.addNode(n); node = fst.addNode(nodeIn);
} }
assert address != -2; assert node != -2;
n.clear(); nodeIn.clear();
final CompiledNode fn = new CompiledNode(); final CompiledNode fn = new CompiledNode();
fn.address = address; fn.node = node;
return fn; return fn;
} }
@ -319,6 +317,11 @@ public class Builder<T> {
} }
*/ */
// De-dup NO_OUTPUT since it must be a singleton:
if (output.equals(NO_OUTPUT)) {
output = NO_OUTPUT;
}
assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input; assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input;
assert validOutput(output); assert validOutput(output);
@ -443,7 +446,7 @@ public class Builder<T> {
} }
} }
//if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output); //if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
fst.finish(compileNode(root, lastInput.length).address); fst.finish(compileNode(root, lastInput.length).node);
return fst; return fst;
} }
@ -480,7 +483,7 @@ public class Builder<T> {
} }
static final class CompiledNode implements Node { static final class CompiledNode implements Node {
int address; int node;
public boolean isCompiled() { public boolean isCompiled() {
return true; return true;
} }
@ -560,7 +563,7 @@ public class Builder<T> {
final Arc<T> arc = arcs[numArcs-1]; final Arc<T> arc = arcs[numArcs-1];
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch; assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
arc.target = target; arc.target = target;
//assert target.address != -2; //assert target.node != -2;
arc.nextFinalOutput = nextFinalOutput; arc.nextFinalOutput = nextFinalOutput;
arc.isFinal = isFinal; arc.isFinal = isFinal;
} }

File diff suppressed because it is too large Load Diff

View File

@ -151,7 +151,8 @@ abstract class FSTEnum<T> {
boolean found = false; boolean found = false;
while (low <= high) { while (low <= high) {
mid = (low + high) >>> 1; mid = (low + high) >>> 1;
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; in.pos = arc.posArcsStart;
in.skip(arc.bytesPerArc*mid+1);
final int midLabel = fst.readLabel(in); final int midLabel = fst.readLabel(in);
final int cmp = midLabel - targetLabel; final int cmp = midLabel - targetLabel;
//System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
@ -275,7 +276,7 @@ abstract class FSTEnum<T> {
// Now scan forward, matching the new suffix of the target // Now scan forward, matching the new suffix of the target
while(true) { while(true) {
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast()); //System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + arc.bytesPerArc);
if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) { if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
// Arcs are fixed array -- use binary search to find // Arcs are fixed array -- use binary search to find
@ -289,15 +290,16 @@ abstract class FSTEnum<T> {
boolean found = false; boolean found = false;
while (low <= high) { while (low <= high) {
mid = (low + high) >>> 1; mid = (low + high) >>> 1;
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; in.pos = arc.posArcsStart;
in.skip(arc.bytesPerArc*mid+1);
final int midLabel = fst.readLabel(in); final int midLabel = fst.readLabel(in);
final int cmp = midLabel - targetLabel; final int cmp = midLabel - targetLabel;
//System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
if (cmp < 0) if (cmp < 0) {
low = mid + 1; low = mid + 1;
else if (cmp > 0) } else if (cmp > 0) {
high = mid - 1; high = mid - 1;
else { } else {
found = true; found = true;
break; break;
} }
@ -430,9 +432,11 @@ abstract class FSTEnum<T> {
FST.Arc<T> arc = getArc(upto-1); FST.Arc<T> arc = getArc(upto-1);
int targetLabel = getTargetLabel(); int targetLabel = getTargetLabel();
final FST.BytesReader fstReader = fst.getBytesReader(0);
while(true) { while(true) {
//System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel)); //System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel));
final FST.Arc<T> nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto)); final FST.Arc<T> nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto), fstReader);
if (nextArc == null) { if (nextArc == null) {
// short circuit // short circuit
//upto--; //upto--;

View File

@ -35,7 +35,7 @@ final class NodeHash<T> {
} }
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address, FST.BytesReader in) throws IOException { private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address, FST.BytesReader in) throws IOException {
fst.readFirstRealArc(address, scratchArc, in); fst.readFirstRealTargetArc(address, scratchArc, in);
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
return false; return false;
} }
@ -43,7 +43,7 @@ final class NodeHash<T> {
final Builder.Arc<T> arc = node.arcs[arcUpto]; final Builder.Arc<T> arc = node.arcs[arcUpto];
if (arc.label != scratchArc.label || if (arc.label != scratchArc.label ||
!arc.output.equals(scratchArc.output) || !arc.output.equals(scratchArc.output) ||
((Builder.CompiledNode) arc.target).address != scratchArc.target || ((Builder.CompiledNode) arc.target).node != scratchArc.target ||
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput) || !arc.nextFinalOutput.equals(scratchArc.nextFinalOutput) ||
arc.isFinal != scratchArc.isFinal()) { arc.isFinal != scratchArc.isFinal()) {
return false; return false;
@ -71,9 +71,9 @@ final class NodeHash<T> {
// TODO: maybe if number of arcs is high we can safely subsample? // TODO: maybe if number of arcs is high we can safely subsample?
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) { for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
final Builder.Arc<T> arc = node.arcs[arcIdx]; final Builder.Arc<T> arc = node.arcs[arcIdx];
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).address + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal); //System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
h = PRIME * h + arc.label; h = PRIME * h + arc.label;
h = PRIME * h + ((Builder.CompiledNode) arc.target).address; h = PRIME * h + ((Builder.CompiledNode) arc.target).node;
h = PRIME * h + arc.output.hashCode(); h = PRIME * h + arc.output.hashCode();
h = PRIME * h + arc.nextFinalOutput.hashCode(); h = PRIME * h + arc.nextFinalOutput.hashCode();
if (arc.isFinal) { if (arc.isFinal) {
@ -88,9 +88,9 @@ final class NodeHash<T> {
private int hash(int node) throws IOException { private int hash(int node) throws IOException {
final int PRIME = 31; final int PRIME = 31;
final FST.BytesReader in = fst.getBytesReader(0); final FST.BytesReader in = fst.getBytesReader(0);
//System.out.println("hash frozen"); //System.out.println("hash frozen node=" + node);
int h = 0; int h = 0;
fst.readFirstRealArc(node, scratchArc, in); fst.readFirstRealTargetArc(node, scratchArc, in);
while(true) { while(true) {
//System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal()); //System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal());
h = PRIME * h + scratchArc.label; h = PRIME * h + scratchArc.label;
@ -109,26 +109,26 @@ final class NodeHash<T> {
return h & Integer.MAX_VALUE; return h & Integer.MAX_VALUE;
} }
public int add(Builder.UnCompiledNode<T> node) throws IOException { public int add(Builder.UnCompiledNode<T> nodeIn) throws IOException {
// System.out.println("hash: add count=" + count + " vs " + table.length); // System.out.println("hash: add count=" + count + " vs " + table.length);
final FST.BytesReader in = fst.getBytesReader(0); final FST.BytesReader in = fst.getBytesReader(0);
final int h = hash(node); final int h = hash(nodeIn);
int pos = h & mask; int pos = h & mask;
int c = 0; int c = 0;
while(true) { while(true) {
final int v = table[pos]; final int v = table[pos];
if (v == 0) { if (v == 0) {
// freeze & add // freeze & add
final int address = fst.addNode(node); final int node = fst.addNode(nodeIn);
//System.out.println(" now freeze addr=" + address); //System.out.println(" now freeze node=" + node);
assert hash(address) == h : "frozenHash=" + hash(address) + " vs h=" + h; assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
count++; count++;
table[pos] = address; table[pos] = node;
if (table.length < 2*count) { if (table.length < 2*count) {
rehash(); rehash();
} }
return address; return node;
} else if (nodesEqual(node, v, in)) { } else if (nodesEqual(nodeIn, v, in)) {
// same node is already here // same node is already here
return v; return v;
} }

View File

@ -26,6 +26,10 @@ import org.apache.lucene.store.DataOutput;
* Represents the outputs for an FST, providing the basic * Represents the outputs for an FST, providing the basic
* algebra needed for the FST. * algebra needed for the FST.
* *
* <p>Note that any operation that returns NO_OUTPUT must
* return the same singleton object from {@link
* #getNoOutput}.</p>
*
* @lucene.experimental * @lucene.experimental
*/ */
@ -56,6 +60,8 @@ public abstract class Outputs<T> {
public abstract String outputToString(T output); public abstract String outputToString(T output);
// TODO: maybe make valid(T output) public...? for asserts
public T merge(T first, T second) { public T merge(T first, T second) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }

View File

@ -38,7 +38,8 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
public final A output1; public final A output1;
public final B output2; public final B output2;
public Pair(A output1, B output2) { // use newPair
private Pair(A output1, B output2) {
this.output1 = output1; this.output1 = output1;
this.output2 = output2; this.output2 = output2;
} }
@ -66,35 +67,79 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
this.outputs2 = outputs2; this.outputs2 = outputs2;
NO_OUTPUT = new Pair<A,B>(outputs1.getNoOutput(), outputs2.getNoOutput()); NO_OUTPUT = new Pair<A,B>(outputs1.getNoOutput(), outputs2.getNoOutput());
} }
public Pair<A,B> get(A output1, B output2) { /** Create a new Pair */
if (output1 == outputs1.getNoOutput() && output2 == outputs2.getNoOutput()) { public Pair<A,B> newPair(A a, B b) {
if (a.equals(outputs1.getNoOutput())) {
a = outputs1.getNoOutput();
}
if (b.equals(outputs2.getNoOutput())) {
b = outputs2.getNoOutput();
}
if (a == outputs1.getNoOutput() && b == outputs2.getNoOutput()) {
return NO_OUTPUT; return NO_OUTPUT;
} else { } else {
return new Pair<A,B>(output1, output2); final Pair<A,B> p = new Pair<A,B>(a, b);
assert valid(p);
return p;
} }
} }
// for assert
private boolean valid(Pair<A,B> pair) {
final boolean noOutput1 = pair.output1.equals(outputs1.getNoOutput());
final boolean noOutput2 = pair.output2.equals(outputs2.getNoOutput());
if (noOutput1 && pair.output1 != outputs1.getNoOutput()) {
System.out.println("invalid0");
return false;
}
if (noOutput2 && pair.output2 != outputs2.getNoOutput()) {
System.out.println("invalid1");
return false;
}
if (noOutput1 && noOutput2) {
if (pair != NO_OUTPUT) {
System.out.println("invalid2");
return false;
} else {
return true;
}
} else {
return true;
}
}
@Override @Override
public Pair<A,B> common(Pair<A,B> pair1, Pair<A,B> pair2) { public Pair<A,B> common(Pair<A,B> pair1, Pair<A,B> pair2) {
return get(outputs1.common(pair1.output1, pair2.output1), assert valid(pair1);
outputs2.common(pair1.output2, pair2.output2)); assert valid(pair2);
return newPair(outputs1.common(pair1.output1, pair2.output1),
outputs2.common(pair1.output2, pair2.output2));
} }
@Override @Override
public Pair<A,B> subtract(Pair<A,B> output, Pair<A,B> inc) { public Pair<A,B> subtract(Pair<A,B> output, Pair<A,B> inc) {
return get(outputs1.subtract(output.output1, inc.output1), assert valid(output);
outputs2.subtract(output.output2, inc.output2)); assert valid(inc);
return newPair(outputs1.subtract(output.output1, inc.output1),
outputs2.subtract(output.output2, inc.output2));
} }
@Override @Override
public Pair<A,B> add(Pair<A,B> prefix, Pair<A,B> output) { public Pair<A,B> add(Pair<A,B> prefix, Pair<A,B> output) {
return get(outputs1.add(prefix.output1, output.output1), assert valid(prefix);
outputs2.add(prefix.output2, output.output2)); assert valid(output);
return newPair(outputs1.add(prefix.output1, output.output1),
outputs2.add(prefix.output2, output.output2));
} }
@Override @Override
public void write(Pair<A,B> output, DataOutput writer) throws IOException { public void write(Pair<A,B> output, DataOutput writer) throws IOException {
assert valid(output);
outputs1.write(output.output1, writer); outputs1.write(output.output1, writer);
outputs2.write(output.output2, writer); outputs2.write(output.output2, writer);
} }
@ -103,7 +148,7 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
public Pair<A,B> read(DataInput in) throws IOException { public Pair<A,B> read(DataInput in) throws IOException {
A output1 = outputs1.read(in); A output1 = outputs1.read(in);
B output2 = outputs2.read(in); B output2 = outputs2.read(in);
return get(output1, output2); return newPair(output1, output2);
} }
@Override @Override
@ -113,6 +158,12 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
@Override @Override
public String outputToString(Pair<A,B> output) { public String outputToString(Pair<A,B> output) {
assert valid(output);
return "<pair:" + outputs1.outputToString(output.output1) + "," + outputs2.outputToString(output.output2) + ">"; return "<pair:" + outputs1.outputToString(output.output1) + "," + outputs2.outputToString(output.output2) + ">";
} }
@Override
public String toString() {
return "PairOutputs<" + outputs1 + "," + outputs2 + ">";
}
} }

View File

@ -25,10 +25,7 @@ import org.apache.lucene.store.DataOutput;
/** /**
* Output is a long, for each input term. NOTE: the * Output is a long, for each input term. NOTE: the
* resulting FST is not guaranteed to be minimal! See * resulting FST is not guaranteed to be minimal! See
* {@link Builder}. You must use {@link #get} to obtain the * {@link Builder}.
* output for a given long value -- do not use autoboxing
* nor create your own Long instance (the value 0
* must map to the {@link #getNoOutput} singleton).
* *
* @lucene.experimental * @lucene.experimental
*/ */
@ -50,14 +47,6 @@ public final class PositiveIntOutputs extends Outputs<Long> {
return doShare ? singletonShare : singletonNoShare; return doShare ? singletonShare : singletonNoShare;
} }
public Long get(long v) {
if (v == 0) {
return NO_OUTPUT;
} else {
return Long.valueOf(v);
}
}
@Override @Override
public Long common(Long output1, Long output2) { public Long common(Long output1, Long output2) {
assert valid(output1); assert valid(output1);

View File

@ -37,23 +37,21 @@ public final class Util {
// TODO: would be nice not to alloc this on every lookup // TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
final FST.BytesReader fstReader = fst.getBytesReader(0);
// Accumulate output as we go // Accumulate output as we go
final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
for(int i=0;i<input.length;i++) { for(int i=0;i<input.length;i++) {
if (fst.findTargetArc(input.ints[input.offset + i], arc, arc) == null) { if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) {
return null; return null;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
} }
output = fst.outputs.add(output, arc.output);
} }
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { if (arc.isFinal()) {
return null; return fst.outputs.add(output, arc.nextFinalOutput);
} else if (arc.output != NO_OUTPUT) {
return fst.outputs.add(output, arc.output);
} else { } else {
return output; return null;
} }
} }
@ -64,26 +62,24 @@ public final class Util {
public static<T> T get(FST<T> fst, BytesRef input) throws IOException { public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
assert fst.inputType == FST.INPUT_TYPE.BYTE1; assert fst.inputType == FST.INPUT_TYPE.BYTE1;
final FST.BytesReader fstReader = fst.getBytesReader(0);
// TODO: would be nice not to alloc this on every lookup // TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
// Accumulate output as we go // Accumulate output as we go
final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
for(int i=0;i<input.length;i++) { for(int i=0;i<input.length;i++) {
if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc) == null) { if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc, fstReader) == null) {
return null; return null;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
} }
output = fst.outputs.add(output, arc.output);
} }
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { if (arc.isFinal()) {
return null; return fst.outputs.add(output, arc.nextFinalOutput);
} else if (arc.output != NO_OUTPUT) {
return fst.outputs.add(output, arc.output);
} else { } else {
return output; return null;
} }
} }
@ -142,7 +138,7 @@ public final class Util {
result.grow(1+upto); result.grow(1+upto);
} }
fst.readFirstRealArc(arc.target, arc, in); fst.readFirstRealTargetArc(arc.target, arc, in);
FST.Arc<Long> prevArc = null; FST.Arc<Long> prevArc = null;
@ -238,6 +234,7 @@ public final class Util {
// A queue of transitions to consider when processing the next level. // A queue of transitions to consider when processing the next level.
final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>(); final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
nextLevelQueue.add(startArc); nextLevelQueue.add(startArc);
//System.out.println("toDot: startArc: " + startArc);
// A list of states on the same level (for ranking). // A list of states on the same level (for ranking).
final List<Integer> sameLevelStates = new ArrayList<Integer>(); final List<Integer> sameLevelStates = new ArrayList<Integer>();
@ -289,8 +286,11 @@ public final class Util {
int level = 0; int level = 0;
final FST.BytesReader r = fst.getBytesReader(0);
while (!nextLevelQueue.isEmpty()) { while (!nextLevelQueue.isEmpty()) {
// we could double buffer here, but it doesn't matter probably. // we could double buffer here, but it doesn't matter probably.
//System.out.println("next level=" + level);
thisLevelQueue.addAll(nextLevelQueue); thisLevelQueue.addAll(nextLevelQueue);
nextLevelQueue.clear(); nextLevelQueue.clear();
@ -298,19 +298,19 @@ public final class Util {
out.write("\n // Transitions and states at level: " + level + "\n"); out.write("\n // Transitions and states at level: " + level + "\n");
while (!thisLevelQueue.isEmpty()) { while (!thisLevelQueue.isEmpty()) {
final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1); final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
//System.out.println(" pop: " + arc);
if (fst.targetHasArcs(arc)) { if (fst.targetHasArcs(arc)) {
// scan all arcs // scan all target arcs
//System.out.println(" readFirstTarget...");
final int node = arc.target; final int node = arc.target;
fst.readFirstTargetArc(arc, arc);
if (arc.label == FST.END_LABEL) { fst.readFirstRealTargetArc(arc.target, arc, r);
// Skip it -- prior recursion took this into account already
assert !arc.isLast(); //System.out.println(" firstTarget: " + arc);
fst.readNextArc(arc);
}
while (true) { while (true) {
//System.out.println(" cycle arc=" + arc);
// Emit the unseen state and add it to the queue for the next level. // Emit the unseen state and add it to the queue for the next level.
if (arc.target >= 0 && !seen.get(arc.target)) { if (arc.target >= 0 && !seen.get(arc.target)) {
@ -329,7 +329,7 @@ public final class Util {
if (fst.isExpandedTarget(arc)) { if (fst.isExpandedTarget(arc)) {
stateColor = expandedNodeColor; stateColor = expandedNodeColor;
} else { } else {
stateColor = null; stateColor = null;
} }
final String finalOutput; final String finalOutput;
@ -339,7 +339,9 @@ public final class Util {
finalOutput = ""; finalOutput = "";
} }
emitDotState(out, Integer.toString(arc.target), arc.isFinal() ? finalStateShape : stateShape, stateColor, finalOutput); emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, finalOutput);
// To see the node address, use this instead:
//emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
seen.set(arc.target); seen.set(arc.target);
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc)); nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
sameLevelStates.add(arc.target); sameLevelStates.add(arc.target);
@ -362,14 +364,22 @@ public final class Util {
outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]"; outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]";
} }
final String arcColor;
if (arc.flag(FST.BIT_TARGET_NEXT)) {
arcColor = "red";
} else {
arcColor = "black";
}
assert arc.label != FST.END_LABEL; assert arc.label != FST.END_LABEL;
out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"]\n"); out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n");
// Break the loop if we're on the last arc of this state. // Break the loop if we're on the last arc of this state.
if (arc.isLast()) { if (arc.isLast()) {
//System.out.println(" break");
break; break;
} }
fst.readNextArc(arc); fst.readNextRealArc(arc, r);
} }
} }
} }

View File

@ -89,11 +89,11 @@ public class TestFSTs extends LuceneTestCase {
return br; return br;
} }
private static IntsRef toIntsRef(String s, int inputMode) { static IntsRef toIntsRef(String s, int inputMode) {
return toIntsRef(s, inputMode, new IntsRef(10)); return toIntsRef(s, inputMode, new IntsRef(10));
} }
private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) { static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
if (inputMode == 0) { if (inputMode == 0) {
// utf8 // utf8
return toIntsRef(new BytesRef(s), ir); return toIntsRef(new BytesRef(s), ir);
@ -103,7 +103,7 @@ public class TestFSTs extends LuceneTestCase {
} }
} }
private static IntsRef toIntsRefUTF32(String s, IntsRef ir) { static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
final int charLength = s.length(); final int charLength = s.length();
int charIdx = 0; int charIdx = 0;
int intIdx = 0; int intIdx = 0;
@ -120,7 +120,7 @@ public class TestFSTs extends LuceneTestCase {
return ir; return ir;
} }
private static IntsRef toIntsRef(BytesRef br, IntsRef ir) { static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
if (br.length > ir.ints.length) { if (br.length > ir.ints.length) {
ir.grow(br.length); ir.grow(br.length);
} }
@ -172,7 +172,7 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms2.length); final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms2.length);
for(int idx=0;idx<terms2.length;idx++) { for(int idx=0;idx<terms2.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx))); pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], (long) idx));
} }
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false); final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false);
assertNotNull(fst); assertNotNull(fst);
@ -230,7 +230,7 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length); final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
for(int idx=0;idx<terms.length;idx++) { for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx))); pairs.add(new FSTTester.InputOutput<Long>(terms[idx], (long) idx));
} }
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(); new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest();
} }
@ -244,7 +244,7 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms.length;idx++) { for(int idx=0;idx<terms.length;idx++) {
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000); final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
lastOutput = value; lastOutput = value;
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value))); pairs.add(new FSTTester.InputOutput<Long>(terms[idx], value));
} }
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, doShare).doTest(); new FSTTester<Long>(random, dir, inputMode, pairs, outputs, doShare).doTest();
} }
@ -254,7 +254,7 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length); final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
for(int idx=0;idx<terms.length;idx++) { for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE)); pairs.add(new FSTTester.InputOutput<Long>(terms[idx], random.nextLong() & Long.MAX_VALUE));
} }
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, false).doTest(); new FSTTester<Long>(random, dir, inputMode, pairs, outputs, false).doTest();
} }
@ -270,8 +270,7 @@ public class TestFSTs extends LuceneTestCase {
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000); final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
lastOutput = value; lastOutput = value;
pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx], pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
outputs.get(o1.get(idx), outputs.newPair((long) idx, value)));
o2.get(value))));
} }
new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs, false).doTest(); new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs, false).doTest();
} }
@ -393,6 +392,7 @@ public class TestFSTs extends LuceneTestCase {
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
final T NO_OUTPUT = fst.outputs.getNoOutput(); final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT; T output = NO_OUTPUT;
final FST.BytesReader fstReader = fst.getBytesReader(0);
for(int i=0;i<=term.length;i++) { for(int i=0;i<=term.length;i++) {
final int label; final int label;
@ -401,8 +401,9 @@ public class TestFSTs extends LuceneTestCase {
} else { } else {
label = term.ints[term.offset+i]; label = term.ints[term.offset+i];
} }
//System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); // System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
if (fst.findTargetArc(label, arc, arc) == null) { if (fst.findTargetArc(label, arc, arc, fstReader) == null) {
// System.out.println(" not found");
if (prefixLength != null) { if (prefixLength != null) {
prefixLength[0] = i; prefixLength[0] = i;
return output; return output;
@ -462,16 +463,19 @@ public class TestFSTs extends LuceneTestCase {
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException { FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2); System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
} }
final boolean willRewrite = random.nextBoolean();
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2, prune1, prune2,
prune1==0 && prune2==0, prune1==0 && prune2==0,
allowRandomSuffixSharing ? random.nextBoolean() : true, allowRandomSuffixSharing ? random.nextBoolean() : true,
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
outputs, outputs,
null); null,
willRewrite);
for(InputOutput<T> pair : pairs) { for(InputOutput<T> pair : pairs) {
if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
@ -486,7 +490,7 @@ public class TestFSTs extends LuceneTestCase {
} }
FST<T> fst = builder.finish(); FST<T> fst = builder.finish();
if (random.nextBoolean() && fst != null) { if (random.nextBoolean() && fst != null && !willRewrite) {
TestFSTs t = new TestFSTs(); TestFSTs t = new TestFSTs();
IOContext context = t.newIOContext(random); IOContext context = t.newIOContext(random);
IndexOutput out = dir.createOutput("fst.bin", context); IndexOutput out = dir.createOutput("fst.bin", context);
@ -522,6 +526,21 @@ public class TestFSTs extends LuceneTestCase {
verifyPruned(inputMode, fst, prune1, prune2); verifyPruned(inputMode, fst, prune1, prune2);
} }
if (willRewrite && fst != null) {
if (VERBOSE) {
System.out.println("TEST: now rewrite");
}
final FST<T> packed =fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000));
if (VERBOSE) {
System.out.println("TEST: now verify packed FST");
}
if (prune1 == 0 && prune2 == 0) {
verifyUnPruned(inputMode, packed);
} else {
verifyPruned(inputMode, packed, prune1, prune2);
}
}
return fst; return fst;
} }
@ -638,7 +657,7 @@ public class TestFSTs extends LuceneTestCase {
num = atLeast(100); num = atLeast(100);
for(int iter=0;iter<num;iter++) { for(int iter=0;iter<num;iter++) {
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: iter=" + iter); System.out.println(" iter=" + iter);
} }
if (random.nextBoolean()) { if (random.nextBoolean()) {
// seek to term that doesn't exist: // seek to term that doesn't exist:
@ -866,7 +885,15 @@ public class TestFSTs extends LuceneTestCase {
prefixes.put(IntsRef.deepCopyOf(scratch), cmo); prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
} else { } else {
cmo.count++; cmo.count++;
cmo.output = outputs.common(cmo.output, pair.output); T output1 = cmo.output;
if (output1.equals(outputs.getNoOutput())) {
output1 = outputs.getNoOutput();
}
T output2 = pair.output;
if (output2.equals(outputs.getNoOutput())) {
output2 = outputs.getNoOutput();
}
cmo.output = outputs.common(output1, output2);
} }
if (idx == pair.input.length) { if (idx == pair.input.length) {
cmo.isFinal = true; cmo.isFinal = true;
@ -992,7 +1019,7 @@ public class TestFSTs extends LuceneTestCase {
public void testRandomWords() throws IOException { public void testRandomWords() throws IOException {
testRandomWords(1000, atLeast(2)); testRandomWords(1000, atLeast(2));
//testRandomWords(20, 100); //testRandomWords(100, 1);
} }
String inputModeToString(int mode) { String inputModeToString(int mode) {
@ -1055,50 +1082,6 @@ public class TestFSTs extends LuceneTestCase {
} }
} }
// NOTE: this test shows a case where our current builder
// fails to produce minimal FST:
/*
public void test3() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRef scratchIntsRef = new IntsRef();
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.get(0));
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), 1L);
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), 2L);
final FST<Long> fst = builder.finish();
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
// NOTE: we produce 7 nodes today
assertEquals(6, fst.getNodeCount());
// NOTE: we produce 8 arcs today
assertEquals(7, fst.getNodeCount());
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
//Util.toDot(fst, w, false, false);
//w.close();
}
*/
// NOTE: this test shows a case where our current builder
// fails to produce minimal FST:
/*
public void test4() throws Exception {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRef scratchIntsRef = new IntsRef();
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1"));
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11"));
final FST<BytesRef> fst = builder.finish();
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
// NOTE: we produce 7 nodes today
assertEquals(6, fst.getNodeCount());
// NOTE: we produce 8 arcs today
assertEquals(7, fst.getNodeCount());
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
//Util.toDot(fst, w, false, false);
//w.close();
}
*/
// Build FST for all unique terms in the test line docs // Build FST for all unique terms in the test line docs
// file, up until a time limit // file, up until a time limit
public void testRealTerms() throws Exception { public void testRealTerms() throws Exception {
@ -1126,7 +1109,10 @@ public class TestFSTs extends LuceneTestCase {
IndexReader r = IndexReader.open(writer, true); IndexReader r = IndexReader.open(writer, true);
writer.close(); writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final boolean doRewrite = random.nextBoolean();
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite);
boolean storeOrd = random.nextBoolean(); boolean storeOrd = random.nextBoolean();
if (VERBOSE) { if (VERBOSE) {
@ -1162,59 +1148,69 @@ public class TestFSTs extends LuceneTestCase {
} else { } else {
output = termsEnum.docFreq(); output = termsEnum.docFreq();
} }
builder.add(Util.toIntsRef(term, scratchIntsRef), outputs.get(output)); builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
ord++; ord++;
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
System.out.println(ord + " terms..."); System.out.println(ord + " terms...");
} }
} }
final FST<Long> fst = builder.finish(); FST<Long> fst = builder.finish();
if (VERBOSE) { if (VERBOSE) {
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes"); System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes");
} }
if (ord > 0) { if (ord > 0) {
// Now confirm BytesRefFSTEnum and TermsEnum act the for(int rewriteIter=0;rewriteIter<2;rewriteIter++) {
// same: if (rewriteIter == 1) {
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst); if (doRewrite) {
int num = atLeast(1000); // Verify again, with packed FST:
for(int iter=0;iter<num;iter++) { fst = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000));
final BytesRef randomTerm = new BytesRef(getRandomString()); } else {
break;
if (VERBOSE) { }
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
} }
// Now confirm BytesRefFSTEnum and TermsEnum act the
// same:
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
int num = atLeast(1000);
for(int iter=0;iter<num;iter++) {
final BytesRef randomTerm = new BytesRef(getRandomString());
if (VERBOSE) {
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
}
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm); final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
final BytesRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(randomTerm); final BytesRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(randomTerm);
if (seekResult == TermsEnum.SeekStatus.END) { if (seekResult == TermsEnum.SeekStatus.END) {
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult); assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
} else { } else {
assertSame(termsEnum, fstEnum, storeOrd); assertSame(termsEnum, fstEnum, storeOrd);
for(int nextIter=0;nextIter<10;nextIter++) { for(int nextIter=0;nextIter<10;nextIter++) {
if (VERBOSE) {
System.out.println("TEST: next");
if (storeOrd) {
System.out.println(" ord=" + termsEnum.ord());
}
}
if (termsEnum.next() != null) {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" term=" + termsEnum.term().utf8ToString()); System.out.println("TEST: next");
if (storeOrd) {
System.out.println(" ord=" + termsEnum.ord());
}
} }
assertNotNull(fstEnum.next()); if (termsEnum.next() != null) {
assertSame(termsEnum, fstEnum, storeOrd); if (VERBOSE) {
} else { System.out.println(" term=" + termsEnum.term().utf8ToString());
if (VERBOSE) { }
System.out.println(" end!"); assertNotNull(fstEnum.next());
assertSame(termsEnum, fstEnum, storeOrd);
} else {
if (VERBOSE) {
System.out.println(" end!");
}
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
if (nextResult != null) {
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
fail();
}
break;
} }
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
if (nextResult != null) {
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
fail();
}
break;
} }
} }
} }
@ -1248,14 +1244,17 @@ public class TestFSTs extends LuceneTestCase {
private int inputMode; private int inputMode;
private final Outputs<T> outputs; private final Outputs<T> outputs;
private final Builder<T> builder; private final Builder<T> builder;
private final boolean doPack;
public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs) { public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean doPack, boolean noArcArrays) {
this.dirOut = dirOut; this.dirOut = dirOut;
this.wordsFileIn = wordsFileIn; this.wordsFileIn = wordsFileIn;
this.inputMode = inputMode; this.inputMode = inputMode;
this.outputs = outputs; this.outputs = outputs;
this.doPack = doPack;
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null);
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack);
builder.setAllowArrayArcs(!noArcArrays);
} }
protected abstract T getOutput(IntsRef input, int ord) throws IOException; protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@ -1287,14 +1286,15 @@ public class TestFSTs extends LuceneTestCase {
} }
assert builder.getTermCount() == ord; assert builder.getTermCount() == ord;
final FST<T> fst = builder.finish(); FST<T> fst = builder.finish();
if (fst == null) { if (fst == null) {
System.out.println("FST was fully pruned!"); System.out.println("FST was fully pruned!");
System.exit(0); System.exit(0);
} }
if (dirOut == null) if (dirOut == null) {
return; return;
}
System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes()); System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
if (fst.getNodeCount() < 100) { if (fst.getNodeCount() < 100) {
@ -1304,12 +1304,17 @@ public class TestFSTs extends LuceneTestCase {
System.out.println("Wrote FST to out.dot"); System.out.println("Wrote FST to out.dot");
} }
Directory dir = FSDirectory.open(new File(dirOut)); if (doPack) {
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT); System.out.println("Pack...");
fst.save(out); fst = fst.pack(4, 100000000);
out.close(); System.out.println("New size " + fst.sizeInBytes() + " bytes");
} else {
System.out.println("Saved FST to fst.bin."); Directory dir = FSDirectory.open(new File(dirOut));
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT);
fst.save(out);
out.close();
System.out.println("Saved FST to fst.bin.");
}
if (!verify) { if (!verify) {
return; return;
@ -1317,45 +1322,50 @@ public class TestFSTs extends LuceneTestCase {
System.out.println("\nNow verify..."); System.out.println("\nNow verify...");
is.close();
is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
ord = 0;
tStart = System.currentTimeMillis();
while(true) { while(true) {
String w = is.readLine(); is.close();
if (w == null) { is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
break;
} ord = 0;
toIntsRef(w, inputMode, intsRef); tStart = System.currentTimeMillis();
T expected = getOutput(intsRef, ord); while(true) {
T actual = Util.get(fst, intsRef); String w = is.readLine();
if (actual == null) { if (w == null) {
throw new RuntimeException("unexpected null output on input=" + w); break;
} }
if (!actual.equals(expected)) { toIntsRef(w, inputMode, intsRef);
throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w); T expected = getOutput(intsRef, ord);
T actual = Util.get(fst, intsRef);
if (actual == null) {
throw new RuntimeException("unexpected null output on input=" + w);
}
if (!actual.equals(expected)) {
throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
}
ord++;
if (ord % 500000 == 0) {
System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
}
if (ord >= limit) {
break;
}
} }
ord++; double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
if (ord % 500000 == 0) { System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
} // NOTE: comment out to profile lookup...
if (ord >= limit) { break;
break;
}
} }
double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
} finally { } finally {
is.close(); is.close();
} }
} }
} }
// java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out // java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
int prune = 0; int prune = 0;
int limit = Integer.MAX_VALUE; int limit = Integer.MAX_VALUE;
@ -1363,7 +1373,8 @@ public class TestFSTs extends LuceneTestCase {
boolean storeOrds = false; boolean storeOrds = false;
boolean storeDocFreqs = false; boolean storeDocFreqs = false;
boolean verify = true; boolean verify = true;
boolean doPack = false;
boolean noArcArrays = false;
String wordsFileIn = null; String wordsFileIn = null;
String dirOut = null; String dirOut = null;
@ -1381,10 +1392,14 @@ public class TestFSTs extends LuceneTestCase {
inputMode = 1; inputMode = 1;
} else if (args[idx].equals("-docFreq")) { } else if (args[idx].equals("-docFreq")) {
storeDocFreqs = true; storeDocFreqs = true;
} else if (args[idx].equals("-noArcArrays")) {
noArcArrays = true;
} else if (args[idx].equals("-ords")) { } else if (args[idx].equals("-ords")) {
storeOrds = true; storeOrds = true;
} else if (args[idx].equals("-noverify")) { } else if (args[idx].equals("-noverify")) {
verify = false; verify = false;
} else if (args[idx].equals("-pack")) {
doPack = true;
} else if (args[idx].startsWith("-")) { } else if (args[idx].startsWith("-")) {
System.err.println("Unrecognized option: " + args[idx]); System.err.println("Unrecognized option: " + args[idx]);
System.exit(-1); System.exit(-1);
@ -1413,44 +1428,44 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true);
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false); final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false);
final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2); final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs) { new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
Random rand; Random rand;
@Override @Override
public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) { public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
if (ord == 0) { if (ord == 0) {
rand = new Random(17); rand = new Random(17);
} }
return new PairOutputs.Pair<Long,Long>(o1.get(ord), return outputs.newPair((long) ord,
o2.get(_TestUtil.nextInt(rand, 1, 5000))); (long) _TestUtil.nextInt(rand, 1, 5000));
} }
}.run(limit, verify); }.run(limit, verify);
} else if (storeOrds) { } else if (storeOrds) {
// Store only ords // Store only ords
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) { new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
@Override @Override
public Long getOutput(IntsRef input, int ord) { public Long getOutput(IntsRef input, int ord) {
return outputs.get(ord); return (long) ord;
} }
}.run(limit, verify); }.run(limit, verify);
} else if (storeDocFreqs) { } else if (storeDocFreqs) {
// Store only docFreq // Store only docFreq
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) { new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
Random rand; Random rand;
@Override @Override
public Long getOutput(IntsRef input, int ord) { public Long getOutput(IntsRef input, int ord) {
if (ord == 0) { if (ord == 0) {
rand = new Random(17); rand = new Random(17);
} }
return outputs.get(_TestUtil.nextInt(rand, 1, 5000)); return (long) _TestUtil.nextInt(rand, 1, 5000);
} }
}.run(limit, verify); }.run(limit, verify);
} else { } else {
// Store nothing // Store nothing
final NoOutputs outputs = NoOutputs.getSingleton(); final NoOutputs outputs = NoOutputs.getSingleton();
final Object NO_OUTPUT = outputs.getNoOutput(); final Object NO_OUTPUT = outputs.getNoOutput();
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs) { new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
@Override @Override
public Object getOutput(IntsRef input, int ord) { public Object getOutput(IntsRef input, int ord) {
return NO_OUTPUT; return NO_OUTPUT;
@ -1468,6 +1483,46 @@ public class TestFSTs extends LuceneTestCase {
assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
} }
/*
public void testTrivial() throws Exception {
// Get outputs -- passing true means FST will share
// (delta code) the outputs. This should result in
// smaller FST if the outputs grow monotonically. But
// if numbers are "random", false should give smaller
// final size:
final NoOutputs outputs = NoOutputs.getSingleton();
String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1,
0, 0,
true,
true,
Integer.MAX_VALUE,
outputs,
null,
true);
Arrays.sort(strings);
final IntsRef scratch = new IntsRef();
for(String s : strings) {
builder.add(Util.toIntsRef(new BytesRef(s), scratch), outputs.getNoOutput());
}
final FST<Object> fst = builder.finish();
System.out.println("DOT before rewrite");
Writer w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/before.dot"));
Util.toDot(fst, w, false, false);
w.close();
final FST<Object> rewrite = new FST<Object>(fst, 1, 100);
System.out.println("DOT after rewrite");
w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/after.dot"));
Util.toDot(rewrite, w, false, false);
w.close();
}
*/
public void testSimple() throws Exception { public void testSimple() throws Exception {
// Get outputs -- passing true means FST will share // Get outputs -- passing true means FST will share
@ -1484,9 +1539,9 @@ public class TestFSTs extends LuceneTestCase {
final BytesRef b = new BytesRef("b"); final BytesRef b = new BytesRef("b");
final BytesRef c = new BytesRef("c"); final BytesRef c = new BytesRef("c");
builder.add(Util.toIntsRef(a, new IntsRef()), outputs.get(17)); builder.add(Util.toIntsRef(a, new IntsRef()), 17L);
builder.add(Util.toIntsRef(b, new IntsRef()), outputs.get(42)); builder.add(Util.toIntsRef(b, new IntsRef()), 42L);
builder.add(Util.toIntsRef(c, new IntsRef()), outputs.get(13824324872317238L)); builder.add(Util.toIntsRef(c, new IntsRef()), 13824324872317238L);
final FST<Long> fst = builder.finish(); final FST<Long> fst = builder.finish();
@ -1795,11 +1850,11 @@ public class TestFSTs extends LuceneTestCase {
public void testFinalOutputOnEndState() throws Exception { public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null); final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random.nextBoolean());
builder.add(Util.toUTF32("stat", new IntsRef()), outputs.get(17)); builder.add(Util.toUTF32("stat", new IntsRef()), 17L);
builder.add(Util.toUTF32("station", new IntsRef()), outputs.get(10)); builder.add(Util.toUTF32("station", new IntsRef()), 10L);
final FST<Long> fst = builder.finish(); final FST<Long> fst = builder.finish();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
StringWriter w = new StringWriter(); StringWriter w = new StringWriter();
Util.toDot(fst, w, false, false); Util.toDot(fst, w, false, false);
w.close(); w.close();
@ -1809,8 +1864,8 @@ public class TestFSTs extends LuceneTestCase {
public void testInternalFinalState() throws Exception { public void testInternalFinalState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final boolean willRewrite = random.nextBoolean();
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null); final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite);
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput()); builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput()); builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput());
final FST<Long> fst = builder.finish(); final FST<Long> fst = builder.finish();
@ -1819,17 +1874,23 @@ public class TestFSTs extends LuceneTestCase {
Util.toDot(fst, w, false, false); Util.toDot(fst, w, false, false);
w.close(); w.close();
//System.out.println(w.toString()); //System.out.println(w.toString());
assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1); final String expected;
if (willRewrite) {
expected = "4 -> 3 [label=\"t\" style=\"bold\"";
} else {
expected = "8 -> 6 [label=\"t\" style=\"bold\"";
}
assertTrue(w.toString().indexOf(expected) != -1);
} }
// Make sure raw FST can differentiate between final vs // Make sure raw FST can differentiate between final vs
// non-final end nodes // non-final end nodes
public void testNonFinalStopNodes() throws Exception { public void testNonFinalStopNode() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final Long nothing = outputs.getNoOutput(); final Long nothing = outputs.getNoOutput();
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs); final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs, false);
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0); final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
@ -1839,8 +1900,8 @@ public class TestFSTs extends LuceneTestCase {
node.isFinal = true; node.isFinal = true;
rootNode.addArc('a', node); rootNode.addArc('a', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode(); final Builder.CompiledNode frozen = new Builder.CompiledNode();
frozen.address = fst.addNode(node); frozen.node = fst.addNode(node);
rootNode.arcs[0].nextFinalOutput = outputs.get(17); rootNode.arcs[0].nextFinalOutput = 17L;
rootNode.arcs[0].isFinal = true; rootNode.arcs[0].isFinal = true;
rootNode.arcs[0].output = nothing; rootNode.arcs[0].output = nothing;
rootNode.arcs[0].target = frozen; rootNode.arcs[0].target = frozen;
@ -1851,13 +1912,18 @@ public class TestFSTs extends LuceneTestCase {
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0); final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
rootNode.addArc('b', node); rootNode.addArc('b', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode(); final Builder.CompiledNode frozen = new Builder.CompiledNode();
frozen.address = fst.addNode(node); frozen.node = fst.addNode(node);
rootNode.arcs[1].nextFinalOutput = nothing; rootNode.arcs[1].nextFinalOutput = nothing;
rootNode.arcs[1].output = outputs.get(42); rootNode.arcs[1].output = 42L;
rootNode.arcs[1].target = frozen; rootNode.arcs[1].target = frozen;
} }
fst.finish(fst.addNode(rootNode)); fst.finish(fst.addNode(rootNode));
StringWriter w = new StringWriter();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
Util.toDot(fst, w, false, false);
w.close();
checkStopNodes(fst, outputs); checkStopNodes(fst, outputs);

View File

@ -226,6 +226,9 @@ public final class SynonymFilter extends TokenFilter {
private final FST<BytesRef> fst; private final FST<BytesRef> fst;
private final FST.BytesReader fstReader;
private final BytesRef scratchBytes = new BytesRef(); private final BytesRef scratchBytes = new BytesRef();
private final CharsRef scratchChars = new CharsRef(); private final CharsRef scratchChars = new CharsRef();
@ -241,7 +244,7 @@ public final class SynonymFilter extends TokenFilter {
this.synonyms = synonyms; this.synonyms = synonyms;
this.ignoreCase = ignoreCase; this.ignoreCase = ignoreCase;
this.fst = synonyms.fst; this.fst = synonyms.fst;
this.fstReader = fst.getBytesReader(0);
if (fst == null) { if (fst == null) {
throw new IllegalArgumentException("fst must be non-null"); throw new IllegalArgumentException("fst must be non-null");
} }
@ -366,7 +369,7 @@ public final class SynonymFilter extends TokenFilter {
int bufUpto = 0; int bufUpto = 0;
while(bufUpto < bufferLen) { while(bufUpto < bufferLen) {
final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) { if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
//System.out.println(" stop"); //System.out.println(" stop");
break byToken; break byToken;
} }
@ -388,7 +391,7 @@ public final class SynonymFilter extends TokenFilter {
// See if the FST wants to continue matching (ie, needs to // See if the FST wants to continue matching (ie, needs to
// see the next input token): // see the next input token):
if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) { if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
// No further rules can match here; we're done // No further rules can match here; we're done
// searching for matching rules starting at the // searching for matching rules starting at the
// current input position. // current input position.

View File

@ -47,16 +47,17 @@ public final class TokenInfoFST {
FST.Arc<Long> firstArc = new FST.Arc<Long>(); FST.Arc<Long> firstArc = new FST.Arc<Long>();
fst.getFirstArc(firstArc); fst.getFirstArc(firstArc);
FST.Arc<Long> arc = new FST.Arc<Long>(); FST.Arc<Long> arc = new FST.Arc<Long>();
final FST.BytesReader fstReader = fst.getBytesReader(0);
// TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs) // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs)
for (int i = 0; i < rootCache.length; i++) { for (int i = 0; i < rootCache.length; i++) {
if (fst.findTargetArc(0x3040 + i, firstArc, arc) != null) { if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) {
rootCache[i] = new FST.Arc<Long>().copyFrom(arc); rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
} }
} }
return rootCache; return rootCache;
} }
public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache) throws IOException { public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache, FST.BytesReader fstReader) throws IOException {
if (useCache && ch >= 0x3040 && ch <= cacheCeiling) { if (useCache && ch >= 0x3040 && ch <= cacheCeiling) {
assert ch != FST.END_LABEL; assert ch != FST.END_LABEL;
final Arc<Long> result = rootCache[ch - 0x3040]; final Arc<Long> result = rootCache[ch - 0x3040];
@ -67,13 +68,17 @@ public final class TokenInfoFST {
return arc; return arc;
} }
} else { } else {
return fst.findTargetArc(ch, follow, arc); return fst.findTargetArc(ch, follow, arc, fstReader);
} }
} }
public Arc<Long> getFirstArc(FST.Arc<Long> arc) { public Arc<Long> getFirstArc(FST.Arc<Long> arc) {
return fst.getFirstArc(arc); return fst.getFirstArc(arc);
} }
public FST.BytesReader getBytesReader(int pos) {
return fst.getBytesReader(pos);
}
/** @lucene.internal for testing only */ /** @lucene.internal for testing only */
FST<Long> getInternalFST() { FST<Long> getInternalFST() {

View File

@ -113,7 +113,7 @@ public final class UserDictionary implements Dictionary {
for (int i = 0; i < token.length(); i++) { for (int i = 0; i < token.length(); i++) {
scratch.ints[i] = (int) token.charAt(i); scratch.ints[i] = (int) token.charAt(i);
} }
fstBuilder.add(scratch, fstOutput.get(ord)); fstBuilder.add(scratch, ord);
segmentations.add(wordIdAndLength); segmentations.add(wordIdAndLength);
ord++; ord++;
} }
@ -134,6 +134,8 @@ public final class UserDictionary implements Dictionary {
TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...] TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
boolean found = false; // true if we found any results boolean found = false; // true if we found any results
final FST.BytesReader fstReader = fst.getBytesReader(0);
FST.Arc<Long> arc = new FST.Arc<Long>(); FST.Arc<Long> arc = new FST.Arc<Long>();
int end = off + len; int end = off + len;
for (int startOffset = off; startOffset < end; startOffset++) { for (int startOffset = off; startOffset < end; startOffset++) {
@ -142,7 +144,7 @@ public final class UserDictionary implements Dictionary {
int remaining = end - startOffset; int remaining = end - startOffset;
for (int i = 0; i < remaining; i++) { for (int i = 0; i < remaining; i++) {
int ch = chars[startOffset+i]; int ch = chars[startOffset+i];
if (fst.findTargetArc(ch, arc, arc, i == 0) == null) { if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
break; // continue to next position break; // continue to next position
} }
output += arc.output.intValue(); output += arc.output.intValue();

View File

@ -35,7 +35,7 @@ import org.apache.lucene.util.fst.FST;
public class Viterbi { public class Viterbi {
private final TokenInfoFST fst; private final TokenInfoFST fst;
private final TokenInfoDictionary dictionary; private final TokenInfoDictionary dictionary;
private final UnknownDictionary unkDictionary; private final UnknownDictionary unkDictionary;
@ -214,6 +214,8 @@ public class Viterbi {
ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN); ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr); addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
final FST.BytesReader fstReader = fst.getBytesReader(0);
// Process user dictionary; // Process user dictionary;
if (useUserDictionary) { if (useUserDictionary) {
processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr); processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
@ -238,7 +240,7 @@ public class Viterbi {
for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) { for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
int ch = text[suffixStart + endIndex - 1]; int ch = text[suffixStart + endIndex - 1];
if (fst.findTargetArc(ch, arc, arc, endIndex == 1) == null) { if (fst.findTargetArc(ch, arc, arc, endIndex == 1, fstReader) == null) {
break; // continue to next position break; // continue to next position
} }
output += arc.output.intValue(); output += arc.output.intValue();

View File

@ -131,7 +131,7 @@ public class TokenInfoDictionaryBuilder {
System.out.println(" encode..."); System.out.println(" encode...");
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, fstOutput); Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true);
IntsRef scratch = new IntsRef(); IntsRef scratch = new IntsRef();
long ord = -1; // first ord will be 0 long ord = -1; // first ord will be 0
String lastValue = null; String lastValue = null;
@ -155,13 +155,14 @@ public class TokenInfoDictionaryBuilder {
for (int i = 0; i < token.length(); i++) { for (int i = 0; i < token.length(); i++) {
scratch.ints[i] = (int) token.charAt(i); scratch.ints[i] = (int) token.charAt(i);
} }
fstBuilder.add(scratch, fstOutput.get(ord)); fstBuilder.add(scratch, ord);
} }
dictionary.addMapping((int)ord, offset); dictionary.addMapping((int)ord, offset);
offset = next; offset = next;
} }
FST<Long> fst = fstBuilder.finish(); final FST<Long> fst = fstBuilder.finish().pack(2, 100000);
System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... "); System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... ");
dictionary.setFST(fst); dictionary.setFST(fst);
System.out.println(" done"); System.out.println(" done");

View File

@ -329,8 +329,11 @@ public class FSTCompletion {
private boolean descendWithPrefix(Arc<Object> arc, BytesRef utf8) private boolean descendWithPrefix(Arc<Object> arc, BytesRef utf8)
throws IOException { throws IOException {
final int max = utf8.offset + utf8.length; final int max = utf8.offset + utf8.length;
// Cannot save as instance var since multiple threads
// can use FSTCompletion at once...
final FST.BytesReader fstReader = automaton.getBytesReader(0);
for (int i = utf8.offset; i < max; i++) { for (int i = utf8.offset; i < max; i++) {
if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc) == null) { if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc, fstReader) == null) {
// No matching prefixes, return an empty result. // No matching prefixes, return an empty result.
return false; return false;
} }

View File

@ -234,7 +234,7 @@ public class FSTCompletionBuilder {
final Object empty = outputs.getNoOutput(); final Object empty = outputs.getNoOutput();
final Builder<Object> builder = new Builder<Object>( final Builder<Object> builder = new Builder<Object>(
FST.INPUT_TYPE.BYTE1, 0, 0, true, true, FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
shareMaxTailLength, outputs, null); shareMaxTailLength, outputs, null, false);
BytesRef scratch = new BytesRef(); BytesRef scratch = new BytesRef();
final IntsRef scratchIntsRef = new IntsRef(); final IntsRef scratchIntsRef = new IntsRef();