LUCENE-3725: add optional packing to FSTs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1237500 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-01-29 22:48:45 +00:00
parent 2e5be2f75c
commit d1165b1972
24 changed files with 1361 additions and 422 deletions

View File

@ -807,6 +807,9 @@ New Features
* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
markup. (Steve Rowe)
* LUCENE-3725: Added optional packing to FST building; this uses extra
RAM during building but results in a smaller FST. (Mike McCandless)
Bug fixes
* LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter

View File

@ -398,7 +398,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
final long indexStartFP;
final long rootBlockFP;
final BytesRef rootCode;
private FST<BytesRef> index;
private final FST<BytesRef> index;
//private boolean DEBUG;
@ -433,6 +433,8 @@ public class BlockTreeTermsReader extends FieldsProducer {
w.close();
}
*/
} else {
index = null;
}
}
@ -495,6 +497,8 @@ public class BlockTreeTermsReader extends FieldsProducer {
private final BytesRef term = new BytesRef();
private final FST.BytesReader fstReader;
// TODO: can we share this with the frame in STE?
private final class Frame {
final int ord;
@ -755,6 +759,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
arcs[arcIdx] = new FST.Arc<BytesRef>();
}
if (index == null) {
fstReader = null;
} else {
fstReader = index.getBytesReader(0);
}
// TODO: if the automaton is "smallish" we really
// should use the terms index to seek at least to
// the initial term and likely to subsequent terms
@ -842,7 +852,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
// TODO: we could be more efficient for the next()
// case by using current arc as starting point,
// passed to findTargetArc
arc = index.findTargetArc(target, arc, getArc(1+idx));
arc = index.findTargetArc(target, arc, getArc(1+idx), fstReader);
assert arc != null;
output = fstOutputs.add(output, arc.output);
idx++;
@ -1186,6 +1196,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
private boolean eof;
final BytesRef term = new BytesRef();
private final FST.BytesReader fstReader;
@SuppressWarnings("unchecked") private FST.Arc<BytesRef>[] arcs = new FST.Arc[1];
@ -1196,6 +1207,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
// Used to hold seek by TermState, or cached seek
staticFrame = new Frame(-1);
if (index == null) {
fstReader = null;
} else {
fstReader = index.getBytesReader(0);
}
// Init w/ root block; don't use index since it may
// not (and need not) have been loaded
for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
@ -1581,7 +1598,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto));
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
if (nextArc == null) {
@ -1838,7 +1855,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto));
final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
if (nextArc == null) {

View File

@ -288,7 +288,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> indexBuilder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
outputs, null);
outputs, null, false);
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}
@ -831,7 +831,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
0, 0, true,
true, Integer.MAX_VALUE,
noOutputs,
new FindBlocks());
new FindBlocks(), false);
postingsWriter.setField(fieldInfo);
}

View File

@ -229,7 +229,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
////System.out.println("VGW: field=" + fieldInfo.name);
// Always put empty string in
fstBuilder.add(new IntsRef(), fstOutputs.get(termsFilePointer));
fstBuilder.add(new IntsRef(), termsFilePointer);
startTermsFilePointer = termsFilePointer;
}
@ -260,7 +260,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
final int lengthSave = text.length;
text.length = indexedTermPrefixLength(lastTerm, text);
try {
fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), fstOutputs.get(termsFilePointer));
fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
} finally {
text.length = lengthSave;
}

View File

@ -521,9 +521,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
final PairOutputs<Long,Long> outputsInner = new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs);
final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
outputsInner);
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1, outputs);
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRef lastTerm = new BytesRef(10);
@ -536,9 +537,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
SimpleTextUtil.readLine(in, scratch);
if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
if (lastDocsStart != -1) {
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
new PairOutputs.Pair<Long,Long>((long) docFreq,
posIntOutputs.get(totalTermFreq))));
b.add(Util.toIntsRef(lastTerm, scratchIntsRef),
outputs.newPair(lastDocsStart,
outputsInner.newPair((long) docFreq, totalTermFreq)));
sumTotalTermFreq += totalTermFreq;
}
break;
@ -553,9 +554,8 @@ class SimpleTextFieldsReader extends FieldsProducer {
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
} else if (StringHelper.startsWith(scratch, TERM)) {
if (lastDocsStart != -1) {
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
new PairOutputs.Pair<Long,Long>((long) docFreq,
posIntOutputs.get(totalTermFreq))));
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair(lastDocsStart,
outputsInner.newPair((long) docFreq, totalTermFreq)));
}
lastDocsStart = in.getFilePointer();
final int len = scratch.length - TERM.length;

View File

@ -95,7 +95,7 @@ public final class FixedBitSet extends DocIdSet implements Bits {
}
public boolean get(int index) {
assert index >= 0 && index < numBits;
assert index >= 0 && index < numBits: "index=" + index;
int i = index >> 6; // div 64
// signed shift will keep a negative index and force an
// array-index-out-of-bounds-exception, removing the need for an explicit check.

View File

@ -588,7 +588,7 @@ public final class UnicodeUtil {
out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
offset += 2;
} else {
assert b < 0xf8;
assert b < 0xf8: "b=" + b;
int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
offset += 3;
if (ch < UNI_MAX_BMP) {

View File

@ -17,15 +17,15 @@ package org.apache.lucene.util.fst;
* limitations under the License.
*/
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
import java.io.IOException;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
/**
* Builds a compact FST (maps an IntsRef term to an arbitrary
* Builds a minimal FST (maps an IntsRef term to an arbitrary
* output) from pre-sorted terms with outputs (the FST
* becomes an FSA if you use NoOutputs). The FST is written
* on-the-fly into a compact serialized format byte array, which can
@ -35,12 +35,6 @@ import java.io.IOException;
* <p>NOTE: The algorithm is described at
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698</p>
*
* If your outputs are ByteSequenceOutput then the final FST
* will be minimal, but if you use PositiveIntOutput then
* it's only "near minimal". For example, aa/0, aab/1, bbb/2
* will produce 6 states when a 5 state fst is also
* possible.
*
* The parameterized type T is the output type. See the
* subclasses of {@link Outputs}.
*
@ -52,7 +46,7 @@ public class Builder<T> {
private final FST<T> fst;
private final T NO_OUTPUT;
// private static final boolean DEBUG = false;
// private static final boolean DEBUG = true;
// simplistic pruning: we prune node (and all following
// nodes) if less than this number of terms go through it:
@ -88,7 +82,7 @@ public class Builder<T> {
* pruning options turned off.
*/
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null);
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false);
}
/**
@ -127,16 +121,20 @@ public class Builder<T> {
* @param outputs The output type for each input sequence. Applies only if building an FST. For
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object.
*
* @param willPackFST Pass true if you will rewrite (compact) the FST before saving. This
* causes the FST to create additional data structures intenrally to facilitate rewriting, but
* it means the resulting FST cannot be saved: it must first be rewritten using {@link FST#FST(FST,int[])}}
*/
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
FreezeTail<T> freezeTail) {
FreezeTail<T> freezeTail, boolean willPackFST) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
this.freezeTail = freezeTail;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
fst = new FST<T>(inputType, outputs);
fst = new FST<T>(inputType, outputs, willPackFST);
if (doShareSuffix) {
dedupHash = new NodeHash<T>(fst);
} else {
@ -170,23 +168,23 @@ public class Builder<T> {
fst.setAllowArrayArcs(b);
}
private CompiledNode compileNode(UnCompiledNode<T> n, int tailLength) throws IOException {
final int address;
if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) {
if (n.numArcs == 0) {
address = fst.addNode(n);
private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throws IOException {
final int node;
if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) {
if (nodeIn.numArcs == 0) {
node = fst.addNode(nodeIn);
} else {
address = dedupHash.add(n);
node = dedupHash.add(nodeIn);
}
} else {
address = fst.addNode(n);
node = fst.addNode(nodeIn);
}
assert address != -2;
assert node != -2;
n.clear();
nodeIn.clear();
final CompiledNode fn = new CompiledNode();
fn.address = address;
fn.node = node;
return fn;
}
@ -319,6 +317,11 @@ public class Builder<T> {
}
*/
// De-dup NO_OUTPUT since it must be a singleton:
if (output.equals(NO_OUTPUT)) {
output = NO_OUTPUT;
}
assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input;
assert validOutput(output);
@ -443,7 +446,7 @@ public class Builder<T> {
}
}
//if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
fst.finish(compileNode(root, lastInput.length).address);
fst.finish(compileNode(root, lastInput.length).node);
return fst;
}
@ -480,7 +483,7 @@ public class Builder<T> {
}
static final class CompiledNode implements Node {
int address;
int node;
public boolean isCompiled() {
return true;
}
@ -560,7 +563,7 @@ public class Builder<T> {
final Arc<T> arc = arcs[numArcs-1];
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
arc.target = target;
//assert target.address != -2;
//assert target.node != -2;
arc.nextFinalOutput = nextFinalOutput;
arc.isFinal = isFinal;
}

File diff suppressed because it is too large Load Diff

View File

@ -151,7 +151,8 @@ abstract class FSTEnum<T> {
boolean found = false;
while (low <= high) {
mid = (low + high) >>> 1;
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
in.pos = arc.posArcsStart;
in.skip(arc.bytesPerArc*mid+1);
final int midLabel = fst.readLabel(in);
final int cmp = midLabel - targetLabel;
//System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
@ -275,7 +276,7 @@ abstract class FSTEnum<T> {
// Now scan forward, matching the new suffix of the target
while(true) {
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast());
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + arc.bytesPerArc);
if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
// Arcs are fixed array -- use binary search to find
@ -289,15 +290,16 @@ abstract class FSTEnum<T> {
boolean found = false;
while (low <= high) {
mid = (low + high) >>> 1;
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
in.pos = arc.posArcsStart;
in.skip(arc.bytesPerArc*mid+1);
final int midLabel = fst.readLabel(in);
final int cmp = midLabel - targetLabel;
//System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
if (cmp < 0)
if (cmp < 0) {
low = mid + 1;
else if (cmp > 0)
} else if (cmp > 0) {
high = mid - 1;
else {
} else {
found = true;
break;
}
@ -430,9 +432,11 @@ abstract class FSTEnum<T> {
FST.Arc<T> arc = getArc(upto-1);
int targetLabel = getTargetLabel();
final FST.BytesReader fstReader = fst.getBytesReader(0);
while(true) {
//System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel));
final FST.Arc<T> nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto));
final FST.Arc<T> nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto), fstReader);
if (nextArc == null) {
// short circuit
//upto--;

View File

@ -35,7 +35,7 @@ final class NodeHash<T> {
}
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address, FST.BytesReader in) throws IOException {
fst.readFirstRealArc(address, scratchArc, in);
fst.readFirstRealTargetArc(address, scratchArc, in);
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
return false;
}
@ -43,7 +43,7 @@ final class NodeHash<T> {
final Builder.Arc<T> arc = node.arcs[arcUpto];
if (arc.label != scratchArc.label ||
!arc.output.equals(scratchArc.output) ||
((Builder.CompiledNode) arc.target).address != scratchArc.target ||
((Builder.CompiledNode) arc.target).node != scratchArc.target ||
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput) ||
arc.isFinal != scratchArc.isFinal()) {
return false;
@ -71,9 +71,9 @@ final class NodeHash<T> {
// TODO: maybe if number of arcs is high we can safely subsample?
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
final Builder.Arc<T> arc = node.arcs[arcIdx];
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).address + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
h = PRIME * h + arc.label;
h = PRIME * h + ((Builder.CompiledNode) arc.target).address;
h = PRIME * h + ((Builder.CompiledNode) arc.target).node;
h = PRIME * h + arc.output.hashCode();
h = PRIME * h + arc.nextFinalOutput.hashCode();
if (arc.isFinal) {
@ -88,9 +88,9 @@ final class NodeHash<T> {
private int hash(int node) throws IOException {
final int PRIME = 31;
final FST.BytesReader in = fst.getBytesReader(0);
//System.out.println("hash frozen");
//System.out.println("hash frozen node=" + node);
int h = 0;
fst.readFirstRealArc(node, scratchArc, in);
fst.readFirstRealTargetArc(node, scratchArc, in);
while(true) {
//System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal());
h = PRIME * h + scratchArc.label;
@ -109,26 +109,26 @@ final class NodeHash<T> {
return h & Integer.MAX_VALUE;
}
public int add(Builder.UnCompiledNode<T> node) throws IOException {
public int add(Builder.UnCompiledNode<T> nodeIn) throws IOException {
// System.out.println("hash: add count=" + count + " vs " + table.length);
final FST.BytesReader in = fst.getBytesReader(0);
final int h = hash(node);
final int h = hash(nodeIn);
int pos = h & mask;
int c = 0;
while(true) {
final int v = table[pos];
if (v == 0) {
// freeze & add
final int address = fst.addNode(node);
//System.out.println(" now freeze addr=" + address);
assert hash(address) == h : "frozenHash=" + hash(address) + " vs h=" + h;
final int node = fst.addNode(nodeIn);
//System.out.println(" now freeze node=" + node);
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
count++;
table[pos] = address;
table[pos] = node;
if (table.length < 2*count) {
rehash();
}
return address;
} else if (nodesEqual(node, v, in)) {
return node;
} else if (nodesEqual(nodeIn, v, in)) {
// same node is already here
return v;
}

View File

@ -26,6 +26,10 @@ import org.apache.lucene.store.DataOutput;
* Represents the outputs for an FST, providing the basic
* algebra needed for the FST.
*
* <p>Note that any operation that returns NO_OUTPUT must
* return the same singleton object from {@link
* #getNoOutput}.</p>
*
* @lucene.experimental
*/
@ -56,6 +60,8 @@ public abstract class Outputs<T> {
public abstract String outputToString(T output);
// TODO: maybe make valid(T output) public...? for asserts
public T merge(T first, T second) {
throw new UnsupportedOperationException();
}

View File

@ -38,7 +38,8 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
public final A output1;
public final B output2;
public Pair(A output1, B output2) {
// use newPair
private Pair(A output1, B output2) {
this.output1 = output1;
this.output2 = output2;
}
@ -67,34 +68,78 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
NO_OUTPUT = new Pair<A,B>(outputs1.getNoOutput(), outputs2.getNoOutput());
}
public Pair<A,B> get(A output1, B output2) {
if (output1 == outputs1.getNoOutput() && output2 == outputs2.getNoOutput()) {
/** Create a new Pair */
public Pair<A,B> newPair(A a, B b) {
if (a.equals(outputs1.getNoOutput())) {
a = outputs1.getNoOutput();
}
if (b.equals(outputs2.getNoOutput())) {
b = outputs2.getNoOutput();
}
if (a == outputs1.getNoOutput() && b == outputs2.getNoOutput()) {
return NO_OUTPUT;
} else {
return new Pair<A,B>(output1, output2);
final Pair<A,B> p = new Pair<A,B>(a, b);
assert valid(p);
return p;
}
}
// for assert
private boolean valid(Pair<A,B> pair) {
final boolean noOutput1 = pair.output1.equals(outputs1.getNoOutput());
final boolean noOutput2 = pair.output2.equals(outputs2.getNoOutput());
if (noOutput1 && pair.output1 != outputs1.getNoOutput()) {
System.out.println("invalid0");
return false;
}
if (noOutput2 && pair.output2 != outputs2.getNoOutput()) {
System.out.println("invalid1");
return false;
}
if (noOutput1 && noOutput2) {
if (pair != NO_OUTPUT) {
System.out.println("invalid2");
return false;
} else {
return true;
}
} else {
return true;
}
}
@Override
public Pair<A,B> common(Pair<A,B> pair1, Pair<A,B> pair2) {
return get(outputs1.common(pair1.output1, pair2.output1),
assert valid(pair1);
assert valid(pair2);
return newPair(outputs1.common(pair1.output1, pair2.output1),
outputs2.common(pair1.output2, pair2.output2));
}
@Override
public Pair<A,B> subtract(Pair<A,B> output, Pair<A,B> inc) {
return get(outputs1.subtract(output.output1, inc.output1),
assert valid(output);
assert valid(inc);
return newPair(outputs1.subtract(output.output1, inc.output1),
outputs2.subtract(output.output2, inc.output2));
}
@Override
public Pair<A,B> add(Pair<A,B> prefix, Pair<A,B> output) {
return get(outputs1.add(prefix.output1, output.output1),
assert valid(prefix);
assert valid(output);
return newPair(outputs1.add(prefix.output1, output.output1),
outputs2.add(prefix.output2, output.output2));
}
@Override
public void write(Pair<A,B> output, DataOutput writer) throws IOException {
assert valid(output);
outputs1.write(output.output1, writer);
outputs2.write(output.output2, writer);
}
@ -103,7 +148,7 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
public Pair<A,B> read(DataInput in) throws IOException {
A output1 = outputs1.read(in);
B output2 = outputs2.read(in);
return get(output1, output2);
return newPair(output1, output2);
}
@Override
@ -113,6 +158,12 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
@Override
public String outputToString(Pair<A,B> output) {
assert valid(output);
return "<pair:" + outputs1.outputToString(output.output1) + "," + outputs2.outputToString(output.output2) + ">";
}
@Override
public String toString() {
return "PairOutputs<" + outputs1 + "," + outputs2 + ">";
}
}

View File

@ -25,10 +25,7 @@ import org.apache.lucene.store.DataOutput;
/**
* Output is a long, for each input term. NOTE: the
* resulting FST is not guaranteed to be minimal! See
* {@link Builder}. You must use {@link #get} to obtain the
* output for a given long value -- do not use autoboxing
* nor create your own Long instance (the value 0
* must map to the {@link #getNoOutput} singleton).
* {@link Builder}.
*
* @lucene.experimental
*/
@ -50,14 +47,6 @@ public final class PositiveIntOutputs extends Outputs<Long> {
return doShare ? singletonShare : singletonNoShare;
}
public Long get(long v) {
if (v == 0) {
return NO_OUTPUT;
} else {
return Long.valueOf(v);
}
}
@Override
public Long common(Long output1, Long output2) {
assert valid(output1);

View File

@ -37,23 +37,21 @@ public final class Util {
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
final FST.BytesReader fstReader = fst.getBytesReader(0);
// Accumulate output as we go
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
T output = fst.outputs.getNoOutput();
for(int i=0;i<input.length;i++) {
if (fst.findTargetArc(input.ints[input.offset + i], arc, arc) == null) {
if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
}
output = fst.outputs.add(output, arc.output);
}
}
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
return fst.outputs.add(output, arc.output);
if (arc.isFinal()) {
return fst.outputs.add(output, arc.nextFinalOutput);
} else {
return output;
return null;
}
}
@ -64,26 +62,24 @@ public final class Util {
public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
assert fst.inputType == FST.INPUT_TYPE.BYTE1;
final FST.BytesReader fstReader = fst.getBytesReader(0);
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
// Accumulate output as we go
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
T output = fst.outputs.getNoOutput();
for(int i=0;i<input.length;i++) {
if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc) == null) {
if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc, fstReader) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
}
output = fst.outputs.add(output, arc.output);
}
}
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
return fst.outputs.add(output, arc.output);
if (arc.isFinal()) {
return fst.outputs.add(output, arc.nextFinalOutput);
} else {
return output;
return null;
}
}
@ -142,7 +138,7 @@ public final class Util {
result.grow(1+upto);
}
fst.readFirstRealArc(arc.target, arc, in);
fst.readFirstRealTargetArc(arc.target, arc, in);
FST.Arc<Long> prevArc = null;
@ -238,6 +234,7 @@ public final class Util {
// A queue of transitions to consider when processing the next level.
final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
nextLevelQueue.add(startArc);
//System.out.println("toDot: startArc: " + startArc);
// A list of states on the same level (for ranking).
final List<Integer> sameLevelStates = new ArrayList<Integer>();
@ -289,8 +286,11 @@ public final class Util {
int level = 0;
final FST.BytesReader r = fst.getBytesReader(0);
while (!nextLevelQueue.isEmpty()) {
// we could double buffer here, but it doesn't matter probably.
//System.out.println("next level=" + level);
thisLevelQueue.addAll(nextLevelQueue);
nextLevelQueue.clear();
@ -298,19 +298,19 @@ public final class Util {
out.write("\n // Transitions and states at level: " + level + "\n");
while (!thisLevelQueue.isEmpty()) {
final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
//System.out.println(" pop: " + arc);
if (fst.targetHasArcs(arc)) {
// scan all arcs
// scan all target arcs
//System.out.println(" readFirstTarget...");
final int node = arc.target;
fst.readFirstTargetArc(arc, arc);
if (arc.label == FST.END_LABEL) {
// Skip it -- prior recursion took this into account already
assert !arc.isLast();
fst.readNextArc(arc);
}
fst.readFirstRealTargetArc(arc.target, arc, r);
//System.out.println(" firstTarget: " + arc);
while (true) {
//System.out.println(" cycle arc=" + arc);
// Emit the unseen state and add it to the queue for the next level.
if (arc.target >= 0 && !seen.get(arc.target)) {
@ -339,7 +339,9 @@ public final class Util {
finalOutput = "";
}
emitDotState(out, Integer.toString(arc.target), arc.isFinal() ? finalStateShape : stateShape, stateColor, finalOutput);
emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, finalOutput);
// To see the node address, use this instead:
//emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
seen.set(arc.target);
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
sameLevelStates.add(arc.target);
@ -362,14 +364,22 @@ public final class Util {
outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]";
}
final String arcColor;
if (arc.flag(FST.BIT_TARGET_NEXT)) {
arcColor = "red";
} else {
arcColor = "black";
}
assert arc.label != FST.END_LABEL;
out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"]\n");
out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n");
// Break the loop if we're on the last arc of this state.
if (arc.isLast()) {
//System.out.println(" break");
break;
}
fst.readNextArc(arc);
fst.readNextRealArc(arc, r);
}
}
}

View File

@ -89,11 +89,11 @@ public class TestFSTs extends LuceneTestCase {
return br;
}
private static IntsRef toIntsRef(String s, int inputMode) {
static IntsRef toIntsRef(String s, int inputMode) {
return toIntsRef(s, inputMode, new IntsRef(10));
}
private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
if (inputMode == 0) {
// utf8
return toIntsRef(new BytesRef(s), ir);
@ -103,7 +103,7 @@ public class TestFSTs extends LuceneTestCase {
}
}
private static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
final int charLength = s.length();
int charIdx = 0;
int intIdx = 0;
@ -120,7 +120,7 @@ public class TestFSTs extends LuceneTestCase {
return ir;
}
private static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
if (br.length > ir.ints.length) {
ir.grow(br.length);
}
@ -172,7 +172,7 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms2.length);
for(int idx=0;idx<terms2.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], (long) idx));
}
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false);
assertNotNull(fst);
@ -230,7 +230,7 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], (long) idx));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest();
}
@ -244,7 +244,7 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms.length;idx++) {
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
lastOutput = value;
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], value));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, doShare).doTest();
}
@ -254,7 +254,7 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], random.nextLong() & Long.MAX_VALUE));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, false).doTest();
}
@ -270,8 +270,7 @@ public class TestFSTs extends LuceneTestCase {
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
lastOutput = value;
pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
outputs.get(o1.get(idx),
o2.get(value))));
outputs.newPair((long) idx, value)));
}
new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs, false).doTest();
}
@ -393,6 +392,7 @@ public class TestFSTs extends LuceneTestCase {
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
final FST.BytesReader fstReader = fst.getBytesReader(0);
for(int i=0;i<=term.length;i++) {
final int label;
@ -402,7 +402,8 @@ public class TestFSTs extends LuceneTestCase {
label = term.ints[term.offset+i];
}
// System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
if (fst.findTargetArc(label, arc, arc) == null) {
if (fst.findTargetArc(label, arc, arc, fstReader) == null) {
// System.out.println(" not found");
if (prefixLength != null) {
prefixLength[0] = i;
return output;
@ -462,16 +463,19 @@ public class TestFSTs extends LuceneTestCase {
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
if (VERBOSE) {
System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
}
final boolean willRewrite = random.nextBoolean();
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1==0 && prune2==0,
allowRandomSuffixSharing ? random.nextBoolean() : true,
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
outputs,
null);
null,
willRewrite);
for(InputOutput<T> pair : pairs) {
if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
@ -486,7 +490,7 @@ public class TestFSTs extends LuceneTestCase {
}
FST<T> fst = builder.finish();
if (random.nextBoolean() && fst != null) {
if (random.nextBoolean() && fst != null && !willRewrite) {
TestFSTs t = new TestFSTs();
IOContext context = t.newIOContext(random);
IndexOutput out = dir.createOutput("fst.bin", context);
@ -522,6 +526,21 @@ public class TestFSTs extends LuceneTestCase {
verifyPruned(inputMode, fst, prune1, prune2);
}
if (willRewrite && fst != null) {
if (VERBOSE) {
System.out.println("TEST: now rewrite");
}
final FST<T> packed =fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000));
if (VERBOSE) {
System.out.println("TEST: now verify packed FST");
}
if (prune1 == 0 && prune2 == 0) {
verifyUnPruned(inputMode, packed);
} else {
verifyPruned(inputMode, packed, prune1, prune2);
}
}
return fst;
}
@ -638,7 +657,7 @@ public class TestFSTs extends LuceneTestCase {
num = atLeast(100);
for(int iter=0;iter<num;iter++) {
if (VERBOSE) {
System.out.println("TEST: iter=" + iter);
System.out.println(" iter=" + iter);
}
if (random.nextBoolean()) {
// seek to term that doesn't exist:
@ -866,7 +885,15 @@ public class TestFSTs extends LuceneTestCase {
prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
} else {
cmo.count++;
cmo.output = outputs.common(cmo.output, pair.output);
T output1 = cmo.output;
if (output1.equals(outputs.getNoOutput())) {
output1 = outputs.getNoOutput();
}
T output2 = pair.output;
if (output2.equals(outputs.getNoOutput())) {
output2 = outputs.getNoOutput();
}
cmo.output = outputs.common(output1, output2);
}
if (idx == pair.input.length) {
cmo.isFinal = true;
@ -992,7 +1019,7 @@ public class TestFSTs extends LuceneTestCase {
public void testRandomWords() throws IOException {
testRandomWords(1000, atLeast(2));
//testRandomWords(20, 100);
//testRandomWords(100, 1);
}
String inputModeToString(int mode) {
@ -1055,50 +1082,6 @@ public class TestFSTs extends LuceneTestCase {
}
}
// NOTE: this test shows a case where our current builder
// fails to produce minimal FST:
/*
public void test3() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRef scratchIntsRef = new IntsRef();
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.get(0));
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), 1L);
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), 2L);
final FST<Long> fst = builder.finish();
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
// NOTE: we produce 7 nodes today
assertEquals(6, fst.getNodeCount());
// NOTE: we produce 8 arcs today
assertEquals(7, fst.getNodeCount());
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
//Util.toDot(fst, w, false, false);
//w.close();
}
*/
// NOTE: this test shows a case where our current builder
// fails to produce minimal FST:
/*
public void test4() throws Exception {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRef scratchIntsRef = new IntsRef();
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1"));
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11"));
final FST<BytesRef> fst = builder.finish();
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
// NOTE: we produce 7 nodes today
assertEquals(6, fst.getNodeCount());
// NOTE: we produce 8 arcs today
assertEquals(7, fst.getNodeCount());
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
//Util.toDot(fst, w, false, false);
//w.close();
}
*/
// Build FST for all unique terms in the test line docs
// file, up until a time limit
public void testRealTerms() throws Exception {
@ -1126,7 +1109,10 @@ public class TestFSTs extends LuceneTestCase {
IndexReader r = IndexReader.open(writer, true);
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final boolean doRewrite = random.nextBoolean();
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite);
boolean storeOrd = random.nextBoolean();
if (VERBOSE) {
@ -1162,18 +1148,27 @@ public class TestFSTs extends LuceneTestCase {
} else {
output = termsEnum.docFreq();
}
builder.add(Util.toIntsRef(term, scratchIntsRef), outputs.get(output));
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
ord++;
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
System.out.println(ord + " terms...");
}
}
final FST<Long> fst = builder.finish();
FST<Long> fst = builder.finish();
if (VERBOSE) {
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes");
}
if (ord > 0) {
for(int rewriteIter=0;rewriteIter<2;rewriteIter++) {
if (rewriteIter == 1) {
if (doRewrite) {
// Verify again, with packed FST:
fst = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000));
} else {
break;
}
}
// Now confirm BytesRefFSTEnum and TermsEnum act the
// same:
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
@ -1221,6 +1216,7 @@ public class TestFSTs extends LuceneTestCase {
}
}
}
}
r.close();
dir.close();
@ -1248,14 +1244,17 @@ public class TestFSTs extends LuceneTestCase {
private int inputMode;
private final Outputs<T> outputs;
private final Builder<T> builder;
private final boolean doPack;
public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs) {
public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean doPack, boolean noArcArrays) {
this.dirOut = dirOut;
this.wordsFileIn = wordsFileIn;
this.inputMode = inputMode;
this.outputs = outputs;
this.doPack = doPack;
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null);
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack);
builder.setAllowArrayArcs(!noArcArrays);
}
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@ -1287,14 +1286,15 @@ public class TestFSTs extends LuceneTestCase {
}
assert builder.getTermCount() == ord;
final FST<T> fst = builder.finish();
FST<T> fst = builder.finish();
if (fst == null) {
System.out.println("FST was fully pruned!");
System.exit(0);
}
if (dirOut == null)
if (dirOut == null) {
return;
}
System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
if (fst.getNodeCount() < 100) {
@ -1304,12 +1304,17 @@ public class TestFSTs extends LuceneTestCase {
System.out.println("Wrote FST to out.dot");
}
if (doPack) {
System.out.println("Pack...");
fst = fst.pack(4, 100000000);
System.out.println("New size " + fst.sizeInBytes() + " bytes");
} else {
Directory dir = FSDirectory.open(new File(dirOut));
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT);
fst.save(out);
out.close();
System.out.println("Saved FST to fst.bin.");
}
if (!verify) {
return;
@ -1317,6 +1322,7 @@ public class TestFSTs extends LuceneTestCase {
System.out.println("\nNow verify...");
while(true) {
is.close();
is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
@ -1349,13 +1355,17 @@ public class TestFSTs extends LuceneTestCase {
double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
// NOTE: comment out to profile lookup...
break;
}
} finally {
is.close();
}
}
}
// java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out
// java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
public static void main(String[] args) throws IOException {
int prune = 0;
int limit = Integer.MAX_VALUE;
@ -1363,7 +1373,8 @@ public class TestFSTs extends LuceneTestCase {
boolean storeOrds = false;
boolean storeDocFreqs = false;
boolean verify = true;
boolean doPack = false;
boolean noArcArrays = false;
String wordsFileIn = null;
String dirOut = null;
@ -1381,10 +1392,14 @@ public class TestFSTs extends LuceneTestCase {
inputMode = 1;
} else if (args[idx].equals("-docFreq")) {
storeDocFreqs = true;
} else if (args[idx].equals("-noArcArrays")) {
noArcArrays = true;
} else if (args[idx].equals("-ords")) {
storeOrds = true;
} else if (args[idx].equals("-noverify")) {
verify = false;
} else if (args[idx].equals("-pack")) {
doPack = true;
} else if (args[idx].startsWith("-")) {
System.err.println("Unrecognized option: " + args[idx]);
System.exit(-1);
@ -1413,44 +1428,44 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true);
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false);
final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs) {
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
Random rand;
@Override
public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
if (ord == 0) {
rand = new Random(17);
}
return new PairOutputs.Pair<Long,Long>(o1.get(ord),
o2.get(_TestUtil.nextInt(rand, 1, 5000)));
return outputs.newPair((long) ord,
(long) _TestUtil.nextInt(rand, 1, 5000));
}
}.run(limit, verify);
} else if (storeOrds) {
// Store only ords
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
@Override
public Long getOutput(IntsRef input, int ord) {
return outputs.get(ord);
return (long) ord;
}
}.run(limit, verify);
} else if (storeDocFreqs) {
// Store only docFreq
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
Random rand;
@Override
public Long getOutput(IntsRef input, int ord) {
if (ord == 0) {
rand = new Random(17);
}
return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
return (long) _TestUtil.nextInt(rand, 1, 5000);
}
}.run(limit, verify);
} else {
// Store nothing
final NoOutputs outputs = NoOutputs.getSingleton();
final Object NO_OUTPUT = outputs.getNoOutput();
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs) {
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
@Override
public Object getOutput(IntsRef input, int ord) {
return NO_OUTPUT;
@ -1468,6 +1483,46 @@ public class TestFSTs extends LuceneTestCase {
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
}
/*
public void testTrivial() throws Exception {
// Get outputs -- passing true means FST will share
// (delta code) the outputs. This should result in
// smaller FST if the outputs grow monotonically. But
// if numbers are "random", false should give smaller
// final size:
final NoOutputs outputs = NoOutputs.getSingleton();
String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1,
0, 0,
true,
true,
Integer.MAX_VALUE,
outputs,
null,
true);
Arrays.sort(strings);
final IntsRef scratch = new IntsRef();
for(String s : strings) {
builder.add(Util.toIntsRef(new BytesRef(s), scratch), outputs.getNoOutput());
}
final FST<Object> fst = builder.finish();
System.out.println("DOT before rewrite");
Writer w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/before.dot"));
Util.toDot(fst, w, false, false);
w.close();
final FST<Object> rewrite = new FST<Object>(fst, 1, 100);
System.out.println("DOT after rewrite");
w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/after.dot"));
Util.toDot(rewrite, w, false, false);
w.close();
}
*/
public void testSimple() throws Exception {
// Get outputs -- passing true means FST will share
@ -1484,9 +1539,9 @@ public class TestFSTs extends LuceneTestCase {
final BytesRef b = new BytesRef("b");
final BytesRef c = new BytesRef("c");
builder.add(Util.toIntsRef(a, new IntsRef()), outputs.get(17));
builder.add(Util.toIntsRef(b, new IntsRef()), outputs.get(42));
builder.add(Util.toIntsRef(c, new IntsRef()), outputs.get(13824324872317238L));
builder.add(Util.toIntsRef(a, new IntsRef()), 17L);
builder.add(Util.toIntsRef(b, new IntsRef()), 42L);
builder.add(Util.toIntsRef(c, new IntsRef()), 13824324872317238L);
final FST<Long> fst = builder.finish();
@ -1795,11 +1850,11 @@ public class TestFSTs extends LuceneTestCase {
public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null);
builder.add(Util.toUTF32("stat", new IntsRef()), outputs.get(17));
builder.add(Util.toUTF32("station", new IntsRef()), outputs.get(10));
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random.nextBoolean());
builder.add(Util.toUTF32("stat", new IntsRef()), 17L);
builder.add(Util.toUTF32("station", new IntsRef()), 10L);
final FST<Long> fst = builder.finish();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
StringWriter w = new StringWriter();
Util.toDot(fst, w, false, false);
w.close();
@ -1809,8 +1864,8 @@ public class TestFSTs extends LuceneTestCase {
public void testInternalFinalState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null);
final boolean willRewrite = random.nextBoolean();
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite);
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput());
final FST<Long> fst = builder.finish();
@ -1819,17 +1874,23 @@ public class TestFSTs extends LuceneTestCase {
Util.toDot(fst, w, false, false);
w.close();
//System.out.println(w.toString());
assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1);
final String expected;
if (willRewrite) {
expected = "4 -> 3 [label=\"t\" style=\"bold\"";
} else {
expected = "8 -> 6 [label=\"t\" style=\"bold\"";
}
assertTrue(w.toString().indexOf(expected) != -1);
}
// Make sure raw FST can differentiate between final vs
// non-final end nodes
public void testNonFinalStopNodes() throws Exception {
public void testNonFinalStopNode() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final Long nothing = outputs.getNoOutput();
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs, false);
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
@ -1839,8 +1900,8 @@ public class TestFSTs extends LuceneTestCase {
node.isFinal = true;
rootNode.addArc('a', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode();
frozen.address = fst.addNode(node);
rootNode.arcs[0].nextFinalOutput = outputs.get(17);
frozen.node = fst.addNode(node);
rootNode.arcs[0].nextFinalOutput = 17L;
rootNode.arcs[0].isFinal = true;
rootNode.arcs[0].output = nothing;
rootNode.arcs[0].target = frozen;
@ -1851,14 +1912,19 @@ public class TestFSTs extends LuceneTestCase {
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
rootNode.addArc('b', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode();
frozen.address = fst.addNode(node);
frozen.node = fst.addNode(node);
rootNode.arcs[1].nextFinalOutput = nothing;
rootNode.arcs[1].output = outputs.get(42);
rootNode.arcs[1].output = 42L;
rootNode.arcs[1].target = frozen;
}
fst.finish(fst.addNode(rootNode));
StringWriter w = new StringWriter();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
Util.toDot(fst, w, false, false);
w.close();
checkStopNodes(fst, outputs);
// Make sure it still works after save/load:

View File

@ -226,6 +226,9 @@ public final class SynonymFilter extends TokenFilter {
private final FST<BytesRef> fst;
private final FST.BytesReader fstReader;
private final BytesRef scratchBytes = new BytesRef();
private final CharsRef scratchChars = new CharsRef();
@ -241,7 +244,7 @@ public final class SynonymFilter extends TokenFilter {
this.synonyms = synonyms;
this.ignoreCase = ignoreCase;
this.fst = synonyms.fst;
this.fstReader = fst.getBytesReader(0);
if (fst == null) {
throw new IllegalArgumentException("fst must be non-null");
}
@ -366,7 +369,7 @@ public final class SynonymFilter extends TokenFilter {
int bufUpto = 0;
while(bufUpto < bufferLen) {
final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) {
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
//System.out.println(" stop");
break byToken;
}
@ -388,7 +391,7 @@ public final class SynonymFilter extends TokenFilter {
// See if the FST wants to continue matching (ie, needs to
// see the next input token):
if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
// No further rules can match here; we're done
// searching for matching rules starting at the
// current input position.

View File

@ -47,16 +47,17 @@ public final class TokenInfoFST {
FST.Arc<Long> firstArc = new FST.Arc<Long>();
fst.getFirstArc(firstArc);
FST.Arc<Long> arc = new FST.Arc<Long>();
final FST.BytesReader fstReader = fst.getBytesReader(0);
// TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs)
for (int i = 0; i < rootCache.length; i++) {
if (fst.findTargetArc(0x3040 + i, firstArc, arc) != null) {
if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) {
rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
}
}
return rootCache;
}
public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache) throws IOException {
public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache, FST.BytesReader fstReader) throws IOException {
if (useCache && ch >= 0x3040 && ch <= cacheCeiling) {
assert ch != FST.END_LABEL;
final Arc<Long> result = rootCache[ch - 0x3040];
@ -67,7 +68,7 @@ public final class TokenInfoFST {
return arc;
}
} else {
return fst.findTargetArc(ch, follow, arc);
return fst.findTargetArc(ch, follow, arc, fstReader);
}
}
@ -75,6 +76,10 @@ public final class TokenInfoFST {
return fst.getFirstArc(arc);
}
public FST.BytesReader getBytesReader(int pos) {
return fst.getBytesReader(pos);
}
/** @lucene.internal for testing only */
FST<Long> getInternalFST() {
return fst;

View File

@ -113,7 +113,7 @@ public final class UserDictionary implements Dictionary {
for (int i = 0; i < token.length(); i++) {
scratch.ints[i] = (int) token.charAt(i);
}
fstBuilder.add(scratch, fstOutput.get(ord));
fstBuilder.add(scratch, ord);
segmentations.add(wordIdAndLength);
ord++;
}
@ -134,6 +134,8 @@ public final class UserDictionary implements Dictionary {
TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
boolean found = false; // true if we found any results
final FST.BytesReader fstReader = fst.getBytesReader(0);
FST.Arc<Long> arc = new FST.Arc<Long>();
int end = off + len;
for (int startOffset = off; startOffset < end; startOffset++) {
@ -142,7 +144,7 @@ public final class UserDictionary implements Dictionary {
int remaining = end - startOffset;
for (int i = 0; i < remaining; i++) {
int ch = chars[startOffset+i];
if (fst.findTargetArc(ch, arc, arc, i == 0) == null) {
if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
break; // continue to next position
}
output += arc.output.intValue();

View File

@ -214,6 +214,8 @@ public class Viterbi {
ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
final FST.BytesReader fstReader = fst.getBytesReader(0);
// Process user dictionary;
if (useUserDictionary) {
processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
@ -238,7 +240,7 @@ public class Viterbi {
for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
int ch = text[suffixStart + endIndex - 1];
if (fst.findTargetArc(ch, arc, arc, endIndex == 1) == null) {
if (fst.findTargetArc(ch, arc, arc, endIndex == 1, fstReader) == null) {
break; // continue to next position
}
output += arc.output.intValue();

View File

@ -131,7 +131,7 @@ public class TokenInfoDictionaryBuilder {
System.out.println(" encode...");
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, fstOutput);
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true);
IntsRef scratch = new IntsRef();
long ord = -1; // first ord will be 0
String lastValue = null;
@ -155,13 +155,14 @@ public class TokenInfoDictionaryBuilder {
for (int i = 0; i < token.length(); i++) {
scratch.ints[i] = (int) token.charAt(i);
}
fstBuilder.add(scratch, fstOutput.get(ord));
fstBuilder.add(scratch, ord);
}
dictionary.addMapping((int)ord, offset);
offset = next;
}
FST<Long> fst = fstBuilder.finish();
final FST<Long> fst = fstBuilder.finish().pack(2, 100000);
System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... ");
dictionary.setFST(fst);
System.out.println(" done");

View File

@ -329,8 +329,11 @@ public class FSTCompletion {
private boolean descendWithPrefix(Arc<Object> arc, BytesRef utf8)
throws IOException {
final int max = utf8.offset + utf8.length;
// Cannot save as instance var since multiple threads
// can use FSTCompletion at once...
final FST.BytesReader fstReader = automaton.getBytesReader(0);
for (int i = utf8.offset; i < max; i++) {
if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc) == null) {
if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc, fstReader) == null) {
// No matching prefixes, return an empty result.
return false;
}

View File

@ -234,7 +234,7 @@ public class FSTCompletionBuilder {
final Object empty = outputs.getNoOutput();
final Builder<Object> builder = new Builder<Object>(
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
shareMaxTailLength, outputs, null);
shareMaxTailLength, outputs, null, false);
BytesRef scratch = new BytesRef();
final IntsRef scratchIntsRef = new IntsRef();