LUCENE-3017: allow FST to distinguish final vs non-final no-arcs-leaving nodes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1091502 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-04-12 17:11:19 +00:00
parent b559afee35
commit 904b8a4b77
3 changed files with 98 additions and 5 deletions

View File

@ -180,7 +180,13 @@ public class Builder<T> {
compileAllTargets(node);
}
final T nextFinalOutput = node.output;
final boolean isFinal = node.isFinal;
// We "fake" the node as being final if it has no
// outgoing arcs; in theory we could leave it
// as non-final (the FST can represent this), but
// FSTEnum, Util, etc., have trouble w/ non-final
// dead-end states:
final boolean isFinal = node.isFinal || node.numArcs == 0;
if (doCompile) {
// this node makes it and we now compile it. first,
@ -268,6 +274,7 @@ public class Builder<T> {
// 'finalness' is stored on the incoming arc, not on
// the node
frontier[0].inputCount++;
frontier[0].isFinal = true;
fst.setEmptyOutput(output);
return;
}
@ -388,6 +395,10 @@ public class Builder<T> {
if (!arc.target.isCompiled()) {
// not yet compiled
@SuppressWarnings("unchecked") final UnCompiledNode<T> n = (UnCompiledNode<T>) arc.target;
if (n.numArcs == 0) {
//System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label);
arc.isFinal = n.isFinal = true;
}
arc.target = compileNode(n);
}
}

View File

@ -25,6 +25,10 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode;
// NOTE: while the FST is able to represent a non-final
// dead-end state (NON_FINAL_END_NODE=0), the layres above
// (FSTEnum, Util) have problems with this!!
/** Represents an FST using a compact byte[] format.
* <p> The format is similar to what's used by Morfologik
* (http://sourceforge.net/projects/morfologik).
@ -214,6 +218,9 @@ public class FST<T> {
}
void finish(int startNode) {
if (startNode == FINAL_END_NODE && emptyOutput != null) {
startNode = 0;
}
if (this.startNode != -1) {
throw new IllegalStateException("already finished");
}
@ -253,6 +260,8 @@ public class FST<T> {
throw new IllegalStateException("call finish first");
}
CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT);
// TODO: really we should encode this as an arc, arriving
// to the root node, instead of special casing here:
if (emptyOutput != null) {
out.writeByte((byte) 1);
out.writeVInt(emptyOutputBytes.length);
@ -466,7 +475,9 @@ public class FST<T> {
arc.nextFinalOutput = emptyOutput;
} else {
arc.flags = BIT_LAST_ARC;
arc.nextFinalOutput = NO_OUTPUT;
}
arc.output = NO_OUTPUT;
// If there are no nodes, ie, the FST only accepts the
// empty string, then startNode is 0, and then readFirstTargetArc
@ -583,12 +594,11 @@ public class FST<T> {
* expanded array format.
*/
boolean isExpandedTarget(Arc<T> follow) throws IOException {
if (follow.isFinal()) {
if (!targetHasArcs(follow)) {
return false;
} else {
final BytesReader in = getBytesReader(follow.target);
final byte b = in.readByte();
return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0;
}
}
@ -667,8 +677,11 @@ public class FST<T> {
}
if (arc.flag(BIT_STOP_NODE)) {
arc.target = FINAL_END_NODE;
arc.flags |= BIT_FINAL_ARC;
if (arc.flag(BIT_FINAL_ARC)) {
arc.target = FINAL_END_NODE;
} else {
arc.target = NON_FINAL_END_NODE;
}
arc.nextArc = in.pos;
} else if (arc.flag(BIT_TARGET_NEXT)) {
arc.nextArc = in.pos;

View File

@ -1421,4 +1421,73 @@ public class TestFSTs extends LuceneTestCase {
FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>());
s.verifyStateAndBelow(fst, arc, 1);
}
// Make sure raw FST can differentiate between final vs
// non-final end nodes
public void testNonFinalStopNodes() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final Long nothing = outputs.getNoOutput();
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
// Add final stop node
{
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
node.isFinal = true;
rootNode.addArc('a', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode();
frozen.address = fst.addNode(node);
rootNode.arcs[0].nextFinalOutput = outputs.get(17);
rootNode.arcs[0].isFinal = true;
rootNode.arcs[0].output = nothing;
rootNode.arcs[0].target = frozen;
}
// Add non-final stop node
{
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
rootNode.addArc('b', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode();
frozen.address = fst.addNode(node);
rootNode.arcs[1].nextFinalOutput = nothing;
rootNode.arcs[1].output = outputs.get(42);
rootNode.arcs[1].target = frozen;
}
fst.finish(fst.addNode(rootNode));
checkStopNodes(fst, outputs);
// Make sure it still works after save/load:
Directory dir = newDirectory();
IndexOutput out = dir.createOutput("fst");
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst");
final FST<Long> fst2 = new FST<Long>(in, outputs);
checkStopNodes(fst2, outputs);
in.close();
dir.close();
}
private void checkStopNodes(FST<Long> fst, PositiveIntOutputs outputs) throws Exception {
final Long nothing = outputs.getNoOutput();
FST.Arc<Long> startArc = fst.getFirstArc(new FST.Arc<Long>());
assertEquals(nothing, startArc.output);
assertEquals(nothing, startArc.nextFinalOutput);
FST.Arc<Long> arc = fst.readFirstTargetArc(startArc, new FST.Arc<Long>());
assertEquals('a', arc.label);
assertEquals(17, arc.nextFinalOutput.longValue());
assertTrue(arc.isFinal());
arc = fst.readNextArc(arc);
assertEquals('b', arc.label);
assertFalse(arc.isFinal());
assertEquals(42, arc.output.longValue());
}
}