mirror of https://github.com/apache/lucene.git
LUCENE-3017: allow FST to distinguish final vs non-final no-arcs-leaving nodes
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1091502 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b559afee35
commit
904b8a4b77
|
@ -180,7 +180,13 @@ public class Builder<T> {
|
|||
compileAllTargets(node);
|
||||
}
|
||||
final T nextFinalOutput = node.output;
|
||||
final boolean isFinal = node.isFinal;
|
||||
|
||||
// We "fake" the node as being final if it has no
|
||||
// outgoing arcs; in theory we could leave it
|
||||
// as non-final (the FST can represent this), but
|
||||
// FSTEnum, Util, etc., have trouble w/ non-final
|
||||
// dead-end states:
|
||||
final boolean isFinal = node.isFinal || node.numArcs == 0;
|
||||
|
||||
if (doCompile) {
|
||||
// this node makes it and we now compile it. first,
|
||||
|
@ -268,6 +274,7 @@ public class Builder<T> {
|
|||
// 'finalness' is stored on the incoming arc, not on
|
||||
// the node
|
||||
frontier[0].inputCount++;
|
||||
frontier[0].isFinal = true;
|
||||
fst.setEmptyOutput(output);
|
||||
return;
|
||||
}
|
||||
|
@ -388,6 +395,10 @@ public class Builder<T> {
|
|||
if (!arc.target.isCompiled()) {
|
||||
// not yet compiled
|
||||
@SuppressWarnings("unchecked") final UnCompiledNode<T> n = (UnCompiledNode<T>) arc.target;
|
||||
if (n.numArcs == 0) {
|
||||
//System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label);
|
||||
arc.isFinal = n.isFinal = true;
|
||||
}
|
||||
arc.target = compileNode(n);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,6 +25,10 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.CodecUtil;
|
||||
import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode;
|
||||
|
||||
// NOTE: while the FST is able to represent a non-final
|
||||
// dead-end state (NON_FINAL_END_NODE=0), the layres above
|
||||
// (FSTEnum, Util) have problems with this!!
|
||||
|
||||
/** Represents an FST using a compact byte[] format.
|
||||
* <p> The format is similar to what's used by Morfologik
|
||||
* (http://sourceforge.net/projects/morfologik).
|
||||
|
@ -214,6 +218,9 @@ public class FST<T> {
|
|||
}
|
||||
|
||||
void finish(int startNode) {
|
||||
if (startNode == FINAL_END_NODE && emptyOutput != null) {
|
||||
startNode = 0;
|
||||
}
|
||||
if (this.startNode != -1) {
|
||||
throw new IllegalStateException("already finished");
|
||||
}
|
||||
|
@ -253,6 +260,8 @@ public class FST<T> {
|
|||
throw new IllegalStateException("call finish first");
|
||||
}
|
||||
CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT);
|
||||
// TODO: really we should encode this as an arc, arriving
|
||||
// to the root node, instead of special casing here:
|
||||
if (emptyOutput != null) {
|
||||
out.writeByte((byte) 1);
|
||||
out.writeVInt(emptyOutputBytes.length);
|
||||
|
@ -466,7 +475,9 @@ public class FST<T> {
|
|||
arc.nextFinalOutput = emptyOutput;
|
||||
} else {
|
||||
arc.flags = BIT_LAST_ARC;
|
||||
arc.nextFinalOutput = NO_OUTPUT;
|
||||
}
|
||||
arc.output = NO_OUTPUT;
|
||||
|
||||
// If there are no nodes, ie, the FST only accepts the
|
||||
// empty string, then startNode is 0, and then readFirstTargetArc
|
||||
|
@ -583,12 +594,11 @@ public class FST<T> {
|
|||
* expanded array format.
|
||||
*/
|
||||
boolean isExpandedTarget(Arc<T> follow) throws IOException {
|
||||
if (follow.isFinal()) {
|
||||
if (!targetHasArcs(follow)) {
|
||||
return false;
|
||||
} else {
|
||||
final BytesReader in = getBytesReader(follow.target);
|
||||
final byte b = in.readByte();
|
||||
|
||||
return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0;
|
||||
}
|
||||
}
|
||||
|
@ -667,8 +677,11 @@ public class FST<T> {
|
|||
}
|
||||
|
||||
if (arc.flag(BIT_STOP_NODE)) {
|
||||
arc.target = FINAL_END_NODE;
|
||||
arc.flags |= BIT_FINAL_ARC;
|
||||
if (arc.flag(BIT_FINAL_ARC)) {
|
||||
arc.target = FINAL_END_NODE;
|
||||
} else {
|
||||
arc.target = NON_FINAL_END_NODE;
|
||||
}
|
||||
arc.nextArc = in.pos;
|
||||
} else if (arc.flag(BIT_TARGET_NEXT)) {
|
||||
arc.nextArc = in.pos;
|
||||
|
|
|
@ -1421,4 +1421,73 @@ public class TestFSTs extends LuceneTestCase {
|
|||
FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>());
|
||||
s.verifyStateAndBelow(fst, arc, 1);
|
||||
}
|
||||
|
||||
// Make sure raw FST can differentiate between final vs
|
||||
// non-final end nodes
|
||||
public void testNonFinalStopNodes() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
final Long nothing = outputs.getNoOutput();
|
||||
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
|
||||
|
||||
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
|
||||
|
||||
// Add final stop node
|
||||
{
|
||||
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
|
||||
node.isFinal = true;
|
||||
rootNode.addArc('a', node);
|
||||
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
||||
frozen.address = fst.addNode(node);
|
||||
rootNode.arcs[0].nextFinalOutput = outputs.get(17);
|
||||
rootNode.arcs[0].isFinal = true;
|
||||
rootNode.arcs[0].output = nothing;
|
||||
rootNode.arcs[0].target = frozen;
|
||||
}
|
||||
|
||||
// Add non-final stop node
|
||||
{
|
||||
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
|
||||
rootNode.addArc('b', node);
|
||||
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
||||
frozen.address = fst.addNode(node);
|
||||
rootNode.arcs[1].nextFinalOutput = nothing;
|
||||
rootNode.arcs[1].output = outputs.get(42);
|
||||
rootNode.arcs[1].target = frozen;
|
||||
}
|
||||
|
||||
fst.finish(fst.addNode(rootNode));
|
||||
|
||||
checkStopNodes(fst, outputs);
|
||||
|
||||
// Make sure it still works after save/load:
|
||||
Directory dir = newDirectory();
|
||||
IndexOutput out = dir.createOutput("fst");
|
||||
fst.save(out);
|
||||
out.close();
|
||||
|
||||
IndexInput in = dir.openInput("fst");
|
||||
final FST<Long> fst2 = new FST<Long>(in, outputs);
|
||||
checkStopNodes(fst2, outputs);
|
||||
in.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void checkStopNodes(FST<Long> fst, PositiveIntOutputs outputs) throws Exception {
|
||||
final Long nothing = outputs.getNoOutput();
|
||||
FST.Arc<Long> startArc = fst.getFirstArc(new FST.Arc<Long>());
|
||||
assertEquals(nothing, startArc.output);
|
||||
assertEquals(nothing, startArc.nextFinalOutput);
|
||||
|
||||
FST.Arc<Long> arc = fst.readFirstTargetArc(startArc, new FST.Arc<Long>());
|
||||
assertEquals('a', arc.label);
|
||||
assertEquals(17, arc.nextFinalOutput.longValue());
|
||||
assertTrue(arc.isFinal());
|
||||
|
||||
arc = fst.readNextArc(arc);
|
||||
assertEquals('b', arc.label);
|
||||
assertFalse(arc.isFinal());
|
||||
assertEquals(42, arc.output.longValue());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue