From 904b8a4b77b138cd6a8755dada8e2cca317fa0c8 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Tue, 12 Apr 2011 17:11:19 +0000 Subject: [PATCH] LUCENE-3017: allow FST to distinguish final vs non-final no-arcs-leaving nodes git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1091502 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/util/automaton/fst/Builder.java | 13 +++- .../apache/lucene/util/automaton/fst/FST.java | 21 ++++-- .../lucene/util/automaton/fst/TestFSTs.java | 69 +++++++++++++++++++ 3 files changed, 98 insertions(+), 5 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java index 7b93fdd3d1f..fed8cd21098 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java @@ -180,7 +180,13 @@ public class Builder { compileAllTargets(node); } final T nextFinalOutput = node.output; - final boolean isFinal = node.isFinal; + + // We "fake" the node as being final if it has no + // outgoing arcs; in theory we could leave it + // as non-final (the FST can represent this), but + // FSTEnum, Util, etc., have trouble w/ non-final + // dead-end states: + final boolean isFinal = node.isFinal || node.numArcs == 0; if (doCompile) { // this node makes it and we now compile it. first, @@ -268,6 +274,7 @@ public class Builder { // 'finalness' is stored on the incoming arc, not on // the node frontier[0].inputCount++; + frontier[0].isFinal = true; fst.setEmptyOutput(output); return; } @@ -388,6 +395,10 @@ public class Builder { if (!arc.target.isCompiled()) { // not yet compiled @SuppressWarnings("unchecked") final UnCompiledNode n = (UnCompiledNode) arc.target; + if (n.numArcs == 0) { + //System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label); + arc.isFinal = n.isFinal = true; + } arc.target = compileNode(n); } } diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java index b1eabb44c9d..dde66270873 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java @@ -25,6 +25,10 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode; +// NOTE: while the FST is able to represent a non-final +// dead-end state (NON_FINAL_END_NODE=0), the layres above +// (FSTEnum, Util) have problems with this!! + /** Represents an FST using a compact byte[] format. *

The format is similar to what's used by Morfologik * (http://sourceforge.net/projects/morfologik). @@ -214,6 +218,9 @@ public class FST { } void finish(int startNode) { + if (startNode == FINAL_END_NODE && emptyOutput != null) { + startNode = 0; + } if (this.startNode != -1) { throw new IllegalStateException("already finished"); } @@ -253,6 +260,8 @@ public class FST { throw new IllegalStateException("call finish first"); } CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); + // TODO: really we should encode this as an arc, arriving + // to the root node, instead of special casing here: if (emptyOutput != null) { out.writeByte((byte) 1); out.writeVInt(emptyOutputBytes.length); @@ -466,7 +475,9 @@ public class FST { arc.nextFinalOutput = emptyOutput; } else { arc.flags = BIT_LAST_ARC; + arc.nextFinalOutput = NO_OUTPUT; } + arc.output = NO_OUTPUT; // If there are no nodes, ie, the FST only accepts the // empty string, then startNode is 0, and then readFirstTargetArc @@ -583,12 +594,11 @@ public class FST { * expanded array format. */ boolean isExpandedTarget(Arc follow) throws IOException { - if (follow.isFinal()) { + if (!targetHasArcs(follow)) { return false; } else { final BytesReader in = getBytesReader(follow.target); final byte b = in.readByte(); - return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0; } } @@ -667,8 +677,11 @@ public class FST { } if (arc.flag(BIT_STOP_NODE)) { - arc.target = FINAL_END_NODE; - arc.flags |= BIT_FINAL_ARC; + if (arc.flag(BIT_FINAL_ARC)) { + arc.target = FINAL_END_NODE; + } else { + arc.target = NON_FINAL_END_NODE; + } arc.nextArc = in.pos; } else if (arc.flag(BIT_TARGET_NEXT)) { arc.nextArc = in.pos; diff --git a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java index f9b5c677b85..b440d5ac7ea 100644 --- a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java +++ b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java @@ -1421,4 +1421,73 @@ public class TestFSTs extends LuceneTestCase { FST.Arc arc = fst.getFirstArc(new FST.Arc()); s.verifyStateAndBelow(fst, arc, 1); } + + // Make sure raw FST can differentiate between final vs + // non-final end nodes + public void testNonFinalStopNodes() throws Exception { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + final Long nothing = outputs.getNoOutput(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + + final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs); + + final Builder.UnCompiledNode rootNode = new Builder.UnCompiledNode(b, 0); + + // Add final stop node + { + final Builder.UnCompiledNode node = new Builder.UnCompiledNode(b, 0); + node.isFinal = true; + rootNode.addArc('a', node); + final Builder.CompiledNode frozen = new Builder.CompiledNode(); + frozen.address = fst.addNode(node); + rootNode.arcs[0].nextFinalOutput = outputs.get(17); + rootNode.arcs[0].isFinal = true; + rootNode.arcs[0].output = nothing; + rootNode.arcs[0].target = frozen; + } + + // Add non-final stop node + { + final Builder.UnCompiledNode node = new Builder.UnCompiledNode(b, 0); + rootNode.addArc('b', node); + final Builder.CompiledNode frozen = new Builder.CompiledNode(); + frozen.address = fst.addNode(node); + rootNode.arcs[1].nextFinalOutput = nothing; + rootNode.arcs[1].output = outputs.get(42); + rootNode.arcs[1].target = frozen; + } + + fst.finish(fst.addNode(rootNode)); + + checkStopNodes(fst, outputs); + + // Make sure it still works after save/load: + Directory dir = newDirectory(); + IndexOutput out = dir.createOutput("fst"); + fst.save(out); + out.close(); + + IndexInput in = dir.openInput("fst"); + final FST fst2 = new FST(in, outputs); + checkStopNodes(fst2, outputs); + in.close(); + dir.close(); + } + + private void checkStopNodes(FST fst, PositiveIntOutputs outputs) throws Exception { + final Long nothing = outputs.getNoOutput(); + FST.Arc startArc = fst.getFirstArc(new FST.Arc()); + assertEquals(nothing, startArc.output); + assertEquals(nothing, startArc.nextFinalOutput); + + FST.Arc arc = fst.readFirstTargetArc(startArc, new FST.Arc()); + assertEquals('a', arc.label); + assertEquals(17, arc.nextFinalOutput.longValue()); + assertTrue(arc.isFinal()); + + arc = fst.readNextArc(arc); + assertEquals('b', arc.label); + assertFalse(arc.isFinal()); + assertEquals(42, arc.output.longValue()); + } }