mirror of https://github.com/apache/lucene.git
LUCENE-2933: Two-stage state expansion criterion for the FST (distance from root and child count).
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1073653 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e5309e652b
commit
62f728df42
|
@ -83,7 +83,7 @@ public class Builder<T> {
|
||||||
@SuppressWarnings("unchecked") final UnCompiledNode<T>[] f = (UnCompiledNode<T>[]) new UnCompiledNode[10];
|
@SuppressWarnings("unchecked") final UnCompiledNode<T>[] f = (UnCompiledNode<T>[]) new UnCompiledNode[10];
|
||||||
frontier = f;
|
frontier = f;
|
||||||
for(int idx=0;idx<frontier.length;idx++) {
|
for(int idx=0;idx<frontier.length;idx++) {
|
||||||
frontier[idx] = new UnCompiledNode<T>(this);
|
frontier[idx] = new UnCompiledNode<T>(this, idx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,7 +201,7 @@ public class Builder<T> {
|
||||||
// undecided on whether to prune it. later, it
|
// undecided on whether to prune it. later, it
|
||||||
// will be either compiled or pruned, so we must
|
// will be either compiled or pruned, so we must
|
||||||
// allocate a new node:
|
// allocate a new node:
|
||||||
frontier[idx] = new UnCompiledNode<T>(this);
|
frontier[idx] = new UnCompiledNode<T>(this, idx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -292,7 +292,7 @@ public class Builder<T> {
|
||||||
new UnCompiledNode[ArrayUtil.oversize(input.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
new UnCompiledNode[ArrayUtil.oversize(input.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||||
System.arraycopy(frontier, 0, next, 0, frontier.length);
|
System.arraycopy(frontier, 0, next, 0, frontier.length);
|
||||||
for(int idx=frontier.length;idx<next.length;idx++) {
|
for(int idx=frontier.length;idx<next.length;idx++) {
|
||||||
next[idx] = new UnCompiledNode<T>(this);
|
next[idx] = new UnCompiledNode<T>(this, idx);
|
||||||
}
|
}
|
||||||
frontier = next;
|
frontier = next;
|
||||||
}
|
}
|
||||||
|
@ -424,12 +424,22 @@ public class Builder<T> {
|
||||||
boolean isFinal;
|
boolean isFinal;
|
||||||
int inputCount;
|
int inputCount;
|
||||||
|
|
||||||
|
/** This node's depth, starting from the automaton root. */
|
||||||
|
final int depth;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param depth
|
||||||
|
* The node's depth starting from the automaton root. Needed for
|
||||||
|
* LUCENE-2934 (node expansion based on conditions other than the
|
||||||
|
* fanout size).
|
||||||
|
*/
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
public UnCompiledNode(Builder<T> owner) {
|
public UnCompiledNode(Builder<T> owner, int depth) {
|
||||||
this.owner = owner;
|
this.owner = owner;
|
||||||
arcs = (Arc<T>[]) new Arc[1];
|
arcs = (Arc<T>[]) new Arc[1];
|
||||||
arcs[0] = new Arc<T>();
|
arcs[0] = new Arc<T>();
|
||||||
output = owner.NO_OUTPUT;
|
output = owner.NO_OUTPUT;
|
||||||
|
this.depth = depth;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isCompiled() {
|
public boolean isCompiled() {
|
||||||
|
@ -441,6 +451,9 @@ public class Builder<T> {
|
||||||
isFinal = false;
|
isFinal = false;
|
||||||
output = owner.NO_OUTPUT;
|
output = owner.NO_OUTPUT;
|
||||||
inputCount = 0;
|
inputCount = 0;
|
||||||
|
|
||||||
|
// We don't clear the depth here because it never changes
|
||||||
|
// for nodes on the frontier (even when reused).
|
||||||
}
|
}
|
||||||
|
|
||||||
public T getLastOutput(int labelToMatch) {
|
public T getLastOutput(int labelToMatch) {
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.CodecUtil;
|
import org.apache.lucene.util.CodecUtil;
|
||||||
|
import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode;
|
||||||
|
|
||||||
/** Represents an FST using a compact byte[] format.
|
/** Represents an FST using a compact byte[] format.
|
||||||
* <p> The format is similar to what's used by Morfologik
|
* <p> The format is similar to what's used by Morfologik
|
||||||
|
@ -47,11 +48,21 @@ public class FST<T> {
|
||||||
// this when number of arcs is > NUM_ARCS_ARRAY:
|
// this when number of arcs is > NUM_ARCS_ARRAY:
|
||||||
private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6;
|
private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6;
|
||||||
|
|
||||||
// If the node has >= this number of arcs, the arcs are
|
/**
|
||||||
// stored as a fixed array. Fixed array consumes more RAM
|
* @see #shouldExpand(UnCompiledNode)
|
||||||
// but enables binary search on the arcs (instead of
|
*/
|
||||||
// linear scan) on lookup by arc label:
|
final static int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node.
|
||||||
private final static int NUM_ARCS_FIXED_ARRAY = 10;
|
|
||||||
|
/**
|
||||||
|
* @see #shouldExpand(UnCompiledNode)
|
||||||
|
*/
|
||||||
|
final static int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see #shouldExpand(UnCompiledNode)
|
||||||
|
*/
|
||||||
|
final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10;
|
||||||
|
|
||||||
private int[] bytesPerArc = new int[0];
|
private int[] bytesPerArc = new int[0];
|
||||||
|
|
||||||
// Increment version to change it
|
// Increment version to change it
|
||||||
|
@ -315,7 +326,7 @@ public class FST<T> {
|
||||||
int startAddress = writer.posWrite;
|
int startAddress = writer.posWrite;
|
||||||
//System.out.println(" startAddr=" + startAddress);
|
//System.out.println(" startAddr=" + startAddress);
|
||||||
|
|
||||||
final boolean doFixedArray = node.numArcs >= NUM_ARCS_FIXED_ARRAY;
|
final boolean doFixedArray = shouldExpand(node);
|
||||||
final int fixedArrayStart;
|
final int fixedArrayStart;
|
||||||
if (doFixedArray) {
|
if (doFixedArray) {
|
||||||
if (bytesPerArc.length < node.numArcs) {
|
if (bytesPerArc.length < node.numArcs) {
|
||||||
|
@ -518,6 +529,23 @@ public class FST<T> {
|
||||||
return readNextArc(arc);
|
return readNextArc(arc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if <code>arc</code>'s target state is in expanded (or vector) format.
|
||||||
|
*
|
||||||
|
* @return Returns <code>true</code> if <code>arc</code> points to a state in an
|
||||||
|
* expanded array format.
|
||||||
|
*/
|
||||||
|
boolean isExpandedTarget(Arc<T> follow) throws IOException {
|
||||||
|
if (follow.isFinal()) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
final BytesReader in = getBytesReader(follow.target);
|
||||||
|
final byte b = in.readByte();
|
||||||
|
|
||||||
|
return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** In-place read; returns the arc. */
|
/** In-place read; returns the arc. */
|
||||||
public Arc<T> readNextArc(Arc<T> arc) throws IOException {
|
public Arc<T> readNextArc(Arc<T> arc) throws IOException {
|
||||||
if (arc.label == -1) {
|
if (arc.label == -1) {
|
||||||
|
@ -712,6 +740,26 @@ public class FST<T> {
|
||||||
public int getArcWithOutputCount() {
|
public int getArcWithOutputCount() {
|
||||||
return arcWithOutputCount;
|
return arcWithOutputCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Nodes will be expanded if their depth (distance from the root node) is
|
||||||
|
* <= this value and their number of arcs is >=
|
||||||
|
* {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Fixed array consumes more RAM but enables binary search on the arcs
|
||||||
|
* (instead of a linear scan) on lookup by arc label.
|
||||||
|
*
|
||||||
|
* @return <code>true</code> if <code>node</code> should be stored in an
|
||||||
|
* expanded (array) form.
|
||||||
|
*
|
||||||
|
* @see #FIXED_ARRAY_NUM_ARCS_DEEP
|
||||||
|
* @see Builder.UnCompiledNode#depth
|
||||||
|
*/
|
||||||
|
private boolean shouldExpand(UnCompiledNode<T> node) {
|
||||||
|
return (node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
|
||||||
|
node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP;
|
||||||
|
}
|
||||||
|
|
||||||
// Non-static: writes to FST's byte[]
|
// Non-static: writes to FST's byte[]
|
||||||
class BytesWriter extends DataOutput {
|
class BytesWriter extends DataOutput {
|
||||||
|
|
|
@ -189,6 +189,8 @@ public final class Util {
|
||||||
*/
|
*/
|
||||||
public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates)
|
public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
final String expandedNodeColor = "blue";
|
||||||
|
|
||||||
// This is the start arc in the automaton (from the epsilon state to the first state
|
// This is the start arc in the automaton (from the epsilon state to the first state
|
||||||
// with outgoing transitions.
|
// with outgoing transitions.
|
||||||
final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
|
final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
|
||||||
|
@ -219,7 +221,9 @@ public final class Util {
|
||||||
}
|
}
|
||||||
|
|
||||||
emitDotState(out, "initial", "point", "white", "");
|
emitDotState(out, "initial", "point", "white", "");
|
||||||
emitDotState(out, Integer.toString(startArc.target), stateShape, null, "");
|
emitDotState(out, Integer.toString(startArc.target), stateShape,
|
||||||
|
fst.isExpandedTarget(startArc) ? expandedNodeColor : null,
|
||||||
|
"");
|
||||||
out.write(" initial -> " + startArc.target + "\n");
|
out.write(" initial -> " + startArc.target + "\n");
|
||||||
|
|
||||||
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
||||||
|
@ -243,7 +247,9 @@ public final class Util {
|
||||||
while (true) {
|
while (true) {
|
||||||
// Emit the unseen state and add it to the queue for the next level.
|
// Emit the unseen state and add it to the queue for the next level.
|
||||||
if (arc.target >= 0 && !seen.get(arc.target)) {
|
if (arc.target >= 0 && !seen.get(arc.target)) {
|
||||||
emitDotState(out, Integer.toString(arc.target), stateShape, null,
|
final boolean isExpanded = fst.isExpandedTarget(arc);
|
||||||
|
emitDotState(out, Integer.toString(arc.target), stateShape,
|
||||||
|
isExpanded ? expandedNodeColor : null,
|
||||||
labelStates ? Integer.toString(arc.target) : "");
|
labelStates ? Integer.toString(arc.target) : "");
|
||||||
seen.set(arc.target);
|
seen.set(arc.target);
|
||||||
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
|
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
|
||||||
|
@ -285,10 +291,10 @@ public final class Util {
|
||||||
}
|
}
|
||||||
sameLevelStates.clear();
|
sameLevelStates.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Emit terminating state (always there anyway).
|
// Emit terminating state (always there anyway).
|
||||||
out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n");
|
out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n");
|
||||||
out.write(" {rank=sink; -1 } ");
|
out.write(" {rank=sink; -1 }\n");
|
||||||
|
|
||||||
out.write("}\n");
|
out.write("}\n");
|
||||||
out.flush();
|
out.flush();
|
||||||
|
|
|
@ -56,6 +56,7 @@ import org.apache.lucene.util.LineFileDocs;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
import org.apache.lucene.util.automaton.fst.FST.Arc;
|
||||||
|
|
||||||
public class TestFSTs extends LuceneTestCase {
|
public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
|
@ -1322,4 +1323,85 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
assertEquals(b, seekResult.input);
|
assertEquals(b, seekResult.input);
|
||||||
assertEquals(42, (long) seekResult.output);
|
assertEquals(42, (long) seekResult.output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test state expansion (array format) on close-to-root states. Creates
|
||||||
|
* synthetic input that has one expanded state on each level.
|
||||||
|
*
|
||||||
|
* @see "https://issues.apache.org/jira/browse/LUCENE-2933"
|
||||||
|
*/
|
||||||
|
public void testExpandedCloseToRoot() throws Exception {
|
||||||
|
class SyntheticData {
|
||||||
|
FST<Object> compile(String[] lines) throws IOException {
|
||||||
|
final NoOutputs outputs = NoOutputs.getSingleton();
|
||||||
|
final Object nothing = outputs.getNoOutput();
|
||||||
|
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
|
||||||
|
|
||||||
|
int line = 0;
|
||||||
|
final BytesRef term = new BytesRef();
|
||||||
|
while (line < lines.length) {
|
||||||
|
String w = lines[line++];
|
||||||
|
if (w == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
term.copy(w);
|
||||||
|
b.add(term, nothing);
|
||||||
|
}
|
||||||
|
|
||||||
|
return b.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
void generate(ArrayList<String> out, StringBuilder b, char from, char to,
|
||||||
|
int depth) {
|
||||||
|
if (depth == 0 || from == to) {
|
||||||
|
String seq = b.toString() + "_" + out.size() + "_end";
|
||||||
|
out.add(seq);
|
||||||
|
} else {
|
||||||
|
for (char c = from; c <= to; c++) {
|
||||||
|
b.append(c);
|
||||||
|
generate(out, b, from, c == to ? to : from, depth - 1);
|
||||||
|
b.deleteCharAt(b.length() - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth)
|
||||||
|
throws IOException {
|
||||||
|
if (fst.targetHasArcs(arc)) {
|
||||||
|
int childCount = 0;
|
||||||
|
for (arc = fst.readFirstTargetArc(arc, arc);;
|
||||||
|
arc = fst.readNextArc(arc), childCount++)
|
||||||
|
{
|
||||||
|
boolean expanded = fst.isExpandedTarget(arc);
|
||||||
|
int children = verifyStateAndBelow(fst, new FST.Arc<Object>().copyFrom(arc), depth + 1);
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
expanded,
|
||||||
|
(depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE &&
|
||||||
|
children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
|
||||||
|
children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
|
||||||
|
if (arc.isLast()) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return childCount;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sanity check.
|
||||||
|
assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
|
||||||
|
assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
|
||||||
|
|
||||||
|
SyntheticData s = new SyntheticData();
|
||||||
|
|
||||||
|
ArrayList<String> out = new ArrayList<String>();
|
||||||
|
StringBuilder b = new StringBuilder();
|
||||||
|
s.generate(out, b, 'a', 'i', 10);
|
||||||
|
String[] input = out.toArray(new String[out.size()]);
|
||||||
|
Arrays.sort(input);
|
||||||
|
FST<Object> fst = s.compile(input);
|
||||||
|
FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>());
|
||||||
|
s.verifyStateAndBelow(fst, arc, 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue