LUCENE-2792: add FST impl

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1044834 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2010-12-12 15:36:08 +00:00
parent c45253df51
commit 994aaec2ef
22 changed files with 4288 additions and 137 deletions

View File

@ -1,2 +1,4 @@
syntax: glob syntax: glob
*/build/* */build/*
*.class

View File

@ -31,15 +31,16 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.fst.Builder;
import org.apache.lucene.util.automaton.fst.BytesRefFSTEnum;
import org.apache.lucene.util.automaton.fst.FST;
import org.apache.lucene.util.automaton.fst.PositiveIntOutputs;
import org.apache.lucene.util.automaton.fst.PairOutputs;
import java.io.IOException; import java.io.IOException;
import java.util.Comparator; import java.util.Comparator;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.HashMap; import java.util.HashMap;
import java.util.TreeMap;
import java.util.SortedMap;
import java.util.Iterator;
class SimpleTextFieldsReader extends FieldsProducer { class SimpleTextFieldsReader extends FieldsProducer {
@ -116,73 +117,39 @@ class SimpleTextFieldsReader extends FieldsProducer {
private class SimpleTextTermsEnum extends TermsEnum { private class SimpleTextTermsEnum extends TermsEnum {
private final IndexInput in; private final IndexInput in;
private final boolean omitTF; private final boolean omitTF;
private BytesRef current;
private int docFreq; private int docFreq;
private long docsStart; private long docsStart;
private boolean ended; private boolean ended;
private final TreeMap<BytesRef,TermData> allTerms; private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum;
private Iterator<Map.Entry<BytesRef,TermData>> iter;
public SimpleTextTermsEnum(TreeMap<BytesRef,TermData> allTerms, boolean omitTF) throws IOException { public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
this.allTerms = allTerms;
this.omitTF = omitTF; this.omitTF = omitTF;
iter = allTerms.entrySet().iterator(); fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst);
} }
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException { public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
final SortedMap<BytesRef,TermData> tailMap = allTerms.tailMap(text);
if (tailMap.isEmpty()) { fstEnum.reset();
current = null; //System.out.println("seek to text=" + text.utf8ToString());
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.advance(text);
if (result == null) {
//System.out.println(" end");
return SeekStatus.END; return SeekStatus.END;
} else { } else {
current = tailMap.firstKey(); //System.out.println(" got text=" + term.utf8ToString());
final TermData td = tailMap.get(current); PairOutputs.Pair<Long,Long> pair = result.output;
docsStart = td.docsStart; docsStart = pair.output1;
docFreq = td.docFreq; docFreq = pair.output2.intValue();
iter = tailMap.entrySet().iterator();
assert iter.hasNext(); if (result.input.equals(text)) {
iter.next(); //System.out.println(" match docsStart=" + docsStart);
if (current.equals(text)) {
return SeekStatus.FOUND; return SeekStatus.FOUND;
} else { } else {
//System.out.println(" not match docsStart=" + docsStart);
return SeekStatus.NOT_FOUND; return SeekStatus.NOT_FOUND;
} }
} }
/*
if (current != null) {
final int cmp = current.compareTo(text);
if (cmp == 0) {
return SeekStatus.FOUND;
} else if (cmp > 0) {
ended = false;
in.seek(fieldStart);
}
} else {
ended = false;
in.seek(fieldStart);
}
// Naive!! This just scans... would be better to do
// up-front scan to build in-RAM index
BytesRef b;
while((b = next()) != null) {
final int cmp = b.compareTo(text);
if (cmp == 0) {
ended = false;
return SeekStatus.FOUND;
} else if (cmp > 0) {
ended = false;
return SeekStatus.NOT_FOUND;
}
}
current = null;
ended = true;
return SeekStatus.END;
*/
} }
@Override @Override
@ -192,56 +159,20 @@ class SimpleTextFieldsReader extends FieldsProducer {
@Override @Override
public BytesRef next() throws IOException { public BytesRef next() throws IOException {
assert !ended; assert !ended;
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next();
if (iter.hasNext()) { if (result != null) {
Map.Entry<BytesRef,TermData> ent = iter.next(); final PairOutputs.Pair<Long,Long> pair = result.output;
current = ent.getKey(); docsStart = pair.output1;
TermData td = ent.getValue(); docFreq = pair.output2.intValue();
docFreq = td.docFreq; return result.input;
docsStart = td.docsStart;
return current;
} else { } else {
current = null;
return null; return null;
} }
/*
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
ended = true;
current = null;
return null;
} else {
assert scratch.startsWith(TERM): "got " + scratch.utf8ToString();
docsStart = in.getFilePointer();
final int len = scratch.length - TERM.length;
if (len > scratch2.length) {
scratch2.grow(len);
}
System.arraycopy(scratch.bytes, TERM.length, scratch2.bytes, 0, len);
scratch2.length = len;
current = scratch2;
docFreq = 0;
long lineStart = 0;
while(true) {
lineStart = in.getFilePointer();
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD) || scratch.startsWith(TERM)) {
break;
}
if (scratch.startsWith(DOC)) {
docFreq++;
}
}
in.seek(lineStart);
return current;
}
*/
} }
@Override @Override
public BytesRef term() { public BytesRef term() {
return current; return fstEnum.current().input;
} }
@Override @Override
@ -512,10 +443,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
private final String field; private final String field;
private final long termsStart; private final long termsStart;
private final boolean omitTF; private final boolean omitTF;
private FST<PairOutputs.Pair<Long,Long>> fst;
// NOTE: horribly, horribly RAM consuming, but then
// SimpleText should never be used in production
private final TreeMap<BytesRef,TermData> allTerms = new TreeMap<BytesRef,TermData>();
private final BytesRef scratch = new BytesRef(10); private final BytesRef scratch = new BytesRef(10);
@ -527,6 +455,8 @@ class SimpleTextFieldsReader extends FieldsProducer {
} }
private void loadTerms() throws IOException { private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs));
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart); in.seek(termsStart);
final BytesRef lastTerm = new BytesRef(10); final BytesRef lastTerm = new BytesRef(10);
@ -536,16 +466,14 @@ class SimpleTextFieldsReader extends FieldsProducer {
readLine(in, scratch); readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) { if (scratch.equals(END) || scratch.startsWith(FIELD)) {
if (lastDocsStart != -1) { if (lastDocsStart != -1) {
allTerms.put(new BytesRef(lastTerm), b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
new TermData(lastDocsStart, docFreq));
} }
break; break;
} else if (scratch.startsWith(DOC)) { } else if (scratch.startsWith(DOC)) {
docFreq++; docFreq++;
} else if (scratch.startsWith(TERM)) { } else if (scratch.startsWith(TERM)) {
if (lastDocsStart != -1) { if (lastDocsStart != -1) {
allTerms.put(new BytesRef(lastTerm), b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
new TermData(lastDocsStart, docFreq));
} }
lastDocsStart = in.getFilePointer(); lastDocsStart = in.getFilePointer();
final int len = scratch.length - TERM.length; final int len = scratch.length - TERM.length;
@ -557,11 +485,23 @@ class SimpleTextFieldsReader extends FieldsProducer {
docFreq = 0; docFreq = 0;
} }
} }
fst = b.finish();
/*
PrintStream ps = new PrintStream("out.dot");
fst.toDot(ps);
ps.close();
System.out.println("SAVED out.dot");
*/
//System.out.println("FST " + fst.sizeInBytes());
} }
@Override @Override
public TermsEnum iterator() throws IOException { public TermsEnum iterator() throws IOException {
return new SimpleTextTermsEnum(allTerms, omitTF); if (fst != null) {
return new SimpleTextTermsEnum(fst, omitTF);
} else {
return TermsEnum.EMPTY;
}
} }
@Override @Override

View File

@ -19,6 +19,7 @@ package org.apache.lucene.util;
import java.util.Collection; import java.util.Collection;
import java.util.Comparator; import java.util.Comparator;
import java.lang.reflect.Array;
/** /**
* Methods for manipulating arrays. * Methods for manipulating arrays.
@ -392,7 +393,7 @@ public final class ArrayUtil {
} }
/** /**
* Returns hash of chars in range start (inclusive) to * Returns hash of bytes in range start (inclusive) to
* end (inclusive) * end (inclusive)
*/ */
public static int hashCode(byte[] array, int start, int end) { public static int hashCode(byte[] array, int start, int end) {
@ -429,6 +430,31 @@ public final class ArrayUtil {
return false; return false;
} }
public static <T> T[] grow(T[] array, int minSize) {
if (array.length < minSize) {
@SuppressWarnings("unchecked") final T[] newArray =
(T[]) Array.newInstance(array.getClass().getComponentType(), oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJ_REF));
System.arraycopy(array, 0, newArray, 0, array.length);
return newArray;
} else
return array;
}
public static <T> T[] grow(T[] array) {
return grow(array, 1 + array.length);
}
public static <T> T[] shrink(T[] array, int targetSize) {
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_OBJ_REF);
if (newSize != array.length) {
@SuppressWarnings("unchecked") final T[] newArray =
(T[]) Array.newInstance(array.getClass().getComponentType(), newSize);
System.arraycopy(array, 0, newArray, 0, newSize);
return newArray;
} else
return array;
}
// Since Arrays.equals doesn't implement offsets for equals // Since Arrays.equals doesn't implement offsets for equals
/** /**
* See if two array slices are the same. * See if two array slices are the same.

View File

@ -21,7 +21,7 @@ package org.apache.lucene.util;
* existing int[]. * existing int[].
* *
* @lucene.internal */ * @lucene.internal */
public final class IntsRef { public final class IntsRef implements Comparable<IntsRef> {
public int[] ints; public int[] ints;
public int offset; public int offset;
@ -81,6 +81,31 @@ public final class IntsRef {
} }
} }
/** Signed int order comparison */
public int compareTo(IntsRef other) {
if (this == other) return 0;
final int[] aInts = this.ints;
int aUpto = this.offset;
final int[] bInts = other.ints;
int bUpto = other.offset;
final int aStop = aUpto + Math.min(this.length, other.length);
while(aUpto < aStop) {
int aInt = aInts[aUpto++];
int bInt = bInts[bUpto++];
if (aInt > bInt) {
return 1;
} else if (aInt < bInt) {
return -1;
}
}
// One is a prefix of the other, or, they are equal:
return this.length - other.length;
}
public void copy(IntsRef other) { public void copy(IntsRef other) {
if (ints == null) { if (ints == null) {
ints = new int[other.length]; ints = new int[other.length];
@ -97,4 +122,18 @@ public final class IntsRef {
ints = ArrayUtil.grow(ints, newLength); ints = ArrayUtil.grow(ints, newLength);
} }
} }
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append('[');
final int end = offset + length;
for(int i=offset;i<end;i++) {
if (i > offset) {
sb.append(' ');
}
sb.append(Integer.toHexString(ints[i]));
}
sb.append(']');
return sb.toString();
}
} }

View File

@ -93,13 +93,7 @@ public final class RecyclingByteBlockAllocator extends ByteBlockPool.Allocator {
@Override @Override
public synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) { public synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) {
final int numBlocks = Math.min(maxBufferedBlocks - freeBlocks, end - start); final int numBlocks = Math.min(maxBufferedBlocks - freeBlocks, end - start);
final int size = freeBlocks + numBlocks; freeByteBlocks = ArrayUtil.grow(freeByteBlocks, freeBlocks + numBlocks);
if (size >= freeByteBlocks.length) {
final byte[][] newBlocks = new byte[ArrayUtil.oversize(size,
RamUsageEstimator.NUM_BYTES_OBJ_REF)][];
System.arraycopy(freeByteBlocks, 0, newBlocks, 0, freeBlocks);
freeByteBlocks = newBlocks;
}
final int stop = start + numBlocks; final int stop = start + numBlocks;
for (int i = start; i < stop; i++) { for (int i = start; i < stop; i++) {
freeByteBlocks[freeBlocks++] = blocks[i]; freeByteBlocks[freeBlocks++] = blocks[i];

View File

@ -40,7 +40,6 @@ import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/** /**
* Finite-state automaton with regular expression operations. * Finite-state automaton with regular expression operations.
@ -281,9 +280,7 @@ public class Automaton implements Serializable, Cloneable {
worklist.add(t.to); worklist.add(t.to);
t.to.number = upto; t.to.number = upto;
if (upto == numberedStates.length) { if (upto == numberedStates.length) {
final State[] newArray = new State[ArrayUtil.oversize(1+upto, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; numberedStates = ArrayUtil.grow(numberedStates);
System.arraycopy(numberedStates, 0, newArray, 0, upto);
numberedStates = newArray;
} }
numberedStates[upto] = t.to; numberedStates[upto] = t.to;
upto++; upto++;

View File

@ -30,7 +30,6 @@
package org.apache.lucene.util.automaton; package org.apache.lucene.util.automaton;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.BitSet; import java.util.BitSet;
@ -459,9 +458,7 @@ final public class BasicOperations {
public void add(Transition t) { public void add(Transition t) {
if (transitions.length == count) { if (transitions.length == count) {
Transition[] newArray = new Transition[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; transitions = ArrayUtil.grow(transitions);
System.arraycopy(transitions, 0, newArray, 0, count);
transitions = newArray;
} }
transitions[count++] = t; transitions[count++] = t;
} }
@ -503,9 +500,7 @@ final public class BasicOperations {
private PointTransitions next(int point) { private PointTransitions next(int point) {
// 1st time we are seeing this point // 1st time we are seeing this point
if (count == points.length) { if (count == points.length) {
final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; points = ArrayUtil.grow(points);
System.arraycopy(points, 0, newArray, 0, count);
points = newArray;
} }
PointTransitions points0 = points[count]; PointTransitions points0 = points[count];
if (points0 == null) { if (points0 == null) {
@ -650,9 +645,7 @@ final public class BasicOperations {
final SortedIntSet.FrozenIntSet p = statesSet.freeze(q); final SortedIntSet.FrozenIntSet p = statesSet.freeze(q);
worklist.add(p); worklist.add(p);
if (newStateUpto == newStatesArray.length) { if (newStateUpto == newStatesArray.length) {
final State[] newArray = new State[ArrayUtil.oversize(1+newStateUpto, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; newStatesArray = ArrayUtil.grow(newStatesArray);
System.arraycopy(newStatesArray, 0, newArray, 0, newStateUpto);
newStatesArray = newArray;
} }
newStatesArray[newStateUpto] = q; newStatesArray[newStateUpto] = q;
q.number = newStateUpto; q.number = newStateUpto;

View File

@ -29,7 +29,6 @@
package org.apache.lucene.util.automaton; package org.apache.lucene.util.automaton;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
import java.io.Serializable; import java.io.Serializable;
import java.util.Collection; import java.util.Collection;
@ -111,9 +110,7 @@ public class State implements Serializable, Comparable<State> {
*/ */
public void addTransition(Transition t) { public void addTransition(Transition t) {
if (numTransitions == transitionsArray.length) { if (numTransitions == transitionsArray.length) {
final Transition[] newArray = new Transition[ArrayUtil.oversize(1+numTransitions, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; transitionsArray = ArrayUtil.grow(transitionsArray);
System.arraycopy(transitionsArray, 0, newArray, 0, numTransitions);
transitionsArray = newArray;
} }
transitionsArray[numTransitions++] = t; transitionsArray[numTransitions++] = t;
} }

View File

@ -17,7 +17,6 @@ package org.apache.lucene.util.automaton;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import java.util.List; import java.util.List;
@ -299,9 +298,7 @@ final class UTF32ToUTF8 {
private State newUTF8State() { private State newUTF8State() {
State s = new State(); State s = new State();
if (utf8StateCount == utf8States.length) { if (utf8StateCount == utf8States.length) {
final State[] newArray = new State[ArrayUtil.oversize(1+utf8StateCount, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; utf8States = ArrayUtil.grow(utf8States);
System.arraycopy(utf8States, 0, newArray, 0, utf8StateCount);
utf8States = newArray;
} }
utf8States[utf8StateCount] = s; utf8States[utf8StateCount] = s;
s.number = utf8StateCount; s.number = utf8StateCount;

View File

@ -0,0 +1,506 @@
package org.apache.lucene.util.automaton.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import java.io.IOException;
/**
* Builds a compact FST (maps an IntsRef term to an arbitrary
* output) from pre-sorted terms with outputs (the FST
* becomes an FSA if you use NoOutputs). The FST is written
* on-the-fly into a compact serialized format byte array, which can
* be saved to / loaded from a Directory or used directly
* for traversal. The FST is always finite (no cycles).
*
* <p>NOTE: The algorithm is described at
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698</p>
*
* If your outputs are ByteSequenceOutput then the final FST
* will be minimal, but if you use PositiveIntOutput then
* it's only "near minimal". For example, aa/0, aab/1, bbb/2
* will produce 6 states when a 5 state fst is also
* possible.
*
* The parameterized type T is the output type. See the
* subclasses of {@link Outputs}.
*
* @lucene.experimental
*/
public class Builder<T> {
private final NodeHash<T> dedupHash;
private final FST<T> fst;
private final T NO_OUTPUT;
// simplistic pruning: we prune node (and all following
// nodes) if less than this number of terms go through it:
private final int minSuffixCount1;
// better pruning: we prune node (and all following
// nodes) if the prior node has less than this number of
// terms go through it:
private final int minSuffixCount2;
private final IntsRef lastInput = new IntsRef();
// NOTE: cutting this over to ArrayList instead loses ~6%
// in build performance on 9.8M Wikipedia terms; so we
// left this as an array:
// current "frontier"
private UnCompiledNode<T>[] frontier;
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs<T> outputs) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
fst = new FST<T>(inputType, outputs);
if (doMinSuffix) {
dedupHash = new NodeHash<T>(fst);
} else {
dedupHash = null;
}
NO_OUTPUT = outputs.getNoOutput();
@SuppressWarnings("unchecked") final UnCompiledNode<T>[] f = (UnCompiledNode<T>[]) new UnCompiledNode[10];
frontier = f;
for(int idx=0;idx<frontier.length;idx++) {
frontier[idx] = new UnCompiledNode<T>(this);
}
}
public int getTotStateCount() {
return fst.nodeCount;
}
public int getTermCount() {
return frontier[0].inputCount;
}
public int getMappedStateCount() {
return dedupHash == null ? 0 : fst.nodeCount;
}
private CompiledNode compileNode(UnCompiledNode<T> n) throws IOException {
final int address;
if (dedupHash != null) {
if (n.numArcs == 0) {
address = fst.addNode(n);
} else {
address = dedupHash.add(n);
}
} else {
address = fst.addNode(n);
}
assert address != -2;
n.clear();
final CompiledNode fn = new CompiledNode();
fn.address = address;
return fn;
}
private void compilePrevTail(int prefixLenPlus1) throws IOException {
assert prefixLenPlus1 >= 1;
//System.out.println(" compileTail " + prefixLenPlus1);
for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) {
boolean doPrune = false;
boolean doCompile = false;
final UnCompiledNode<T> node = frontier[idx];
final UnCompiledNode<T> parent = frontier[idx-1];
if (node.inputCount < minSuffixCount1) {
doPrune = true;
doCompile = true;
} else if (idx > prefixLenPlus1) {
// prune if parent's inputCount is less than suffixMinCount2
if (parent.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && parent.inputCount == 1) {
// my parent, about to be compiled, doesn't make the cut, so
// I'm definitely pruned
// if pruneCount2 is 1, we keep only up
// until the 'distinguished edge', ie we keep only the
// 'divergent' part of the FST. if my parent, about to be
// compiled, has inputCount 1 then we are already past the
// distinguished edge. NOTE: this only works if
// the FST outputs are not "compressible" (simple
// ords ARE compressible).
doPrune = true;
} else {
// my parent, about to be compiled, does make the cut, so
// I'm definitely not pruned
doPrune = false;
}
doCompile = true;
} else {
// if pruning is disabled (count is 0) we can always
// compile current node
doCompile = minSuffixCount2 == 0;
}
//System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune);
if (node.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && node.inputCount == 1) {
// drop all arcs
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
@SuppressWarnings("unchecked") final UnCompiledNode<T> target = (UnCompiledNode<T>) node.arcs[arcIdx].target;
target.clear();
}
node.numArcs = 0;
}
if (doPrune) {
// this node doesn't make it -- deref it
node.clear();
parent.deleteLast(lastInput.ints[lastInput.offset+idx-1], node);
} else {
if (minSuffixCount2 != 0) {
compileAllTargets(node);
}
final T nextFinalOutput = node.output;
final boolean isFinal = node.isFinal;
if (doCompile) {
// this node makes it and we now compile it. first,
// compile any targets that were previously
// undecided:
parent.replaceLast(lastInput.ints[lastInput.offset + idx-1],
compileNode(node),
nextFinalOutput,
isFinal);
} else {
// replaceLast just to install
// nextFinalOutput/isFinal onto the arc
parent.replaceLast(lastInput.ints[lastInput.offset + idx-1],
node,
nextFinalOutput,
isFinal);
// this node will stay in play for now, since we are
// undecided on whether to prune it. later, it
// will be either compiled or pruned, so we must
// allocate a new node:
frontier[idx] = new UnCompiledNode<T>(this);
}
}
}
}
private final IntsRef scratchIntsRef = new IntsRef(10);
public void add(BytesRef input, T output) throws IOException {
assert fst.getInputType() == FST.INPUT_TYPE.BYTE1;
scratchIntsRef.grow(input.length);
for(int i=0;i<input.length;i++) {
scratchIntsRef.ints[i] = input.bytes[i+input.offset] & 0xFF;
}
scratchIntsRef.length = input.length;
add(scratchIntsRef, output);
}
/** Sugar: adds the UTF32 chars from char[] slice. FST
* must be FST.INPUT_TYPE.BYTE4! */
public void add(char[] s, int offset, int length, T output) throws IOException {
assert fst.getInputType() == FST.INPUT_TYPE.BYTE4;
int charIdx = offset;
int intIdx = 0;
final int charLimit = offset + length;
while(charIdx < charLimit) {
scratchIntsRef.grow(intIdx+1);
final int utf32 = Character.codePointAt(s, charIdx);
scratchIntsRef.ints[intIdx] = utf32;
charIdx += Character.charCount(utf32);
intIdx++;
}
scratchIntsRef.length = intIdx;
add(scratchIntsRef, output);
}
/** Sugar: adds the UTF32 chars from CharSequence. FST
* must be FST.INPUT_TYPE.BYTE4! */
public void add(CharSequence s, T output) throws IOException {
assert fst.getInputType() == FST.INPUT_TYPE.BYTE4;
int charIdx = 0;
int intIdx = 0;
final int charLimit = s.length();
while(charIdx < charLimit) {
scratchIntsRef.grow(intIdx+1);
final int utf32 = Character.codePointAt(s, charIdx);
scratchIntsRef.ints[intIdx] = utf32;
charIdx += Character.charCount(utf32);
intIdx++;
}
scratchIntsRef.length = intIdx;
add(scratchIntsRef, output);
}
public void add(IntsRef input, T output) throws IOException {
//System.out.println("\nADD: " + input.utf8ToString());
assert lastInput.length == 0 || input.compareTo(lastInput) > 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input;
assert validOutput(output);
//System.out.println("\nadd: " + input);
if (input.length == 0) {
// empty input: only allowed as first input. we have
// to special case this because the packed FST
// format cannot represent the empty input since
// 'finalness' is stored on the incoming arc, not on
// the node
frontier[0].inputCount++;
fst.setEmptyOutput(output);
return;
}
// compare shared prefix length
int pos1 = 0;
int pos2 = input.offset;
final int pos1Stop = Math.min(lastInput.length, input.length);
while(true) {
//System.out.println(" incr " + pos1);
frontier[pos1].inputCount++;
if (pos1 >= pos1Stop || lastInput.ints[pos1] != input.ints[pos2]) {
break;
}
pos1++;
pos2++;
}
final int prefixLenPlus1 = pos1+1;
if (frontier.length < input.length+1) {
final UnCompiledNode<T>[] next = ArrayUtil.grow(frontier, input.length+1);
for(int idx=frontier.length;idx<next.length;idx++) {
next[idx] = new UnCompiledNode<T>(this);
}
frontier = next;
}
// minimize/compile states from previous input's
// orphan'd suffix
compilePrevTail(prefixLenPlus1);
// init tail states for current input
for(int idx=prefixLenPlus1;idx<=input.length;idx++) {
frontier[idx-1].addArc(input.ints[input.offset + idx - 1],
frontier[idx]);
//System.out.println(" incr tail " + idx);
frontier[idx].inputCount++;
}
final UnCompiledNode<T> lastNode = frontier[input.length];
lastNode.isFinal = true;
lastNode.output = NO_OUTPUT;
// push conflicting outputs forward, only as far as
// needed
for(int idx=1;idx<prefixLenPlus1;idx++) {
final UnCompiledNode<T> node = frontier[idx];
final UnCompiledNode<T> parentNode = frontier[idx-1];
final T lastOutput = parentNode.getLastOutput(input.ints[input.offset + idx - 1]);
assert validOutput(lastOutput);
final T commonOutputPrefix;
final T wordSuffix;
if (lastOutput != NO_OUTPUT) {
commonOutputPrefix = fst.outputs.common(output, lastOutput);
assert validOutput(commonOutputPrefix);
wordSuffix = fst.outputs.subtract(lastOutput, commonOutputPrefix);
assert validOutput(wordSuffix);
parentNode.setLastOutput(input.ints[input.offset + idx - 1], commonOutputPrefix);
node.prependOutput(wordSuffix);
} else {
commonOutputPrefix = wordSuffix = NO_OUTPUT;
}
output = fst.outputs.subtract(output, commonOutputPrefix);
assert validOutput(output);
}
// push remaining output:
frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output);
// save last input
lastInput.copy(input);
//System.out.println(" count[0]=" + frontier[0].inputCount);
}
private boolean validOutput(T output) {
return output == NO_OUTPUT || !output.equals(NO_OUTPUT);
}
/** Returns final FST. NOTE: this will return null if
* nothing is accepted by the FST. */
public FST<T> finish() throws IOException {
// minimize nodes in the last word's suffix
compilePrevTail(1);
//System.out.println("finish: inputCount=" + frontier[0].inputCount);
if (frontier[0].inputCount < minSuffixCount1 || frontier[0].inputCount < minSuffixCount2 || frontier[0].numArcs == 0) {
if (fst.getEmptyOutput() == null) {
return null;
} else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) {
// empty string got pruned
return null;
} else {
fst.finish(compileNode(frontier[0]).address);
//System.out.println("compile addr = " + fst.getStartNode());
return fst;
}
} else {
if (minSuffixCount2 != 0) {
compileAllTargets(frontier[0]);
}
//System.out.println("NOW: " + frontier[0].numArcs);
fst.finish(compileNode(frontier[0]).address);
}
return fst;
}
private void compileAllTargets(UnCompiledNode<T> node) throws IOException {
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
final Arc<T> arc = node.arcs[arcIdx];
if (!arc.target.isCompiled()) {
// not yet compiled
@SuppressWarnings("unchecked") final UnCompiledNode<T> n = (UnCompiledNode<T>) arc.target;
arc.target = compileNode(n);
}
}
}
static class Arc<T> {
public int label; // really an "unsigned" byte
public Node target;
public boolean isFinal;
public T output;
public T nextFinalOutput;
}
// NOTE: not many instances of Node or CompiledNode are in
// memory while the FST is being built; it's only the
// current "frontier":
static interface Node {
boolean isCompiled();
}
static final class CompiledNode implements Node {
int address;
public boolean isCompiled() {
return true;
}
}
static final class UnCompiledNode<T> implements Node {
final Builder<T> owner;
int numArcs;
Arc<T>[] arcs;
T output;
boolean isFinal;
int inputCount;
@SuppressWarnings("unchecked")
public UnCompiledNode(Builder<T> owner) {
this.owner = owner;
arcs = (Arc<T>[]) new Arc[1];
arcs[0] = new Arc<T>();
output = owner.NO_OUTPUT;
}
public boolean isCompiled() {
return false;
}
public void clear() {
numArcs = 0;
isFinal = false;
output = owner.NO_OUTPUT;
inputCount = 0;
}
public T getLastOutput(int labelToMatch) {
assert numArcs > 0;
assert arcs[numArcs-1].label == labelToMatch;
return arcs[numArcs-1].output;
}
public void addArc(int label, Node target) {
assert label >= 0;
assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs;
if (numArcs == arcs.length) {
final Arc<T>[] newArcs = ArrayUtil.grow(arcs);
for(int arcIdx=numArcs;arcIdx<newArcs.length;arcIdx++) {
newArcs[arcIdx] = new Arc<T>();
}
arcs = newArcs;
}
final Arc<T> arc = arcs[numArcs++];
arc.label = label;
arc.target = target;
arc.output = arc.nextFinalOutput = owner.NO_OUTPUT;
arc.isFinal = false;
}
public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
assert numArcs > 0;
final Arc<T> arc = arcs[numArcs-1];
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
arc.target = target;
//assert target.address != -2;
arc.nextFinalOutput = nextFinalOutput;
arc.isFinal = isFinal;
}
public void deleteLast(int label, Node target) {
assert numArcs > 0;
assert label == arcs[numArcs-1].label;
assert target == arcs[numArcs-1].target;
numArcs--;
}
public void setLastOutput(int labelToMatch, T newOutput) {
assert owner.validOutput(newOutput);
assert numArcs > 0;
final Arc<T> arc = arcs[numArcs-1];
assert arc.label == labelToMatch;
arc.output = newOutput;
}
// pushes an output prefix forward onto all arcs
public void prependOutput(T outputPrefix) {
assert owner.validOutput(outputPrefix);
for(int arcIdx=0;arcIdx<numArcs;arcIdx++) {
arcs[arcIdx].output = owner.fst.outputs.add(outputPrefix, arcs[arcIdx].output);
assert owner.validOutput(arcs[arcIdx].output);
}
if (isFinal) {
output = owner.fst.outputs.add(outputPrefix, output);
assert owner.validOutput(output);
}
}
}
}

View File

@ -0,0 +1,137 @@
package org.apache.lucene.util.automaton.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
/**
* Output is a sequence of bytes, for each input term.
* @lucene.experimental
*/
public final class ByteSequenceOutputs extends Outputs<BytesRef> {
private final static BytesRef NO_OUTPUT = new BytesRef();
private ByteSequenceOutputs() {
}
public static ByteSequenceOutputs getSingleton() {
return new ByteSequenceOutputs();
}
@Override
public BytesRef common(BytesRef output1, BytesRef output2) {
assert output1 != null;
assert output2 != null;
int pos1 = output1.offset;
int pos2 = output2.offset;
int stopAt1 = pos1 + Math.min(output1.length, output2.length);
while(pos1 < stopAt1) {
if (output1.bytes[pos1] != output2.bytes[pos2]) {
break;
}
pos1++;
pos2++;
}
if (pos1 == output1.offset) {
// no common prefix
return NO_OUTPUT;
} else if (pos1 == output1.offset + output1.length) {
// output1 is a prefix of output2
return output1;
} else if (pos2 == output2.offset + output2.length) {
// output2 is a prefix of output1
return output2;
} else {
return new BytesRef(output1.bytes, output1.offset, pos1-output1.offset);
}
}
@Override
public BytesRef subtract(BytesRef output, BytesRef inc) {
assert output != null;
assert inc != null;
if (inc == NO_OUTPUT) {
// no prefix removed
return output;
} else if (inc.length == output.length) {
// entire output removed
return NO_OUTPUT;
} else {
assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length;
assert inc.length > 0;
return new BytesRef(output.bytes, output.offset + inc.length, output.length-inc.length);
}
}
@Override
public BytesRef add(BytesRef prefix, BytesRef output) {
assert prefix != null;
assert output != null;
if (prefix == NO_OUTPUT) {
return output;
} else if (output == NO_OUTPUT) {
return prefix;
} else {
assert prefix.length > 0;
assert output.length > 0;
BytesRef result = new BytesRef(prefix.length + output.length);
System.arraycopy(prefix.bytes, prefix.offset, result.bytes, 0, prefix.length);
System.arraycopy(output.bytes, output.offset, result.bytes, prefix.length, output.length);
result.length = prefix.length + output.length;
return result;
}
}
@Override
public void write(BytesRef prefix, DataOutput out) throws IOException {
assert prefix != null;
out.writeVInt(prefix.length);
out.writeBytes(prefix.bytes, prefix.offset, prefix.length);
}
@Override
public BytesRef read(DataInput in) throws IOException {
final int len = in.readVInt();
if (len == 0) {
return NO_OUTPUT;
} else {
final BytesRef output = new BytesRef(len);
in.readBytes(output.bytes, 0, len);
output.length = len;
return output;
}
}
@Override
public BytesRef getNoOutput() {
return NO_OUTPUT;
}
@Override
public String outputToString(BytesRef output) {
return output.utf8ToString();
}
}

View File

@ -0,0 +1,304 @@
package org.apache.lucene.util.automaton.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
/** Can next() and advance() through the terms in an FST
* @lucene.experimental
*/
public class BytesRefFSTEnum<T> {
private final FST<T> fst;
private BytesRef current = new BytesRef(10);
@SuppressWarnings("unchecked") private FST.Arc<T>[] arcs = (FST.Arc<T>[]) new FST.Arc[10];
// outputs are cumulative
@SuppressWarnings("unchecked") private T[] output = (T[]) new Object[10];
private boolean lastFinal;
private boolean didEmpty;
private final T NO_OUTPUT;
private final InputOutput<T> result = new InputOutput<T>();
public static class InputOutput<T> {
public BytesRef input;
public T output;
}
public BytesRefFSTEnum(FST<T> fst) {
this.fst = fst;
result.input = current;
NO_OUTPUT = fst.outputs.getNoOutput();
}
public void reset() {
lastFinal = false;
didEmpty = false;
current.length = 0;
result.output = NO_OUTPUT;
}
/** NOTE: target must be >= where we are already
* positioned */
public InputOutput<T> advance(BytesRef target) throws IOException {
assert target.compareTo(current) >= 0;
//System.out.println(" advance len=" + target.length + " curlen=" + current.length);
// special case empty string
if (current.length == 0) {
if (target.length == 0) {
final T output = fst.getEmptyOutput();
if (output != null) {
if (!didEmpty) {
current.length = 0;
lastFinal = true;
result.output = output;
didEmpty = true;
}
return result;
} else {
return next();
}
}
if (fst.noNodes()) {
return null;
}
}
// TODO: possibly caller could/should provide common
// prefix length? ie this work may be redundant if
// caller is in fact intersecting against its own
// automaton
// what prefix does target share w/ current
int idx = 0;
while (idx < current.length && idx < target.length) {
if (current.bytes[idx] != target.bytes[target.offset + idx]) {
break;
}
idx++;
}
//System.out.println(" shared " + idx);
FST.Arc<T> arc;
if (current.length == 0) {
// new enum (no seek/next yet)
arc = fst.readFirstArc(fst.getStartNode(), getArc(0));
//System.out.println(" new enum");
} else if (idx < current.length) {
// roll back to shared point
lastFinal = false;
current.length = idx;
arc = arcs[idx];
if (arc.isLast()) {
if (idx == 0) {
return null;
} else {
return next();
}
}
arc = fst.readNextArc(arc);
} else if (idx == target.length) {
// degenerate case -- seek to term we are already on
assert target.equals(current);
return result;
} else {
// current is a full prefix of target
if (lastFinal) {
arc = fst.readFirstArc(arcs[current.length-1].target, getArc(current.length));
} else {
return next();
}
}
lastFinal = false;
assert arc == arcs[current.length];
int targetLabel = target.bytes[target.offset+current.length] & 0xFF;
while(true) {
//System.out.println(" cycle len=" + current.length + " target=" + ((char) targetLabel) + " vs " + ((char) arc.label));
if (arc.label == targetLabel) {
grow();
current.bytes[current.length] = (byte) arc.label;
appendOutput(arc.output);
current.length++;
grow();
if (current.length == target.length) {
result.output = output[current.length-1];
if (arc.isFinal()) {
// target is exact match
if (fst.hasArcs(arc.target)) {
// target is also a proper prefix of other terms
lastFinal = true;
appendFinalOutput(arc.nextFinalOutput);
}
} else {
// target is not a match but is a prefix of
// other terms
current.length--;
push();
}
return result;
} else if (!fst.hasArcs(arc.target)) {
// we only match a prefix of the target
return next();
} else {
targetLabel = target.bytes[target.offset+current.length] & 0xFF;
arc = fst.readFirstArc(arc.target, getArc(current.length));
}
} else if (arc.label > targetLabel) {
// we are now past the target
push();
return result;
} else if (arc.isLast()) {
if (current.length == 0) {
return null;
}
return next();
} else {
arc = fst.readNextArc(getArc(current.length));
}
}
}
public InputOutput<T> current() {
return result;
}
public InputOutput<T> next() throws IOException {
//System.out.println(" enum.next");
if (current.length == 0) {
final T output = fst.getEmptyOutput();
if (output != null) {
if (!didEmpty) {
current.length = 0;
lastFinal = true;
result.output = output;
didEmpty = true;
return result;
} else {
lastFinal = false;
}
}
if (fst.noNodes()) {
return null;
}
fst.readFirstArc(fst.getStartNode(), getArc(0));
push();
} else if (lastFinal) {
lastFinal = false;
assert current.length > 0;
// resume pushing
fst.readFirstArc(arcs[current.length-1].target, getArc(current.length));
push();
} else {
//System.out.println(" pop/push");
pop();
if (current.length == 0) {
// enum done
return null;
} else {
current.length--;
fst.readNextArc(arcs[current.length]);
push();
}
}
return result;
}
private void grow() {
final int l = current.length + 1;
current.grow(l);
arcs = ArrayUtil.grow(arcs, l);
output = ArrayUtil.grow(output, l);
}
private void appendOutput(T addedOutput) {
T newOutput;
if (current.length == 0) {
newOutput = addedOutput;
} else if (addedOutput == NO_OUTPUT) {
output[current.length] = output[current.length-1];
return;
} else {
newOutput = fst.outputs.add(output[current.length-1], addedOutput);
}
output[current.length] = newOutput;
}
private void appendFinalOutput(T addedOutput) {
if (current.length == 0) {
result.output = addedOutput;
} else {
result.output = fst.outputs.add(output[current.length-1], addedOutput);
}
}
private void push() throws IOException {
FST.Arc<T> arc = arcs[current.length];
assert arc != null;
while(true) {
grow();
current.bytes[current.length] = (byte) arc.label;
appendOutput(arc.output);
//System.out.println(" push: append label=" + ((char) arc.label) + " output=" + fst.outputs.outputToString(arc.output));
current.length++;
grow();
if (!fst.hasArcs(arc.target)) {
break;
}
if (arc.isFinal()) {
appendFinalOutput(arc.nextFinalOutput);
lastFinal = true;
return;
}
arc = fst.readFirstArc(arc.target, getArc(current.length));
}
result.output = output[current.length-1];
}
private void pop() {
while (current.length > 0 && arcs[current.length-1].isLast()) {
current.length--;
}
}
private FST.Arc<T> getArc(int idx) {
if (arcs[idx] == null) {
arcs[idx] = new FST.Arc<T>();
}
return arcs[idx];
}
}

View File

@ -0,0 +1,922 @@
package org.apache.lucene.util.automaton.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IntsRef;
/** Represents an FST using a compact byte[] format.
* <p> The format is similar to what's used by Morfologik
* (http://sourceforge.net/projects/morfologik).
* @lucene.experimental
*/
public class FST<T> {
public static enum INPUT_TYPE {BYTE1, BYTE2, BYTE4};
private final INPUT_TYPE inputType;
private final static int BIT_FINAL_ARC = 1 << 0;
private final static int BIT_LAST_ARC = 1 << 1;
private final static int BIT_TARGET_NEXT = 1 << 2;
private final static int BIT_STOP_NODE = 1 << 3;
private final static int BIT_ARC_HAS_OUTPUT = 1 << 4;
private final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
// Arcs are stored as fixed-size (per entry) array, so
// that we can find an arc using binary search. We do
// this when number of arcs is > NUM_ARCS_ARRAY:
private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6;
// If the node has >= this number of arcs, the arcs are
// stored as a fixed array. Fixed array consumes more RAM
// but enables binary search on the arcs (instead of
// linear scan) on lookup by arc label:
private final static int NUM_ARCS_FIXED_ARRAY = 10;
private int[] bytesPerArc = new int[0];
// Increment version to change it
private final static String FILE_FORMAT_NAME = "FST";
private final static int VERSION_START = 0;
private final static int VERSION_CURRENT = VERSION_START;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
private final static int FINAL_END_NODE = -1;
// Never serialized; just used to represent the virtual
// non-final node w/ no arcs:
private final static int NON_FINAL_END_NODE = 0;
// if non-null, this FST accepts the empty string and
// produces this output
private T emptyOutput;
private byte[] emptyOutputBytes;
private byte[] bytes;
int byteUpto = 0;
private int startNode = -1;
public final Outputs<T> outputs;
private int lastFrozenNode;
private final T NO_OUTPUT;
public int nodeCount;
public int arcCount;
public int arcWithOutputCount;
public final static class Arc<T> {
int label; // really a "unsigned" byte
int target;
byte flags;
T output;
T nextFinalOutput;
int nextArc;
// This is non-zero if current arcs are fixed array:
int posArcsStart;
int bytesPerArc;
int arcIdx;
int numArcs;
// Must call this before re-using an Arc instance on a
// new node
public void reset() {
bytesPerArc = 0;
}
public boolean flag(int flag) {
return FST.flag(flags, flag);
}
public boolean isLast() {
return flag(BIT_LAST_ARC);
}
public boolean isFinal() {
return flag(BIT_FINAL_ARC);
}
};
static boolean flag(int flags, int bit) {
return (flags & bit) != 0;
}
private final BytesWriter writer;
// make a new empty FST, for building
public FST(INPUT_TYPE inputType, Outputs<T> outputs) {
this.inputType = inputType;
this.outputs = outputs;
bytes = new byte[128];
NO_OUTPUT = outputs.getNoOutput();
writer = new BytesWriter();
emptyOutput = null;
}
// create an existing FST
public FST(IndexInput in, Outputs<T> outputs) throws IOException {
this.outputs = outputs;
writer = null;
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_START);
if (in.readByte() == 1) {
// accepts empty string
int numBytes = in.readVInt();
// messy
bytes = new byte[numBytes];
in.readBytes(bytes, 0, numBytes);
emptyOutput = outputs.read(new BytesReader(numBytes-1));
} else {
emptyOutput = null;
}
final byte t = in.readByte();
switch(t) {
case 0:
inputType = INPUT_TYPE.BYTE1;
break;
case 1:
inputType = INPUT_TYPE.BYTE2;
break;
case 2:
inputType = INPUT_TYPE.BYTE4;
break;
default:
throw new IllegalStateException("invalid input type " + t);
}
startNode = in.readVInt();
nodeCount = in.readVInt();
arcCount = in.readVInt();
arcWithOutputCount = in.readVInt();
bytes = new byte[in.readVInt()];
in.readBytes(bytes, 0, bytes.length);
NO_OUTPUT = outputs.getNoOutput();
}
public INPUT_TYPE getInputType() {
return inputType;
}
/** Returns bytes used to represent the FST */
public int sizeInBytes() {
return bytes.length;
}
void finish(int startNode) {
if (this.startNode != -1) {
throw new IllegalStateException("already finished");
}
byte[] finalBytes = new byte[writer.posWrite];
System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite);
bytes = finalBytes;
this.startNode = startNode;
}
public void setEmptyOutput(T v) throws IOException {
if (emptyOutput != null) {
throw new IllegalStateException("empty output is already set");
}
emptyOutput = v;
// TODO: this is messy -- replace with sillyBytesWriter; maybe make
// bytes private
final int posSave = writer.posWrite;
outputs.write(emptyOutput, writer);
emptyOutputBytes = new byte[writer.posWrite-posSave];
// reverse
final int stopAt = (writer.posWrite - posSave)/2;
int upto = 0;
while(upto < stopAt) {
final byte b = bytes[posSave + upto];
bytes[posSave+upto] = bytes[writer.posWrite-upto-1];
bytes[writer.posWrite-upto-1] = b;
upto++;
}
System.arraycopy(bytes, posSave, emptyOutputBytes, 0, writer.posWrite-posSave);
writer.posWrite = posSave;
}
public void save(IndexOutput out) throws IOException {
if (startNode == -1) {
throw new IllegalStateException("call finish first");
}
CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT);
if (emptyOutput != null) {
out.writeByte((byte) 1);
out.writeVInt(emptyOutputBytes.length);
out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length);
} else {
out.writeByte((byte) 0);
}
final byte t;
if (inputType == INPUT_TYPE.BYTE1) {
t = 0;
} else if (inputType == INPUT_TYPE.BYTE2) {
t = 1;
} else {
t = 2;
}
out.writeByte(t);
out.writeVInt(startNode);
out.writeVInt(nodeCount);
out.writeVInt(arcCount);
out.writeVInt(arcWithOutputCount);
out.writeVInt(bytes.length);
out.writeBytes(bytes, 0, bytes.length);
}
private void writeLabel(int v) throws IOException {
assert v >= 0: "v=" + v;
if (inputType == INPUT_TYPE.BYTE1) {
assert v <= 255: "v=" + v;
writer.writeByte((byte) v);
} else if (inputType == INPUT_TYPE.BYTE2) {
assert v <= 65535: "v=" + v;
writer.writeVInt(v);
} else {
//writeInt(v);
writer.writeVInt(v);
}
}
private int readLabel(DataInput in) throws IOException {
final int v;
if (inputType == INPUT_TYPE.BYTE1) {
v = in.readByte()&0xFF;
} else if (inputType == INPUT_TYPE.BYTE2) {
v = in.readVInt();
} else {
v = in.readVInt();
}
return v;
}
// returns true if the node at this address has any
// outgoing arcs
public boolean hasArcs(int address) {
return address != FINAL_END_NODE && address != NON_FINAL_END_NODE;
}
public int getStartNode() {
if (startNode == -1) {
throw new IllegalStateException("call finish first");
}
return startNode;
}
// returns null if this FST does not accept the empty
// string, else, the output for the empty string
public T getEmptyOutput() {
return emptyOutput;
}
// serializes new node by appending its bytes to the end
// of the current byte[]
int addNode(Builder.UnCompiledNode<T> node) throws IOException {
//System.out.println("FST.addNode pos=" + posWrite + " numArcs=" + node.numArcs);
if (node.numArcs == 0) {
if (node.isFinal) {
return FINAL_END_NODE;
} else {
return NON_FINAL_END_NODE;
}
}
int startAddress = writer.posWrite;
//System.out.println(" startAddr=" + startAddress);
final boolean doFixedArray = node.numArcs >= NUM_ARCS_FIXED_ARRAY;
final int fixedArrayStart;
if (doFixedArray) {
if (bytesPerArc.length < node.numArcs) {
bytesPerArc = new int[ArrayUtil.oversize(node.numArcs, 1)];
}
// write a "false" first arc:
writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY);
writer.writeVInt(node.numArcs);
// placeholder -- we'll come back and write the number
// of bytes per arc here:
writer.writeByte((byte) 0);
fixedArrayStart = writer.posWrite;
//System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart);
} else {
fixedArrayStart = 0;
}
nodeCount++;
arcCount += node.numArcs;
final int lastArc = node.numArcs-1;
int lastArcStart = writer.posWrite;
int maxBytesPerArc = 0;
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
final Builder.Arc<T> arc = node.arcs[arcIdx];
final Builder.CompiledNode target = (Builder.CompiledNode) arc.target;
int flags = 0;
if (arcIdx == lastArc) {
flags += BIT_LAST_ARC;
}
if (lastFrozenNode == target.address && !doFixedArray) {
flags += BIT_TARGET_NEXT;
}
if (arc.isFinal) {
flags += BIT_FINAL_ARC;
if (arc.nextFinalOutput != NO_OUTPUT) {
flags += BIT_ARC_HAS_FINAL_OUTPUT;
}
} else {
assert arc.nextFinalOutput == NO_OUTPUT;
}
boolean targetHasArcs = hasArcs(target.address);
if (!targetHasArcs) {
flags += BIT_STOP_NODE;
}
if (arc.output != NO_OUTPUT) {
flags += BIT_ARC_HAS_OUTPUT;
}
writer.writeByte((byte) flags);
writeLabel(arc.label);
//System.out.println(" write arc: label=" + arc.label + " flags=" + flags);
if (arc.output != NO_OUTPUT) {
outputs.write(arc.output, writer);
arcWithOutputCount++;
}
if (arc.nextFinalOutput != NO_OUTPUT) {
outputs.write(arc.nextFinalOutput, writer);
}
if (targetHasArcs && (doFixedArray || lastFrozenNode != target.address)) {
assert target.address > 0;
writer.writeInt(target.address);
}
// just write the arcs "like normal" on first pass,
// but record how many bytes each one took, and max
// byte size:
if (doFixedArray) {
bytesPerArc[arcIdx] = writer.posWrite - lastArcStart;
lastArcStart = writer.posWrite;
maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]);
//System.out.println(" bytes=" + bytesPerArc[arcIdx]);
}
}
if (doFixedArray) {
assert maxBytesPerArc > 0;
// 2nd pass just "expands" all arcs to take up a fixed
// byte size
final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc;
bytes = ArrayUtil.grow(bytes, sizeNeeded);
if (maxBytesPerArc > 255) {
throw new IllegalStateException("max arc size is too large (" + maxBytesPerArc + ")");
}
bytes[fixedArrayStart-1] = (byte) maxBytesPerArc;
// expand the arcs in place, backwards
int srcPos = writer.posWrite;
int destPos = fixedArrayStart + node.numArcs*maxBytesPerArc;
writer.posWrite = destPos;
for(int arcIdx=node.numArcs-1;arcIdx>=0;arcIdx--) {
//System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos);
destPos -= maxBytesPerArc;
srcPos -= bytesPerArc[arcIdx];
if (srcPos != destPos) {
assert destPos > srcPos;
System.arraycopy(bytes, srcPos, bytes, destPos, bytesPerArc[arcIdx]);
}
}
}
// reverse bytes in-place; we do this so that the
// "BIT_TARGET_NEXT" opto can work, ie, it reads the
// node just before the current one
final int endAddress = writer.posWrite;
final int stopAt = (endAddress - startAddress)/2;
int upto = 0;
while (upto < stopAt) {
final byte b = bytes[startAddress+upto];
bytes[startAddress+upto] = bytes[endAddress-upto-1];
bytes[endAddress-upto-1] = b;
upto++;
}
lastFrozenNode = endAddress - 1;
/*
System.out.println(" return node addr=" + (endAddress-1));
for(int i=endAddress-1;i>=startAddress;i--) {
System.out.println(" bytes[" + i + "]=" + bytes[i]);
}
*/
return endAddress-1;
}
public Arc<T> readFirstArc(int address, Arc<T> arc) throws IOException {
//System.out.println("readFirstArc addr=" + address);
//int pos = address;
final BytesReader in = new BytesReader(address);
arc.flags = in.readByte();
if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) {
//System.out.println(" fixedArray");
// this is first arc in a fixed-array
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readByte() & 0xFF;
arc.arcIdx = -1;
arc.posArcsStart = in.pos;
//System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
} else {
in.pos++;
arc.bytesPerArc = 0;
}
arc.nextArc = in.pos;
return readNextArc(arc);
}
public Arc<T> readNextArc(Arc<T> arc) throws IOException {
// this is a continuing arc in a fixed array
final BytesReader in;
if (arc.bytesPerArc != 0) {
// arcs are at fixed entries
arc.arcIdx++;
in = new BytesReader(arc.posArcsStart - arc.arcIdx*arc.bytesPerArc);
} else {
// arcs are packed
in = new BytesReader(arc.nextArc);
}
arc.flags = in.readByte();
arc.label = readLabel(in);
if (arc.flag(BIT_ARC_HAS_OUTPUT)) {
arc.output = outputs.read(in);
} else {
arc.output = outputs.getNoOutput();
}
if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
arc.nextFinalOutput = outputs.read(in);
} else {
arc.nextFinalOutput = outputs.getNoOutput();
}
if (arc.flag(BIT_STOP_NODE)) {
arc.target = FINAL_END_NODE;
arc.nextArc = in.pos;
} else if (arc.flag(BIT_TARGET_NEXT)) {
arc.nextArc = in.pos;
if (!arc.flag(BIT_LAST_ARC)) {
if (arc.bytesPerArc == 0) {
// must scan
seekToNextNode(in);
} else {
in.pos = arc.posArcsStart - arc.bytesPerArc * arc.numArcs;
}
}
arc.target = in.pos;
} else {
arc.target = in.readInt();
arc.nextArc = in.pos;
}
return arc;
}
public Arc<T> findArc(int address, int labelToMatch, Arc<T> arc) throws IOException {
// TODO: maybe make an explicit thread state that holds
// reusable stuff eg BytesReader:
final BytesReader in = new BytesReader(address);
if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) {
// Arcs are full array; do binary search:
//System.out.println("findArc: array label=" + labelToMatch);
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readByte() & 0xFF;
arc.posArcsStart = in.pos;
int low = 0;
int high = arc.numArcs-1;
while (low <= high) {
int mid = (low + high) >>> 1;
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
int midLabel = readLabel(in);
final int cmp = midLabel - labelToMatch;
if (cmp < 0)
low = mid + 1;
else if (cmp > 0)
high = mid - 1;
else {
arc.arcIdx = mid-1;
return readNextArc(arc);
}
}
return null;
}
//System.out.println("findArc: scan");
readFirstArc(address, arc);
while(true) {
if (arc.label == labelToMatch) {
return arc;
} else if (arc.isLast()) {
return null;
} else {
readNextArc(arc);
}
}
}
/** Looks up the output for this input, or null if the
* input is not accepted. FST must be
* INPUT_TYPE.BYTE4. */
public T get(IntsRef input) throws IOException {
assert inputType == INPUT_TYPE.BYTE4;
if (input.length == 0) {
return getEmptyOutput();
}
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = new FST.Arc<T>();
int node = getStartNode();
T output = NO_OUTPUT;
for(int i=0;i<input.length;i++) {
if (!hasArcs(node)) {
// hit end of FST before input end
return null;
}
if (findArc(node, input.ints[input.offset + i], arc) != null) {
node = arc.target;
if (arc.output != NO_OUTPUT) {
output = outputs.add(output, arc.output);
}
} else {
return null;
}
}
if (!arc.isFinal()) {
// hit input's end before end node
return null;
}
if (arc.nextFinalOutput != NO_OUTPUT) {
output = outputs.add(output, arc.nextFinalOutput);
}
return output;
}
/** Logically casts input to UTF32 ints then looks up the output
* or null if the input is not accepted. FST must be
* INPUT_TYPE.BYTE4. */
public T get(char[] input, int offset, int length) throws IOException {
assert inputType == INPUT_TYPE.BYTE4;
if (length == 0) {
return getEmptyOutput();
}
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = new FST.Arc<T>();
int node = getStartNode();
int charIdx = offset;
final int charLimit = offset + length;
T output = NO_OUTPUT;
while(charIdx < charLimit) {
if (!hasArcs(node)) {
// hit end of FST before input end
return null;
}
final int utf32 = Character.codePointAt(input, charIdx);
charIdx += Character.charCount(utf32);
if (findArc(node, utf32, arc) != null) {
node = arc.target;
if (arc.output != NO_OUTPUT) {
output = outputs.add(output, arc.output);
}
} else {
return null;
}
}
if (!arc.isFinal()) {
// hit input's end before end node
return null;
}
if (arc.nextFinalOutput != NO_OUTPUT) {
output = outputs.add(output, arc.nextFinalOutput);
}
return output;
}
/** Logically casts input to UTF32 ints then looks up the output
* or null if the input is not accepted. FST must be
* INPUT_TYPE.BYTE4. */
public T get(CharSequence input) throws IOException {
assert inputType == INPUT_TYPE.BYTE4;
final int len = input.length();
if (len == 0) {
return getEmptyOutput();
}
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = new FST.Arc<T>();
int node = getStartNode();
int charIdx = 0;
final int charLimit = input.length();
T output = NO_OUTPUT;
while(charIdx < charLimit) {
if (!hasArcs(node)) {
// hit end of FST before input end
return null;
}
final int utf32 = Character.codePointAt(input, charIdx);
charIdx += Character.charCount(utf32);
if (findArc(node, utf32, arc) != null) {
node = arc.target;
if (arc.output != NO_OUTPUT) {
output = outputs.add(output, arc.output);
}
} else {
return null;
}
}
if (!arc.isFinal()) {
// hit input's end before end node
return null;
}
if (arc.nextFinalOutput != NO_OUTPUT) {
output = outputs.add(output, arc.nextFinalOutput);
}
return output;
}
/** Looks up the output for this input, or null if the
* input is not accepted */
public T get(BytesRef input) throws IOException {
assert inputType == INPUT_TYPE.BYTE1;
if (input.length == 0) {
return getEmptyOutput();
}
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = new FST.Arc<T>();
int node = getStartNode();
T output = NO_OUTPUT;
for(int i=0;i<input.length;i++) {
if (!hasArcs(node)) {
// hit end of FST before input end
return null;
}
if (findArc(node, input.bytes[i+input.offset], arc) != null) {
node = arc.target;
if (arc.output != NO_OUTPUT) {
output = outputs.add(output, arc.output);
}
} else {
return null;
}
}
if (!arc.isFinal()) {
// hit input's end before end node
return null;
}
if (arc.nextFinalOutput != NO_OUTPUT) {
output = outputs.add(output, arc.nextFinalOutput);
}
return output;
}
/** Returns true if this FST has no nodes */
public boolean noNodes() {
//System.out.println("isempty startNode=" + startNode);
return startNode == 0;
}
private void seekToNextNode(BytesReader in) throws IOException {
while(true) {
final int flags = in.readByte();
readLabel(in);
if (flag(flags, BIT_ARC_HAS_OUTPUT)) {
outputs.read(in);
}
if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) {
outputs.read(in);
}
if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) {
in.readInt();
}
if (flag(flags, BIT_LAST_ARC)) {
return;
}
}
}
// NOTE: this consumes alot of RAM!
// final arcs have a flat end (not arrow)
// arcs w/ NEXT opto are in blue
/*
eg:
PrintStream ps = new PrintStream("out.dot");
fst.toDot(ps);
ps.close();
System.out.println("SAVED out.dot");
then dot -Tpng out.dot > /x/tmp/out.png
*/
public void toDot(PrintStream out) throws IOException {
final List<Integer> queue = new ArrayList<Integer>();
queue.add(startNode);
final Set<Integer> seen = new HashSet<Integer>();
seen.add(startNode);
out.println("digraph FST {");
out.println(" rankdir = LR;");
//out.println(" " + startNode + " [shape=circle label=" + startNode + "];");
out.println(" " + startNode + " [label=\"\" shape=circle];");
out.println(" initial [shape=point color=white label=\"\"];");
if (emptyOutput != null) {
out.println(" initial -> " + startNode + " [arrowhead=tee label=\"(" + outputs.outputToString(emptyOutput) + ")\"];");
} else {
out.println(" initial -> " + startNode);
}
final Arc<T> arc = new Arc<T>();
while(queue.size() != 0) {
Integer node = queue.get(queue.size()-1);
queue.remove(queue.size()-1);
if (node == FINAL_END_NODE || node == NON_FINAL_END_NODE) {
continue;
}
// scan all arcs
readFirstArc(node, arc);
while(true) {
if (!seen.contains(arc.target)) {
//out.println(" " + arc.target + " [label=" + arc.target + "];");
out.println(" " + arc.target + " [label=\"\" shape=circle];");
seen.add(arc.target);
queue.add(arc.target);
}
String outs;
if (arc.output != NO_OUTPUT) {
outs = "/" + outputs.outputToString(arc.output);
} else {
outs = "";
}
if (arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) {
outs += " (" + outputs.outputToString(arc.nextFinalOutput) + ")";
}
out.print(" " + node + " -> " + arc.target + " [label=\"" + arc.label + outs + "\"");
if (arc.isFinal()) {
out.print(" arrowhead=tee");
}
if (arc.flag(BIT_TARGET_NEXT)) {
out.print(" color=blue");
}
out.println("];");
if (arc.isLast()) {
break;
} else {
readNextArc(arc);
}
}
}
out.println("}");
}
public int getNodeCount() {
// 1+ in order to count the -1 implicit final node
return 1+nodeCount;
}
public int getArcCount() {
return arcCount;
}
public int getArcWithOutputCount() {
return arcWithOutputCount;
}
// Non-static: writes to FST's byte[]
private class BytesWriter extends DataOutput {
int posWrite;
public BytesWriter() {
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
posWrite = 1;
}
@Override
public void writeByte(byte b) {
if (bytes.length == posWrite) {
bytes = ArrayUtil.grow(bytes);
}
assert posWrite < bytes.length: "posWrite=" + posWrite + " bytes.length=" + bytes.length;
bytes[posWrite++] = b;
}
@Override
public void writeBytes(byte[] b, int offset, int length) {
final int size = posWrite + length;
bytes = ArrayUtil.grow(bytes, size);
System.arraycopy(b, offset, bytes, posWrite, length);
posWrite += length;
}
}
// Non-static: reads byte[] from FST
private class BytesReader extends DataInput {
int pos;
public BytesReader(int pos) {
this.pos = pos;
}
@Override
public byte readByte() {
return bytes[pos--];
}
@Override
public void readBytes(byte[] b, int offset, int len) {
for(int i=0;i<len;i++) {
b[offset+i] = bytes[pos--];
}
}
}
}

View File

@ -0,0 +1,141 @@
package org.apache.lucene.util.automaton.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.IntsRef;
/**
* Output is a sequence of ints, for each input term.
* @lucene.experimental
*/
public final class IntSequenceOutputs extends Outputs<IntsRef> {
private final static IntsRef NO_OUTPUT = new IntsRef();
private IntSequenceOutputs() {
}
public static IntSequenceOutputs getSingleton() {
return new IntSequenceOutputs();
}
@Override
public IntsRef common(IntsRef output1, IntsRef output2) {
assert output1 != null;
assert output2 != null;
int pos1 = output1.offset;
int pos2 = output2.offset;
int stopAt1 = pos1 + Math.min(output1.length, output2.length);
while(pos1 < stopAt1) {
if (output1.ints[pos1] != output2.ints[pos2]) {
break;
}
pos1++;
pos2++;
}
if (pos1 == output1.offset) {
// no common prefix
return NO_OUTPUT;
} else if (pos1 == output1.offset + output1.length) {
// output1 is a prefix of output2
return output1;
} else if (pos2 == output2.offset + output2.length) {
// output2 is a prefix of output1
return output2;
} else {
return new IntsRef(output1.ints, output1.offset, pos1-output1.offset);
}
}
@Override
public IntsRef subtract(IntsRef output, IntsRef inc) {
assert output != null;
assert inc != null;
if (inc == NO_OUTPUT) {
// no prefix removed
return output;
} else if (inc.length == output.length) {
// entire output removed
return NO_OUTPUT;
} else {
assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length;
assert inc.length > 0;
return new IntsRef(output.ints, output.offset + inc.length, output.length-inc.length);
}
}
@Override
public IntsRef add(IntsRef prefix, IntsRef output) {
assert prefix != null;
assert output != null;
if (prefix == NO_OUTPUT) {
return output;
} else if (output == NO_OUTPUT) {
return prefix;
} else {
assert prefix.length > 0;
assert output.length > 0;
IntsRef result = new IntsRef(prefix.length + output.length);
System.arraycopy(prefix.ints, prefix.offset, result.ints, 0, prefix.length);
System.arraycopy(output.ints, output.offset, result.ints, prefix.length, output.length);
result.length = prefix.length + output.length;
return result;
}
}
@Override
public void write(IntsRef prefix, DataOutput out) throws IOException {
assert prefix != null;
out.writeVInt(prefix.length);
for(int idx=0;idx<prefix.length;idx++) {
out.writeVInt(prefix.ints[prefix.offset+idx]);
}
}
@Override
public IntsRef read(DataInput in) throws IOException {
final int len = in.readVInt();
if (len == 0) {
return NO_OUTPUT;
} else {
final IntsRef output = new IntsRef(len);
for(int idx=0;idx<len;idx++) {
output.ints[idx] = in.readVInt();
}
output.length = len;
return output;
}
}
@Override
public IntsRef getNoOutput() {
return NO_OUTPUT;
}
@Override
public String outputToString(IntsRef output) {
return output.toString();
}
}

View File

@ -0,0 +1,304 @@
package org.apache.lucene.util.automaton.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import java.io.IOException;
/** Can next() and advance() through the terms in an FST
* @lucene.experimental
*/
public class IntsRefFSTEnum<T> {
private final FST<T> fst;
private IntsRef current = new IntsRef(10);
@SuppressWarnings("unchecked") private FST.Arc<T>[] arcs = (FST.Arc<T>[]) new FST.Arc[10];
// outputs are cumulative
@SuppressWarnings("unchecked") private T[] output = (T[]) new Object[10];
private boolean lastFinal;
private boolean didEmpty;
private final T NO_OUTPUT;
private final InputOutput<T> result = new InputOutput<T>();
public static class InputOutput<T> {
public IntsRef input;
public T output;
}
public IntsRefFSTEnum(FST<T> fst) {
this.fst = fst;
result.input = current;
NO_OUTPUT = fst.outputs.getNoOutput();
}
public void reset() {
lastFinal = false;
didEmpty = false;
current.length = 0;
result.output = NO_OUTPUT;
}
/** NOTE: target must be >= where we are already
* positioned */
public InputOutput<T> advance(IntsRef target) throws IOException {
assert target.compareTo(current) >= 0;
//System.out.println(" advance len=" + target.length + " curlen=" + current.length);
// special case empty string
if (current.length == 0) {
if (target.length == 0) {
final T output = fst.getEmptyOutput();
if (output != null) {
if (!didEmpty) {
current.length = 0;
lastFinal = true;
result.output = output;
didEmpty = true;
}
return result;
} else {
return next();
}
}
if (fst.noNodes()) {
return null;
}
}
// TODO: possibly caller could/should provide common
// prefix length? ie this work may be redundant if
// caller is in fact intersecting against its own
// automaton
// what prefix does target share w/ current
int idx = 0;
while (idx < current.length && idx < target.length) {
if (current.ints[idx] != target.ints[target.offset + idx]) {
break;
}
idx++;
}
//System.out.println(" shared " + idx);
FST.Arc<T> arc;
if (current.length == 0) {
// new enum (no seek/next yet)
arc = fst.readFirstArc(fst.getStartNode(), getArc(0));
//System.out.println(" new enum");
} else if (idx < current.length) {
// roll back to shared point
lastFinal = false;
current.length = idx;
arc = arcs[idx];
if (arc.isLast()) {
if (idx == 0) {
return null;
} else {
return next();
}
}
arc = fst.readNextArc(arc);
} else if (idx == target.length) {
// degenerate case -- seek to term we are already on
assert target.equals(current);
return result;
} else {
// current is a full prefix of target
if (lastFinal) {
arc = fst.readFirstArc(arcs[current.length-1].target, getArc(current.length));
} else {
return next();
}
}
lastFinal = false;
assert arc == arcs[current.length];
int targetLabel = target.ints[target.offset+current.length];
while(true) {
//System.out.println(" cycle len=" + current.length + " target=" + ((char) targetLabel) + " vs " + ((char) arc.label));
if (arc.label == targetLabel) {
grow();
current.ints[current.length] = arc.label;
appendOutput(arc.output);
current.length++;
grow();
if (current.length == target.length) {
result.output = output[current.length-1];
if (arc.isFinal()) {
// target is exact match
if (fst.hasArcs(arc.target)) {
// target is also a proper prefix of other terms
lastFinal = true;
appendFinalOutput(arc.nextFinalOutput);
}
} else {
// target is not a match but is a prefix of
// other terms
current.length--;
push();
}
return result;
} else if (!fst.hasArcs(arc.target)) {
// we only match a prefix of the target
return next();
} else {
targetLabel = target.ints[target.offset+current.length];
arc = fst.readFirstArc(arc.target, getArc(current.length));
}
} else if (arc.label > targetLabel) {
// we are now past the target
push();
return result;
} else if (arc.isLast()) {
if (current.length == 0) {
return null;
}
return next();
} else {
arc = fst.readNextArc(getArc(current.length));
}
}
}
public InputOutput<T> current() {
return result;
}
public InputOutput<T> next() throws IOException {
//System.out.println(" enum.next");
if (current.length == 0) {
final T output = fst.getEmptyOutput();
if (output != null) {
if (!didEmpty) {
current.length = 0;
lastFinal = true;
result.output = output;
didEmpty = true;
return result;
} else {
lastFinal = false;
}
}
if (fst.noNodes()) {
return null;
}
fst.readFirstArc(fst.getStartNode(), getArc(0));
push();
} else if (lastFinal) {
lastFinal = false;
assert current.length > 0;
// resume pushing
fst.readFirstArc(arcs[current.length-1].target, getArc(current.length));
push();
} else {
//System.out.println(" pop/push");
pop();
if (current.length == 0) {
// enum done
return null;
} else {
current.length--;
fst.readNextArc(arcs[current.length]);
push();
}
}
return result;
}
private void grow() {
final int l = current.length + 1;
current.grow(l);
arcs = ArrayUtil.grow(arcs, l);
output = ArrayUtil.grow(output, l);
}
private void appendOutput(T addedOutput) {
T newOutput;
if (current.length == 0) {
newOutput = addedOutput;
} else if (addedOutput == NO_OUTPUT) {
output[current.length] = output[current.length-1];
return;
} else {
newOutput = fst.outputs.add(output[current.length-1], addedOutput);
}
output[current.length] = newOutput;
}
private void appendFinalOutput(T addedOutput) {
if (current.length == 0) {
result.output = addedOutput;
} else {
result.output = fst.outputs.add(output[current.length-1], addedOutput);
}
}
private void push() throws IOException {
FST.Arc<T> arc = arcs[current.length];
assert arc != null;
while(true) {
grow();
current.ints[current.length] = arc.label;
appendOutput(arc.output);
//System.out.println(" push: append label=" + ((char) arc.label) + " output=" + fst.outputs.outputToString(arc.output));
current.length++;
grow();
if (!fst.hasArcs(arc.target)) {
break;
}
if (arc.isFinal()) {
appendFinalOutput(arc.nextFinalOutput);
lastFinal = true;
return;
}
arc = fst.readFirstArc(arc.target, getArc(current.length));
}
result.output = output[current.length-1];
}
private void pop() {
while (current.length > 0 && arcs[current.length-1].isLast()) {
current.length--;
}
}
private FST.Arc<T> getArc(int idx) {
if (arcs[idx] == null) {
arcs[idx] = new FST.Arc<T>();
}
return arcs[idx];
}
}

View File

@ -0,0 +1,94 @@
package org.apache.lucene.util.automaton.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
/**
* Use this if you just want to build an FSA.
*/
public final class NoOutputs extends Outputs<Object> {
final Object NO_OUTPUT = new Object() {
// NodeHash calls hashCode for this output; we fix this
// so we get deterministic hashing.
@Override
public int hashCode() {
return 42;
}
@Override
public boolean equals(Object other) {
return other == this;
}
};
private static final NoOutputs singleton = new NoOutputs();
private NoOutputs() {
}
public static NoOutputs getSingleton() {
return singleton;
}
@Override
public Object common(Object output1, Object output2) {
assert output1 == NO_OUTPUT;
assert output2 == NO_OUTPUT;
return NO_OUTPUT;
}
@Override
public Object subtract(Object output, Object inc) {
assert output == NO_OUTPUT;
assert inc == NO_OUTPUT;
return NO_OUTPUT;
}
@Override
public Object add(Object prefix, Object output) {
assert prefix == NO_OUTPUT: "got " + prefix;
assert output == NO_OUTPUT;
return NO_OUTPUT;
}
@Override
public void write(Object prefix, DataOutput out) {
//assert false;
}
@Override
public Object read(DataInput in) {
//assert false;
//return null;
return NO_OUTPUT;
}
@Override
public Object getNoOutput() {
return NO_OUTPUT;
}
@Override
public String outputToString(Object output) {
return "";
}
}

View File

@ -0,0 +1,174 @@
package org.apache.lucene.util.automaton.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
// Used to dedup states (lookup already-frozen states)
final class NodeHash<T> {
private int[] table;
private int count;
private int mask;
private final FST<T> fst;
private final FST.Arc<T> scratchArc = new FST.Arc<T>();
public static int conf;
public NodeHash(FST<T> fst) {
table = new int[16];
mask = 15;
this.fst = fst;
}
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address) throws IOException {
fst.readFirstArc(address, scratchArc);
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
return false;
}
for(int arcUpto=0;arcUpto<node.numArcs;arcUpto++) {
final Builder.Arc arc = node.arcs[arcUpto];
if (arc.label != scratchArc.label ||
!arc.output.equals(scratchArc.output) ||
((Builder.CompiledNode) arc.target).address != scratchArc.target ||
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput) ||
arc.isFinal != scratchArc.isFinal()) {
return false;
}
if (scratchArc.isLast()) {
if (arcUpto == node.numArcs-1) {
return true;
} else {
return false;
}
}
fst.readNextArc(scratchArc);
}
return false;
}
// hash code for an unfrozen node. This must be identical
// to the un-frozen case (below)!!
private int hash(Builder.UnCompiledNode<T> node) {
final int PRIME = 31;
//System.out.println("hash unfrozen");
int h = 0;
// TODO: maybe if number of arcs is high we can safely subsample?
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
final Builder.Arc<T> arc = node.arcs[arcIdx];
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).address + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
h = PRIME * h + arc.label;
h = PRIME * h + ((Builder.CompiledNode) arc.target).address;
h = PRIME * h + arc.output.hashCode();
h = PRIME * h + arc.nextFinalOutput.hashCode();
if (arc.isFinal) {
h += 17;
}
}
//System.out.println(" ret " + (h&Integer.MAX_VALUE));
return h & Integer.MAX_VALUE;
}
// hash code for a frozen node
private int hash(int node) throws IOException {
final int PRIME = 31;
//System.out.println("hash frozen");
int h = 0;
fst.readFirstArc(node, scratchArc);
while(true) {
//System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal());
h = PRIME * h + scratchArc.label;
h = PRIME * h + scratchArc.target;
h = PRIME * h + scratchArc.output.hashCode();
h = PRIME * h + scratchArc.nextFinalOutput.hashCode();
if (scratchArc.isFinal()) {
h += 17;
}
if (scratchArc.isLast()) {
break;
}
fst.readNextArc(scratchArc);
}
//System.out.println(" ret " + (h&Integer.MAX_VALUE));
return h & Integer.MAX_VALUE;
}
public int add(Builder.UnCompiledNode<T> node) throws IOException {
// System.out.println("hash: add count=" + count + " vs " + table.length);
final int h = hash(node);
int h2 = h;
int c = 1;
while(true) {
final int pos = h2 & mask;
final int v = table[pos];
if (v == 0) {
// freeze & add
final int address = fst.addNode(node);
//System.out.println(" now freeze addr=" + address);
assert hash(address) == h : "frozenHash=" + hash(address) + " vs h=" + h;
count++;
table[pos] = address;
if (table.length < 2*count) {
rehash();
}
return address;
} else if (nodesEqual(node, v)) {
// same node is already here
return v;
}
// quadratic probe
h2 = h+(c + c*c)/2;
c++;
conf++;
}
}
// called only by rehash
private void addNew(int address) throws IOException {
final int h = hash(address);
int h2 = h;
int c = 1;
while(true) {
final int pos = h2 & mask;
if (table[pos] == 0) {
table[pos] = address;
break;
}
// quadratic probe
h2 = h + (c + c*c)/2;
c++;
conf++;
}
}
private void rehash() throws IOException {
final int[] oldTable = table;
table = new int[2*table.length];
mask = table.length-1;
for(int idx=0;idx<oldTable.length;idx++) {
final int address = oldTable[idx];
if (address != 0) {
addNew(address);
}
}
}
}

View File

@ -0,0 +1,57 @@
package org.apache.lucene.util.automaton.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
/**
* Represents the outputs for an FST, providing the basic
* algebra needed for the FST.
* @lucene.experimental
*/
public abstract class Outputs<T> {
// TODO: maybe change this API to allow for re-use of the
// output instances -- this is an insane amount of garbage
// (new object per byte/char/int) if eg used during
// analysis
/** Eg common("foo", "foobar") -> "foo" */
public abstract T common(T output1, T output2);
/** Eg subtract("foobar", "foo") -> "bar" */
public abstract T subtract(T output, T inc);
/** Eg add("foo", "bar") -> "foobar" */
public abstract T add(T prefix, T output);
public abstract void write(T output, DataOutput out) throws IOException;
public abstract T read(DataInput in) throws IOException;
/** NOTE: this output is compared with == so you must
* ensure that all methods return the single object if
* it's really no output */
public abstract T getNoOutput();
public abstract String outputToString(T output);
}

View File

@ -0,0 +1,117 @@
package org.apache.lucene.util.automaton.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
/**
* Pairs up two outputs into one.
* @lucene.experimental
*/
public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
private final Pair<A,B> NO_OUTPUT;
private final Outputs<A> outputs1;
private final Outputs<B> outputs2;
public static class Pair<A,B> {
public final A output1;
public final B output2;
public Pair(A output1, B output2) {
this.output1 = output1;
this.output2 = output2;
}
@Override @SuppressWarnings("unchecked")
public boolean equals(Object other) {
if (other == this) {
return true;
} else if (other instanceof Pair) {
Pair pair = (Pair) other;
return output1.equals(pair.output1) && output2.equals(pair.output2);
} else {
return false;
}
}
public int hashCode() {
return output1.hashCode() + output2.hashCode();
}
};
public PairOutputs(Outputs<A> outputs1, Outputs<B> outputs2) {
this.outputs1 = outputs1;
this.outputs2 = outputs2;
NO_OUTPUT = new Pair<A,B>(outputs1.getNoOutput(), outputs2.getNoOutput());
}
public Pair<A,B> get(A output1, B output2) {
if (output1 == outputs1.getNoOutput() && output2 == outputs2.getNoOutput()) {
return NO_OUTPUT;
} else {
return new Pair<A,B>(output1, output2);
}
}
@Override
public Pair<A,B> common(Pair<A,B> pair1, Pair<A,B> pair2) {
return get(outputs1.common(pair1.output1, pair2.output1),
outputs2.common(pair1.output2, pair2.output2));
}
@Override
public Pair<A,B> subtract(Pair<A,B> output, Pair<A,B> inc) {
return get(outputs1.subtract(output.output1, inc.output1),
outputs2.subtract(output.output2, inc.output2));
}
@Override
public Pair<A,B> add(Pair<A,B> prefix, Pair<A,B> output) {
return get(outputs1.add(prefix.output1, output.output1),
outputs2.add(prefix.output2, output.output2));
}
@Override
public void write(Pair<A,B> output, DataOutput writer) throws IOException {
outputs1.write(output.output1, writer);
outputs2.write(output.output2, writer);
}
@Override
public Pair<A,B> read(DataInput in) throws IOException {
A output1 = outputs1.read(in);
B output2 = outputs2.read(in);
return get(output1, output2);
}
@Override
public Pair<A,B> getNoOutput() {
return NO_OUTPUT;
}
@Override
public String outputToString(Pair<A,B> output) {
return "<pair:" + outputs1.outputToString(output.output1) + "," + outputs2.outputToString(output.output2) + ">";
}
}

View File

@ -0,0 +1,138 @@
package org.apache.lucene.util.automaton.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
// TODO: make a sharing and non-sharing variant; eg if you
// output docFreq per term the FST will be smaller if you
// don't share since they are not "well shared"
/**
* Output is a long, for each input term. NOTE: the
* resulting FST is not guaranteed to be minimal! See
* {@link Builder}.
* @lucene.experimental
*/
public final class PositiveIntOutputs extends Outputs<Long> {
private final static Long NO_OUTPUT = new Long(0);
private final boolean doShare;
private final static PositiveIntOutputs singletonShare = new PositiveIntOutputs(true);
private final static PositiveIntOutputs singletonNoShare = new PositiveIntOutputs(false);
private PositiveIntOutputs(boolean doShare) {
this.doShare = doShare;
}
public static PositiveIntOutputs getSingleton(boolean doShare) {
return doShare ? singletonShare : singletonNoShare;
}
public Long get(long v) {
if (v == 0) {
return NO_OUTPUT;
} else {
return Long.valueOf(v);
}
}
@Override
public Long common(Long output1, Long output2) {
assert valid(output1);
assert valid(output2);
if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) {
return NO_OUTPUT;
} else if (doShare) {
assert output1 > 0;
assert output2 > 0;
return Math.min(output1, output2);
} else if (output1.equals(output2)) {
return output1;
} else {
return NO_OUTPUT;
}
}
@Override
public Long subtract(Long output, Long inc) {
assert valid(output);
assert valid(inc);
assert output >= inc;
if (inc == NO_OUTPUT) {
return output;
} else if (output.equals(inc)) {
return NO_OUTPUT;
} else {
return output - inc;
}
}
@Override
public Long add(Long prefix, Long output) {
assert valid(prefix);
assert valid(output);
if (prefix == NO_OUTPUT) {
return output;
} else if (output == NO_OUTPUT) {
return prefix;
} else {
return prefix + output;
}
}
@Override
public void write(Long output, DataOutput out) throws IOException {
assert valid(output);
out.writeVLong(output);
}
@Override
public Long read(DataInput in) throws IOException {
long v = in.readVLong();
if (v == 0) {
return NO_OUTPUT;
} else {
return v;
}
}
private boolean valid(Long o) {
assert o != null;
assert o instanceof Long;
assert o == NO_OUTPUT || o > 0;
return true;
}
@Override
public Long getNoOutput() {
return NO_OUTPUT;
}
@Override
public String outputToString(Long output) {
return output.toString();
}
}

View File

@ -0,0 +1,39 @@
is threadlocal.get costly? if so maybe make an FSTReader? would hold this "relative" pos, and each thread'd use it for reading, instead of PosRef
maybe changed Outputs class to "reuse" stuff? eg this new BytesRef in ByteSequenceOutputs..
do i even "need" both non_final_end_state and final_end_state?
hmm -- can I get weights working here?
can FST be used to index all internal substrings, mapping to term?
- maybe put back ability to add multiple outputs per input...?
make this work w/ char...?
- then FSTCharFilter/FSTTokenFilter
- syn filter?
experiment: try reversing terms before compressing -- how much smaller?
maybe seprate out a 'writable/growing fst' from a read-only one?
can we somehow [partially] tableize lookups like oal.util.automaton?
make an FST terms index option for codecs...?
make an FSTCharsMap?
need a benchmark testing FST traversal -- just fix the static main to rewind & visit all terms
thread state
when writing FST to disk:
- Sequentially writing (would save memory in codec during indexing). We are now using DataOutput, which could also go directly to disk
- problem: size of BytesRef must be known before
later
- maybe don't require FSTEnum.advance to be forward only?
- should i make a posIntOutputs separate from posLongOutputs?
- mv randomAccpetedWord / run / etc. from test into FST?
- hmm get multi-outputs working again? do we ever need this?

File diff suppressed because it is too large Load Diff