diff --git a/.hgignore b/.hgignore index 3cb9afd9fae..1e17ec787de 100644 --- a/.hgignore +++ b/.hgignore @@ -1,2 +1,4 @@ syntax: glob */build/* +*.class + diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java index 95ed9554c67..61297284a2a 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java @@ -31,15 +31,16 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.fst.Builder; +import org.apache.lucene.util.automaton.fst.BytesRefFSTEnum; +import org.apache.lucene.util.automaton.fst.FST; +import org.apache.lucene.util.automaton.fst.PositiveIntOutputs; +import org.apache.lucene.util.automaton.fst.PairOutputs; import java.io.IOException; import java.util.Comparator; import java.util.Map; -import java.util.Set; import java.util.HashMap; -import java.util.TreeMap; -import java.util.SortedMap; -import java.util.Iterator; class SimpleTextFieldsReader extends FieldsProducer { @@ -116,73 +117,39 @@ class SimpleTextFieldsReader extends FieldsProducer { private class SimpleTextTermsEnum extends TermsEnum { private final IndexInput in; private final boolean omitTF; - private BytesRef current; private int docFreq; private long docsStart; private boolean ended; - private final TreeMap allTerms; - private Iterator> iter; + private final BytesRefFSTEnum> fstEnum; - public SimpleTextTermsEnum(TreeMap allTerms, boolean omitTF) throws IOException { + public SimpleTextTermsEnum(FST> fst, boolean omitTF) throws IOException { this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); - this.allTerms = allTerms; this.omitTF = omitTF; - iter = allTerms.entrySet().iterator(); + fstEnum = new BytesRefFSTEnum>(fst); } public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException { - - final SortedMap tailMap = allTerms.tailMap(text); - if (tailMap.isEmpty()) { - current = null; + fstEnum.reset(); + //System.out.println("seek to text=" + text.utf8ToString()); + final BytesRefFSTEnum.InputOutput> result = fstEnum.advance(text); + if (result == null) { + //System.out.println(" end"); return SeekStatus.END; } else { - current = tailMap.firstKey(); - final TermData td = tailMap.get(current); - docsStart = td.docsStart; - docFreq = td.docFreq; - iter = tailMap.entrySet().iterator(); - assert iter.hasNext(); - iter.next(); - if (current.equals(text)) { + //System.out.println(" got text=" + term.utf8ToString()); + PairOutputs.Pair pair = result.output; + docsStart = pair.output1; + docFreq = pair.output2.intValue(); + + if (result.input.equals(text)) { + //System.out.println(" match docsStart=" + docsStart); return SeekStatus.FOUND; } else { + //System.out.println(" not match docsStart=" + docsStart); return SeekStatus.NOT_FOUND; } } - - /* - if (current != null) { - final int cmp = current.compareTo(text); - if (cmp == 0) { - return SeekStatus.FOUND; - } else if (cmp > 0) { - ended = false; - in.seek(fieldStart); - } - } else { - ended = false; - in.seek(fieldStart); - } - - // Naive!! This just scans... would be better to do - // up-front scan to build in-RAM index - BytesRef b; - while((b = next()) != null) { - final int cmp = b.compareTo(text); - if (cmp == 0) { - ended = false; - return SeekStatus.FOUND; - } else if (cmp > 0) { - ended = false; - return SeekStatus.NOT_FOUND; - } - } - current = null; - ended = true; - return SeekStatus.END; - */ } @Override @@ -192,56 +159,20 @@ class SimpleTextFieldsReader extends FieldsProducer { @Override public BytesRef next() throws IOException { assert !ended; - - if (iter.hasNext()) { - Map.Entry ent = iter.next(); - current = ent.getKey(); - TermData td = ent.getValue(); - docFreq = td.docFreq; - docsStart = td.docsStart; - return current; + final BytesRefFSTEnum.InputOutput> result = fstEnum.next(); + if (result != null) { + final PairOutputs.Pair pair = result.output; + docsStart = pair.output1; + docFreq = pair.output2.intValue(); + return result.input; } else { - current = null; return null; } - - /* - readLine(in, scratch); - if (scratch.equals(END) || scratch.startsWith(FIELD)) { - ended = true; - current = null; - return null; - } else { - assert scratch.startsWith(TERM): "got " + scratch.utf8ToString(); - docsStart = in.getFilePointer(); - final int len = scratch.length - TERM.length; - if (len > scratch2.length) { - scratch2.grow(len); - } - System.arraycopy(scratch.bytes, TERM.length, scratch2.bytes, 0, len); - scratch2.length = len; - current = scratch2; - docFreq = 0; - long lineStart = 0; - while(true) { - lineStart = in.getFilePointer(); - readLine(in, scratch); - if (scratch.equals(END) || scratch.startsWith(FIELD) || scratch.startsWith(TERM)) { - break; - } - if (scratch.startsWith(DOC)) { - docFreq++; - } - } - in.seek(lineStart); - return current; - } - */ } @Override public BytesRef term() { - return current; + return fstEnum.current().input; } @Override @@ -512,10 +443,7 @@ class SimpleTextFieldsReader extends FieldsProducer { private final String field; private final long termsStart; private final boolean omitTF; - - // NOTE: horribly, horribly RAM consuming, but then - // SimpleText should never be used in production - private final TreeMap allTerms = new TreeMap(); + private FST> fst; private final BytesRef scratch = new BytesRef(10); @@ -527,6 +455,8 @@ class SimpleTextFieldsReader extends FieldsProducer { } private void loadTerms() throws IOException { + PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); + Builder> b = new Builder>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs(posIntOutputs, posIntOutputs)); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); @@ -536,16 +466,14 @@ class SimpleTextFieldsReader extends FieldsProducer { readLine(in, scratch); if (scratch.equals(END) || scratch.startsWith(FIELD)) { if (lastDocsStart != -1) { - allTerms.put(new BytesRef(lastTerm), - new TermData(lastDocsStart, docFreq)); + b.add(lastTerm, new PairOutputs.Pair(lastDocsStart, Long.valueOf(docFreq))); } break; } else if (scratch.startsWith(DOC)) { docFreq++; } else if (scratch.startsWith(TERM)) { if (lastDocsStart != -1) { - allTerms.put(new BytesRef(lastTerm), - new TermData(lastDocsStart, docFreq)); + b.add(lastTerm, new PairOutputs.Pair(lastDocsStart, Long.valueOf(docFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; @@ -557,11 +485,23 @@ class SimpleTextFieldsReader extends FieldsProducer { docFreq = 0; } } + fst = b.finish(); + /* + PrintStream ps = new PrintStream("out.dot"); + fst.toDot(ps); + ps.close(); + System.out.println("SAVED out.dot"); + */ + //System.out.println("FST " + fst.sizeInBytes()); } @Override public TermsEnum iterator() throws IOException { - return new SimpleTextTermsEnum(allTerms, omitTF); + if (fst != null) { + return new SimpleTextTermsEnum(fst, omitTF); + } else { + return TermsEnum.EMPTY; + } } @Override diff --git a/lucene/src/java/org/apache/lucene/util/ArrayUtil.java b/lucene/src/java/org/apache/lucene/util/ArrayUtil.java index d9dd51e956f..fbf62507bae 100644 --- a/lucene/src/java/org/apache/lucene/util/ArrayUtil.java +++ b/lucene/src/java/org/apache/lucene/util/ArrayUtil.java @@ -19,6 +19,7 @@ package org.apache.lucene.util; import java.util.Collection; import java.util.Comparator; +import java.lang.reflect.Array; /** * Methods for manipulating arrays. @@ -392,7 +393,7 @@ public final class ArrayUtil { } /** - * Returns hash of chars in range start (inclusive) to + * Returns hash of bytes in range start (inclusive) to * end (inclusive) */ public static int hashCode(byte[] array, int start, int end) { @@ -429,6 +430,31 @@ public final class ArrayUtil { return false; } + public static T[] grow(T[] array, int minSize) { + if (array.length < minSize) { + @SuppressWarnings("unchecked") final T[] newArray = + (T[]) Array.newInstance(array.getClass().getComponentType(), oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJ_REF)); + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static T[] grow(T[] array) { + return grow(array, 1 + array.length); + } + + public static T[] shrink(T[] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_OBJ_REF); + if (newSize != array.length) { + @SuppressWarnings("unchecked") final T[] newArray = + (T[]) Array.newInstance(array.getClass().getComponentType(), newSize); + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else + return array; + } + // Since Arrays.equals doesn't implement offsets for equals /** * See if two array slices are the same. diff --git a/lucene/src/java/org/apache/lucene/util/IntsRef.java b/lucene/src/java/org/apache/lucene/util/IntsRef.java index 33a486b09c0..1f284b5ea51 100644 --- a/lucene/src/java/org/apache/lucene/util/IntsRef.java +++ b/lucene/src/java/org/apache/lucene/util/IntsRef.java @@ -21,7 +21,7 @@ package org.apache.lucene.util; * existing int[]. * * @lucene.internal */ -public final class IntsRef { +public final class IntsRef implements Comparable { public int[] ints; public int offset; @@ -81,6 +81,31 @@ public final class IntsRef { } } + /** Signed int order comparison */ + public int compareTo(IntsRef other) { + if (this == other) return 0; + + final int[] aInts = this.ints; + int aUpto = this.offset; + final int[] bInts = other.ints; + int bUpto = other.offset; + + final int aStop = aUpto + Math.min(this.length, other.length); + + while(aUpto < aStop) { + int aInt = aInts[aUpto++]; + int bInt = bInts[bUpto++]; + if (aInt > bInt) { + return 1; + } else if (aInt < bInt) { + return -1; + } + } + + // One is a prefix of the other, or, they are equal: + return this.length - other.length; + } + public void copy(IntsRef other) { if (ints == null) { ints = new int[other.length]; @@ -97,4 +122,18 @@ public final class IntsRef { ints = ArrayUtil.grow(ints, newLength); } } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append('['); + final int end = offset + length; + for(int i=offset;i offset) { + sb.append(' '); + } + sb.append(Integer.toHexString(ints[i])); + } + sb.append(']'); + return sb.toString(); + } } diff --git a/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java b/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java index 5346f9fcc3a..af1a48f18b1 100644 --- a/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java +++ b/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java @@ -93,13 +93,7 @@ public final class RecyclingByteBlockAllocator extends ByteBlockPool.Allocator { @Override public synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) { final int numBlocks = Math.min(maxBufferedBlocks - freeBlocks, end - start); - final int size = freeBlocks + numBlocks; - if (size >= freeByteBlocks.length) { - final byte[][] newBlocks = new byte[ArrayUtil.oversize(size, - RamUsageEstimator.NUM_BYTES_OBJ_REF)][]; - System.arraycopy(freeByteBlocks, 0, newBlocks, 0, freeBlocks); - freeByteBlocks = newBlocks; - } + freeByteBlocks = ArrayUtil.grow(freeByteBlocks, freeBlocks + numBlocks); final int stop = start + numBlocks; for (int i = start; i < stop; i++) { freeByteBlocks[freeBlocks++] = blocks[i]; diff --git a/lucene/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/src/java/org/apache/lucene/util/automaton/Automaton.java index b4a893039a2..990dbf58298 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/Automaton.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/Automaton.java @@ -40,7 +40,6 @@ import java.util.List; import java.util.Set; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; /** * Finite-state automaton with regular expression operations. @@ -281,9 +280,7 @@ public class Automaton implements Serializable, Cloneable { worklist.add(t.to); t.to.number = upto; if (upto == numberedStates.length) { - final State[] newArray = new State[ArrayUtil.oversize(1+upto, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; - System.arraycopy(numberedStates, 0, newArray, 0, upto); - numberedStates = newArray; + numberedStates = ArrayUtil.grow(numberedStates); } numberedStates[upto] = t.to; upto++; diff --git a/lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java b/lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java index 5061ec315d4..965ef6025f6 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java @@ -30,7 +30,6 @@ package org.apache.lucene.util.automaton; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; import java.util.ArrayList; import java.util.BitSet; @@ -459,9 +458,7 @@ final public class BasicOperations { public void add(Transition t) { if (transitions.length == count) { - Transition[] newArray = new Transition[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; - System.arraycopy(transitions, 0, newArray, 0, count); - transitions = newArray; + transitions = ArrayUtil.grow(transitions); } transitions[count++] = t; } @@ -503,9 +500,7 @@ final public class BasicOperations { private PointTransitions next(int point) { // 1st time we are seeing this point if (count == points.length) { - final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; - System.arraycopy(points, 0, newArray, 0, count); - points = newArray; + points = ArrayUtil.grow(points); } PointTransitions points0 = points[count]; if (points0 == null) { @@ -650,9 +645,7 @@ final public class BasicOperations { final SortedIntSet.FrozenIntSet p = statesSet.freeze(q); worklist.add(p); if (newStateUpto == newStatesArray.length) { - final State[] newArray = new State[ArrayUtil.oversize(1+newStateUpto, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; - System.arraycopy(newStatesArray, 0, newArray, 0, newStateUpto); - newStatesArray = newArray; + newStatesArray = ArrayUtil.grow(newStatesArray); } newStatesArray[newStateUpto] = q; q.number = newStateUpto; diff --git a/lucene/src/java/org/apache/lucene/util/automaton/State.java b/lucene/src/java/org/apache/lucene/util/automaton/State.java index b4040c9b583..148d946d4d0 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/State.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/State.java @@ -29,7 +29,6 @@ package org.apache.lucene.util.automaton; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; import java.io.Serializable; import java.util.Collection; @@ -111,9 +110,7 @@ public class State implements Serializable, Comparable { */ public void addTransition(Transition t) { if (numTransitions == transitionsArray.length) { - final Transition[] newArray = new Transition[ArrayUtil.oversize(1+numTransitions, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; - System.arraycopy(transitionsArray, 0, newArray, 0, numTransitions); - transitionsArray = newArray; + transitionsArray = ArrayUtil.grow(transitionsArray); } transitionsArray[numTransitions++] = t; } diff --git a/lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java b/lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java index 9a2bee79917..ee252f25873 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java @@ -17,7 +17,6 @@ package org.apache.lucene.util.automaton; * limitations under the License. */ -import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.ArrayUtil; import java.util.List; @@ -299,9 +298,7 @@ final class UTF32ToUTF8 { private State newUTF8State() { State s = new State(); if (utf8StateCount == utf8States.length) { - final State[] newArray = new State[ArrayUtil.oversize(1+utf8StateCount, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; - System.arraycopy(utf8States, 0, newArray, 0, utf8StateCount); - utf8States = newArray; + utf8States = ArrayUtil.grow(utf8States); } utf8States[utf8StateCount] = s; s.number = utf8StateCount; diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java new file mode 100644 index 00000000000..2445e40f06d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java @@ -0,0 +1,506 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +import java.io.IOException; + +/** + * Builds a compact FST (maps an IntsRef term to an arbitrary + * output) from pre-sorted terms with outputs (the FST + * becomes an FSA if you use NoOutputs). The FST is written + * on-the-fly into a compact serialized format byte array, which can + * be saved to / loaded from a Directory or used directly + * for traversal. The FST is always finite (no cycles). + * + *

NOTE: The algorithm is described at + * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698

+ * + * If your outputs are ByteSequenceOutput then the final FST + * will be minimal, but if you use PositiveIntOutput then + * it's only "near minimal". For example, aa/0, aab/1, bbb/2 + * will produce 6 states when a 5 state fst is also + * possible. + * + * The parameterized type T is the output type. See the + * subclasses of {@link Outputs}. + * + * @lucene.experimental + */ + +public class Builder { + private final NodeHash dedupHash; + private final FST fst; + private final T NO_OUTPUT; + + // simplistic pruning: we prune node (and all following + // nodes) if less than this number of terms go through it: + private final int minSuffixCount1; + + // better pruning: we prune node (and all following + // nodes) if the prior node has less than this number of + // terms go through it: + private final int minSuffixCount2; + + private final IntsRef lastInput = new IntsRef(); + + // NOTE: cutting this over to ArrayList instead loses ~6% + // in build performance on 9.8M Wikipedia terms; so we + // left this as an array: + // current "frontier" + private UnCompiledNode[] frontier; + + public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs outputs) { + this.minSuffixCount1 = minSuffixCount1; + this.minSuffixCount2 = minSuffixCount2; + fst = new FST(inputType, outputs); + if (doMinSuffix) { + dedupHash = new NodeHash(fst); + } else { + dedupHash = null; + } + NO_OUTPUT = outputs.getNoOutput(); + + @SuppressWarnings("unchecked") final UnCompiledNode[] f = (UnCompiledNode[]) new UnCompiledNode[10]; + frontier = f; + for(int idx=0;idx(this); + } + } + + public int getTotStateCount() { + return fst.nodeCount; + } + + public int getTermCount() { + return frontier[0].inputCount; + } + + public int getMappedStateCount() { + return dedupHash == null ? 0 : fst.nodeCount; + } + + private CompiledNode compileNode(UnCompiledNode n) throws IOException { + + final int address; + if (dedupHash != null) { + if (n.numArcs == 0) { + address = fst.addNode(n); + } else { + address = dedupHash.add(n); + } + } else { + address = fst.addNode(n); + } + assert address != -2; + + n.clear(); + + final CompiledNode fn = new CompiledNode(); + fn.address = address; + return fn; + } + + private void compilePrevTail(int prefixLenPlus1) throws IOException { + assert prefixLenPlus1 >= 1; + //System.out.println(" compileTail " + prefixLenPlus1); + for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) { + boolean doPrune = false; + boolean doCompile = false; + + final UnCompiledNode node = frontier[idx]; + final UnCompiledNode parent = frontier[idx-1]; + + if (node.inputCount < minSuffixCount1) { + doPrune = true; + doCompile = true; + } else if (idx > prefixLenPlus1) { + // prune if parent's inputCount is less than suffixMinCount2 + if (parent.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && parent.inputCount == 1) { + // my parent, about to be compiled, doesn't make the cut, so + // I'm definitely pruned + + // if pruneCount2 is 1, we keep only up + // until the 'distinguished edge', ie we keep only the + // 'divergent' part of the FST. if my parent, about to be + // compiled, has inputCount 1 then we are already past the + // distinguished edge. NOTE: this only works if + // the FST outputs are not "compressible" (simple + // ords ARE compressible). + doPrune = true; + } else { + // my parent, about to be compiled, does make the cut, so + // I'm definitely not pruned + doPrune = false; + } + doCompile = true; + } else { + // if pruning is disabled (count is 0) we can always + // compile current node + doCompile = minSuffixCount2 == 0; + } + + //System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune); + + if (node.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && node.inputCount == 1) { + // drop all arcs + for(int arcIdx=0;arcIdx target = (UnCompiledNode) node.arcs[arcIdx].target; + target.clear(); + } + node.numArcs = 0; + } + + if (doPrune) { + // this node doesn't make it -- deref it + node.clear(); + parent.deleteLast(lastInput.ints[lastInput.offset+idx-1], node); + } else { + + if (minSuffixCount2 != 0) { + compileAllTargets(node); + } + final T nextFinalOutput = node.output; + final boolean isFinal = node.isFinal; + + if (doCompile) { + // this node makes it and we now compile it. first, + // compile any targets that were previously + // undecided: + parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], + compileNode(node), + nextFinalOutput, + isFinal); + } else { + // replaceLast just to install + // nextFinalOutput/isFinal onto the arc + parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], + node, + nextFinalOutput, + isFinal); + // this node will stay in play for now, since we are + // undecided on whether to prune it. later, it + // will be either compiled or pruned, so we must + // allocate a new node: + frontier[idx] = new UnCompiledNode(this); + } + } + } + } + + private final IntsRef scratchIntsRef = new IntsRef(10); + + public void add(BytesRef input, T output) throws IOException { + assert fst.getInputType() == FST.INPUT_TYPE.BYTE1; + scratchIntsRef.grow(input.length); + for(int i=0;i 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input; + assert validOutput(output); + + //System.out.println("\nadd: " + input); + if (input.length == 0) { + // empty input: only allowed as first input. we have + // to special case this because the packed FST + // format cannot represent the empty input since + // 'finalness' is stored on the incoming arc, not on + // the node + frontier[0].inputCount++; + fst.setEmptyOutput(output); + return; + } + + // compare shared prefix length + int pos1 = 0; + int pos2 = input.offset; + final int pos1Stop = Math.min(lastInput.length, input.length); + while(true) { + //System.out.println(" incr " + pos1); + frontier[pos1].inputCount++; + if (pos1 >= pos1Stop || lastInput.ints[pos1] != input.ints[pos2]) { + break; + } + pos1++; + pos2++; + } + final int prefixLenPlus1 = pos1+1; + + if (frontier.length < input.length+1) { + final UnCompiledNode[] next = ArrayUtil.grow(frontier, input.length+1); + for(int idx=frontier.length;idx(this); + } + frontier = next; + } + + // minimize/compile states from previous input's + // orphan'd suffix + compilePrevTail(prefixLenPlus1); + + // init tail states for current input + for(int idx=prefixLenPlus1;idx<=input.length;idx++) { + frontier[idx-1].addArc(input.ints[input.offset + idx - 1], + frontier[idx]); + //System.out.println(" incr tail " + idx); + frontier[idx].inputCount++; + } + + final UnCompiledNode lastNode = frontier[input.length]; + lastNode.isFinal = true; + lastNode.output = NO_OUTPUT; + + // push conflicting outputs forward, only as far as + // needed + for(int idx=1;idx node = frontier[idx]; + final UnCompiledNode parentNode = frontier[idx-1]; + + final T lastOutput = parentNode.getLastOutput(input.ints[input.offset + idx - 1]); + assert validOutput(lastOutput); + + final T commonOutputPrefix; + final T wordSuffix; + + if (lastOutput != NO_OUTPUT) { + commonOutputPrefix = fst.outputs.common(output, lastOutput); + assert validOutput(commonOutputPrefix); + wordSuffix = fst.outputs.subtract(lastOutput, commonOutputPrefix); + assert validOutput(wordSuffix); + parentNode.setLastOutput(input.ints[input.offset + idx - 1], commonOutputPrefix); + node.prependOutput(wordSuffix); + } else { + commonOutputPrefix = wordSuffix = NO_OUTPUT; + } + + output = fst.outputs.subtract(output, commonOutputPrefix); + assert validOutput(output); + } + + // push remaining output: + frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output); + + // save last input + lastInput.copy(input); + + //System.out.println(" count[0]=" + frontier[0].inputCount); + } + + private boolean validOutput(T output) { + return output == NO_OUTPUT || !output.equals(NO_OUTPUT); + } + + /** Returns final FST. NOTE: this will return null if + * nothing is accepted by the FST. */ + public FST finish() throws IOException { + + // minimize nodes in the last word's suffix + compilePrevTail(1); + //System.out.println("finish: inputCount=" + frontier[0].inputCount); + if (frontier[0].inputCount < minSuffixCount1 || frontier[0].inputCount < minSuffixCount2 || frontier[0].numArcs == 0) { + if (fst.getEmptyOutput() == null) { + return null; + } else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) { + // empty string got pruned + return null; + } else { + fst.finish(compileNode(frontier[0]).address); + //System.out.println("compile addr = " + fst.getStartNode()); + return fst; + } + } else { + if (minSuffixCount2 != 0) { + compileAllTargets(frontier[0]); + } + //System.out.println("NOW: " + frontier[0].numArcs); + fst.finish(compileNode(frontier[0]).address); + } + + return fst; + } + + private void compileAllTargets(UnCompiledNode node) throws IOException { + for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; + if (!arc.target.isCompiled()) { + // not yet compiled + @SuppressWarnings("unchecked") final UnCompiledNode n = (UnCompiledNode) arc.target; + arc.target = compileNode(n); + } + } + } + + static class Arc { + public int label; // really an "unsigned" byte + public Node target; + public boolean isFinal; + public T output; + public T nextFinalOutput; + } + + // NOTE: not many instances of Node or CompiledNode are in + // memory while the FST is being built; it's only the + // current "frontier": + + static interface Node { + boolean isCompiled(); + } + + static final class CompiledNode implements Node { + int address; + public boolean isCompiled() { + return true; + } + } + + static final class UnCompiledNode implements Node { + final Builder owner; + int numArcs; + Arc[] arcs; + T output; + boolean isFinal; + int inputCount; + + @SuppressWarnings("unchecked") + public UnCompiledNode(Builder owner) { + this.owner = owner; + arcs = (Arc[]) new Arc[1]; + arcs[0] = new Arc(); + output = owner.NO_OUTPUT; + } + + public boolean isCompiled() { + return false; + } + + public void clear() { + numArcs = 0; + isFinal = false; + output = owner.NO_OUTPUT; + inputCount = 0; + } + + public T getLastOutput(int labelToMatch) { + assert numArcs > 0; + assert arcs[numArcs-1].label == labelToMatch; + return arcs[numArcs-1].output; + } + + public void addArc(int label, Node target) { + assert label >= 0; + assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs; + if (numArcs == arcs.length) { + final Arc[] newArcs = ArrayUtil.grow(arcs); + for(int arcIdx=numArcs;arcIdx(); + } + arcs = newArcs; + } + final Arc arc = arcs[numArcs++]; + arc.label = label; + arc.target = target; + arc.output = arc.nextFinalOutput = owner.NO_OUTPUT; + arc.isFinal = false; + } + + public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) { + assert numArcs > 0; + final Arc arc = arcs[numArcs-1]; + assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch; + arc.target = target; + //assert target.address != -2; + arc.nextFinalOutput = nextFinalOutput; + arc.isFinal = isFinal; + } + + public void deleteLast(int label, Node target) { + assert numArcs > 0; + assert label == arcs[numArcs-1].label; + assert target == arcs[numArcs-1].target; + numArcs--; + } + + public void setLastOutput(int labelToMatch, T newOutput) { + assert owner.validOutput(newOutput); + assert numArcs > 0; + final Arc arc = arcs[numArcs-1]; + assert arc.label == labelToMatch; + arc.output = newOutput; + } + + // pushes an output prefix forward onto all arcs + public void prependOutput(T outputPrefix) { + assert owner.validOutput(outputPrefix); + + for(int arcIdx=0;arcIdx { + + private final static BytesRef NO_OUTPUT = new BytesRef(); + + private ByteSequenceOutputs() { + } + + public static ByteSequenceOutputs getSingleton() { + return new ByteSequenceOutputs(); + } + + @Override + public BytesRef common(BytesRef output1, BytesRef output2) { + assert output1 != null; + assert output2 != null; + + int pos1 = output1.offset; + int pos2 = output2.offset; + int stopAt1 = pos1 + Math.min(output1.length, output2.length); + while(pos1 < stopAt1) { + if (output1.bytes[pos1] != output2.bytes[pos2]) { + break; + } + pos1++; + pos2++; + } + + if (pos1 == output1.offset) { + // no common prefix + return NO_OUTPUT; + } else if (pos1 == output1.offset + output1.length) { + // output1 is a prefix of output2 + return output1; + } else if (pos2 == output2.offset + output2.length) { + // output2 is a prefix of output1 + return output2; + } else { + return new BytesRef(output1.bytes, output1.offset, pos1-output1.offset); + } + } + + @Override + public BytesRef subtract(BytesRef output, BytesRef inc) { + assert output != null; + assert inc != null; + if (inc == NO_OUTPUT) { + // no prefix removed + return output; + } else if (inc.length == output.length) { + // entire output removed + return NO_OUTPUT; + } else { + assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length; + assert inc.length > 0; + return new BytesRef(output.bytes, output.offset + inc.length, output.length-inc.length); + } + } + + @Override + public BytesRef add(BytesRef prefix, BytesRef output) { + assert prefix != null; + assert output != null; + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + assert prefix.length > 0; + assert output.length > 0; + BytesRef result = new BytesRef(prefix.length + output.length); + System.arraycopy(prefix.bytes, prefix.offset, result.bytes, 0, prefix.length); + System.arraycopy(output.bytes, output.offset, result.bytes, prefix.length, output.length); + result.length = prefix.length + output.length; + return result; + } + } + + @Override + public void write(BytesRef prefix, DataOutput out) throws IOException { + assert prefix != null; + out.writeVInt(prefix.length); + out.writeBytes(prefix.bytes, prefix.offset, prefix.length); + } + + @Override + public BytesRef read(DataInput in) throws IOException { + final int len = in.readVInt(); + if (len == 0) { + return NO_OUTPUT; + } else { + final BytesRef output = new BytesRef(len); + in.readBytes(output.bytes, 0, len); + output.length = len; + return output; + } + } + + @Override + public BytesRef getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(BytesRef output) { + return output.utf8ToString(); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/BytesRefFSTEnum.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/BytesRefFSTEnum.java new file mode 100644 index 00000000000..150a0e7dcf6 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/BytesRefFSTEnum.java @@ -0,0 +1,304 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +/** Can next() and advance() through the terms in an FST + * @lucene.experimental +*/ + +public class BytesRefFSTEnum { + private final FST fst; + + private BytesRef current = new BytesRef(10); + @SuppressWarnings("unchecked") private FST.Arc[] arcs = (FST.Arc[]) new FST.Arc[10]; + // outputs are cumulative + @SuppressWarnings("unchecked") private T[] output = (T[]) new Object[10]; + + private boolean lastFinal; + private boolean didEmpty; + private final T NO_OUTPUT; + private final InputOutput result = new InputOutput(); + + public static class InputOutput { + public BytesRef input; + public T output; + } + + public BytesRefFSTEnum(FST fst) { + this.fst = fst; + result.input = current; + NO_OUTPUT = fst.outputs.getNoOutput(); + } + + public void reset() { + lastFinal = false; + didEmpty = false; + current.length = 0; + result.output = NO_OUTPUT; + } + + /** NOTE: target must be >= where we are already + * positioned */ + public InputOutput advance(BytesRef target) throws IOException { + + assert target.compareTo(current) >= 0; + + //System.out.println(" advance len=" + target.length + " curlen=" + current.length); + + // special case empty string + if (current.length == 0) { + if (target.length == 0) { + final T output = fst.getEmptyOutput(); + if (output != null) { + if (!didEmpty) { + current.length = 0; + lastFinal = true; + result.output = output; + didEmpty = true; + } + return result; + } else { + return next(); + } + } + + if (fst.noNodes()) { + return null; + } + } + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + + // what prefix does target share w/ current + int idx = 0; + while (idx < current.length && idx < target.length) { + if (current.bytes[idx] != target.bytes[target.offset + idx]) { + break; + } + idx++; + } + + //System.out.println(" shared " + idx); + + FST.Arc arc; + if (current.length == 0) { + // new enum (no seek/next yet) + arc = fst.readFirstArc(fst.getStartNode(), getArc(0)); + //System.out.println(" new enum"); + } else if (idx < current.length) { + // roll back to shared point + lastFinal = false; + current.length = idx; + arc = arcs[idx]; + if (arc.isLast()) { + if (idx == 0) { + return null; + } else { + return next(); + } + } + arc = fst.readNextArc(arc); + } else if (idx == target.length) { + // degenerate case -- seek to term we are already on + assert target.equals(current); + return result; + } else { + // current is a full prefix of target + if (lastFinal) { + arc = fst.readFirstArc(arcs[current.length-1].target, getArc(current.length)); + } else { + return next(); + } + } + + lastFinal = false; + + assert arc == arcs[current.length]; + int targetLabel = target.bytes[target.offset+current.length] & 0xFF; + + while(true) { + //System.out.println(" cycle len=" + current.length + " target=" + ((char) targetLabel) + " vs " + ((char) arc.label)); + if (arc.label == targetLabel) { + grow(); + current.bytes[current.length] = (byte) arc.label; + appendOutput(arc.output); + current.length++; + grow(); + if (current.length == target.length) { + result.output = output[current.length-1]; + if (arc.isFinal()) { + // target is exact match + if (fst.hasArcs(arc.target)) { + // target is also a proper prefix of other terms + lastFinal = true; + appendFinalOutput(arc.nextFinalOutput); + } + } else { + // target is not a match but is a prefix of + // other terms + current.length--; + push(); + } + return result; + } else if (!fst.hasArcs(arc.target)) { + // we only match a prefix of the target + return next(); + } else { + targetLabel = target.bytes[target.offset+current.length] & 0xFF; + arc = fst.readFirstArc(arc.target, getArc(current.length)); + } + } else if (arc.label > targetLabel) { + // we are now past the target + push(); + return result; + } else if (arc.isLast()) { + if (current.length == 0) { + return null; + } + return next(); + } else { + arc = fst.readNextArc(getArc(current.length)); + } + } + } + + public InputOutput current() { + return result; + } + + public InputOutput next() throws IOException { + //System.out.println(" enum.next"); + + if (current.length == 0) { + final T output = fst.getEmptyOutput(); + if (output != null) { + if (!didEmpty) { + current.length = 0; + lastFinal = true; + result.output = output; + didEmpty = true; + return result; + } else { + lastFinal = false; + } + } + if (fst.noNodes()) { + return null; + } + fst.readFirstArc(fst.getStartNode(), getArc(0)); + push(); + } else if (lastFinal) { + lastFinal = false; + assert current.length > 0; + // resume pushing + fst.readFirstArc(arcs[current.length-1].target, getArc(current.length)); + push(); + } else { + //System.out.println(" pop/push"); + pop(); + if (current.length == 0) { + // enum done + return null; + } else { + current.length--; + fst.readNextArc(arcs[current.length]); + push(); + } + } + + return result; + } + + private void grow() { + final int l = current.length + 1; + current.grow(l); + arcs = ArrayUtil.grow(arcs, l); + output = ArrayUtil.grow(output, l); + } + + private void appendOutput(T addedOutput) { + T newOutput; + if (current.length == 0) { + newOutput = addedOutput; + } else if (addedOutput == NO_OUTPUT) { + output[current.length] = output[current.length-1]; + return; + } else { + newOutput = fst.outputs.add(output[current.length-1], addedOutput); + } + output[current.length] = newOutput; + } + + private void appendFinalOutput(T addedOutput) { + if (current.length == 0) { + result.output = addedOutput; + } else { + result.output = fst.outputs.add(output[current.length-1], addedOutput); + } + } + + private void push() throws IOException { + + FST.Arc arc = arcs[current.length]; + assert arc != null; + + while(true) { + grow(); + + current.bytes[current.length] = (byte) arc.label; + appendOutput(arc.output); + //System.out.println(" push: append label=" + ((char) arc.label) + " output=" + fst.outputs.outputToString(arc.output)); + current.length++; + grow(); + + if (!fst.hasArcs(arc.target)) { + break; + } + + if (arc.isFinal()) { + appendFinalOutput(arc.nextFinalOutput); + lastFinal = true; + return; + } + + arc = fst.readFirstArc(arc.target, getArc(current.length)); + } + result.output = output[current.length-1]; + } + + private void pop() { + while (current.length > 0 && arcs[current.length-1].isLast()) { + current.length--; + } + } + + private FST.Arc getArc(int idx) { + if (arcs[idx] == null) { + arcs[idx] = new FST.Arc(); + } + return arcs[idx]; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java new file mode 100644 index 00000000000..8de2e33e747 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java @@ -0,0 +1,922 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.IntsRef; + +/** Represents an FST using a compact byte[] format. + *

The format is similar to what's used by Morfologik + * (http://sourceforge.net/projects/morfologik). + * @lucene.experimental + */ +public class FST { + public static enum INPUT_TYPE {BYTE1, BYTE2, BYTE4}; + private final INPUT_TYPE inputType; + + private final static int BIT_FINAL_ARC = 1 << 0; + private final static int BIT_LAST_ARC = 1 << 1; + private final static int BIT_TARGET_NEXT = 1 << 2; + private final static int BIT_STOP_NODE = 1 << 3; + private final static int BIT_ARC_HAS_OUTPUT = 1 << 4; + private final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; + + // Arcs are stored as fixed-size (per entry) array, so + // that we can find an arc using binary search. We do + // this when number of arcs is > NUM_ARCS_ARRAY: + private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6; + + // If the node has >= this number of arcs, the arcs are + // stored as a fixed array. Fixed array consumes more RAM + // but enables binary search on the arcs (instead of + // linear scan) on lookup by arc label: + private final static int NUM_ARCS_FIXED_ARRAY = 10; + private int[] bytesPerArc = new int[0]; + + // Increment version to change it + private final static String FILE_FORMAT_NAME = "FST"; + private final static int VERSION_START = 0; + private final static int VERSION_CURRENT = VERSION_START; + + // Never serialized; just used to represent the virtual + // final node w/ no arcs: + private final static int FINAL_END_NODE = -1; + + // Never serialized; just used to represent the virtual + // non-final node w/ no arcs: + private final static int NON_FINAL_END_NODE = 0; + + // if non-null, this FST accepts the empty string and + // produces this output + private T emptyOutput; + private byte[] emptyOutputBytes; + + private byte[] bytes; + int byteUpto = 0; + + private int startNode = -1; + + public final Outputs outputs; + + private int lastFrozenNode; + + private final T NO_OUTPUT; + + public int nodeCount; + public int arcCount; + public int arcWithOutputCount; + + public final static class Arc { + int label; // really a "unsigned" byte + int target; + byte flags; + T output; + T nextFinalOutput; + int nextArc; + + // This is non-zero if current arcs are fixed array: + int posArcsStart; + int bytesPerArc; + int arcIdx; + int numArcs; + + // Must call this before re-using an Arc instance on a + // new node + public void reset() { + bytesPerArc = 0; + } + + public boolean flag(int flag) { + return FST.flag(flags, flag); + } + + public boolean isLast() { + return flag(BIT_LAST_ARC); + } + + public boolean isFinal() { + return flag(BIT_FINAL_ARC); + } + }; + + static boolean flag(int flags, int bit) { + return (flags & bit) != 0; + } + + private final BytesWriter writer; + + // make a new empty FST, for building + public FST(INPUT_TYPE inputType, Outputs outputs) { + this.inputType = inputType; + this.outputs = outputs; + bytes = new byte[128]; + NO_OUTPUT = outputs.getNoOutput(); + + writer = new BytesWriter(); + + emptyOutput = null; + } + + // create an existing FST + public FST(IndexInput in, Outputs outputs) throws IOException { + this.outputs = outputs; + writer = null; + CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_START); + if (in.readByte() == 1) { + // accepts empty string + int numBytes = in.readVInt(); + // messy + bytes = new byte[numBytes]; + in.readBytes(bytes, 0, numBytes); + emptyOutput = outputs.read(new BytesReader(numBytes-1)); + } else { + emptyOutput = null; + } + final byte t = in.readByte(); + switch(t) { + case 0: + inputType = INPUT_TYPE.BYTE1; + break; + case 1: + inputType = INPUT_TYPE.BYTE2; + break; + case 2: + inputType = INPUT_TYPE.BYTE4; + break; + default: + throw new IllegalStateException("invalid input type " + t); + } + startNode = in.readVInt(); + nodeCount = in.readVInt(); + arcCount = in.readVInt(); + arcWithOutputCount = in.readVInt(); + + bytes = new byte[in.readVInt()]; + in.readBytes(bytes, 0, bytes.length); + NO_OUTPUT = outputs.getNoOutput(); + } + + public INPUT_TYPE getInputType() { + return inputType; + } + + /** Returns bytes used to represent the FST */ + public int sizeInBytes() { + return bytes.length; + } + + void finish(int startNode) { + if (this.startNode != -1) { + throw new IllegalStateException("already finished"); + } + byte[] finalBytes = new byte[writer.posWrite]; + System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite); + bytes = finalBytes; + this.startNode = startNode; + } + + public void setEmptyOutput(T v) throws IOException { + if (emptyOutput != null) { + throw new IllegalStateException("empty output is already set"); + } + emptyOutput = v; + + // TODO: this is messy -- replace with sillyBytesWriter; maybe make + // bytes private + final int posSave = writer.posWrite; + outputs.write(emptyOutput, writer); + emptyOutputBytes = new byte[writer.posWrite-posSave]; + + // reverse + final int stopAt = (writer.posWrite - posSave)/2; + int upto = 0; + while(upto < stopAt) { + final byte b = bytes[posSave + upto]; + bytes[posSave+upto] = bytes[writer.posWrite-upto-1]; + bytes[writer.posWrite-upto-1] = b; + upto++; + } + System.arraycopy(bytes, posSave, emptyOutputBytes, 0, writer.posWrite-posSave); + writer.posWrite = posSave; + } + + public void save(IndexOutput out) throws IOException { + if (startNode == -1) { + throw new IllegalStateException("call finish first"); + } + CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); + if (emptyOutput != null) { + out.writeByte((byte) 1); + out.writeVInt(emptyOutputBytes.length); + out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length); + } else { + out.writeByte((byte) 0); + } + final byte t; + if (inputType == INPUT_TYPE.BYTE1) { + t = 0; + } else if (inputType == INPUT_TYPE.BYTE2) { + t = 1; + } else { + t = 2; + } + out.writeByte(t); + out.writeVInt(startNode); + out.writeVInt(nodeCount); + out.writeVInt(arcCount); + out.writeVInt(arcWithOutputCount); + out.writeVInt(bytes.length); + out.writeBytes(bytes, 0, bytes.length); + } + + private void writeLabel(int v) throws IOException { + assert v >= 0: "v=" + v; + if (inputType == INPUT_TYPE.BYTE1) { + assert v <= 255: "v=" + v; + writer.writeByte((byte) v); + } else if (inputType == INPUT_TYPE.BYTE2) { + assert v <= 65535: "v=" + v; + writer.writeVInt(v); + } else { + //writeInt(v); + writer.writeVInt(v); + } + } + + private int readLabel(DataInput in) throws IOException { + final int v; + if (inputType == INPUT_TYPE.BYTE1) { + v = in.readByte()&0xFF; + } else if (inputType == INPUT_TYPE.BYTE2) { + v = in.readVInt(); + } else { + v = in.readVInt(); + } + return v; + } + + // returns true if the node at this address has any + // outgoing arcs + public boolean hasArcs(int address) { + return address != FINAL_END_NODE && address != NON_FINAL_END_NODE; + } + + public int getStartNode() { + if (startNode == -1) { + throw new IllegalStateException("call finish first"); + } + return startNode; + } + + // returns null if this FST does not accept the empty + // string, else, the output for the empty string + public T getEmptyOutput() { + return emptyOutput; + } + + // serializes new node by appending its bytes to the end + // of the current byte[] + int addNode(Builder.UnCompiledNode node) throws IOException { + //System.out.println("FST.addNode pos=" + posWrite + " numArcs=" + node.numArcs); + if (node.numArcs == 0) { + if (node.isFinal) { + return FINAL_END_NODE; + } else { + return NON_FINAL_END_NODE; + } + } + + int startAddress = writer.posWrite; + //System.out.println(" startAddr=" + startAddress); + + final boolean doFixedArray = node.numArcs >= NUM_ARCS_FIXED_ARRAY; + final int fixedArrayStart; + if (doFixedArray) { + if (bytesPerArc.length < node.numArcs) { + bytesPerArc = new int[ArrayUtil.oversize(node.numArcs, 1)]; + } + // write a "false" first arc: + writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY); + writer.writeVInt(node.numArcs); + // placeholder -- we'll come back and write the number + // of bytes per arc here: + writer.writeByte((byte) 0); + fixedArrayStart = writer.posWrite; + //System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart); + } else { + fixedArrayStart = 0; + } + + nodeCount++; + arcCount += node.numArcs; + + final int lastArc = node.numArcs-1; + + int lastArcStart = writer.posWrite; + int maxBytesPerArc = 0; + for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; + final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; + int flags = 0; + + if (arcIdx == lastArc) { + flags += BIT_LAST_ARC; + } + + if (lastFrozenNode == target.address && !doFixedArray) { + flags += BIT_TARGET_NEXT; + } + + if (arc.isFinal) { + flags += BIT_FINAL_ARC; + if (arc.nextFinalOutput != NO_OUTPUT) { + flags += BIT_ARC_HAS_FINAL_OUTPUT; + } + } else { + assert arc.nextFinalOutput == NO_OUTPUT; + } + + boolean targetHasArcs = hasArcs(target.address); + + if (!targetHasArcs) { + flags += BIT_STOP_NODE; + } + + if (arc.output != NO_OUTPUT) { + flags += BIT_ARC_HAS_OUTPUT; + } + + writer.writeByte((byte) flags); + writeLabel(arc.label); + + //System.out.println(" write arc: label=" + arc.label + " flags=" + flags); + + if (arc.output != NO_OUTPUT) { + outputs.write(arc.output, writer); + arcWithOutputCount++; + } + if (arc.nextFinalOutput != NO_OUTPUT) { + outputs.write(arc.nextFinalOutput, writer); + } + + if (targetHasArcs && (doFixedArray || lastFrozenNode != target.address)) { + assert target.address > 0; + writer.writeInt(target.address); + } + + // just write the arcs "like normal" on first pass, + // but record how many bytes each one took, and max + // byte size: + if (doFixedArray) { + bytesPerArc[arcIdx] = writer.posWrite - lastArcStart; + lastArcStart = writer.posWrite; + maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]); + //System.out.println(" bytes=" + bytesPerArc[arcIdx]); + } + } + + if (doFixedArray) { + assert maxBytesPerArc > 0; + // 2nd pass just "expands" all arcs to take up a fixed + // byte size + final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc; + bytes = ArrayUtil.grow(bytes, sizeNeeded); + if (maxBytesPerArc > 255) { + throw new IllegalStateException("max arc size is too large (" + maxBytesPerArc + ")"); + } + bytes[fixedArrayStart-1] = (byte) maxBytesPerArc; + + // expand the arcs in place, backwards + int srcPos = writer.posWrite; + int destPos = fixedArrayStart + node.numArcs*maxBytesPerArc; + writer.posWrite = destPos; + for(int arcIdx=node.numArcs-1;arcIdx>=0;arcIdx--) { + //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); + destPos -= maxBytesPerArc; + srcPos -= bytesPerArc[arcIdx]; + if (srcPos != destPos) { + assert destPos > srcPos; + System.arraycopy(bytes, srcPos, bytes, destPos, bytesPerArc[arcIdx]); + } + } + } + + // reverse bytes in-place; we do this so that the + // "BIT_TARGET_NEXT" opto can work, ie, it reads the + // node just before the current one + final int endAddress = writer.posWrite; + final int stopAt = (endAddress - startAddress)/2; + int upto = 0; + while (upto < stopAt) { + final byte b = bytes[startAddress+upto]; + bytes[startAddress+upto] = bytes[endAddress-upto-1]; + bytes[endAddress-upto-1] = b; + upto++; + } + + lastFrozenNode = endAddress - 1; + /* + System.out.println(" return node addr=" + (endAddress-1)); + for(int i=endAddress-1;i>=startAddress;i--) { + System.out.println(" bytes[" + i + "]=" + bytes[i]); + } + */ + + return endAddress-1; + } + + public Arc readFirstArc(int address, Arc arc) throws IOException { + //System.out.println("readFirstArc addr=" + address); + //int pos = address; + final BytesReader in = new BytesReader(address); + + arc.flags = in.readByte(); + + if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { + //System.out.println(" fixedArray"); + // this is first arc in a fixed-array + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readByte() & 0xFF; + arc.arcIdx = -1; + arc.posArcsStart = in.pos; + //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); + } else { + in.pos++; + arc.bytesPerArc = 0; + } + arc.nextArc = in.pos; + return readNextArc(arc); + } + + public Arc readNextArc(Arc arc) throws IOException { + // this is a continuing arc in a fixed array + final BytesReader in; + if (arc.bytesPerArc != 0) { + // arcs are at fixed entries + arc.arcIdx++; + in = new BytesReader(arc.posArcsStart - arc.arcIdx*arc.bytesPerArc); + } else { + // arcs are packed + in = new BytesReader(arc.nextArc); + } + arc.flags = in.readByte(); + arc.label = readLabel(in); + + if (arc.flag(BIT_ARC_HAS_OUTPUT)) { + arc.output = outputs.read(in); + } else { + arc.output = outputs.getNoOutput(); + } + + if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + arc.nextFinalOutput = outputs.read(in); + } else { + arc.nextFinalOutput = outputs.getNoOutput(); + } + + if (arc.flag(BIT_STOP_NODE)) { + arc.target = FINAL_END_NODE; + arc.nextArc = in.pos; + } else if (arc.flag(BIT_TARGET_NEXT)) { + arc.nextArc = in.pos; + if (!arc.flag(BIT_LAST_ARC)) { + if (arc.bytesPerArc == 0) { + // must scan + seekToNextNode(in); + } else { + in.pos = arc.posArcsStart - arc.bytesPerArc * arc.numArcs; + } + } + arc.target = in.pos; + } else { + arc.target = in.readInt(); + arc.nextArc = in.pos; + } + + return arc; + } + + public Arc findArc(int address, int labelToMatch, Arc arc) throws IOException { + // TODO: maybe make an explicit thread state that holds + // reusable stuff eg BytesReader: + final BytesReader in = new BytesReader(address); + + if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) { + // Arcs are full array; do binary search: + //System.out.println("findArc: array label=" + labelToMatch); + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readByte() & 0xFF; + arc.posArcsStart = in.pos; + int low = 0; + int high = arc.numArcs-1; + while (low <= high) { + int mid = (low + high) >>> 1; + in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; + int midLabel = readLabel(in); + final int cmp = midLabel - labelToMatch; + if (cmp < 0) + low = mid + 1; + else if (cmp > 0) + high = mid - 1; + else { + arc.arcIdx = mid-1; + return readNextArc(arc); + } + } + + return null; + } + //System.out.println("findArc: scan"); + + readFirstArc(address, arc); + + while(true) { + if (arc.label == labelToMatch) { + return arc; + } else if (arc.isLast()) { + return null; + } else { + readNextArc(arc); + } + } + } + + /** Looks up the output for this input, or null if the + * input is not accepted. FST must be + * INPUT_TYPE.BYTE4. */ + public T get(IntsRef input) throws IOException { + assert inputType == INPUT_TYPE.BYTE4; + + if (input.length == 0) { + return getEmptyOutput(); + } + + // TODO: would be nice not to alloc this on every lookup + final FST.Arc arc = new FST.Arc(); + int node = getStartNode(); + T output = NO_OUTPUT; + for(int i=0;i arc = new FST.Arc(); + int node = getStartNode(); + int charIdx = offset; + final int charLimit = offset + length; + T output = NO_OUTPUT; + while(charIdx < charLimit) { + if (!hasArcs(node)) { + // hit end of FST before input end + return null; + } + + final int utf32 = Character.codePointAt(input, charIdx); + charIdx += Character.charCount(utf32); + + if (findArc(node, utf32, arc) != null) { + node = arc.target; + if (arc.output != NO_OUTPUT) { + output = outputs.add(output, arc.output); + } + } else { + return null; + } + } + + if (!arc.isFinal()) { + // hit input's end before end node + return null; + } + + if (arc.nextFinalOutput != NO_OUTPUT) { + output = outputs.add(output, arc.nextFinalOutput); + } + + return output; + } + + + /** Logically casts input to UTF32 ints then looks up the output + * or null if the input is not accepted. FST must be + * INPUT_TYPE.BYTE4. */ + public T get(CharSequence input) throws IOException { + assert inputType == INPUT_TYPE.BYTE4; + + final int len = input.length(); + if (len == 0) { + return getEmptyOutput(); + } + + // TODO: would be nice not to alloc this on every lookup + final FST.Arc arc = new FST.Arc(); + int node = getStartNode(); + int charIdx = 0; + final int charLimit = input.length(); + T output = NO_OUTPUT; + while(charIdx < charLimit) { + if (!hasArcs(node)) { + // hit end of FST before input end + return null; + } + + final int utf32 = Character.codePointAt(input, charIdx); + charIdx += Character.charCount(utf32); + + if (findArc(node, utf32, arc) != null) { + node = arc.target; + if (arc.output != NO_OUTPUT) { + output = outputs.add(output, arc.output); + } + } else { + return null; + } + } + + if (!arc.isFinal()) { + // hit input's end before end node + return null; + } + + if (arc.nextFinalOutput != NO_OUTPUT) { + output = outputs.add(output, arc.nextFinalOutput); + } + + return output; + } + + /** Looks up the output for this input, or null if the + * input is not accepted */ + public T get(BytesRef input) throws IOException { + assert inputType == INPUT_TYPE.BYTE1; + + if (input.length == 0) { + return getEmptyOutput(); + } + + // TODO: would be nice not to alloc this on every lookup + final FST.Arc arc = new FST.Arc(); + int node = getStartNode(); + T output = NO_OUTPUT; + for(int i=0;i /x/tmp/out.png + */ + public void toDot(PrintStream out) throws IOException { + + final List queue = new ArrayList(); + queue.add(startNode); + + final Set seen = new HashSet(); + seen.add(startNode); + + out.println("digraph FST {"); + out.println(" rankdir = LR;"); + //out.println(" " + startNode + " [shape=circle label=" + startNode + "];"); + out.println(" " + startNode + " [label=\"\" shape=circle];"); + out.println(" initial [shape=point color=white label=\"\"];"); + if (emptyOutput != null) { + out.println(" initial -> " + startNode + " [arrowhead=tee label=\"(" + outputs.outputToString(emptyOutput) + ")\"];"); + } else { + out.println(" initial -> " + startNode); + } + + final Arc arc = new Arc(); + + while(queue.size() != 0) { + Integer node = queue.get(queue.size()-1); + queue.remove(queue.size()-1); + + if (node == FINAL_END_NODE || node == NON_FINAL_END_NODE) { + continue; + } + + // scan all arcs + readFirstArc(node, arc); + while(true) { + + if (!seen.contains(arc.target)) { + //out.println(" " + arc.target + " [label=" + arc.target + "];"); + out.println(" " + arc.target + " [label=\"\" shape=circle];"); + seen.add(arc.target); + queue.add(arc.target); + } + String outs; + if (arc.output != NO_OUTPUT) { + outs = "/" + outputs.outputToString(arc.output); + } else { + outs = ""; + } + if (arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) { + outs += " (" + outputs.outputToString(arc.nextFinalOutput) + ")"; + } + out.print(" " + node + " -> " + arc.target + " [label=\"" + arc.label + outs + "\""); + if (arc.isFinal()) { + out.print(" arrowhead=tee"); + } + if (arc.flag(BIT_TARGET_NEXT)) { + out.print(" color=blue"); + } + out.println("];"); + + if (arc.isLast()) { + break; + } else { + readNextArc(arc); + } + } + } + out.println("}"); + } + + public int getNodeCount() { + // 1+ in order to count the -1 implicit final node + return 1+nodeCount; + } + + public int getArcCount() { + return arcCount; + } + + public int getArcWithOutputCount() { + return arcWithOutputCount; + } + + // Non-static: writes to FST's byte[] + private class BytesWriter extends DataOutput { + int posWrite; + + public BytesWriter() { + // pad: ensure no node gets address 0 which is reserved to mean + // the stop state w/ no arcs + posWrite = 1; + } + + @Override + public void writeByte(byte b) { + if (bytes.length == posWrite) { + bytes = ArrayUtil.grow(bytes); + } + assert posWrite < bytes.length: "posWrite=" + posWrite + " bytes.length=" + bytes.length; + bytes[posWrite++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + final int size = posWrite + length; + bytes = ArrayUtil.grow(bytes, size); + System.arraycopy(b, offset, bytes, posWrite, length); + posWrite += length; + } + } + + // Non-static: reads byte[] from FST + private class BytesReader extends DataInput { + int pos; + + public BytesReader(int pos) { + this.pos = pos; + } + + @Override + public byte readByte() { + return bytes[pos--]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + for(int i=0;i { + + private final static IntsRef NO_OUTPUT = new IntsRef(); + + private IntSequenceOutputs() { + } + + public static IntSequenceOutputs getSingleton() { + return new IntSequenceOutputs(); + } + + @Override + public IntsRef common(IntsRef output1, IntsRef output2) { + assert output1 != null; + assert output2 != null; + + int pos1 = output1.offset; + int pos2 = output2.offset; + int stopAt1 = pos1 + Math.min(output1.length, output2.length); + while(pos1 < stopAt1) { + if (output1.ints[pos1] != output2.ints[pos2]) { + break; + } + pos1++; + pos2++; + } + + if (pos1 == output1.offset) { + // no common prefix + return NO_OUTPUT; + } else if (pos1 == output1.offset + output1.length) { + // output1 is a prefix of output2 + return output1; + } else if (pos2 == output2.offset + output2.length) { + // output2 is a prefix of output1 + return output2; + } else { + return new IntsRef(output1.ints, output1.offset, pos1-output1.offset); + } + } + + @Override + public IntsRef subtract(IntsRef output, IntsRef inc) { + assert output != null; + assert inc != null; + if (inc == NO_OUTPUT) { + // no prefix removed + return output; + } else if (inc.length == output.length) { + // entire output removed + return NO_OUTPUT; + } else { + assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length; + assert inc.length > 0; + return new IntsRef(output.ints, output.offset + inc.length, output.length-inc.length); + } + } + + @Override + public IntsRef add(IntsRef prefix, IntsRef output) { + assert prefix != null; + assert output != null; + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + assert prefix.length > 0; + assert output.length > 0; + IntsRef result = new IntsRef(prefix.length + output.length); + System.arraycopy(prefix.ints, prefix.offset, result.ints, 0, prefix.length); + System.arraycopy(output.ints, output.offset, result.ints, prefix.length, output.length); + result.length = prefix.length + output.length; + return result; + } + } + + @Override + public void write(IntsRef prefix, DataOutput out) throws IOException { + assert prefix != null; + out.writeVInt(prefix.length); + for(int idx=0;idx { + private final FST fst; + + private IntsRef current = new IntsRef(10); + @SuppressWarnings("unchecked") private FST.Arc[] arcs = (FST.Arc[]) new FST.Arc[10]; + // outputs are cumulative + @SuppressWarnings("unchecked") private T[] output = (T[]) new Object[10]; + + private boolean lastFinal; + private boolean didEmpty; + private final T NO_OUTPUT; + private final InputOutput result = new InputOutput(); + + public static class InputOutput { + public IntsRef input; + public T output; + } + + public IntsRefFSTEnum(FST fst) { + this.fst = fst; + result.input = current; + NO_OUTPUT = fst.outputs.getNoOutput(); + } + + public void reset() { + lastFinal = false; + didEmpty = false; + current.length = 0; + result.output = NO_OUTPUT; + } + + /** NOTE: target must be >= where we are already + * positioned */ + public InputOutput advance(IntsRef target) throws IOException { + + assert target.compareTo(current) >= 0; + + //System.out.println(" advance len=" + target.length + " curlen=" + current.length); + + // special case empty string + if (current.length == 0) { + if (target.length == 0) { + final T output = fst.getEmptyOutput(); + if (output != null) { + if (!didEmpty) { + current.length = 0; + lastFinal = true; + result.output = output; + didEmpty = true; + } + return result; + } else { + return next(); + } + } + + if (fst.noNodes()) { + return null; + } + } + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + + // what prefix does target share w/ current + int idx = 0; + while (idx < current.length && idx < target.length) { + if (current.ints[idx] != target.ints[target.offset + idx]) { + break; + } + idx++; + } + + //System.out.println(" shared " + idx); + + FST.Arc arc; + if (current.length == 0) { + // new enum (no seek/next yet) + arc = fst.readFirstArc(fst.getStartNode(), getArc(0)); + //System.out.println(" new enum"); + } else if (idx < current.length) { + // roll back to shared point + lastFinal = false; + current.length = idx; + arc = arcs[idx]; + if (arc.isLast()) { + if (idx == 0) { + return null; + } else { + return next(); + } + } + arc = fst.readNextArc(arc); + } else if (idx == target.length) { + // degenerate case -- seek to term we are already on + assert target.equals(current); + return result; + } else { + // current is a full prefix of target + if (lastFinal) { + arc = fst.readFirstArc(arcs[current.length-1].target, getArc(current.length)); + } else { + return next(); + } + } + + lastFinal = false; + + assert arc == arcs[current.length]; + int targetLabel = target.ints[target.offset+current.length]; + + while(true) { + //System.out.println(" cycle len=" + current.length + " target=" + ((char) targetLabel) + " vs " + ((char) arc.label)); + if (arc.label == targetLabel) { + grow(); + current.ints[current.length] = arc.label; + appendOutput(arc.output); + current.length++; + grow(); + if (current.length == target.length) { + result.output = output[current.length-1]; + if (arc.isFinal()) { + // target is exact match + if (fst.hasArcs(arc.target)) { + // target is also a proper prefix of other terms + lastFinal = true; + appendFinalOutput(arc.nextFinalOutput); + } + } else { + // target is not a match but is a prefix of + // other terms + current.length--; + push(); + } + return result; + } else if (!fst.hasArcs(arc.target)) { + // we only match a prefix of the target + return next(); + } else { + targetLabel = target.ints[target.offset+current.length]; + arc = fst.readFirstArc(arc.target, getArc(current.length)); + } + } else if (arc.label > targetLabel) { + // we are now past the target + push(); + return result; + } else if (arc.isLast()) { + if (current.length == 0) { + return null; + } + return next(); + } else { + arc = fst.readNextArc(getArc(current.length)); + } + } + } + + public InputOutput current() { + return result; + } + + public InputOutput next() throws IOException { + //System.out.println(" enum.next"); + + if (current.length == 0) { + final T output = fst.getEmptyOutput(); + if (output != null) { + if (!didEmpty) { + current.length = 0; + lastFinal = true; + result.output = output; + didEmpty = true; + return result; + } else { + lastFinal = false; + } + } + if (fst.noNodes()) { + return null; + } + fst.readFirstArc(fst.getStartNode(), getArc(0)); + push(); + } else if (lastFinal) { + lastFinal = false; + assert current.length > 0; + // resume pushing + fst.readFirstArc(arcs[current.length-1].target, getArc(current.length)); + push(); + } else { + //System.out.println(" pop/push"); + pop(); + if (current.length == 0) { + // enum done + return null; + } else { + current.length--; + fst.readNextArc(arcs[current.length]); + push(); + } + } + + return result; + } + + private void grow() { + final int l = current.length + 1; + current.grow(l); + arcs = ArrayUtil.grow(arcs, l); + output = ArrayUtil.grow(output, l); + } + + private void appendOutput(T addedOutput) { + T newOutput; + if (current.length == 0) { + newOutput = addedOutput; + } else if (addedOutput == NO_OUTPUT) { + output[current.length] = output[current.length-1]; + return; + } else { + newOutput = fst.outputs.add(output[current.length-1], addedOutput); + } + output[current.length] = newOutput; + } + + private void appendFinalOutput(T addedOutput) { + if (current.length == 0) { + result.output = addedOutput; + } else { + result.output = fst.outputs.add(output[current.length-1], addedOutput); + } + } + + private void push() throws IOException { + + FST.Arc arc = arcs[current.length]; + assert arc != null; + + while(true) { + grow(); + + current.ints[current.length] = arc.label; + appendOutput(arc.output); + //System.out.println(" push: append label=" + ((char) arc.label) + " output=" + fst.outputs.outputToString(arc.output)); + current.length++; + grow(); + + if (!fst.hasArcs(arc.target)) { + break; + } + + if (arc.isFinal()) { + appendFinalOutput(arc.nextFinalOutput); + lastFinal = true; + return; + } + + arc = fst.readFirstArc(arc.target, getArc(current.length)); + } + result.output = output[current.length-1]; + } + + private void pop() { + while (current.length > 0 && arcs[current.length-1].isLast()) { + current.length--; + } + } + + private FST.Arc getArc(int idx) { + if (arcs[idx] == null) { + arcs[idx] = new FST.Arc(); + } + return arcs[idx]; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/NoOutputs.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/NoOutputs.java new file mode 100644 index 00000000000..edb9167e84c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/NoOutputs.java @@ -0,0 +1,94 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +/** + * Use this if you just want to build an FSA. + */ + +public final class NoOutputs extends Outputs { + + final Object NO_OUTPUT = new Object() { + // NodeHash calls hashCode for this output; we fix this + // so we get deterministic hashing. + @Override + public int hashCode() { + return 42; + } + + @Override + public boolean equals(Object other) { + return other == this; + } + }; + + private static final NoOutputs singleton = new NoOutputs(); + + private NoOutputs() { + } + + public static NoOutputs getSingleton() { + return singleton; + } + + @Override + public Object common(Object output1, Object output2) { + assert output1 == NO_OUTPUT; + assert output2 == NO_OUTPUT; + return NO_OUTPUT; + } + + @Override + public Object subtract(Object output, Object inc) { + assert output == NO_OUTPUT; + assert inc == NO_OUTPUT; + return NO_OUTPUT; + } + + @Override + public Object add(Object prefix, Object output) { + assert prefix == NO_OUTPUT: "got " + prefix; + assert output == NO_OUTPUT; + return NO_OUTPUT; + } + + @Override + public void write(Object prefix, DataOutput out) { + //assert false; + } + + @Override + public Object read(DataInput in) { + //assert false; + //return null; + return NO_OUTPUT; + } + + @Override + public Object getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(Object output) { + return ""; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java new file mode 100644 index 00000000000..7c244467669 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java @@ -0,0 +1,174 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +// Used to dedup states (lookup already-frozen states) +final class NodeHash { + + private int[] table; + private int count; + private int mask; + private final FST fst; + private final FST.Arc scratchArc = new FST.Arc(); + + public static int conf; + + public NodeHash(FST fst) { + table = new int[16]; + mask = 15; + this.fst = fst; + } + + private boolean nodesEqual(Builder.UnCompiledNode node, int address) throws IOException { + fst.readFirstArc(address, scratchArc); + if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { + return false; + } + for(int arcUpto=0;arcUpto node) { + final int PRIME = 31; + //System.out.println("hash unfrozen"); + int h = 0; + // TODO: maybe if number of arcs is high we can safely subsample? + for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; + //System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).address + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal); + h = PRIME * h + arc.label; + h = PRIME * h + ((Builder.CompiledNode) arc.target).address; + h = PRIME * h + arc.output.hashCode(); + h = PRIME * h + arc.nextFinalOutput.hashCode(); + if (arc.isFinal) { + h += 17; + } + } + //System.out.println(" ret " + (h&Integer.MAX_VALUE)); + return h & Integer.MAX_VALUE; + } + + // hash code for a frozen node + private int hash(int node) throws IOException { + final int PRIME = 31; + //System.out.println("hash frozen"); + int h = 0; + fst.readFirstArc(node, scratchArc); + while(true) { + //System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal()); + h = PRIME * h + scratchArc.label; + h = PRIME * h + scratchArc.target; + h = PRIME * h + scratchArc.output.hashCode(); + h = PRIME * h + scratchArc.nextFinalOutput.hashCode(); + if (scratchArc.isFinal()) { + h += 17; + } + if (scratchArc.isLast()) { + break; + } + fst.readNextArc(scratchArc); + } + //System.out.println(" ret " + (h&Integer.MAX_VALUE)); + return h & Integer.MAX_VALUE; + } + + public int add(Builder.UnCompiledNode node) throws IOException { + // System.out.println("hash: add count=" + count + " vs " + table.length); + final int h = hash(node); + int h2 = h; + int c = 1; + while(true) { + final int pos = h2 & mask; + final int v = table[pos]; + if (v == 0) { + // freeze & add + final int address = fst.addNode(node); + //System.out.println(" now freeze addr=" + address); + assert hash(address) == h : "frozenHash=" + hash(address) + " vs h=" + h; + count++; + table[pos] = address; + if (table.length < 2*count) { + rehash(); + } + return address; + } else if (nodesEqual(node, v)) { + // same node is already here + return v; + } + + // quadratic probe + h2 = h+(c + c*c)/2; + c++; + conf++; + } + } + + // called only by rehash + private void addNew(int address) throws IOException { + final int h = hash(address); + int h2 = h; + int c = 1; + while(true) { + final int pos = h2 & mask; + if (table[pos] == 0) { + table[pos] = address; + break; + } + + // quadratic probe + h2 = h + (c + c*c)/2; + c++; + conf++; + } + } + + private void rehash() throws IOException { + final int[] oldTable = table; + table = new int[2*table.length]; + mask = table.length-1; + for(int idx=0;idx { + + // TODO: maybe change this API to allow for re-use of the + // output instances -- this is an insane amount of garbage + // (new object per byte/char/int) if eg used during + // analysis + + /** Eg common("foo", "foobar") -> "foo" */ + public abstract T common(T output1, T output2); + + /** Eg subtract("foobar", "foo") -> "bar" */ + public abstract T subtract(T output, T inc); + + /** Eg add("foo", "bar") -> "foobar" */ + public abstract T add(T prefix, T output); + + public abstract void write(T output, DataOutput out) throws IOException; + + public abstract T read(DataInput in) throws IOException; + + /** NOTE: this output is compared with == so you must + * ensure that all methods return the single object if + * it's really no output */ + public abstract T getNoOutput(); + + public abstract String outputToString(T output); +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/PairOutputs.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/PairOutputs.java new file mode 100644 index 00000000000..64275bc55fd --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/PairOutputs.java @@ -0,0 +1,117 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +/** + * Pairs up two outputs into one. + * @lucene.experimental + */ + + +public class PairOutputs extends Outputs> { + + private final Pair NO_OUTPUT; + private final Outputs outputs1; + private final Outputs outputs2; + + public static class Pair { + public final A output1; + public final B output2; + + public Pair(A output1, B output2) { + this.output1 = output1; + this.output2 = output2; + } + + @Override @SuppressWarnings("unchecked") + public boolean equals(Object other) { + if (other == this) { + return true; + } else if (other instanceof Pair) { + Pair pair = (Pair) other; + return output1.equals(pair.output1) && output2.equals(pair.output2); + } else { + return false; + } + } + + public int hashCode() { + return output1.hashCode() + output2.hashCode(); + } + }; + + public PairOutputs(Outputs outputs1, Outputs outputs2) { + this.outputs1 = outputs1; + this.outputs2 = outputs2; + NO_OUTPUT = new Pair(outputs1.getNoOutput(), outputs2.getNoOutput()); + } + + public Pair get(A output1, B output2) { + if (output1 == outputs1.getNoOutput() && output2 == outputs2.getNoOutput()) { + return NO_OUTPUT; + } else { + return new Pair(output1, output2); + } + } + + @Override + public Pair common(Pair pair1, Pair pair2) { + return get(outputs1.common(pair1.output1, pair2.output1), + outputs2.common(pair1.output2, pair2.output2)); + } + + @Override + public Pair subtract(Pair output, Pair inc) { + return get(outputs1.subtract(output.output1, inc.output1), + outputs2.subtract(output.output2, inc.output2)); + } + + @Override + public Pair add(Pair prefix, Pair output) { + return get(outputs1.add(prefix.output1, output.output1), + outputs2.add(prefix.output2, output.output2)); + } + + @Override + public void write(Pair output, DataOutput writer) throws IOException { + outputs1.write(output.output1, writer); + outputs2.write(output.output2, writer); + } + + @Override + public Pair read(DataInput in) throws IOException { + A output1 = outputs1.read(in); + B output2 = outputs2.read(in); + return get(output1, output2); + } + + @Override + public Pair getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(Pair output) { + return ""; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java new file mode 100644 index 00000000000..ba17fe99dee --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java @@ -0,0 +1,138 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +// TODO: make a sharing and non-sharing variant; eg if you +// output docFreq per term the FST will be smaller if you +// don't share since they are not "well shared" + +/** + * Output is a long, for each input term. NOTE: the + * resulting FST is not guaranteed to be minimal! See + * {@link Builder}. + * @lucene.experimental + */ + +public final class PositiveIntOutputs extends Outputs { + + private final static Long NO_OUTPUT = new Long(0); + + private final boolean doShare; + + private final static PositiveIntOutputs singletonShare = new PositiveIntOutputs(true); + private final static PositiveIntOutputs singletonNoShare = new PositiveIntOutputs(false); + + private PositiveIntOutputs(boolean doShare) { + this.doShare = doShare; + } + + public static PositiveIntOutputs getSingleton(boolean doShare) { + return doShare ? singletonShare : singletonNoShare; + } + + public Long get(long v) { + if (v == 0) { + return NO_OUTPUT; + } else { + return Long.valueOf(v); + } + } + + @Override + public Long common(Long output1, Long output2) { + assert valid(output1); + assert valid(output2); + if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) { + return NO_OUTPUT; + } else if (doShare) { + assert output1 > 0; + assert output2 > 0; + return Math.min(output1, output2); + } else if (output1.equals(output2)) { + return output1; + } else { + return NO_OUTPUT; + } + } + + @Override + public Long subtract(Long output, Long inc) { + assert valid(output); + assert valid(inc); + assert output >= inc; + + if (inc == NO_OUTPUT) { + return output; + } else if (output.equals(inc)) { + return NO_OUTPUT; + } else { + return output - inc; + } + } + + @Override + public Long add(Long prefix, Long output) { + assert valid(prefix); + assert valid(output); + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + return prefix + output; + } + } + + @Override + public void write(Long output, DataOutput out) throws IOException { + assert valid(output); + out.writeVLong(output); + } + + @Override + public Long read(DataInput in) throws IOException { + long v = in.readVLong(); + if (v == 0) { + return NO_OUTPUT; + } else { + return v; + } + } + + private boolean valid(Long o) { + assert o != null; + assert o instanceof Long; + assert o == NO_OUTPUT || o > 0; + return true; + } + + @Override + public Long getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(Long output) { + return output.toString(); + } +} diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/TODO b/lucene/src/java/org/apache/lucene/util/automaton/fst/TODO new file mode 100644 index 00000000000..98fc6797e04 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/TODO @@ -0,0 +1,39 @@ +is threadlocal.get costly? if so maybe make an FSTReader? would hold this "relative" pos, and each thread'd use it for reading, instead of PosRef + +maybe changed Outputs class to "reuse" stuff? eg this new BytesRef in ByteSequenceOutputs.. + +do i even "need" both non_final_end_state and final_end_state? + +hmm -- can I get weights working here? + +can FST be used to index all internal substrings, mapping to term? + - maybe put back ability to add multiple outputs per input...? + +make this work w/ char...? + - then FSTCharFilter/FSTTokenFilter + - syn filter? + +experiment: try reversing terms before compressing -- how much smaller? + +maybe seprate out a 'writable/growing fst' from a read-only one? + +can we somehow [partially] tableize lookups like oal.util.automaton? + +make an FST terms index option for codecs...? + +make an FSTCharsMap? + +need a benchmark testing FST traversal -- just fix the static main to rewind & visit all terms + +thread state + +when writing FST to disk: +- Sequentially writing (would save memory in codec during indexing). We are now using DataOutput, which could also go directly to disk +- problem: size of BytesRef must be known before + +later + - maybe don't require FSTEnum.advance to be forward only? + - should i make a posIntOutputs separate from posLongOutputs? + - mv randomAccpetedWord / run / etc. from test into FST? + - hmm get multi-outputs working again? do we ever need this? + diff --git a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java new file mode 100644 index 00000000000..c444d488cdf --- /dev/null +++ b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java @@ -0,0 +1,1233 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MockDirectoryWrapper; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LineFileDocs; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util._TestUtil; + +public class TestFSTs extends LuceneTestCase { + + private MockDirectoryWrapper dir; + + public void setUp() throws IOException { + dir = newDirectory(); + dir.setPreventDoubleWrite(false); + } + + public void tearDown() throws IOException { + dir.close(); + } + + private static BytesRef toBytesRef(IntsRef ir) { + BytesRef br = new BytesRef(ir.length); + for(int i=0;i= 0 && x <= 255; + br.bytes[i] = (byte) x; + } + br.length = ir.length; + return br; + } + + private static IntsRef toIntsRef(String s, int inputMode) { + return toIntsRef(s, inputMode, new IntsRef(10)); + } + + private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) { + if (inputMode == 0) { + // utf8 + return toIntsRef(new BytesRef(s), ir); + } else { + // utf32 + return toIntsRefUTF32(s, ir); + } + } + + private static IntsRef toIntsRefUTF32(String s, IntsRef ir) { + final int charLength = s.length(); + int charIdx = 0; + int intIdx = 0; + while(charIdx < charLength) { + if (intIdx == ir.ints.length) { + ir.grow(intIdx+1); + } + final int utf32 = s.codePointAt(charIdx); + ir.ints[intIdx] = utf32; + charIdx += Character.charCount(utf32); + intIdx++; + } + ir.length = intIdx; + return ir; + } + + private static IntsRef toIntsRef(BytesRef br, IntsRef ir) { + if (br.length > ir.ints.length) { + ir.grow(br.length); + } + for(int i=0;i outputs = NoOutputs.getSingleton(); + final Object NO_OUTPUT = outputs.getNoOutput(); + final List> pairs = new ArrayList>(terms.length); + for(IntsRef term : terms) { + pairs.add(new FSTTester.InputOutput(term, NO_OUTPUT)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // PositiveIntOutput (ord) + { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], outputs.get(idx))); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // PositiveIntOutput (random monotonically increasing positive number) + { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final List> pairs = new ArrayList>(terms.length); + long lastOutput = 0; + for(int idx=0;idx(terms[idx], outputs.get(value))); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // PositiveIntOutput (random positive number) + { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // Pair + { + final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final PairOutputs outputs = new PairOutputs(o1, o2); + final List>> pairs = new ArrayList>>(terms.length); + long lastOutput = 0; + for(int idx=0;idx>(terms[idx], + outputs.get(o1.get(idx), + o2.get(value)))); + } + new FSTTester>(random, dir, inputMode, pairs, outputs).doTest(); + } + + // Sequence-of-bytes + { + final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + final BytesRef NO_OUTPUT = outputs.getNoOutput(); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], output)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // Sequence-of-ints + { + final IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], output)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + } + + private static class FSTTester { + + final Random random; + final List> pairs; + final int inputMode; + final Outputs outputs; + final Directory dir; + + public FSTTester(Random random, Directory dir, int inputMode, List> pairs, Outputs outputs) { + this.random = random; + this.dir = dir; + this.inputMode = inputMode; + this.pairs = pairs; + this.outputs = outputs; + } + + private static class InputOutput implements Comparable> { + public final IntsRef input; + public final T output; + + public InputOutput(IntsRef input, T output) { + this.input = input; + this.output = output; + } + + public int compareTo(InputOutput other) { + if (other instanceof InputOutput) { + return input.compareTo((other).input); + } else { + throw new IllegalArgumentException(); + } + } + } + + private String getRandomString() { + final String term; + if (random.nextBoolean()) { + term = _TestUtil.randomRealisticUnicodeString(random); + } else { + // we want to mix in limited-alphabet symbols so + // we get more sharing of the nodes given how few + // terms we are testing... + term = simpleRandomString(random); + } + return term; + } + + public void doTest() throws IOException { + // no pruning + doTest(0, 0); + + // simple pruning + doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0); + + // leafy pruning + doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size())); + } + + // NOTE: only copies the stuff this test needs!! + private FST.Arc copyArc(FST.Arc arc) { + final FST.Arc copy = new FST.Arc(); + copy.label = arc.label; + copy.target = arc.target; + copy.output = arc.output; + copy.nextFinalOutput = arc.nextFinalOutput; + return arc; + } + + // runs the term, returning the output, or null if term + // isn't accepted. if stopNode is non-null it must be + // length 2 int array; stopNode[0] will be the last + // matching node (-1 if the term is accepted) + // and stopNode[1] will be the length of the + // term prefix that matches + private T run(FST fst, IntsRef term, int[] stopNode) throws IOException { + if (term.length == 0) { + final T output = fst.getEmptyOutput(); + if (stopNode != null) { + stopNode[1] = 0; + if (output != null) { + // accepted + stopNode[0] = -1; + } else { + stopNode[0] = fst.getStartNode(); + } + } + return output; + } + + final FST.Arc arc = new FST.Arc(); + int node = fst.getStartNode(); + int lastNode = -1; + T output = fst.outputs.getNoOutput(); + //System.out.println("match?"); + for(int i=0;i fst, IntsRef in) throws IOException { + int node = fst.getStartNode(); + + if (fst.noNodes()) { + // degenerate FST: only accepts the empty string + assertTrue(fst.getEmptyOutput() != null); + in.length = 0; + return fst.getEmptyOutput(); + } + final List> arcs = new ArrayList>(); + in.length = 0; + in.offset = 0; + T output = fst.outputs.getNoOutput(); + //System.out.println("get random"); + while(true) { + // read all arcs: + //System.out.println(" n=" + node); + int arcAddress = node; + FST.Arc arc = new FST.Arc(); + fst.readFirstArc(arcAddress, arc); + arcs.add(copyArc(arc)); + while(!arc.isLast()) { + fst.readNextArc(arc); + arcs.add(copyArc(arc)); + } + + // pick one + arc = arcs.get(random.nextInt(arcs.size())); + + arcs.clear(); + + // append label + if (in.ints.length == in.length) { + in.grow(1+in.length); + } + in.ints[in.length++] = arc.label; + + output = fst.outputs.add(output, arc.output); + + // maybe stop + if (arc.isFinal()) { + if (fst.hasArcs(arc.target)) { + // final state but it also has outgoing edges + if (random.nextBoolean()) { + output = fst.outputs.add(output, arc.nextFinalOutput); + break; + } + } else { + break; + } + } + + node = arc.target; + } + + return output; + } + + + private FST doTest(int prune1, int prune2) throws IOException { + if (VERBOSE) { + System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2); + } + + final Builder builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, + prune1, prune2, + prune1==0 && prune2==0, outputs); + + for(InputOutput pair : pairs) { + builder.add(pair.input, pair.output); + } + FST fst = builder.finish(); + + if (random.nextBoolean() && fst != null) { + IndexOutput out = dir.createOutput("fst.bin"); + fst.save(out); + out.close(); + IndexInput in = dir.openInput("fst.bin"); + try { + fst = new FST(in, outputs); + } finally { + in.close(); + dir.deleteFile("fst.bin"); + } + } + + if (VERBOSE && pairs.size() <= 20 && fst != null) { + PrintStream ps = new PrintStream("out.dot"); + fst.toDot(ps); + ps.close(); + System.out.println("SAVED out.dot"); + } + + if (VERBOSE) { + if (fst == null) { + System.out.println(" fst has 0 nodes (fully pruned)"); + } else { + System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs"); + } + } + + if (prune1 == 0 && prune2 == 0) { + verifyUnPruned(inputMode, fst); + } else { + verifyPruned(inputMode, fst, prune1, prune2); + } + + return fst; + } + + // FST is complete + private void verifyUnPruned(int inputMode, FST fst) throws IOException { + + if (pairs.size() == 0) { + assertNull(fst); + return; + } + + if (VERBOSE) { + System.out.println("TEST: now verify " + pairs.size() + " terms"); + for(InputOutput pair : pairs) { + assertNotNull(pair); + assertNotNull(pair.input); + assertNotNull(pair.output); + System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output)); + } + } + + assertNotNull(fst); + + // make sure all words are accepted + { + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + for(InputOutput pair : pairs) { + IntsRef term = pair.input; + Object output = run(fst, term, null); + + assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output); + assertEquals(output, pair.output); + + // verify enum's next + IntsRefFSTEnum.InputOutput t = fstEnum.next(); + + assertEquals(term, t.input); + assertEquals(pair.output, t.output); + } + assertNull(fstEnum.next()); + } + + final Map termsMap = new HashMap(); + for(InputOutput pair : pairs) { + termsMap.put(pair.input, pair.output); + } + + // find random matching word and make sure it's valid + final IntsRef scratch = new IntsRef(10); + for(int iter=0;iter<500*RANDOM_MULTIPLIER;iter++) { + T output = randomAcceptedWord(fst, scratch); + assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch)); + assertEquals(termsMap.get(scratch), output); + } + + // test single IntsRefFSTEnum.advance: + //System.out.println("TEST: verify advance"); + for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { + final IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + if (random.nextBoolean()) { + // seek to term that doesn't exist: + while(true) { + final IntsRef term = toIntsRef(getRandomString(), inputMode); + int pos = Collections.binarySearch(pairs, new InputOutput(term, null)); + if (pos < 0) { + pos = -(pos+1); + // ok doesn't exist + //System.out.println(" seek " + inputToString(inputMode, term)); + final IntsRefFSTEnum.InputOutput seekResult = fstEnum.advance(term); + if (pos < pairs.size()) { + //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output)); + assertEquals(pairs.get(pos).input, seekResult.input); + assertEquals(pairs.get(pos).output, seekResult.output); + } else { + // seeked beyond end + //System.out.println("seek=" + seekTerm); + assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult); + } + + break; + } + } + } else { + // seek to term that does exist: + InputOutput pair = pairs.get(random.nextInt(pairs.size())); + //System.out.println(" seek " + inputToString(inputMode, pair.input)); + final IntsRefFSTEnum.InputOutput seekResult = fstEnum.advance(pair.input); + assertEquals(pair.input, seekResult.input); + assertEquals(pair.output, seekResult.output); + } + } + + if (VERBOSE) { + System.out.println("TEST: mixed next/advance"); + } + + // test mixed next/advance + for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { + if (VERBOSE) { + System.out.println("TEST: iter " + iter); + } + final IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + int upto = -1; + while(true) { + boolean isDone = false; + if (upto == pairs.size()-1 || random.nextBoolean()) { + // next + upto++; + if (VERBOSE) { + System.out.println(" do next"); + } + isDone = fstEnum.next() == null; + } else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) { + int attempt = 0; + for(;attempt<10;attempt++) { + IntsRef term = toIntsRef(getRandomString(), inputMode); + if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) { + if (VERBOSE) { + System.out.println(" do non-exist advance(" + inputToString(inputMode, term) + "]"); + } + int pos = Collections.binarySearch(pairs, new InputOutput(term, null)); + assert pos < 0; + upto = -(pos+1); + isDone = fstEnum.advance(term) == null; + break; + } + } + if (attempt == 10) { + continue; + } + + } else { + final int inc = random.nextInt(pairs.size() - upto - 1); + upto += inc; + if (upto == -1) { + upto = 0; + } + + if (VERBOSE) { + System.out.println(" do advance(" + inputToString(inputMode, pairs.get(upto).input) + "]"); + } + isDone = fstEnum.advance(pairs.get(upto).input) == null; + } + if (VERBOSE) { + if (!isDone) { + System.out.println(" got " + inputToString(inputMode, fstEnum.current().input)); + } else { + System.out.println(" got null"); + } + } + + if (upto == pairs.size()) { + assertTrue(isDone); + break; + } else { + assertFalse(isDone); + assertEquals(pairs.get(upto).input, fstEnum.current().input); + assertEquals(pairs.get(upto).output, fstEnum.current().output); + } + } + } + } + + private static class CountMinOutput { + int count; + T output; + T finalOutput; + boolean isLeaf = true; + boolean isFinal; + } + + // FST is pruned + private void verifyPruned(int inputMode, FST fst, int prune1, int prune2) throws IOException { + + if (VERBOSE) { + System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs); + for(InputOutput pair : pairs) { + System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output)); + } + } + + // To validate the FST, we brute-force compute all prefixes + // in the terms, matched to their "common" outputs, prune that + // set according to the prune thresholds, then assert the FST + // matches that same set. + + // NOTE: Crazy RAM intensive!! + + //System.out.println("TEST: tally prefixes"); + + // build all prefixes + final Map> prefixes = new HashMap>(); + final IntsRef scratch = new IntsRef(10); + for(InputOutput pair: pairs) { + scratch.copy(pair.input); + for(int idx=0;idx<=pair.input.length;idx++) { + scratch.length = idx; + CountMinOutput cmo = prefixes.get(scratch); + if (cmo == null) { + cmo = new CountMinOutput(); + cmo.count = 1; + cmo.output = pair.output; + prefixes.put(new IntsRef(scratch), cmo); + } else { + cmo.count++; + cmo.output = outputs.common(cmo.output, pair.output); + } + if (idx == pair.input.length) { + cmo.isFinal = true; + cmo.finalOutput = cmo.output; + } + } + } + + //System.out.println("TEST: now prune"); + + // prune 'em + final Iterator>> it = prefixes.entrySet().iterator(); + while(it.hasNext()) { + Map.Entry> ent = it.next(); + final IntsRef prefix = ent.getKey(); + final CountMinOutput cmo = ent.getValue(); + //System.out.println(" term=" + inputToString(inputMode, prefix) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf); + final boolean keep; + if (prune1 > 0) { + keep = cmo.count >= prune1; + } else { + assert prune2 > 0; + if (prune2 > 1 && cmo.count >= prune2) { + keep = true; + } else if (prefix.length > 0) { + // consult our parent + scratch.length = prefix.length-1; + System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length); + final CountMinOutput cmo2 = prefixes.get(scratch); + //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); + keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1))); + } else if (cmo.count >= prune2) { + keep = true; + } else { + keep = false; + } + } + + if (!keep) { + it.remove(); + //System.out.println(" remove"); + } else { + // clear isLeaf for all ancestors + //System.out.println(" keep"); + scratch.copy(prefix); + scratch.length--; + while(scratch.length >= 0) { + final CountMinOutput cmo2 = prefixes.get(scratch); + if (cmo2 != null) { + //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch)); + cmo2.isLeaf = false; + } + scratch.length--; + } + } + } + + //System.out.println("TEST: after prune"); + /* + for(Map.Entry ent : prefixes.entrySet()) { + System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); + if (ent.getValue().isFinal) { + System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput)); + } + } + */ + + if (prefixes.size() <= 1) { + assertNull(fst); + return; + } + + assertNotNull(fst); + + // make sure FST only enums valid prefixes + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + IntsRefFSTEnum.InputOutput current; + while((current = fstEnum.next()) != null) { + //System.out.println(" fst enum term=" + inputToString(inputMode, current.input) + " output=" + outputs.outputToString(current.output)); + final CountMinOutput cmo = prefixes.get(current.input); + assertNotNull(cmo); + assertTrue(cmo.isLeaf || cmo.isFinal); + if (cmo.isFinal && !cmo.isLeaf) { + assertEquals(cmo.finalOutput, current.output); + } else { + assertEquals(cmo.output, current.output); + } + } + + // make sure all non-pruned prefixes are present in the FST + final int[] stopNode = new int[2]; + for(Map.Entry> ent : prefixes.entrySet()) { + if (ent.getKey().length > 0) { + final CountMinOutput cmo = ent.getValue(); + final T output = run(fst, ent.getKey(), stopNode); + //System.out.println(" term=" + inputToString(inputMode, ent.getKey()) + " output=" + outputs.outputToString(cmo.output)); + // if (cmo.isFinal && !cmo.isLeaf) { + if (cmo.isFinal) { + assertEquals(cmo.finalOutput, output); + } else { + assertEquals(cmo.output, output); + } + assertEquals(ent.getKey().length, stopNode[1]); + } + } + } + } + + public void testRandomWords() throws IOException { + testRandomWords(1000, 5 * RANDOM_MULTIPLIER); + //testRandomWords(10, 100); + } + + private String inputModeToString(int mode) { + if (mode == 0) { + return "utf8"; + } else { + return "utf32"; + } + } + + private void testRandomWords(int maxNumWords, int numIter) throws IOException { + for(int iter=0;iter termsSet = new HashSet(); + IntsRef[] terms = new IntsRef[numWords]; + while(termsSet.size() < numWords) { + final String term = getRandomString(); + termsSet.add(toIntsRef(term, inputMode)); + } + doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()])); + } + } + } + + private String getRandomString() { + final String term; + if (random.nextBoolean()) { + term = _TestUtil.randomRealisticUnicodeString(random); + } else { + // we want to mix in limited-alphabet symbols so + // we get more sharing of the nodes given how few + // terms we are testing... + term = simpleRandomString(random); + } + return term; + } + + @Nightly + public void testBigSet() throws IOException { + testRandomWords(50000, RANDOM_MULTIPLIER); + } + + private static String inputToString(int inputMode, IntsRef term) { + if (inputMode == 0) { + // utf8 + return toBytesRef(term).utf8ToString(); + } else { + // utf32 + return UnicodeUtil.newString(term.ints, term.offset, term.length); + } + } + + // Build FST for all unique terms in the test line docs + // file, up until a time limit + public void testRealTerms() throws Exception { + + if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) { + // no + CodecProvider.getDefault().setDefaultFieldCodec("Standard"); + } + + final LineFileDocs docs = new LineFileDocs(false); + final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 300 : 1; + final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64); + final File tempDir = _TestUtil.getTempDir("fstlines"); + final MockDirectoryWrapper dir = new MockDirectoryWrapper(random, FSDirectory.open(tempDir)); + final IndexWriter writer = new IndexWriter(dir, conf); + final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC * 1000; + Document doc; + int docCount = 0; + while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) { + writer.addDocument(doc); + docCount++; + } + IndexReader r = IndexReader.open(writer); + writer.close(); + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); + Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + + boolean storeOrd = random.nextBoolean(); + if (VERBOSE) { + if (storeOrd) { + System.out.println("FST stores ord"); + } else { + System.out.println("FST stores docFreq"); + } + } + Terms terms = MultiFields.getTerms(r, "body"); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + BytesRef term; + int ord = 0; + while((term = termsEnum.next()) != null) { + if (ord == 0) { + try { + termsEnum.ord(); + } catch (UnsupportedOperationException uoe) { + storeOrd = false; + } + } + final int output; + if (storeOrd) { + output = ord; + } else { + output = termsEnum.docFreq(); + } + builder.add(term, outputs.get(output)); + ord++; + } + final FST fst = builder.finish(); + if (VERBOSE) { + System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes"); + } + + if (ord > 0) { + // Now confirm BytesRefFSTEnum and TermsEnum act the + // same: + final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); + for(int iter=0;iter<1000*RANDOM_MULTIPLIER;iter++) { + fstEnum.reset(); + final BytesRef randomTerm = new BytesRef(getRandomString()); + + final TermsEnum.SeekStatus seekResult = termsEnum.seek(randomTerm); + final BytesRefFSTEnum.InputOutput fstSeekResult = fstEnum.advance(randomTerm); + + if (VERBOSE) { + System.out.println("TEST: seek " + randomTerm.utf8ToString()); + } + + if (seekResult == TermsEnum.SeekStatus.END) { + assertNull(fstSeekResult); + } else { + assertSame(termsEnum, fstEnum, storeOrd); + for(int nextIter=0;nextIter<10;nextIter++) { + if (VERBOSE) { + System.out.println("TEST: next"); + } + if (termsEnum.next() != null) { + if (VERBOSE) { + System.out.println(" term=" + termsEnum.term().utf8ToString()); + } + assertNotNull(fstEnum.next()); + assertSame(termsEnum, fstEnum, storeOrd); + } else { + BytesRefFSTEnum.InputOutput nextResult = fstEnum.next(); + if (nextResult != null) { + System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output)); + fail(); + } + break; + } + } + } + } + } + } + + r.close(); + dir.close(); + } + + private void assertSame(TermsEnum termsEnum, BytesRefFSTEnum fstEnum, boolean storeOrd) throws Exception { + if (termsEnum.term() == null) { + assertNull(fstEnum.current()); + } else { + assertEquals(termsEnum.term(), fstEnum.current().input); + if (storeOrd) { + // fst stored the ord + assertEquals(termsEnum.ord(), ((Long) fstEnum.current().output).longValue()); + } else { + // fst stored the docFreq + assertEquals(termsEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue())); + } + } + } + + private static abstract class VisitTerms { + private final String dirOut; + private final String wordsFileIn; + private int inputMode; + private final Outputs outputs; + private final Builder builder; + + public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs outputs) { + this.dirOut = dirOut; + this.wordsFileIn = wordsFileIn; + this.inputMode = inputMode; + this.outputs = outputs; + + builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, outputs); + } + + protected abstract T getOutput(IntsRef input, int ord) throws IOException; + + public void run(int limit) throws IOException { + BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); + try { + final IntsRef intsRef = new IntsRef(10); + long tStart = System.currentTimeMillis(); + int ord = 0; + while(true) { + String w = is.readLine(); + if (w == null) { + break; + } + toIntsRef(w, inputMode, intsRef); + builder.add(intsRef, + getOutput(intsRef, ord)); + + ord++; + if (ord % 500000 == 0) { + System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "..."); + } + if (ord >= limit) { + break; + } + } + + assert builder.getTermCount() == ord; + final FST fst = builder.finish(); + if (fst == null) { + System.out.println("FST was fully pruned!"); + System.exit(0); + } + + System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes()); + if (fst.getNodeCount() < 100) { + PrintStream ps = new PrintStream("out.dot"); + fst.toDot(ps); + ps.close(); + System.out.println("Wrote FST to out.dot"); + } + + Directory dir = FSDirectory.open(new File(dirOut)); + IndexOutput out = dir.createOutput("fst.bin"); + fst.save(out); + out.close(); + + System.out.println("Saved FST to fst.bin."); + + System.out.println("\nNow verify..."); + + is.close(); + is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); + + ord = 0; + tStart = System.currentTimeMillis(); + while(true) { + String w = is.readLine(); + if (w == null) { + break; + } + toIntsRef(w, inputMode, intsRef); + T expected = getOutput(intsRef, ord); + T actual = fst.get(intsRef); + if (actual == null) { + throw new RuntimeException("unexpected null output on input=" + w); + } + if (!actual.equals(expected)) { + throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w); + } + + ord++; + if (ord % 500000 == 0) { + System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "..."); + } + if (ord >= limit) { + break; + } + } + + double totSec = ((System.currentTimeMillis() - tStart)/1000.0); + System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)"); + + } finally { + is.close(); + } + } + } + + // java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out + public static void main(String[] args) throws IOException { + final String wordsFileIn = args[0]; + final String dirOut = args[1]; + int idx = 2; + int prune = 0; + int limit = Integer.MAX_VALUE; + int inputMode = 0; // utf8 + boolean storeOrds = false; + boolean storeDocFreqs = false; + while(idx < args.length) { + if (args[idx].equals("-prune")) { + prune = Integer.valueOf(args[1+idx]); + idx++; + } + if (args[idx].equals("-limit")) { + limit = Integer.valueOf(args[1+idx]); + idx++; + } + if (args[idx].equals("-utf8")) { + inputMode = 0; + } + if (args[idx].equals("-utf32")) { + inputMode = 1; + } + if (args[idx].equals("-docFreq")) { + storeDocFreqs = true; + } + if (args[idx].equals("-ords")) { + storeOrds = true; + } + idx++; + } + + // ord benefits from share, docFreqs don't: + + if (storeOrds && storeDocFreqs) { + // Store both ord & docFreq: + final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true); + final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false); + final PairOutputs outputs = new PairOutputs(o1, o2); + new VisitTerms>(dirOut, wordsFileIn, inputMode, prune, outputs) { + Random rand; + @Override + public PairOutputs.Pair getOutput(IntsRef input, int ord) { + if (ord == 0) { + rand = new Random(17); + } + return new PairOutputs.Pair(o1.get(ord), + o2.get(_TestUtil.nextInt(rand, 1, 5000))); + } + }.run(limit); + } else if (storeOrds) { + // Store only ords + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + @Override + public Long getOutput(IntsRef input, int ord) { + return outputs.get(ord); + } + }.run(limit); + } else if (storeDocFreqs) { + // Store only docFreq + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false); + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + Random rand; + @Override + public Long getOutput(IntsRef input, int ord) { + if (ord == 0) { + rand = new Random(17); + } + return outputs.get(_TestUtil.nextInt(rand, 1, 5000)); + } + }.run(limit); + } else { + // Store nothing + final NoOutputs outputs = NoOutputs.getSingleton(); + final Object NO_OUTPUT = outputs.getNoOutput(); + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + @Override + public Object getOutput(IntsRef input, int ord) { + return NO_OUTPUT; + } + }.run(limit); + } + } +}