mirror of https://github.com/apache/lucene.git
LUCENE-4404: add ListOfOutputs for FST to hold more than one output per input
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1388935 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5be591e97a
commit
fd748920f8
|
@ -21,8 +21,12 @@ Changes in backwards compatibility policy
|
||||||
Robert Muir)
|
Robert Muir)
|
||||||
|
|
||||||
======================= Lucene 4.1.0 =======================
|
======================= Lucene 4.1.0 =======================
|
||||||
|
New Features
|
||||||
|
|
||||||
(No Changes)
|
* LUCENE-4404: New ListOfOutputs (in lucene/misc) for FSTs wraps
|
||||||
|
another Outputs implementation, allowing you to store more than one
|
||||||
|
output for a single input. UpToTwoPositiveIntsOutputs was moved
|
||||||
|
from lucene/core to lucene/misc. (Mike McCandless)
|
||||||
|
|
||||||
======================= Lucene 4.0.0 =======================
|
======================= Lucene 4.0.0 =======================
|
||||||
|
|
||||||
|
|
|
@ -399,8 +399,10 @@ public class Builder<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
final UnCompiledNode<T> lastNode = frontier[input.length];
|
final UnCompiledNode<T> lastNode = frontier[input.length];
|
||||||
lastNode.isFinal = true;
|
if (lastInput.length != input.length || prefixLenPlus1 != input.length + 1) {
|
||||||
lastNode.output = NO_OUTPUT;
|
lastNode.isFinal = true;
|
||||||
|
lastNode.output = NO_OUTPUT;
|
||||||
|
}
|
||||||
|
|
||||||
// push conflicting outputs forward, only as far as
|
// push conflicting outputs forward, only as far as
|
||||||
// needed
|
// needed
|
||||||
|
|
|
@ -296,11 +296,13 @@ public final class FST<T> {
|
||||||
// messy
|
// messy
|
||||||
bytes = new byte[numBytes];
|
bytes = new byte[numBytes];
|
||||||
in.readBytes(bytes, 0, numBytes);
|
in.readBytes(bytes, 0, numBytes);
|
||||||
|
BytesReader reader;
|
||||||
if (packed) {
|
if (packed) {
|
||||||
emptyOutput = outputs.read(getBytesReader(0));
|
reader = getBytesReader(0);
|
||||||
} else {
|
} else {
|
||||||
emptyOutput = outputs.read(getBytesReader(numBytes-1));
|
reader = getBytesReader(numBytes-1);
|
||||||
}
|
}
|
||||||
|
emptyOutput = outputs.readFinalOutput(reader);
|
||||||
} else {
|
} else {
|
||||||
emptyOutput = null;
|
emptyOutput = null;
|
||||||
}
|
}
|
||||||
|
@ -414,7 +416,7 @@ public final class FST<T> {
|
||||||
// TODO: this is messy -- replace with sillyBytesWriter; maybe make
|
// TODO: this is messy -- replace with sillyBytesWriter; maybe make
|
||||||
// bytes private
|
// bytes private
|
||||||
final int posSave = writer.posWrite;
|
final int posSave = writer.posWrite;
|
||||||
outputs.write(emptyOutput, writer);
|
outputs.writeFinalOutput(emptyOutput, writer);
|
||||||
emptyOutputBytes = new byte[writer.posWrite-posSave];
|
emptyOutputBytes = new byte[writer.posWrite-posSave];
|
||||||
|
|
||||||
if (!packed) {
|
if (!packed) {
|
||||||
|
@ -638,7 +640,7 @@ public final class FST<T> {
|
||||||
|
|
||||||
if (arc.nextFinalOutput != NO_OUTPUT) {
|
if (arc.nextFinalOutput != NO_OUTPUT) {
|
||||||
//System.out.println(" write final output");
|
//System.out.println(" write final output");
|
||||||
outputs.write(arc.nextFinalOutput, writer);
|
outputs.writeFinalOutput(arc.nextFinalOutput, writer);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
|
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
|
||||||
|
@ -788,7 +790,7 @@ public final class FST<T> {
|
||||||
outputs.read(in);
|
outputs.read(in);
|
||||||
}
|
}
|
||||||
if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
|
if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
|
||||||
outputs.read(in);
|
outputs.readFinalOutput(in);
|
||||||
}
|
}
|
||||||
if (arc.flag(BIT_STOP_NODE)) {
|
if (arc.flag(BIT_STOP_NODE)) {
|
||||||
} else if (arc.flag(BIT_TARGET_NEXT)) {
|
} else if (arc.flag(BIT_TARGET_NEXT)) {
|
||||||
|
@ -963,7 +965,7 @@ public final class FST<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
|
if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
|
||||||
arc.nextFinalOutput = outputs.read(in);
|
arc.nextFinalOutput = outputs.readFinalOutput(in);
|
||||||
} else {
|
} else {
|
||||||
arc.nextFinalOutput = outputs.getNoOutput();
|
arc.nextFinalOutput = outputs.getNoOutput();
|
||||||
}
|
}
|
||||||
|
@ -1127,7 +1129,7 @@ public final class FST<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) {
|
if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) {
|
||||||
outputs.read(in);
|
outputs.readFinalOutput(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) {
|
if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) {
|
||||||
|
@ -1221,6 +1223,14 @@ public final class FST<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns a {@link BytesReader} for this FST, positioned at
|
||||||
|
* position 0. */
|
||||||
|
public BytesReader getBytesReader() {
|
||||||
|
return getBytesReader(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns a {@link BytesReader} for this FST, positioned at
|
||||||
|
* the provided position. */
|
||||||
public BytesReader getBytesReader(int pos) {
|
public BytesReader getBytesReader(int pos) {
|
||||||
// TODO: maybe re-use via ThreadLocal?
|
// TODO: maybe re-use via ThreadLocal?
|
||||||
if (packed) {
|
if (packed) {
|
||||||
|
@ -1654,7 +1664,7 @@ public final class FST<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (arc.nextFinalOutput != NO_OUTPUT) {
|
if (arc.nextFinalOutput != NO_OUTPUT) {
|
||||||
outputs.write(arc.nextFinalOutput, writer);
|
outputs.writeFinalOutput(arc.nextFinalOutput, writer);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (doWriteTarget) {
|
if (doWriteTarget) {
|
||||||
|
|
|
@ -49,10 +49,27 @@ public abstract class Outputs<T> {
|
||||||
/** Eg add("foo", "bar") -> "foobar" */
|
/** Eg add("foo", "bar") -> "foobar" */
|
||||||
public abstract T add(T prefix, T output);
|
public abstract T add(T prefix, T output);
|
||||||
|
|
||||||
|
/** Encode an output value into a {@link DataOutput}. */
|
||||||
public abstract void write(T output, DataOutput out) throws IOException;
|
public abstract void write(T output, DataOutput out) throws IOException;
|
||||||
|
|
||||||
|
/** Encode an final node output value into a {@link
|
||||||
|
* DataOutput}. By default this just calls {@link #write(Object,
|
||||||
|
* DataOutput)}. */
|
||||||
|
public void writeFinalOutput(T output, DataOutput out) throws IOException {
|
||||||
|
write(output, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Decode an output value previously written with {@link
|
||||||
|
* #write(Object, DataOutput)}. */
|
||||||
public abstract T read(DataInput in) throws IOException;
|
public abstract T read(DataInput in) throws IOException;
|
||||||
|
|
||||||
|
/** Decode an output value previously written with {@link
|
||||||
|
* #writeFinalOutput(Object, DataOutput)}. By default this
|
||||||
|
* just calls {@link #read(DataInput)}. */
|
||||||
|
public T readFinalOutput(DataInput in) throws IOException {
|
||||||
|
return read(in);
|
||||||
|
}
|
||||||
|
|
||||||
/** NOTE: this output is compared with == so you must
|
/** NOTE: this output is compared with == so you must
|
||||||
* ensure that all methods return the single object if
|
* ensure that all methods return the single object if
|
||||||
* it's really no output */
|
* it's really no output */
|
||||||
|
|
|
@ -29,8 +29,6 @@ import java.io.Writer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.codecs.Codec;
|
|
||||||
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
@ -56,7 +54,6 @@ import org.apache.lucene.util.LineFileDocs;
|
||||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
import org.apache.lucene.util.automaton.Automaton;
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||||
|
@ -67,6 +64,10 @@ import org.apache.lucene.util.fst.FST.BytesReader;
|
||||||
import org.apache.lucene.util.fst.PairOutputs.Pair;
|
import org.apache.lucene.util.fst.PairOutputs.Pair;
|
||||||
import org.apache.lucene.util.packed.PackedInts;
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
|
|
||||||
|
import static org.apache.lucene.util.fst.FSTTester.getRandomString;
|
||||||
|
import static org.apache.lucene.util.fst.FSTTester.simpleRandomString;
|
||||||
|
import static org.apache.lucene.util.fst.FSTTester.toIntsRef;
|
||||||
|
|
||||||
@SuppressCodecs({ "SimpleText", "Memory", "Direct" })
|
@SuppressCodecs({ "SimpleText", "Memory", "Direct" })
|
||||||
@Slow
|
@Slow
|
||||||
public class TestFSTs extends LuceneTestCase {
|
public class TestFSTs extends LuceneTestCase {
|
||||||
|
@ -87,59 +88,6 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
super.tearDown();
|
super.tearDown();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static BytesRef toBytesRef(IntsRef ir) {
|
|
||||||
BytesRef br = new BytesRef(ir.length);
|
|
||||||
for(int i=0;i<ir.length;i++) {
|
|
||||||
int x = ir.ints[ir.offset+i];
|
|
||||||
assert x >= 0 && x <= 255;
|
|
||||||
br.bytes[i] = (byte) x;
|
|
||||||
}
|
|
||||||
br.length = ir.length;
|
|
||||||
return br;
|
|
||||||
}
|
|
||||||
|
|
||||||
static IntsRef toIntsRef(String s, int inputMode) {
|
|
||||||
return toIntsRef(s, inputMode, new IntsRef(10));
|
|
||||||
}
|
|
||||||
|
|
||||||
static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
|
|
||||||
if (inputMode == 0) {
|
|
||||||
// utf8
|
|
||||||
return toIntsRef(new BytesRef(s), ir);
|
|
||||||
} else {
|
|
||||||
// utf32
|
|
||||||
return toIntsRefUTF32(s, ir);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
|
|
||||||
final int charLength = s.length();
|
|
||||||
int charIdx = 0;
|
|
||||||
int intIdx = 0;
|
|
||||||
while(charIdx < charLength) {
|
|
||||||
if (intIdx == ir.ints.length) {
|
|
||||||
ir.grow(intIdx+1);
|
|
||||||
}
|
|
||||||
final int utf32 = s.codePointAt(charIdx);
|
|
||||||
ir.ints[intIdx] = utf32;
|
|
||||||
charIdx += Character.charCount(utf32);
|
|
||||||
intIdx++;
|
|
||||||
}
|
|
||||||
ir.length = intIdx;
|
|
||||||
return ir;
|
|
||||||
}
|
|
||||||
|
|
||||||
static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
|
|
||||||
if (br.length > ir.ints.length) {
|
|
||||||
ir.grow(br.length);
|
|
||||||
}
|
|
||||||
for(int i=0;i<br.length;i++) {
|
|
||||||
ir.ints[i] = br.bytes[br.offset+i]&0xFF;
|
|
||||||
}
|
|
||||||
ir.length = br.length;
|
|
||||||
return ir;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testBasicFSA() throws IOException {
|
public void testBasicFSA() throws IOException {
|
||||||
String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
|
String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
|
||||||
String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"};
|
String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"};
|
||||||
|
@ -206,19 +154,6 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String simpleRandomString(Random r) {
|
|
||||||
final int end = r.nextInt(10);
|
|
||||||
if (end == 0) {
|
|
||||||
// allow 0 length
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
final char[] buffer = new char[end];
|
|
||||||
for (int i = 0; i < end; i++) {
|
|
||||||
buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
|
|
||||||
}
|
|
||||||
return new String(buffer, 0, end);
|
|
||||||
}
|
|
||||||
|
|
||||||
// given set of terms, test the different outputs for them
|
// given set of terms, test the different outputs for them
|
||||||
private void doTest(int inputMode, IntsRef[] terms) throws IOException {
|
private void doTest(int inputMode, IntsRef[] terms) throws IOException {
|
||||||
Arrays.sort(terms);
|
Arrays.sort(terms);
|
||||||
|
@ -231,7 +166,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
for(IntsRef term : terms) {
|
for(IntsRef term : terms) {
|
||||||
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
|
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
|
||||||
}
|
}
|
||||||
new FSTTester<Object>(random(), dir, inputMode, pairs, outputs, false).doTest();
|
new FSTTester<Object>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// PositiveIntOutput (ord)
|
// PositiveIntOutput (ord)
|
||||||
|
@ -241,7 +176,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
for(int idx=0;idx<terms.length;idx++) {
|
for(int idx=0;idx<terms.length;idx++) {
|
||||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], (long) idx));
|
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], (long) idx));
|
||||||
}
|
}
|
||||||
new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, true).doTest();
|
new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, true).doTest(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// PositiveIntOutput (random monotonically increasing positive number)
|
// PositiveIntOutput (random monotonically increasing positive number)
|
||||||
|
@ -255,7 +190,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
lastOutput = value;
|
lastOutput = value;
|
||||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], value));
|
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], value));
|
||||||
}
|
}
|
||||||
new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, doShare).doTest();
|
new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, doShare).doTest(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// PositiveIntOutput (random positive number)
|
// PositiveIntOutput (random positive number)
|
||||||
|
@ -265,7 +200,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
for(int idx=0;idx<terms.length;idx++) {
|
for(int idx=0;idx<terms.length;idx++) {
|
||||||
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], _TestUtil.nextLong(random(), 0, Long.MAX_VALUE)));
|
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], _TestUtil.nextLong(random(), 0, Long.MAX_VALUE)));
|
||||||
}
|
}
|
||||||
new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, false).doTest();
|
new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pair<ord, (random monotonically increasing positive number>
|
// Pair<ord, (random monotonically increasing positive number>
|
||||||
|
@ -281,7 +216,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
|
pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
|
||||||
outputs.newPair((long) idx, value)));
|
outputs.newPair((long) idx, value)));
|
||||||
}
|
}
|
||||||
new FSTTester<PairOutputs.Pair<Long,Long>>(random(), dir, inputMode, pairs, outputs, false).doTest();
|
new FSTTester<PairOutputs.Pair<Long,Long>>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sequence-of-bytes
|
// Sequence-of-bytes
|
||||||
|
@ -293,7 +228,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final BytesRef output = random().nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
|
final BytesRef output = random().nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
|
||||||
pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output));
|
pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output));
|
||||||
}
|
}
|
||||||
new FSTTester<BytesRef>(random(), dir, inputMode, pairs, outputs, false).doTest();
|
new FSTTester<BytesRef>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sequence-of-ints
|
// Sequence-of-ints
|
||||||
|
@ -309,722 +244,11 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output));
|
pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output));
|
||||||
}
|
}
|
||||||
new FSTTester<IntsRef>(random(), dir, inputMode, pairs, outputs, false).doTest();
|
new FSTTester<IntsRef>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Up to two positive ints, shared, generally but not
|
|
||||||
// monotonically increasing
|
|
||||||
{
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: now test UpToTwoPositiveIntOutputs");
|
|
||||||
}
|
|
||||||
final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true);
|
|
||||||
final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
|
|
||||||
long lastOutput = 0;
|
|
||||||
for(int idx=0;idx<terms.length;idx++) {
|
|
||||||
// Sometimes go backwards
|
|
||||||
long value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
|
|
||||||
while(value < 0) {
|
|
||||||
value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
|
|
||||||
}
|
|
||||||
final Object output;
|
|
||||||
if (random().nextInt(5) == 3) {
|
|
||||||
long value2 = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
|
|
||||||
while(value2 < 0) {
|
|
||||||
value2 = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
|
|
||||||
}
|
|
||||||
output = outputs.get(value, value2);
|
|
||||||
} else {
|
|
||||||
output = outputs.get(value);
|
|
||||||
}
|
|
||||||
pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
|
|
||||||
}
|
|
||||||
new FSTTester<Object>(random(), dir, inputMode, pairs, outputs, false).doTest();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class FSTTester<T> {
|
|
||||||
|
|
||||||
final Random random;
|
|
||||||
final List<InputOutput<T>> pairs;
|
|
||||||
final int inputMode;
|
|
||||||
final Outputs<T> outputs;
|
|
||||||
final Directory dir;
|
|
||||||
final boolean doReverseLookup;
|
|
||||||
|
|
||||||
public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs, boolean doReverseLookup) {
|
|
||||||
this.random = random;
|
|
||||||
this.dir = dir;
|
|
||||||
this.inputMode = inputMode;
|
|
||||||
this.pairs = pairs;
|
|
||||||
this.outputs = outputs;
|
|
||||||
this.doReverseLookup = doReverseLookup;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class InputOutput<T> implements Comparable<InputOutput<T>> {
|
|
||||||
public final IntsRef input;
|
|
||||||
public final T output;
|
|
||||||
|
|
||||||
public InputOutput(IntsRef input, T output) {
|
|
||||||
this.input = input;
|
|
||||||
this.output = output;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int compareTo(InputOutput<T> other) {
|
|
||||||
if (other instanceof InputOutput) {
|
|
||||||
return input.compareTo((other).input);
|
|
||||||
} else {
|
|
||||||
throw new IllegalArgumentException();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void doTest() throws IOException {
|
|
||||||
// no pruning
|
|
||||||
doTest(0, 0, true);
|
|
||||||
|
|
||||||
if (!(outputs instanceof UpToTwoPositiveIntOutputs)) {
|
|
||||||
// simple pruning
|
|
||||||
doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true);
|
|
||||||
|
|
||||||
// leafy pruning
|
|
||||||
doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// runs the term, returning the output, or null if term
|
|
||||||
// isn't accepted. if prefixLength is non-null it must be
|
|
||||||
// length 1 int array; prefixLength[0] is set to the length
|
|
||||||
// of the term prefix that matches
|
|
||||||
private T run(FST<T> fst, IntsRef term, int[] prefixLength) throws IOException {
|
|
||||||
assert prefixLength == null || prefixLength.length == 1;
|
|
||||||
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
|
||||||
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
|
||||||
T output = NO_OUTPUT;
|
|
||||||
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
|
||||||
|
|
||||||
for(int i=0;i<=term.length;i++) {
|
|
||||||
final int label;
|
|
||||||
if (i == term.length) {
|
|
||||||
label = FST.END_LABEL;
|
|
||||||
} else {
|
|
||||||
label = term.ints[term.offset+i];
|
|
||||||
}
|
|
||||||
// System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
|
|
||||||
if (fst.findTargetArc(label, arc, arc, fstReader) == null) {
|
|
||||||
// System.out.println(" not found");
|
|
||||||
if (prefixLength != null) {
|
|
||||||
prefixLength[0] = i;
|
|
||||||
return output;
|
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
output = fst.outputs.add(output, arc.output);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (prefixLength != null) {
|
|
||||||
prefixLength[0] = term.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
|
|
||||||
private T randomAcceptedWord(FST<T> fst, IntsRef in) throws IOException {
|
|
||||||
FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
|
||||||
|
|
||||||
final List<FST.Arc<T>> arcs = new ArrayList<FST.Arc<T>>();
|
|
||||||
in.length = 0;
|
|
||||||
in.offset = 0;
|
|
||||||
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
|
||||||
T output = NO_OUTPUT;
|
|
||||||
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
|
||||||
|
|
||||||
while(true) {
|
|
||||||
// read all arcs:
|
|
||||||
fst.readFirstTargetArc(arc, arc, fstReader);
|
|
||||||
arcs.add(new FST.Arc<T>().copyFrom(arc));
|
|
||||||
while(!arc.isLast()) {
|
|
||||||
fst.readNextArc(arc, fstReader);
|
|
||||||
arcs.add(new FST.Arc<T>().copyFrom(arc));
|
|
||||||
}
|
|
||||||
|
|
||||||
// pick one
|
|
||||||
arc = arcs.get(random.nextInt(arcs.size()));
|
|
||||||
arcs.clear();
|
|
||||||
|
|
||||||
// accumulate output
|
|
||||||
output = fst.outputs.add(output, arc.output);
|
|
||||||
|
|
||||||
// append label
|
|
||||||
if (arc.label == FST.END_LABEL) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (in.ints.length == in.length) {
|
|
||||||
in.grow(1+in.length);
|
|
||||||
}
|
|
||||||
in.ints[in.length++] = arc.label;
|
|
||||||
}
|
|
||||||
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
|
|
||||||
}
|
|
||||||
|
|
||||||
final boolean willRewrite = random.nextBoolean();
|
|
||||||
|
|
||||||
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
|
|
||||||
prune1, prune2,
|
|
||||||
prune1==0 && prune2==0,
|
|
||||||
allowRandomSuffixSharing ? random.nextBoolean() : true,
|
|
||||||
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
|
|
||||||
outputs,
|
|
||||||
null,
|
|
||||||
willRewrite);
|
|
||||||
|
|
||||||
for(InputOutput<T> pair : pairs) {
|
|
||||||
if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
|
|
||||||
final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs;
|
|
||||||
final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output;
|
|
||||||
@SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
|
|
||||||
builderObject.add(pair.input, _outputs.get(twoLongs.first));
|
|
||||||
builderObject.add(pair.input, _outputs.get(twoLongs.second));
|
|
||||||
} else {
|
|
||||||
builder.add(pair.input, pair.output);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
FST<T> fst = builder.finish();
|
|
||||||
|
|
||||||
if (random.nextBoolean() && fst != null && !willRewrite) {
|
|
||||||
IOContext context = LuceneTestCase.newIOContext(random);
|
|
||||||
IndexOutput out = dir.createOutput("fst.bin", context);
|
|
||||||
fst.save(out);
|
|
||||||
out.close();
|
|
||||||
IndexInput in = dir.openInput("fst.bin", context);
|
|
||||||
try {
|
|
||||||
fst = new FST<T>(in, outputs);
|
|
||||||
} finally {
|
|
||||||
in.close();
|
|
||||||
dir.deleteFile("fst.bin");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (VERBOSE && pairs.size() <= 20 && fst != null) {
|
|
||||||
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
|
||||||
Util.toDot(fst, w, false, false);
|
|
||||||
w.close();
|
|
||||||
System.out.println("SAVED out.dot");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (VERBOSE) {
|
|
||||||
if (fst == null) {
|
|
||||||
System.out.println(" fst has 0 nodes (fully pruned)");
|
|
||||||
} else {
|
|
||||||
System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (prune1 == 0 && prune2 == 0) {
|
|
||||||
verifyUnPruned(inputMode, fst);
|
|
||||||
} else {
|
|
||||||
verifyPruned(inputMode, fst, prune1, prune2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (willRewrite && fst != null) {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: now rewrite");
|
|
||||||
}
|
|
||||||
final FST<T> packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat());
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: now verify packed FST");
|
|
||||||
}
|
|
||||||
if (prune1 == 0 && prune2 == 0) {
|
|
||||||
verifyUnPruned(inputMode, packed);
|
|
||||||
} else {
|
|
||||||
verifyPruned(inputMode, packed, prune1, prune2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return fst;
|
|
||||||
}
|
|
||||||
|
|
||||||
// FST is complete
|
|
||||||
private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
|
|
||||||
|
|
||||||
final FST<Long> fstLong;
|
|
||||||
final Set<Long> validOutputs;
|
|
||||||
long minLong = Long.MAX_VALUE;
|
|
||||||
long maxLong = Long.MIN_VALUE;
|
|
||||||
|
|
||||||
if (doReverseLookup) {
|
|
||||||
@SuppressWarnings("unchecked") FST<Long> fstLong0 = (FST<Long>) fst;
|
|
||||||
fstLong = fstLong0;
|
|
||||||
validOutputs = new HashSet<Long>();
|
|
||||||
for(InputOutput<T> pair: pairs) {
|
|
||||||
Long output = (Long) pair.output;
|
|
||||||
maxLong = Math.max(maxLong, output);
|
|
||||||
minLong = Math.min(minLong, output);
|
|
||||||
validOutputs.add(output);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
fstLong = null;
|
|
||||||
validOutputs = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (pairs.size() == 0) {
|
|
||||||
assertNull(fst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: now verify " + pairs.size() + " terms");
|
|
||||||
for(InputOutput<T> pair : pairs) {
|
|
||||||
assertNotNull(pair);
|
|
||||||
assertNotNull(pair.input);
|
|
||||||
assertNotNull(pair.output);
|
|
||||||
System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assertNotNull(fst);
|
|
||||||
|
|
||||||
// visit valid pairs in order -- make sure all words
|
|
||||||
// are accepted, and FSTEnum's next() steps through
|
|
||||||
// them correctly
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: check valid terms/next()");
|
|
||||||
}
|
|
||||||
{
|
|
||||||
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
|
|
||||||
for(InputOutput<T> pair : pairs) {
|
|
||||||
IntsRef term = pair.input;
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
|
|
||||||
}
|
|
||||||
Object output = run(fst, term, null);
|
|
||||||
assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
|
|
||||||
assertEquals(pair.output, output);
|
|
||||||
|
|
||||||
// verify enum's next
|
|
||||||
IntsRefFSTEnum.InputOutput<T> t = fstEnum.next();
|
|
||||||
assertNotNull(t);
|
|
||||||
assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input);
|
|
||||||
assertEquals(pair.output, t.output);
|
|
||||||
}
|
|
||||||
assertNull(fstEnum.next());
|
|
||||||
}
|
|
||||||
|
|
||||||
final Map<IntsRef,T> termsMap = new HashMap<IntsRef,T>();
|
|
||||||
for(InputOutput<T> pair : pairs) {
|
|
||||||
termsMap.put(pair.input, pair.output);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (doReverseLookup && maxLong > minLong) {
|
|
||||||
// Do random lookups so we test null (output doesn't
|
|
||||||
// exist) case:
|
|
||||||
assertNull(Util.getByOutput(fstLong, minLong-7));
|
|
||||||
assertNull(Util.getByOutput(fstLong, maxLong+7));
|
|
||||||
|
|
||||||
final int num = atLeast(100);
|
|
||||||
for(int iter=0;iter<num;iter++) {
|
|
||||||
Long v = _TestUtil.nextLong(random, minLong, maxLong);
|
|
||||||
IntsRef input = Util.getByOutput(fstLong, v);
|
|
||||||
assertTrue(validOutputs.contains(v) || input == null);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// find random matching word and make sure it's valid
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: verify random accepted terms");
|
|
||||||
}
|
|
||||||
final IntsRef scratch = new IntsRef(10);
|
|
||||||
int num = atLeast(500);
|
|
||||||
for(int iter=0;iter<num;iter++) {
|
|
||||||
T output = randomAcceptedWord(fst, scratch);
|
|
||||||
assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
|
|
||||||
assertEquals(termsMap.get(scratch), output);
|
|
||||||
|
|
||||||
if (doReverseLookup) {
|
|
||||||
//System.out.println("lookup output=" + output + " outs=" + fst.outputs);
|
|
||||||
IntsRef input = Util.getByOutput(fstLong, (Long) output);
|
|
||||||
assertNotNull(input);
|
|
||||||
//System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
|
|
||||||
assertEquals(scratch, input);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// test IntsRefFSTEnum.seek:
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: verify seek");
|
|
||||||
}
|
|
||||||
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
|
|
||||||
num = atLeast(100);
|
|
||||||
for(int iter=0;iter<num;iter++) {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" iter=" + iter);
|
|
||||||
}
|
|
||||||
if (random.nextBoolean()) {
|
|
||||||
// seek to term that doesn't exist:
|
|
||||||
while(true) {
|
|
||||||
final IntsRef term = toIntsRef(getRandomString(random), inputMode);
|
|
||||||
int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
|
|
||||||
if (pos < 0) {
|
|
||||||
pos = -(pos+1);
|
|
||||||
// ok doesn't exist
|
|
||||||
//System.out.println(" seek " + inputToString(inputMode, term));
|
|
||||||
final IntsRefFSTEnum.InputOutput<T> seekResult;
|
|
||||||
if (random.nextInt(3) == 0) {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" do non-exist seekExact term=" + inputToString(inputMode, term));
|
|
||||||
}
|
|
||||||
seekResult = fstEnum.seekExact(term);
|
|
||||||
pos = -1;
|
|
||||||
} else if (random.nextBoolean()) {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term));
|
|
||||||
}
|
|
||||||
seekResult = fstEnum.seekFloor(term);
|
|
||||||
pos--;
|
|
||||||
} else {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term));
|
|
||||||
}
|
|
||||||
seekResult = fstEnum.seekCeil(term);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (pos != -1 && pos < pairs.size()) {
|
|
||||||
//System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output));
|
|
||||||
assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult);
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" got " + inputToString(inputMode, seekResult.input));
|
|
||||||
}
|
|
||||||
assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input);
|
|
||||||
assertEquals(pairs.get(pos).output, seekResult.output);
|
|
||||||
} else {
|
|
||||||
// seeked before start or beyond end
|
|
||||||
//System.out.println("seek=" + seekTerm);
|
|
||||||
assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult);
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" got null");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// seek to term that does exist:
|
|
||||||
InputOutput<T> pair = pairs.get(random.nextInt(pairs.size()));
|
|
||||||
final IntsRefFSTEnum.InputOutput<T> seekResult;
|
|
||||||
if (random.nextInt(3) == 2) {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" do exists seekExact term=" + inputToString(inputMode, pair.input));
|
|
||||||
}
|
|
||||||
seekResult = fstEnum.seekExact(pair.input);
|
|
||||||
} else if (random.nextBoolean()) {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input));
|
|
||||||
}
|
|
||||||
seekResult = fstEnum.seekFloor(pair.input);
|
|
||||||
} else {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input));
|
|
||||||
}
|
|
||||||
seekResult = fstEnum.seekCeil(pair.input);
|
|
||||||
}
|
|
||||||
assertNotNull(seekResult);
|
|
||||||
assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input);
|
|
||||||
assertEquals(pair.output, seekResult.output);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: mixed next/seek");
|
|
||||||
}
|
|
||||||
|
|
||||||
// test mixed next/seek
|
|
||||||
num = atLeast(100);
|
|
||||||
for(int iter=0;iter<num;iter++) {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: iter " + iter);
|
|
||||||
}
|
|
||||||
// reset:
|
|
||||||
fstEnum = new IntsRefFSTEnum<T>(fst);
|
|
||||||
int upto = -1;
|
|
||||||
while(true) {
|
|
||||||
boolean isDone = false;
|
|
||||||
if (upto == pairs.size()-1 || random.nextBoolean()) {
|
|
||||||
// next
|
|
||||||
upto++;
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" do next");
|
|
||||||
}
|
|
||||||
isDone = fstEnum.next() == null;
|
|
||||||
} else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) {
|
|
||||||
int attempt = 0;
|
|
||||||
for(;attempt<10;attempt++) {
|
|
||||||
IntsRef term = toIntsRef(getRandomString(random), inputMode);
|
|
||||||
if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) {
|
|
||||||
int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
|
|
||||||
assert pos < 0;
|
|
||||||
upto = -(pos+1);
|
|
||||||
|
|
||||||
if (random.nextBoolean()) {
|
|
||||||
upto--;
|
|
||||||
assertTrue(upto != -1);
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")");
|
|
||||||
}
|
|
||||||
isDone = fstEnum.seekFloor(term) == null;
|
|
||||||
} else {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")");
|
|
||||||
}
|
|
||||||
isDone = fstEnum.seekCeil(term) == null;
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (attempt == 10) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
final int inc = random.nextInt(pairs.size() - upto - 1);
|
|
||||||
upto += inc;
|
|
||||||
if (upto == -1) {
|
|
||||||
upto = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (random.nextBoolean()) {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" do seekCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")");
|
|
||||||
}
|
|
||||||
isDone = fstEnum.seekCeil(pairs.get(upto).input) == null;
|
|
||||||
} else {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" do seekFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")");
|
|
||||||
}
|
|
||||||
isDone = fstEnum.seekFloor(pairs.get(upto).input) == null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (VERBOSE) {
|
|
||||||
if (!isDone) {
|
|
||||||
System.out.println(" got " + inputToString(inputMode, fstEnum.current().input));
|
|
||||||
} else {
|
|
||||||
System.out.println(" got null");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (upto == pairs.size()) {
|
|
||||||
assertTrue(isDone);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
assertFalse(isDone);
|
|
||||||
assertEquals(pairs.get(upto).input, fstEnum.current().input);
|
|
||||||
assertEquals(pairs.get(upto).output, fstEnum.current().output);
|
|
||||||
|
|
||||||
/*
|
|
||||||
if (upto < pairs.size()-1) {
|
|
||||||
int tryCount = 0;
|
|
||||||
while(tryCount < 10) {
|
|
||||||
final IntsRef t = toIntsRef(getRandomString(), inputMode);
|
|
||||||
if (pairs.get(upto).input.compareTo(t) < 0) {
|
|
||||||
final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0;
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected);
|
|
||||||
}
|
|
||||||
assertEquals(expected, fstEnum.beforeNext(t));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
tryCount++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class CountMinOutput<T> {
|
|
||||||
int count;
|
|
||||||
T output;
|
|
||||||
T finalOutput;
|
|
||||||
boolean isLeaf = true;
|
|
||||||
boolean isFinal;
|
|
||||||
}
|
|
||||||
|
|
||||||
// FST is pruned
|
|
||||||
private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
|
|
||||||
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
|
|
||||||
for(InputOutput<T> pair : pairs) {
|
|
||||||
System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// To validate the FST, we brute-force compute all prefixes
|
|
||||||
// in the terms, matched to their "common" outputs, prune that
|
|
||||||
// set according to the prune thresholds, then assert the FST
|
|
||||||
// matches that same set.
|
|
||||||
|
|
||||||
// NOTE: Crazy RAM intensive!!
|
|
||||||
|
|
||||||
//System.out.println("TEST: tally prefixes");
|
|
||||||
|
|
||||||
// build all prefixes
|
|
||||||
final Map<IntsRef,CountMinOutput<T>> prefixes = new HashMap<IntsRef,CountMinOutput<T>>();
|
|
||||||
final IntsRef scratch = new IntsRef(10);
|
|
||||||
for(InputOutput<T> pair: pairs) {
|
|
||||||
scratch.copyInts(pair.input);
|
|
||||||
for(int idx=0;idx<=pair.input.length;idx++) {
|
|
||||||
scratch.length = idx;
|
|
||||||
CountMinOutput<T> cmo = prefixes.get(scratch);
|
|
||||||
if (cmo == null) {
|
|
||||||
cmo = new CountMinOutput<T>();
|
|
||||||
cmo.count = 1;
|
|
||||||
cmo.output = pair.output;
|
|
||||||
prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
|
|
||||||
} else {
|
|
||||||
cmo.count++;
|
|
||||||
T output1 = cmo.output;
|
|
||||||
if (output1.equals(outputs.getNoOutput())) {
|
|
||||||
output1 = outputs.getNoOutput();
|
|
||||||
}
|
|
||||||
T output2 = pair.output;
|
|
||||||
if (output2.equals(outputs.getNoOutput())) {
|
|
||||||
output2 = outputs.getNoOutput();
|
|
||||||
}
|
|
||||||
cmo.output = outputs.common(output1, output2);
|
|
||||||
}
|
|
||||||
if (idx == pair.input.length) {
|
|
||||||
cmo.isFinal = true;
|
|
||||||
cmo.finalOutput = cmo.output;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: now prune");
|
|
||||||
}
|
|
||||||
|
|
||||||
// prune 'em
|
|
||||||
final Iterator<Map.Entry<IntsRef,CountMinOutput<T>>> it = prefixes.entrySet().iterator();
|
|
||||||
while(it.hasNext()) {
|
|
||||||
Map.Entry<IntsRef,CountMinOutput<T>> ent = it.next();
|
|
||||||
final IntsRef prefix = ent.getKey();
|
|
||||||
final CountMinOutput<T> cmo = ent.getValue();
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
|
|
||||||
}
|
|
||||||
final boolean keep;
|
|
||||||
if (prune1 > 0) {
|
|
||||||
keep = cmo.count >= prune1;
|
|
||||||
} else {
|
|
||||||
assert prune2 > 0;
|
|
||||||
if (prune2 > 1 && cmo.count >= prune2) {
|
|
||||||
keep = true;
|
|
||||||
} else if (prefix.length > 0) {
|
|
||||||
// consult our parent
|
|
||||||
scratch.length = prefix.length-1;
|
|
||||||
System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length);
|
|
||||||
final CountMinOutput<T> cmo2 = prefixes.get(scratch);
|
|
||||||
//System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count));
|
|
||||||
keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
|
|
||||||
} else if (cmo.count >= prune2) {
|
|
||||||
keep = true;
|
|
||||||
} else {
|
|
||||||
keep = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!keep) {
|
|
||||||
it.remove();
|
|
||||||
//System.out.println(" remove");
|
|
||||||
} else {
|
|
||||||
// clear isLeaf for all ancestors
|
|
||||||
//System.out.println(" keep");
|
|
||||||
scratch.copyInts(prefix);
|
|
||||||
scratch.length--;
|
|
||||||
while(scratch.length >= 0) {
|
|
||||||
final CountMinOutput<T> cmo2 = prefixes.get(scratch);
|
|
||||||
if (cmo2 != null) {
|
|
||||||
//System.out.println(" clear isLeaf " + inputToString(inputMode, scratch));
|
|
||||||
cmo2.isLeaf = false;
|
|
||||||
}
|
|
||||||
scratch.length--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: after prune");
|
|
||||||
for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
|
|
||||||
System.out.println(" " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
|
|
||||||
if (ent.getValue().isFinal) {
|
|
||||||
System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (prefixes.size() <= 1) {
|
|
||||||
assertNull(fst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
assertNotNull(fst);
|
|
||||||
|
|
||||||
// make sure FST only enums valid prefixes
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: check pruned enum");
|
|
||||||
}
|
|
||||||
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
|
|
||||||
IntsRefFSTEnum.InputOutput<T> current;
|
|
||||||
while((current = fstEnum.next()) != null) {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output));
|
|
||||||
}
|
|
||||||
final CountMinOutput<T> cmo = prefixes.get(current.input);
|
|
||||||
assertNotNull(cmo);
|
|
||||||
assertTrue(cmo.isLeaf || cmo.isFinal);
|
|
||||||
//if (cmo.isFinal && !cmo.isLeaf) {
|
|
||||||
if (cmo.isFinal) {
|
|
||||||
assertEquals(cmo.finalOutput, current.output);
|
|
||||||
} else {
|
|
||||||
assertEquals(cmo.output, current.output);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// make sure all non-pruned prefixes are present in the FST
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: verify all prefixes");
|
|
||||||
}
|
|
||||||
final int[] stopNode = new int[1];
|
|
||||||
for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
|
|
||||||
if (ent.getKey().length > 0) {
|
|
||||||
final CountMinOutput<T> cmo = ent.getValue();
|
|
||||||
final T output = run(fst, ent.getKey(), stopNode);
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output));
|
|
||||||
}
|
|
||||||
// if (cmo.isFinal && !cmo.isLeaf) {
|
|
||||||
if (cmo.isFinal) {
|
|
||||||
assertEquals(cmo.finalOutput, output);
|
|
||||||
} else {
|
|
||||||
assertEquals(cmo.output, output);
|
|
||||||
}
|
|
||||||
assertEquals(ent.getKey().length, stopNode[0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testRandomWords() throws IOException {
|
public void testRandomWords() throws IOException {
|
||||||
testRandomWords(1000, atLeast(2));
|
testRandomWords(1000, atLeast(2));
|
||||||
|
@ -1058,40 +282,11 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static String getRandomString(Random random) {
|
|
||||||
final String term;
|
|
||||||
if (random.nextBoolean()) {
|
|
||||||
term = _TestUtil.randomRealisticUnicodeString(random);
|
|
||||||
} else {
|
|
||||||
// we want to mix in limited-alphabet symbols so
|
|
||||||
// we get more sharing of the nodes given how few
|
|
||||||
// terms we are testing...
|
|
||||||
term = simpleRandomString(random);
|
|
||||||
}
|
|
||||||
return term;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Nightly
|
@Nightly
|
||||||
public void testBigSet() throws IOException {
|
public void testBigSet() throws IOException {
|
||||||
testRandomWords(_TestUtil.nextInt(random(), 50000, 60000), 1);
|
testRandomWords(_TestUtil.nextInt(random(), 50000, 60000), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static String inputToString(int inputMode, IntsRef term) {
|
|
||||||
return inputToString(inputMode, term, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
|
|
||||||
if (!isValidUnicode) {
|
|
||||||
return term.toString();
|
|
||||||
} else if (inputMode == 0) {
|
|
||||||
// utf8
|
|
||||||
return toBytesRef(term).utf8ToString() + " " + term;
|
|
||||||
} else {
|
|
||||||
// utf32
|
|
||||||
return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build FST for all unique terms in the test line docs
|
// Build FST for all unique terms in the test line docs
|
||||||
// file, up until a time limit
|
// file, up until a time limit
|
||||||
public void testRealTerms() throws Exception {
|
public void testRealTerms() throws Exception {
|
||||||
|
|
|
@ -0,0 +1,188 @@
|
||||||
|
package org.apache.lucene.util.fst;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wraps another Outputs implementation and encodes one or
|
||||||
|
* more of its output values. You can use this when a single
|
||||||
|
* input may need to map to more than one output,
|
||||||
|
* maintaining order: pass the same input with a different
|
||||||
|
* output by calling {@link Builder#add(IntsRef,Object)} multiple
|
||||||
|
* times. The builder will then combine the outputs using
|
||||||
|
* the {@link Outputs#merge(Object,Object)} method.
|
||||||
|
*
|
||||||
|
* <p>The resulting FST may not be minimal when an input has
|
||||||
|
* more than one output, as this requires pushing all
|
||||||
|
* multi-output values to a final state.
|
||||||
|
*
|
||||||
|
* <p>NOTE: this cannot wrap itself (ie you cannot make an
|
||||||
|
* FST with List<List<Object>> outputs using this).
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
// NOTE: i think we could get a more compact FST if, instead
|
||||||
|
// of adding the same input multiple times with a different
|
||||||
|
// output each time, we added it only once with a
|
||||||
|
// pre-constructed List<T> output. This way the "multiple
|
||||||
|
// values" is fully opaque to the Builder/FST. It would
|
||||||
|
// require implementing the full algebra using set
|
||||||
|
// arithmetic (I think?); maybe SetOfOutputs is a good name.
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public final class ListOfOutputs<T> extends Outputs<Object> {
|
||||||
|
|
||||||
|
private final Outputs<T> outputs;
|
||||||
|
|
||||||
|
public ListOfOutputs(Outputs<T> outputs) {
|
||||||
|
this.outputs = outputs;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object common(Object output1, Object output2) {
|
||||||
|
// These will never be a list:
|
||||||
|
return outputs.common((T) output1, (T) output2);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object subtract(Object object, Object inc) {
|
||||||
|
// These will never be a list:
|
||||||
|
return outputs.subtract((T) object, (T) inc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object add(Object prefix, Object output) {
|
||||||
|
assert !(prefix instanceof List);
|
||||||
|
if (!(output instanceof List)) {
|
||||||
|
return outputs.add((T) prefix, (T) output);
|
||||||
|
} else {
|
||||||
|
List<T> outputList = (List<T>) output;
|
||||||
|
List<T> addedList = new ArrayList<T>(outputList.size());
|
||||||
|
for(T _output : outputList) {
|
||||||
|
addedList.add(outputs.add((T) prefix, _output));
|
||||||
|
}
|
||||||
|
return addedList;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(Object output, DataOutput out) throws IOException {
|
||||||
|
assert !(output instanceof List);
|
||||||
|
outputs.write((T) output, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void writeFinalOutput(Object output, DataOutput out) throws IOException {
|
||||||
|
if (!(output instanceof List)) {
|
||||||
|
out.writeVInt(1);
|
||||||
|
outputs.write((T) output, out);
|
||||||
|
} else {
|
||||||
|
List<T> outputList = (List<T>) output;
|
||||||
|
out.writeVInt(outputList.size());
|
||||||
|
for(T eachOutput : outputList) {
|
||||||
|
outputs.write(eachOutput, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object read(DataInput in) throws IOException {
|
||||||
|
return outputs.read(in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object readFinalOutput(DataInput in) throws IOException {
|
||||||
|
int count = in.readVInt();
|
||||||
|
if (count == 1) {
|
||||||
|
return outputs.read(in);
|
||||||
|
} else {
|
||||||
|
List<T> outputList = new ArrayList<T>(count);
|
||||||
|
for(int i=0;i<count;i++) {
|
||||||
|
outputList.add(outputs.read(in));
|
||||||
|
}
|
||||||
|
return outputList;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object getNoOutput() {
|
||||||
|
return outputs.getNoOutput();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String outputToString(Object output) {
|
||||||
|
if (!(output instanceof List)) {
|
||||||
|
return outputs.outputToString((T) output);
|
||||||
|
} else {
|
||||||
|
List<T> outputList = (List<T>) output;
|
||||||
|
|
||||||
|
StringBuilder b = new StringBuilder();
|
||||||
|
b.append('[');
|
||||||
|
|
||||||
|
for(int i=0;i<outputList.size();i++) {
|
||||||
|
if (i > 0) {
|
||||||
|
b.append(", ");
|
||||||
|
}
|
||||||
|
b.append(outputs.outputToString(outputList.get(i)));
|
||||||
|
}
|
||||||
|
b.append(']');
|
||||||
|
return b.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object merge(Object first, Object second) {
|
||||||
|
List<T> outputList = new ArrayList<T>();
|
||||||
|
if (!(first instanceof List)) {
|
||||||
|
outputList.add((T) first);
|
||||||
|
} else {
|
||||||
|
outputList.addAll((List<T>) first);
|
||||||
|
}
|
||||||
|
if (!(second instanceof List)) {
|
||||||
|
outputList.add((T) second);
|
||||||
|
} else {
|
||||||
|
outputList.addAll((List<T>) second);
|
||||||
|
}
|
||||||
|
//System.out.println("MERGE: now " + outputList.size() + " first=" + outputToString(first) + " second=" + outputToString(second));
|
||||||
|
//System.out.println(" return " + outputToString(outputList));
|
||||||
|
return outputList;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "OneOrMoreOutputs(" + outputs + ")";
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<T> asList(Object output) {
|
||||||
|
if (!(output instanceof List)) {
|
||||||
|
List<T> result = new ArrayList<T>(1);
|
||||||
|
result.add((T) output);
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
return (List<T>) output;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,6 +17,21 @@ package org.apache.lucene.util.fst;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An FST {@link Outputs} implementation where each output
|
||||||
|
* is one or two non-negative long values. If it's a
|
||||||
|
* single output, Long is returned; else, TwoLongs. Order
|
||||||
|
* is preserved in the TwoLongs case, ie .first is the first
|
||||||
|
* input/output added to Builder, and .second is the
|
||||||
|
* second. You cannot store 0 output with this (that's
|
||||||
|
* reserved to mean "no output")!
|
||||||
|
*
|
||||||
|
* NOTE: the resulting FST is not guaranteed to be minimal!
|
||||||
|
* See {@link Builder}.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
||||||
|
@ -36,7 +51,6 @@ import org.apache.lucene.store.DataOutput;
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public final class UpToTwoPositiveIntOutputs extends Outputs<Object> {
|
public final class UpToTwoPositiveIntOutputs extends Outputs<Object> {
|
||||||
|
|
||||||
/** Holds two long outputs. */
|
/** Holds two long outputs. */
|
|
@ -0,0 +1,21 @@
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
Misc FST classes.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,237 @@
|
||||||
|
package org.apache.lucene.util.fst;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.MockDirectoryWrapper;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
import org.apache.lucene.util.fst.UpToTwoPositiveIntOutputs.TwoLongs;
|
||||||
|
|
||||||
|
import static org.apache.lucene.util.fst.FSTTester.getRandomString;
|
||||||
|
import static org.apache.lucene.util.fst.FSTTester.toIntsRef;
|
||||||
|
|
||||||
|
public class TestFSTsMisc extends LuceneTestCase {
|
||||||
|
|
||||||
|
private MockDirectoryWrapper dir;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
dir = newMockDirectory();
|
||||||
|
dir.setPreventDoubleWrite(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
// can be null if we force simpletext (funky, some kind of bug in test runner maybe)
|
||||||
|
if (dir != null) dir.close();
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomWords() throws IOException {
|
||||||
|
testRandomWords(1000, LuceneTestCase.atLeast(random(), 2));
|
||||||
|
//testRandomWords(100, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testRandomWords(int maxNumWords, int numIter) throws IOException {
|
||||||
|
Random random = new Random(random().nextLong());
|
||||||
|
for(int iter=0;iter<numIter;iter++) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("\nTEST: iter " + iter);
|
||||||
|
}
|
||||||
|
for(int inputMode=0;inputMode<2;inputMode++) {
|
||||||
|
final int numWords = random.nextInt(maxNumWords+1);
|
||||||
|
Set<IntsRef> termsSet = new HashSet<IntsRef>();
|
||||||
|
IntsRef[] terms = new IntsRef[numWords];
|
||||||
|
while(termsSet.size() < numWords) {
|
||||||
|
final String term = getRandomString(random);
|
||||||
|
termsSet.add(toIntsRef(term, inputMode));
|
||||||
|
}
|
||||||
|
doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTest(int inputMode, IntsRef[] terms) throws IOException {
|
||||||
|
Arrays.sort(terms);
|
||||||
|
|
||||||
|
// Up to two positive ints, shared, generally but not
|
||||||
|
// monotonically increasing
|
||||||
|
{
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: now test UpToTwoPositiveIntOutputs");
|
||||||
|
}
|
||||||
|
final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true);
|
||||||
|
final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
|
||||||
|
long lastOutput = 0;
|
||||||
|
for(int idx=0;idx<terms.length;idx++) {
|
||||||
|
// Sometimes go backwards
|
||||||
|
long value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
|
||||||
|
while(value < 0) {
|
||||||
|
value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
|
||||||
|
}
|
||||||
|
final Object output;
|
||||||
|
if (random().nextInt(5) == 3) {
|
||||||
|
long value2 = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
|
||||||
|
while(value2 < 0) {
|
||||||
|
value2 = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
|
||||||
|
}
|
||||||
|
List<Long> values = new ArrayList<Long>();
|
||||||
|
values.add(value);
|
||||||
|
values.add(value2);
|
||||||
|
output = values;
|
||||||
|
} else {
|
||||||
|
output = outputs.get(value);
|
||||||
|
}
|
||||||
|
pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
|
||||||
|
}
|
||||||
|
new FSTTester<Object>(random(), dir, inputMode, pairs, outputs, false) {
|
||||||
|
@Override
|
||||||
|
protected boolean outputsEqual(Object output1, Object output2) {
|
||||||
|
if (output1 instanceof TwoLongs && output2 instanceof List) {
|
||||||
|
TwoLongs twoLongs1 = (TwoLongs) output1;
|
||||||
|
return Arrays.asList(new Long[] {twoLongs1.first, twoLongs1.second}).equals(output2);
|
||||||
|
} else if (output2 instanceof TwoLongs && output1 instanceof List) {
|
||||||
|
TwoLongs twoLongs2 = (TwoLongs) output2;
|
||||||
|
return Arrays.asList(new Long[] {twoLongs2.first, twoLongs2.second}).equals(output1);
|
||||||
|
}
|
||||||
|
return output1.equals(output2);
|
||||||
|
}
|
||||||
|
}.doTest(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListOfOutputs(PositiveIntOutputs), generally but not
|
||||||
|
// monotonically increasing
|
||||||
|
{
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: now test OneOrMoreOutputs");
|
||||||
|
}
|
||||||
|
final PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
|
||||||
|
final ListOfOutputs<Long> outputs = new ListOfOutputs<Long>(_outputs);
|
||||||
|
final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
|
||||||
|
long lastOutput = 0;
|
||||||
|
for(int idx=0;idx<terms.length;idx++) {
|
||||||
|
|
||||||
|
int outputCount = _TestUtil.nextInt(random(), 1, 7);
|
||||||
|
List<Long> values = new ArrayList<Long>();
|
||||||
|
for(int i=0;i<outputCount;i++) {
|
||||||
|
// Sometimes go backwards
|
||||||
|
long value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
|
||||||
|
while(value < 0) {
|
||||||
|
value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
|
||||||
|
}
|
||||||
|
values.add(value);
|
||||||
|
lastOutput = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
final Object output;
|
||||||
|
if (values.size() == 1) {
|
||||||
|
output = values.get(0);
|
||||||
|
} else {
|
||||||
|
output = values;
|
||||||
|
}
|
||||||
|
|
||||||
|
pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
|
||||||
|
}
|
||||||
|
new FSTTester<Object>(random(), dir, inputMode, pairs, outputs, false).doTest(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testListOfOutputs() throws Exception {
|
||||||
|
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
|
||||||
|
ListOfOutputs<Long> outputs = new ListOfOutputs<Long>(_outputs);
|
||||||
|
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
|
final IntsRef scratch = new IntsRef();
|
||||||
|
// Add the same input more than once and the outputs
|
||||||
|
// are merged:
|
||||||
|
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
|
||||||
|
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
|
||||||
|
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
|
||||||
|
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
|
||||||
|
final FST<Object> fst = builder.finish();
|
||||||
|
|
||||||
|
Object output = Util.get(fst, new BytesRef("a"));
|
||||||
|
assertNotNull(output);
|
||||||
|
List<Long> outputList = outputs.asList(output);
|
||||||
|
assertEquals(3, outputList.size());
|
||||||
|
assertEquals(1L, outputList.get(0).longValue());
|
||||||
|
assertEquals(3L, outputList.get(1).longValue());
|
||||||
|
assertEquals(0L, outputList.get(2).longValue());
|
||||||
|
|
||||||
|
output = Util.get(fst, new BytesRef("b"));
|
||||||
|
assertNotNull(output);
|
||||||
|
outputList = outputs.asList(output);
|
||||||
|
assertEquals(1, outputList.size());
|
||||||
|
assertEquals(17L, outputList.get(0).longValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testListOfOutputsEmptyString() throws Exception {
|
||||||
|
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
|
||||||
|
ListOfOutputs<Long> outputs = new ListOfOutputs<Long>(_outputs);
|
||||||
|
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
|
final IntsRef scratch = new IntsRef();
|
||||||
|
builder.add(scratch, 0L);
|
||||||
|
builder.add(scratch, 1L);
|
||||||
|
builder.add(scratch, 17L);
|
||||||
|
builder.add(scratch, 1L);
|
||||||
|
|
||||||
|
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
|
||||||
|
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
|
||||||
|
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
|
||||||
|
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
|
||||||
|
|
||||||
|
final FST<Object> fst = builder.finish();
|
||||||
|
|
||||||
|
Object output = Util.get(fst, new BytesRef(""));
|
||||||
|
assertNotNull(output);
|
||||||
|
List<Long> outputList = outputs.asList(output);
|
||||||
|
assertEquals(4, outputList.size());
|
||||||
|
assertEquals(0L, outputList.get(0).longValue());
|
||||||
|
assertEquals(1L, outputList.get(1).longValue());
|
||||||
|
assertEquals(17L, outputList.get(2).longValue());
|
||||||
|
assertEquals(1L, outputList.get(3).longValue());
|
||||||
|
|
||||||
|
output = Util.get(fst, new BytesRef("a"));
|
||||||
|
assertNotNull(output);
|
||||||
|
outputList = outputs.asList(output);
|
||||||
|
assertEquals(3, outputList.size());
|
||||||
|
assertEquals(1L, outputList.get(0).longValue());
|
||||||
|
assertEquals(3L, outputList.get(1).longValue());
|
||||||
|
assertEquals(0L, outputList.get(2).longValue());
|
||||||
|
|
||||||
|
output = Util.get(fst, new BytesRef("b"));
|
||||||
|
assertNotNull(output);
|
||||||
|
outputList = outputs.asList(output);
|
||||||
|
assertEquals(1, outputList.size());
|
||||||
|
assertEquals(0L, outputList.get(0).longValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,832 @@
|
||||||
|
package org.apache.lucene.util.fst;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
* <p/>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p/>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.Writer;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.junit.Assert.assertNull;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
/** Helper class to test FSTs. */
|
||||||
|
public class FSTTester<T> {
|
||||||
|
|
||||||
|
final Random random;
|
||||||
|
final List<InputOutput<T>> pairs;
|
||||||
|
final int inputMode;
|
||||||
|
final Outputs<T> outputs;
|
||||||
|
final Directory dir;
|
||||||
|
final boolean doReverseLookup;
|
||||||
|
|
||||||
|
public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs, boolean doReverseLookup) {
|
||||||
|
this.random = random;
|
||||||
|
this.dir = dir;
|
||||||
|
this.inputMode = inputMode;
|
||||||
|
this.pairs = pairs;
|
||||||
|
this.outputs = outputs;
|
||||||
|
this.doReverseLookup = doReverseLookup;
|
||||||
|
}
|
||||||
|
|
||||||
|
static String inputToString(int inputMode, IntsRef term) {
|
||||||
|
return inputToString(inputMode, term, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
|
||||||
|
if (!isValidUnicode) {
|
||||||
|
return term.toString();
|
||||||
|
} else if (inputMode == 0) {
|
||||||
|
// utf8
|
||||||
|
return toBytesRef(term).utf8ToString() + " " + term;
|
||||||
|
} else {
|
||||||
|
// utf32
|
||||||
|
return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static BytesRef toBytesRef(IntsRef ir) {
|
||||||
|
BytesRef br = new BytesRef(ir.length);
|
||||||
|
for(int i=0;i<ir.length;i++) {
|
||||||
|
int x = ir.ints[ir.offset+i];
|
||||||
|
assert x >= 0 && x <= 255;
|
||||||
|
br.bytes[i] = (byte) x;
|
||||||
|
}
|
||||||
|
br.length = ir.length;
|
||||||
|
return br;
|
||||||
|
}
|
||||||
|
|
||||||
|
static String getRandomString(Random random) {
|
||||||
|
final String term;
|
||||||
|
if (random.nextBoolean()) {
|
||||||
|
term = _TestUtil.randomRealisticUnicodeString(random);
|
||||||
|
} else {
|
||||||
|
// we want to mix in limited-alphabet symbols so
|
||||||
|
// we get more sharing of the nodes given how few
|
||||||
|
// terms we are testing...
|
||||||
|
term = simpleRandomString(random);
|
||||||
|
}
|
||||||
|
return term;
|
||||||
|
}
|
||||||
|
|
||||||
|
static String simpleRandomString(Random r) {
|
||||||
|
final int end = r.nextInt(10);
|
||||||
|
if (end == 0) {
|
||||||
|
// allow 0 length
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
final char[] buffer = new char[end];
|
||||||
|
for (int i = 0; i < end; i++) {
|
||||||
|
buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
|
||||||
|
}
|
||||||
|
return new String(buffer, 0, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
static IntsRef toIntsRef(String s, int inputMode) {
|
||||||
|
return toIntsRef(s, inputMode, new IntsRef(10));
|
||||||
|
}
|
||||||
|
|
||||||
|
static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
|
||||||
|
if (inputMode == 0) {
|
||||||
|
// utf8
|
||||||
|
return toIntsRef(new BytesRef(s), ir);
|
||||||
|
} else {
|
||||||
|
// utf32
|
||||||
|
return toIntsRefUTF32(s, ir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
|
||||||
|
final int charLength = s.length();
|
||||||
|
int charIdx = 0;
|
||||||
|
int intIdx = 0;
|
||||||
|
while(charIdx < charLength) {
|
||||||
|
if (intIdx == ir.ints.length) {
|
||||||
|
ir.grow(intIdx+1);
|
||||||
|
}
|
||||||
|
final int utf32 = s.codePointAt(charIdx);
|
||||||
|
ir.ints[intIdx] = utf32;
|
||||||
|
charIdx += Character.charCount(utf32);
|
||||||
|
intIdx++;
|
||||||
|
}
|
||||||
|
ir.length = intIdx;
|
||||||
|
return ir;
|
||||||
|
}
|
||||||
|
|
||||||
|
static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
|
||||||
|
if (br.length > ir.ints.length) {
|
||||||
|
ir.grow(br.length);
|
||||||
|
}
|
||||||
|
for(int i=0;i<br.length;i++) {
|
||||||
|
ir.ints[i] = br.bytes[br.offset+i]&0xFF;
|
||||||
|
}
|
||||||
|
ir.length = br.length;
|
||||||
|
return ir;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Holds one input/output pair. */
|
||||||
|
public static class InputOutput<T> implements Comparable<InputOutput<T>> {
|
||||||
|
public final IntsRef input;
|
||||||
|
public final T output;
|
||||||
|
|
||||||
|
public InputOutput(IntsRef input, T output) {
|
||||||
|
this.input = input;
|
||||||
|
this.output = output;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int compareTo(InputOutput<T> other) {
|
||||||
|
if (other instanceof InputOutput) {
|
||||||
|
return input.compareTo((other).input);
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void doTest(boolean testPruning) throws IOException {
|
||||||
|
// no pruning
|
||||||
|
doTest(0, 0, true);
|
||||||
|
|
||||||
|
if (testPruning) {
|
||||||
|
// simple pruning
|
||||||
|
doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true);
|
||||||
|
|
||||||
|
// leafy pruning
|
||||||
|
doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// runs the term, returning the output, or null if term
|
||||||
|
// isn't accepted. if prefixLength is non-null it must be
|
||||||
|
// length 1 int array; prefixLength[0] is set to the length
|
||||||
|
// of the term prefix that matches
|
||||||
|
private T run(FST<T> fst, IntsRef term, int[] prefixLength) throws IOException {
|
||||||
|
assert prefixLength == null || prefixLength.length == 1;
|
||||||
|
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
||||||
|
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
||||||
|
T output = NO_OUTPUT;
|
||||||
|
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||||
|
|
||||||
|
for(int i=0;i<=term.length;i++) {
|
||||||
|
final int label;
|
||||||
|
if (i == term.length) {
|
||||||
|
label = FST.END_LABEL;
|
||||||
|
} else {
|
||||||
|
label = term.ints[term.offset+i];
|
||||||
|
}
|
||||||
|
// System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
|
||||||
|
if (fst.findTargetArc(label, arc, arc, fstReader) == null) {
|
||||||
|
// System.out.println(" not found");
|
||||||
|
if (prefixLength != null) {
|
||||||
|
prefixLength[0] = i;
|
||||||
|
return output;
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
output = fst.outputs.add(output, arc.output);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (prefixLength != null) {
|
||||||
|
prefixLength[0] = term.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
private T randomAcceptedWord(FST<T> fst, IntsRef in) throws IOException {
|
||||||
|
FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
|
||||||
|
|
||||||
|
final List<FST.Arc<T>> arcs = new ArrayList<FST.Arc<T>>();
|
||||||
|
in.length = 0;
|
||||||
|
in.offset = 0;
|
||||||
|
final T NO_OUTPUT = fst.outputs.getNoOutput();
|
||||||
|
T output = NO_OUTPUT;
|
||||||
|
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||||
|
|
||||||
|
while(true) {
|
||||||
|
// read all arcs:
|
||||||
|
fst.readFirstTargetArc(arc, arc, fstReader);
|
||||||
|
arcs.add(new FST.Arc<T>().copyFrom(arc));
|
||||||
|
while(!arc.isLast()) {
|
||||||
|
fst.readNextArc(arc, fstReader);
|
||||||
|
arcs.add(new FST.Arc<T>().copyFrom(arc));
|
||||||
|
}
|
||||||
|
|
||||||
|
// pick one
|
||||||
|
arc = arcs.get(random.nextInt(arcs.size()));
|
||||||
|
arcs.clear();
|
||||||
|
|
||||||
|
// accumulate output
|
||||||
|
output = fst.outputs.add(output, arc.output);
|
||||||
|
|
||||||
|
// append label
|
||||||
|
if (arc.label == FST.END_LABEL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (in.ints.length == in.length) {
|
||||||
|
in.grow(1+in.length);
|
||||||
|
}
|
||||||
|
in.ints[in.length++] = arc.label;
|
||||||
|
}
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
|
||||||
|
}
|
||||||
|
|
||||||
|
final boolean willRewrite = random.nextBoolean();
|
||||||
|
|
||||||
|
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
|
||||||
|
prune1, prune2,
|
||||||
|
prune1==0 && prune2==0,
|
||||||
|
allowRandomSuffixSharing ? random.nextBoolean() : true,
|
||||||
|
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
|
||||||
|
outputs,
|
||||||
|
null,
|
||||||
|
willRewrite);
|
||||||
|
|
||||||
|
for(InputOutput<T> pair : pairs) {
|
||||||
|
if (pair.output instanceof List) {
|
||||||
|
@SuppressWarnings("unchecked") List<Long> longValues = (List<Long>) pair.output;
|
||||||
|
@SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
|
||||||
|
for(Long value : longValues) {
|
||||||
|
builderObject.add(pair.input, value);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
builder.add(pair.input, pair.output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
FST<T> fst = builder.finish();
|
||||||
|
|
||||||
|
if (random.nextBoolean() && fst != null && !willRewrite) {
|
||||||
|
IOContext context = LuceneTestCase.newIOContext(random);
|
||||||
|
IndexOutput out = dir.createOutput("fst.bin", context);
|
||||||
|
fst.save(out);
|
||||||
|
out.close();
|
||||||
|
IndexInput in = dir.openInput("fst.bin", context);
|
||||||
|
try {
|
||||||
|
fst = new FST<T>(in, outputs);
|
||||||
|
} finally {
|
||||||
|
in.close();
|
||||||
|
dir.deleteFile("fst.bin");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LuceneTestCase.VERBOSE && pairs.size() <= 20 && fst != null) {
|
||||||
|
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
||||||
|
Util.toDot(fst, w, false, false);
|
||||||
|
w.close();
|
||||||
|
System.out.println("SAVED out.dot");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
if (fst == null) {
|
||||||
|
System.out.println(" fst has 0 nodes (fully pruned)");
|
||||||
|
} else {
|
||||||
|
System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (prune1 == 0 && prune2 == 0) {
|
||||||
|
verifyUnPruned(inputMode, fst);
|
||||||
|
} else {
|
||||||
|
verifyPruned(inputMode, fst, prune1, prune2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (willRewrite && fst != null) {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: now rewrite");
|
||||||
|
}
|
||||||
|
final FST<T> packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat());
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: now verify packed FST");
|
||||||
|
}
|
||||||
|
if (prune1 == 0 && prune2 == 0) {
|
||||||
|
verifyUnPruned(inputMode, packed);
|
||||||
|
} else {
|
||||||
|
verifyPruned(inputMode, packed, prune1, prune2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return fst;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean outputsEqual(T a, T b) {
|
||||||
|
return a.equals(b);
|
||||||
|
}
|
||||||
|
|
||||||
|
// FST is complete
|
||||||
|
private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
|
||||||
|
|
||||||
|
final FST<Long> fstLong;
|
||||||
|
final Set<Long> validOutputs;
|
||||||
|
long minLong = Long.MAX_VALUE;
|
||||||
|
long maxLong = Long.MIN_VALUE;
|
||||||
|
|
||||||
|
if (doReverseLookup) {
|
||||||
|
@SuppressWarnings("unchecked") FST<Long> fstLong0 = (FST<Long>) fst;
|
||||||
|
fstLong = fstLong0;
|
||||||
|
validOutputs = new HashSet<Long>();
|
||||||
|
for(InputOutput<T> pair: pairs) {
|
||||||
|
Long output = (Long) pair.output;
|
||||||
|
maxLong = Math.max(maxLong, output);
|
||||||
|
minLong = Math.min(minLong, output);
|
||||||
|
validOutputs.add(output);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fstLong = null;
|
||||||
|
validOutputs = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pairs.size() == 0) {
|
||||||
|
assertNull(fst);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: now verify " + pairs.size() + " terms");
|
||||||
|
for(InputOutput<T> pair : pairs) {
|
||||||
|
assertNotNull(pair);
|
||||||
|
assertNotNull(pair.input);
|
||||||
|
assertNotNull(pair.output);
|
||||||
|
System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assertNotNull(fst);
|
||||||
|
|
||||||
|
// visit valid pairs in order -- make sure all words
|
||||||
|
// are accepted, and FSTEnum's next() steps through
|
||||||
|
// them correctly
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: check valid terms/next()");
|
||||||
|
}
|
||||||
|
{
|
||||||
|
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
|
||||||
|
for(InputOutput<T> pair : pairs) {
|
||||||
|
IntsRef term = pair.input;
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
|
||||||
|
}
|
||||||
|
T output = run(fst, term, null);
|
||||||
|
assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
|
||||||
|
assertTrue(outputsEqual(pair.output, output));
|
||||||
|
|
||||||
|
// verify enum's next
|
||||||
|
IntsRefFSTEnum.InputOutput<T> t = fstEnum.next();
|
||||||
|
assertNotNull(t);
|
||||||
|
assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input);
|
||||||
|
assertTrue(outputsEqual(pair.output, t.output));
|
||||||
|
}
|
||||||
|
assertNull(fstEnum.next());
|
||||||
|
}
|
||||||
|
|
||||||
|
final Map<IntsRef,T> termsMap = new HashMap<IntsRef,T>();
|
||||||
|
for(InputOutput<T> pair : pairs) {
|
||||||
|
termsMap.put(pair.input, pair.output);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (doReverseLookup && maxLong > minLong) {
|
||||||
|
// Do random lookups so we test null (output doesn't
|
||||||
|
// exist) case:
|
||||||
|
assertNull(Util.getByOutput(fstLong, minLong-7));
|
||||||
|
assertNull(Util.getByOutput(fstLong, maxLong+7));
|
||||||
|
|
||||||
|
final int num = LuceneTestCase.atLeast(random, 100);
|
||||||
|
for(int iter=0;iter<num;iter++) {
|
||||||
|
Long v = _TestUtil.nextLong(random, minLong, maxLong);
|
||||||
|
IntsRef input = Util.getByOutput(fstLong, v);
|
||||||
|
assertTrue(validOutputs.contains(v) || input == null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find random matching word and make sure it's valid
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: verify random accepted terms");
|
||||||
|
}
|
||||||
|
final IntsRef scratch = new IntsRef(10);
|
||||||
|
int num = LuceneTestCase.atLeast(random, 500);
|
||||||
|
for(int iter=0;iter<num;iter++) {
|
||||||
|
T output = randomAcceptedWord(fst, scratch);
|
||||||
|
assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
|
||||||
|
assertTrue(outputsEqual(termsMap.get(scratch), output));
|
||||||
|
|
||||||
|
if (doReverseLookup) {
|
||||||
|
//System.out.println("lookup output=" + output + " outs=" + fst.outputs);
|
||||||
|
IntsRef input = Util.getByOutput(fstLong, (Long) output);
|
||||||
|
assertNotNull(input);
|
||||||
|
//System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
|
||||||
|
assertEquals(scratch, input);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// test IntsRefFSTEnum.seek:
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: verify seek");
|
||||||
|
}
|
||||||
|
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
|
||||||
|
num = LuceneTestCase.atLeast(random, 100);
|
||||||
|
for(int iter=0;iter<num;iter++) {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" iter=" + iter);
|
||||||
|
}
|
||||||
|
if (random.nextBoolean()) {
|
||||||
|
// seek to term that doesn't exist:
|
||||||
|
while(true) {
|
||||||
|
final IntsRef term = toIntsRef(getRandomString(random), inputMode);
|
||||||
|
int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
|
||||||
|
if (pos < 0) {
|
||||||
|
pos = -(pos+1);
|
||||||
|
// ok doesn't exist
|
||||||
|
//System.out.println(" seek " + inputToString(inputMode, term));
|
||||||
|
final IntsRefFSTEnum.InputOutput<T> seekResult;
|
||||||
|
if (random.nextInt(3) == 0) {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" do non-exist seekExact term=" + inputToString(inputMode, term));
|
||||||
|
}
|
||||||
|
seekResult = fstEnum.seekExact(term);
|
||||||
|
pos = -1;
|
||||||
|
} else if (random.nextBoolean()) {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term));
|
||||||
|
}
|
||||||
|
seekResult = fstEnum.seekFloor(term);
|
||||||
|
pos--;
|
||||||
|
} else {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term));
|
||||||
|
}
|
||||||
|
seekResult = fstEnum.seekCeil(term);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pos != -1 && pos < pairs.size()) {
|
||||||
|
//System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output));
|
||||||
|
assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult);
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" got " + inputToString(inputMode, seekResult.input));
|
||||||
|
}
|
||||||
|
assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input);
|
||||||
|
assertTrue(outputsEqual(pairs.get(pos).output, seekResult.output));
|
||||||
|
} else {
|
||||||
|
// seeked before start or beyond end
|
||||||
|
//System.out.println("seek=" + seekTerm);
|
||||||
|
assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult);
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" got null");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// seek to term that does exist:
|
||||||
|
InputOutput<T> pair = pairs.get(random.nextInt(pairs.size()));
|
||||||
|
final IntsRefFSTEnum.InputOutput<T> seekResult;
|
||||||
|
if (random.nextInt(3) == 2) {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" do exists seekExact term=" + inputToString(inputMode, pair.input));
|
||||||
|
}
|
||||||
|
seekResult = fstEnum.seekExact(pair.input);
|
||||||
|
} else if (random.nextBoolean()) {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input));
|
||||||
|
}
|
||||||
|
seekResult = fstEnum.seekFloor(pair.input);
|
||||||
|
} else {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input));
|
||||||
|
}
|
||||||
|
seekResult = fstEnum.seekCeil(pair.input);
|
||||||
|
}
|
||||||
|
assertNotNull(seekResult);
|
||||||
|
assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input);
|
||||||
|
assertTrue(outputsEqual(pair.output, seekResult.output));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: mixed next/seek");
|
||||||
|
}
|
||||||
|
|
||||||
|
// test mixed next/seek
|
||||||
|
num = LuceneTestCase.atLeast(random, 100);
|
||||||
|
for(int iter=0;iter<num;iter++) {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: iter " + iter);
|
||||||
|
}
|
||||||
|
// reset:
|
||||||
|
fstEnum = new IntsRefFSTEnum<T>(fst);
|
||||||
|
int upto = -1;
|
||||||
|
while(true) {
|
||||||
|
boolean isDone = false;
|
||||||
|
if (upto == pairs.size()-1 || random.nextBoolean()) {
|
||||||
|
// next
|
||||||
|
upto++;
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" do next");
|
||||||
|
}
|
||||||
|
isDone = fstEnum.next() == null;
|
||||||
|
} else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) {
|
||||||
|
int attempt = 0;
|
||||||
|
for(;attempt<10;attempt++) {
|
||||||
|
IntsRef term = toIntsRef(getRandomString(random), inputMode);
|
||||||
|
if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) {
|
||||||
|
int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
|
||||||
|
assert pos < 0;
|
||||||
|
upto = -(pos+1);
|
||||||
|
|
||||||
|
if (random.nextBoolean()) {
|
||||||
|
upto--;
|
||||||
|
assertTrue(upto != -1);
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")");
|
||||||
|
}
|
||||||
|
isDone = fstEnum.seekFloor(term) == null;
|
||||||
|
} else {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")");
|
||||||
|
}
|
||||||
|
isDone = fstEnum.seekCeil(term) == null;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (attempt == 10) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
final int inc = random.nextInt(pairs.size() - upto - 1);
|
||||||
|
upto += inc;
|
||||||
|
if (upto == -1) {
|
||||||
|
upto = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (random.nextBoolean()) {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" do seekCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")");
|
||||||
|
}
|
||||||
|
isDone = fstEnum.seekCeil(pairs.get(upto).input) == null;
|
||||||
|
} else {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" do seekFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")");
|
||||||
|
}
|
||||||
|
isDone = fstEnum.seekFloor(pairs.get(upto).input) == null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
if (!isDone) {
|
||||||
|
System.out.println(" got " + inputToString(inputMode, fstEnum.current().input));
|
||||||
|
} else {
|
||||||
|
System.out.println(" got null");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (upto == pairs.size()) {
|
||||||
|
assertTrue(isDone);
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
assertFalse(isDone);
|
||||||
|
assertEquals(pairs.get(upto).input, fstEnum.current().input);
|
||||||
|
assertTrue(outputsEqual(pairs.get(upto).output, fstEnum.current().output));
|
||||||
|
|
||||||
|
/*
|
||||||
|
if (upto < pairs.size()-1) {
|
||||||
|
int tryCount = 0;
|
||||||
|
while(tryCount < 10) {
|
||||||
|
final IntsRef t = toIntsRef(getRandomString(), inputMode);
|
||||||
|
if (pairs.get(upto).input.compareTo(t) < 0) {
|
||||||
|
final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0;
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected);
|
||||||
|
}
|
||||||
|
assertEquals(expected, fstEnum.beforeNext(t));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
tryCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class CountMinOutput<T> {
|
||||||
|
int count;
|
||||||
|
T output;
|
||||||
|
T finalOutput;
|
||||||
|
boolean isLeaf = true;
|
||||||
|
boolean isFinal;
|
||||||
|
}
|
||||||
|
|
||||||
|
// FST is pruned
|
||||||
|
private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
|
||||||
|
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
|
||||||
|
for(InputOutput<T> pair : pairs) {
|
||||||
|
System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// To validate the FST, we brute-force compute all prefixes
|
||||||
|
// in the terms, matched to their "common" outputs, prune that
|
||||||
|
// set according to the prune thresholds, then assert the FST
|
||||||
|
// matches that same set.
|
||||||
|
|
||||||
|
// NOTE: Crazy RAM intensive!!
|
||||||
|
|
||||||
|
//System.out.println("TEST: tally prefixes");
|
||||||
|
|
||||||
|
// build all prefixes
|
||||||
|
final Map<IntsRef,CountMinOutput<T>> prefixes = new HashMap<IntsRef,CountMinOutput<T>>();
|
||||||
|
final IntsRef scratch = new IntsRef(10);
|
||||||
|
for(InputOutput<T> pair: pairs) {
|
||||||
|
scratch.copyInts(pair.input);
|
||||||
|
for(int idx=0;idx<=pair.input.length;idx++) {
|
||||||
|
scratch.length = idx;
|
||||||
|
CountMinOutput<T> cmo = prefixes.get(scratch);
|
||||||
|
if (cmo == null) {
|
||||||
|
cmo = new CountMinOutput<T>();
|
||||||
|
cmo.count = 1;
|
||||||
|
cmo.output = pair.output;
|
||||||
|
prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
|
||||||
|
} else {
|
||||||
|
cmo.count++;
|
||||||
|
T output1 = cmo.output;
|
||||||
|
if (output1.equals(outputs.getNoOutput())) {
|
||||||
|
output1 = outputs.getNoOutput();
|
||||||
|
}
|
||||||
|
T output2 = pair.output;
|
||||||
|
if (output2.equals(outputs.getNoOutput())) {
|
||||||
|
output2 = outputs.getNoOutput();
|
||||||
|
}
|
||||||
|
cmo.output = outputs.common(output1, output2);
|
||||||
|
}
|
||||||
|
if (idx == pair.input.length) {
|
||||||
|
cmo.isFinal = true;
|
||||||
|
cmo.finalOutput = cmo.output;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: now prune");
|
||||||
|
}
|
||||||
|
|
||||||
|
// prune 'em
|
||||||
|
final Iterator<Map.Entry<IntsRef,CountMinOutput<T>>> it = prefixes.entrySet().iterator();
|
||||||
|
while(it.hasNext()) {
|
||||||
|
Map.Entry<IntsRef,CountMinOutput<T>> ent = it.next();
|
||||||
|
final IntsRef prefix = ent.getKey();
|
||||||
|
final CountMinOutput<T> cmo = ent.getValue();
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
|
||||||
|
}
|
||||||
|
final boolean keep;
|
||||||
|
if (prune1 > 0) {
|
||||||
|
keep = cmo.count >= prune1;
|
||||||
|
} else {
|
||||||
|
assert prune2 > 0;
|
||||||
|
if (prune2 > 1 && cmo.count >= prune2) {
|
||||||
|
keep = true;
|
||||||
|
} else if (prefix.length > 0) {
|
||||||
|
// consult our parent
|
||||||
|
scratch.length = prefix.length-1;
|
||||||
|
System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length);
|
||||||
|
final CountMinOutput<T> cmo2 = prefixes.get(scratch);
|
||||||
|
//System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count));
|
||||||
|
keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
|
||||||
|
} else if (cmo.count >= prune2) {
|
||||||
|
keep = true;
|
||||||
|
} else {
|
||||||
|
keep = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!keep) {
|
||||||
|
it.remove();
|
||||||
|
//System.out.println(" remove");
|
||||||
|
} else {
|
||||||
|
// clear isLeaf for all ancestors
|
||||||
|
//System.out.println(" keep");
|
||||||
|
scratch.copyInts(prefix);
|
||||||
|
scratch.length--;
|
||||||
|
while(scratch.length >= 0) {
|
||||||
|
final CountMinOutput<T> cmo2 = prefixes.get(scratch);
|
||||||
|
if (cmo2 != null) {
|
||||||
|
//System.out.println(" clear isLeaf " + inputToString(inputMode, scratch));
|
||||||
|
cmo2.isLeaf = false;
|
||||||
|
}
|
||||||
|
scratch.length--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: after prune");
|
||||||
|
for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
|
||||||
|
System.out.println(" " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
|
||||||
|
if (ent.getValue().isFinal) {
|
||||||
|
System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (prefixes.size() <= 1) {
|
||||||
|
assertNull(fst);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
assertNotNull(fst);
|
||||||
|
|
||||||
|
// make sure FST only enums valid prefixes
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: check pruned enum");
|
||||||
|
}
|
||||||
|
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
|
||||||
|
IntsRefFSTEnum.InputOutput<T> current;
|
||||||
|
while((current = fstEnum.next()) != null) {
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output));
|
||||||
|
}
|
||||||
|
final CountMinOutput<T> cmo = prefixes.get(current.input);
|
||||||
|
assertNotNull(cmo);
|
||||||
|
assertTrue(cmo.isLeaf || cmo.isFinal);
|
||||||
|
//if (cmo.isFinal && !cmo.isLeaf) {
|
||||||
|
if (cmo.isFinal) {
|
||||||
|
assertEquals(cmo.finalOutput, current.output);
|
||||||
|
} else {
|
||||||
|
assertEquals(cmo.output, current.output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// make sure all non-pruned prefixes are present in the FST
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: verify all prefixes");
|
||||||
|
}
|
||||||
|
final int[] stopNode = new int[1];
|
||||||
|
for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
|
||||||
|
if (ent.getKey().length > 0) {
|
||||||
|
final CountMinOutput<T> cmo = ent.getValue();
|
||||||
|
final T output = run(fst, ent.getKey(), stopNode);
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output));
|
||||||
|
}
|
||||||
|
// if (cmo.isFinal && !cmo.isLeaf) {
|
||||||
|
if (cmo.isFinal) {
|
||||||
|
assertEquals(cmo.finalOutput, output);
|
||||||
|
} else {
|
||||||
|
assertEquals(cmo.output, output);
|
||||||
|
}
|
||||||
|
assertEquals(ent.getKey().length, stopNode[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
Support for FST testing.
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Reference in New Issue