LUCENE-4404: add ListOfOutputs for FST to hold more than one output per input

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1388935 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-09-23 00:52:36 +00:00
parent 5be591e97a
commit fd748920f8
11 changed files with 1373 additions and 828 deletions

View File

@ -21,8 +21,12 @@ Changes in backwards compatibility policy
Robert Muir) Robert Muir)
======================= Lucene 4.1.0 ======================= ======================= Lucene 4.1.0 =======================
New Features
(No Changes) * LUCENE-4404: New ListOfOutputs (in lucene/misc) for FSTs wraps
another Outputs implementation, allowing you to store more than one
output for a single input. UpToTwoPositiveIntsOutputs was moved
from lucene/core to lucene/misc. (Mike McCandless)
======================= Lucene 4.0.0 ======================= ======================= Lucene 4.0.0 =======================

View File

@ -399,8 +399,10 @@ public class Builder<T> {
} }
final UnCompiledNode<T> lastNode = frontier[input.length]; final UnCompiledNode<T> lastNode = frontier[input.length];
if (lastInput.length != input.length || prefixLenPlus1 != input.length + 1) {
lastNode.isFinal = true; lastNode.isFinal = true;
lastNode.output = NO_OUTPUT; lastNode.output = NO_OUTPUT;
}
// push conflicting outputs forward, only as far as // push conflicting outputs forward, only as far as
// needed // needed

View File

@ -296,11 +296,13 @@ public final class FST<T> {
// messy // messy
bytes = new byte[numBytes]; bytes = new byte[numBytes];
in.readBytes(bytes, 0, numBytes); in.readBytes(bytes, 0, numBytes);
BytesReader reader;
if (packed) { if (packed) {
emptyOutput = outputs.read(getBytesReader(0)); reader = getBytesReader(0);
} else { } else {
emptyOutput = outputs.read(getBytesReader(numBytes-1)); reader = getBytesReader(numBytes-1);
} }
emptyOutput = outputs.readFinalOutput(reader);
} else { } else {
emptyOutput = null; emptyOutput = null;
} }
@ -414,7 +416,7 @@ public final class FST<T> {
// TODO: this is messy -- replace with sillyBytesWriter; maybe make // TODO: this is messy -- replace with sillyBytesWriter; maybe make
// bytes private // bytes private
final int posSave = writer.posWrite; final int posSave = writer.posWrite;
outputs.write(emptyOutput, writer); outputs.writeFinalOutput(emptyOutput, writer);
emptyOutputBytes = new byte[writer.posWrite-posSave]; emptyOutputBytes = new byte[writer.posWrite-posSave];
if (!packed) { if (!packed) {
@ -638,7 +640,7 @@ public final class FST<T> {
if (arc.nextFinalOutput != NO_OUTPUT) { if (arc.nextFinalOutput != NO_OUTPUT) {
//System.out.println(" write final output"); //System.out.println(" write final output");
outputs.write(arc.nextFinalOutput, writer); outputs.writeFinalOutput(arc.nextFinalOutput, writer);
} }
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
@ -788,7 +790,7 @@ public final class FST<T> {
outputs.read(in); outputs.read(in);
} }
if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
outputs.read(in); outputs.readFinalOutput(in);
} }
if (arc.flag(BIT_STOP_NODE)) { if (arc.flag(BIT_STOP_NODE)) {
} else if (arc.flag(BIT_TARGET_NEXT)) { } else if (arc.flag(BIT_TARGET_NEXT)) {
@ -963,7 +965,7 @@ public final class FST<T> {
} }
if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
arc.nextFinalOutput = outputs.read(in); arc.nextFinalOutput = outputs.readFinalOutput(in);
} else { } else {
arc.nextFinalOutput = outputs.getNoOutput(); arc.nextFinalOutput = outputs.getNoOutput();
} }
@ -1127,7 +1129,7 @@ public final class FST<T> {
} }
if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) {
outputs.read(in); outputs.readFinalOutput(in);
} }
if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) {
@ -1221,6 +1223,14 @@ public final class FST<T> {
} }
} }
/** Returns a {@link BytesReader} for this FST, positioned at
* position 0. */
public BytesReader getBytesReader() {
return getBytesReader(0);
}
/** Returns a {@link BytesReader} for this FST, positioned at
* the provided position. */
public BytesReader getBytesReader(int pos) { public BytesReader getBytesReader(int pos) {
// TODO: maybe re-use via ThreadLocal? // TODO: maybe re-use via ThreadLocal?
if (packed) { if (packed) {
@ -1654,7 +1664,7 @@ public final class FST<T> {
} }
} }
if (arc.nextFinalOutput != NO_OUTPUT) { if (arc.nextFinalOutput != NO_OUTPUT) {
outputs.write(arc.nextFinalOutput, writer); outputs.writeFinalOutput(arc.nextFinalOutput, writer);
} }
if (doWriteTarget) { if (doWriteTarget) {

View File

@ -49,10 +49,27 @@ public abstract class Outputs<T> {
/** Eg add("foo", "bar") -> "foobar" */ /** Eg add("foo", "bar") -> "foobar" */
public abstract T add(T prefix, T output); public abstract T add(T prefix, T output);
/** Encode an output value into a {@link DataOutput}. */
public abstract void write(T output, DataOutput out) throws IOException; public abstract void write(T output, DataOutput out) throws IOException;
/** Encode an final node output value into a {@link
* DataOutput}. By default this just calls {@link #write(Object,
* DataOutput)}. */
public void writeFinalOutput(T output, DataOutput out) throws IOException {
write(output, out);
}
/** Decode an output value previously written with {@link
* #write(Object, DataOutput)}. */
public abstract T read(DataInput in) throws IOException; public abstract T read(DataInput in) throws IOException;
/** Decode an output value previously written with {@link
* #writeFinalOutput(Object, DataOutput)}. By default this
* just calls {@link #read(DataInput)}. */
public T readFinalOutput(DataInput in) throws IOException {
return read(in);
}
/** NOTE: this output is compared with == so you must /** NOTE: this output is compared with == so you must
* ensure that all methods return the single object if * ensure that all methods return the single object if
* it's really no output */ * it's really no output */

View File

@ -29,8 +29,6 @@ import java.io.Writer;
import java.util.*; import java.util.*;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
@ -56,7 +54,6 @@ import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil; import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton;
@ -67,6 +64,10 @@ import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs.Pair; import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.util.fst.FSTTester.getRandomString;
import static org.apache.lucene.util.fst.FSTTester.simpleRandomString;
import static org.apache.lucene.util.fst.FSTTester.toIntsRef;
@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) @SuppressCodecs({ "SimpleText", "Memory", "Direct" })
@Slow @Slow
public class TestFSTs extends LuceneTestCase { public class TestFSTs extends LuceneTestCase {
@ -87,59 +88,6 @@ public class TestFSTs extends LuceneTestCase {
super.tearDown(); super.tearDown();
} }
private static BytesRef toBytesRef(IntsRef ir) {
BytesRef br = new BytesRef(ir.length);
for(int i=0;i<ir.length;i++) {
int x = ir.ints[ir.offset+i];
assert x >= 0 && x <= 255;
br.bytes[i] = (byte) x;
}
br.length = ir.length;
return br;
}
static IntsRef toIntsRef(String s, int inputMode) {
return toIntsRef(s, inputMode, new IntsRef(10));
}
static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
if (inputMode == 0) {
// utf8
return toIntsRef(new BytesRef(s), ir);
} else {
// utf32
return toIntsRefUTF32(s, ir);
}
}
static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
final int charLength = s.length();
int charIdx = 0;
int intIdx = 0;
while(charIdx < charLength) {
if (intIdx == ir.ints.length) {
ir.grow(intIdx+1);
}
final int utf32 = s.codePointAt(charIdx);
ir.ints[intIdx] = utf32;
charIdx += Character.charCount(utf32);
intIdx++;
}
ir.length = intIdx;
return ir;
}
static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
if (br.length > ir.ints.length) {
ir.grow(br.length);
}
for(int i=0;i<br.length;i++) {
ir.ints[i] = br.bytes[br.offset+i]&0xFF;
}
ir.length = br.length;
return ir;
}
public void testBasicFSA() throws IOException { public void testBasicFSA() throws IOException {
String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"}; String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"}; String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"};
@ -206,19 +154,6 @@ public class TestFSTs extends LuceneTestCase {
} }
} }
private static String simpleRandomString(Random r) {
final int end = r.nextInt(10);
if (end == 0) {
// allow 0 length
return "";
}
final char[] buffer = new char[end];
for (int i = 0; i < end; i++) {
buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
}
return new String(buffer, 0, end);
}
// given set of terms, test the different outputs for them // given set of terms, test the different outputs for them
private void doTest(int inputMode, IntsRef[] terms) throws IOException { private void doTest(int inputMode, IntsRef[] terms) throws IOException {
Arrays.sort(terms); Arrays.sort(terms);
@ -231,7 +166,7 @@ public class TestFSTs extends LuceneTestCase {
for(IntsRef term : terms) { for(IntsRef term : terms) {
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT)); pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
} }
new FSTTester<Object>(random(), dir, inputMode, pairs, outputs, false).doTest(); new FSTTester<Object>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
} }
// PositiveIntOutput (ord) // PositiveIntOutput (ord)
@ -241,7 +176,7 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms.length;idx++) { for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], (long) idx)); pairs.add(new FSTTester.InputOutput<Long>(terms[idx], (long) idx));
} }
new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, true).doTest(); new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, true).doTest(true);
} }
// PositiveIntOutput (random monotonically increasing positive number) // PositiveIntOutput (random monotonically increasing positive number)
@ -255,7 +190,7 @@ public class TestFSTs extends LuceneTestCase {
lastOutput = value; lastOutput = value;
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], value)); pairs.add(new FSTTester.InputOutput<Long>(terms[idx], value));
} }
new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, doShare).doTest(); new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, doShare).doTest(true);
} }
// PositiveIntOutput (random positive number) // PositiveIntOutput (random positive number)
@ -265,7 +200,7 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms.length;idx++) { for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], _TestUtil.nextLong(random(), 0, Long.MAX_VALUE))); pairs.add(new FSTTester.InputOutput<Long>(terms[idx], _TestUtil.nextLong(random(), 0, Long.MAX_VALUE)));
} }
new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, false).doTest(); new FSTTester<Long>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
} }
// Pair<ord, (random monotonically increasing positive number> // Pair<ord, (random monotonically increasing positive number>
@ -281,7 +216,7 @@ public class TestFSTs extends LuceneTestCase {
pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx], pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
outputs.newPair((long) idx, value))); outputs.newPair((long) idx, value)));
} }
new FSTTester<PairOutputs.Pair<Long,Long>>(random(), dir, inputMode, pairs, outputs, false).doTest(); new FSTTester<PairOutputs.Pair<Long,Long>>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
} }
// Sequence-of-bytes // Sequence-of-bytes
@ -293,7 +228,7 @@ public class TestFSTs extends LuceneTestCase {
final BytesRef output = random().nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx)); final BytesRef output = random().nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output)); pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output));
} }
new FSTTester<BytesRef>(random(), dir, inputMode, pairs, outputs, false).doTest(); new FSTTester<BytesRef>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
} }
// Sequence-of-ints // Sequence-of-ints
@ -309,722 +244,11 @@ public class TestFSTs extends LuceneTestCase {
} }
pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output)); pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output));
} }
new FSTTester<IntsRef>(random(), dir, inputMode, pairs, outputs, false).doTest(); new FSTTester<IntsRef>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
} }
// Up to two positive ints, shared, generally but not
// monotonically increasing
{
if (VERBOSE) {
System.out.println("TEST: now test UpToTwoPositiveIntOutputs");
}
final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true);
final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
long lastOutput = 0;
for(int idx=0;idx<terms.length;idx++) {
// Sometimes go backwards
long value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
while(value < 0) {
value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
}
final Object output;
if (random().nextInt(5) == 3) {
long value2 = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
while(value2 < 0) {
value2 = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
}
output = outputs.get(value, value2);
} else {
output = outputs.get(value);
}
pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
}
new FSTTester<Object>(random(), dir, inputMode, pairs, outputs, false).doTest();
}
}
private static class FSTTester<T> {
final Random random;
final List<InputOutput<T>> pairs;
final int inputMode;
final Outputs<T> outputs;
final Directory dir;
final boolean doReverseLookup;
public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs, boolean doReverseLookup) {
this.random = random;
this.dir = dir;
this.inputMode = inputMode;
this.pairs = pairs;
this.outputs = outputs;
this.doReverseLookup = doReverseLookup;
}
private static class InputOutput<T> implements Comparable<InputOutput<T>> {
public final IntsRef input;
public final T output;
public InputOutput(IntsRef input, T output) {
this.input = input;
this.output = output;
}
public int compareTo(InputOutput<T> other) {
if (other instanceof InputOutput) {
return input.compareTo((other).input);
} else {
throw new IllegalArgumentException();
}
}
}
public void doTest() throws IOException {
// no pruning
doTest(0, 0, true);
if (!(outputs instanceof UpToTwoPositiveIntOutputs)) {
// simple pruning
doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true);
// leafy pruning
doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true);
}
}
// runs the term, returning the output, or null if term
// isn't accepted. if prefixLength is non-null it must be
// length 1 int array; prefixLength[0] is set to the length
// of the term prefix that matches
private T run(FST<T> fst, IntsRef term, int[] prefixLength) throws IOException {
assert prefixLength == null || prefixLength.length == 1;
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
final FST.BytesReader fstReader = fst.getBytesReader(0);
for(int i=0;i<=term.length;i++) {
final int label;
if (i == term.length) {
label = FST.END_LABEL;
} else {
label = term.ints[term.offset+i];
}
// System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
if (fst.findTargetArc(label, arc, arc, fstReader) == null) {
// System.out.println(" not found");
if (prefixLength != null) {
prefixLength[0] = i;
return output;
} else {
return null;
}
}
output = fst.outputs.add(output, arc.output);
}
if (prefixLength != null) {
prefixLength[0] = term.length;
}
return output;
}
private T randomAcceptedWord(FST<T> fst, IntsRef in) throws IOException {
FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
final List<FST.Arc<T>> arcs = new ArrayList<FST.Arc<T>>();
in.length = 0;
in.offset = 0;
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
final FST.BytesReader fstReader = fst.getBytesReader(0);
while(true) {
// read all arcs:
fst.readFirstTargetArc(arc, arc, fstReader);
arcs.add(new FST.Arc<T>().copyFrom(arc));
while(!arc.isLast()) {
fst.readNextArc(arc, fstReader);
arcs.add(new FST.Arc<T>().copyFrom(arc));
}
// pick one
arc = arcs.get(random.nextInt(arcs.size()));
arcs.clear();
// accumulate output
output = fst.outputs.add(output, arc.output);
// append label
if (arc.label == FST.END_LABEL) {
break;
}
if (in.ints.length == in.length) {
in.grow(1+in.length);
}
in.ints[in.length++] = arc.label;
}
return output;
}
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
if (VERBOSE) {
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
}
final boolean willRewrite = random.nextBoolean();
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1==0 && prune2==0,
allowRandomSuffixSharing ? random.nextBoolean() : true,
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
outputs,
null,
willRewrite);
for(InputOutput<T> pair : pairs) {
if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs;
final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output;
@SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
builderObject.add(pair.input, _outputs.get(twoLongs.first));
builderObject.add(pair.input, _outputs.get(twoLongs.second));
} else {
builder.add(pair.input, pair.output);
}
}
FST<T> fst = builder.finish();
if (random.nextBoolean() && fst != null && !willRewrite) {
IOContext context = LuceneTestCase.newIOContext(random);
IndexOutput out = dir.createOutput("fst.bin", context);
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst.bin", context);
try {
fst = new FST<T>(in, outputs);
} finally {
in.close();
dir.deleteFile("fst.bin");
}
}
if (VERBOSE && pairs.size() <= 20 && fst != null) {
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
Util.toDot(fst, w, false, false);
w.close();
System.out.println("SAVED out.dot");
}
if (VERBOSE) {
if (fst == null) {
System.out.println(" fst has 0 nodes (fully pruned)");
} else {
System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs");
}
}
if (prune1 == 0 && prune2 == 0) {
verifyUnPruned(inputMode, fst);
} else {
verifyPruned(inputMode, fst, prune1, prune2);
}
if (willRewrite && fst != null) {
if (VERBOSE) {
System.out.println("TEST: now rewrite");
}
final FST<T> packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat());
if (VERBOSE) {
System.out.println("TEST: now verify packed FST");
}
if (prune1 == 0 && prune2 == 0) {
verifyUnPruned(inputMode, packed);
} else {
verifyPruned(inputMode, packed, prune1, prune2);
}
}
return fst;
}
// FST is complete
private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
final FST<Long> fstLong;
final Set<Long> validOutputs;
long minLong = Long.MAX_VALUE;
long maxLong = Long.MIN_VALUE;
if (doReverseLookup) {
@SuppressWarnings("unchecked") FST<Long> fstLong0 = (FST<Long>) fst;
fstLong = fstLong0;
validOutputs = new HashSet<Long>();
for(InputOutput<T> pair: pairs) {
Long output = (Long) pair.output;
maxLong = Math.max(maxLong, output);
minLong = Math.min(minLong, output);
validOutputs.add(output);
}
} else {
fstLong = null;
validOutputs = null;
}
if (pairs.size() == 0) {
assertNull(fst);
return;
}
if (VERBOSE) {
System.out.println("TEST: now verify " + pairs.size() + " terms");
for(InputOutput<T> pair : pairs) {
assertNotNull(pair);
assertNotNull(pair.input);
assertNotNull(pair.output);
System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
}
}
assertNotNull(fst);
// visit valid pairs in order -- make sure all words
// are accepted, and FSTEnum's next() steps through
// them correctly
if (VERBOSE) {
System.out.println("TEST: check valid terms/next()");
}
{
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
for(InputOutput<T> pair : pairs) {
IntsRef term = pair.input;
if (VERBOSE) {
System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
}
Object output = run(fst, term, null);
assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
assertEquals(pair.output, output);
// verify enum's next
IntsRefFSTEnum.InputOutput<T> t = fstEnum.next();
assertNotNull(t);
assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input);
assertEquals(pair.output, t.output);
}
assertNull(fstEnum.next());
}
final Map<IntsRef,T> termsMap = new HashMap<IntsRef,T>();
for(InputOutput<T> pair : pairs) {
termsMap.put(pair.input, pair.output);
}
if (doReverseLookup && maxLong > minLong) {
// Do random lookups so we test null (output doesn't
// exist) case:
assertNull(Util.getByOutput(fstLong, minLong-7));
assertNull(Util.getByOutput(fstLong, maxLong+7));
final int num = atLeast(100);
for(int iter=0;iter<num;iter++) {
Long v = _TestUtil.nextLong(random, minLong, maxLong);
IntsRef input = Util.getByOutput(fstLong, v);
assertTrue(validOutputs.contains(v) || input == null);
}
}
// find random matching word and make sure it's valid
if (VERBOSE) {
System.out.println("TEST: verify random accepted terms");
}
final IntsRef scratch = new IntsRef(10);
int num = atLeast(500);
for(int iter=0;iter<num;iter++) {
T output = randomAcceptedWord(fst, scratch);
assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
assertEquals(termsMap.get(scratch), output);
if (doReverseLookup) {
//System.out.println("lookup output=" + output + " outs=" + fst.outputs);
IntsRef input = Util.getByOutput(fstLong, (Long) output);
assertNotNull(input);
//System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
assertEquals(scratch, input);
}
} }
// test IntsRefFSTEnum.seek:
if (VERBOSE) {
System.out.println("TEST: verify seek");
}
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
num = atLeast(100);
for(int iter=0;iter<num;iter++) {
if (VERBOSE) {
System.out.println(" iter=" + iter);
}
if (random.nextBoolean()) {
// seek to term that doesn't exist:
while(true) {
final IntsRef term = toIntsRef(getRandomString(random), inputMode);
int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
if (pos < 0) {
pos = -(pos+1);
// ok doesn't exist
//System.out.println(" seek " + inputToString(inputMode, term));
final IntsRefFSTEnum.InputOutput<T> seekResult;
if (random.nextInt(3) == 0) {
if (VERBOSE) {
System.out.println(" do non-exist seekExact term=" + inputToString(inputMode, term));
}
seekResult = fstEnum.seekExact(term);
pos = -1;
} else if (random.nextBoolean()) {
if (VERBOSE) {
System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term));
}
seekResult = fstEnum.seekFloor(term);
pos--;
} else {
if (VERBOSE) {
System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term));
}
seekResult = fstEnum.seekCeil(term);
}
if (pos != -1 && pos < pairs.size()) {
//System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output));
assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult);
if (VERBOSE) {
System.out.println(" got " + inputToString(inputMode, seekResult.input));
}
assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input);
assertEquals(pairs.get(pos).output, seekResult.output);
} else {
// seeked before start or beyond end
//System.out.println("seek=" + seekTerm);
assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult);
if (VERBOSE) {
System.out.println(" got null");
}
}
break;
}
}
} else {
// seek to term that does exist:
InputOutput<T> pair = pairs.get(random.nextInt(pairs.size()));
final IntsRefFSTEnum.InputOutput<T> seekResult;
if (random.nextInt(3) == 2) {
if (VERBOSE) {
System.out.println(" do exists seekExact term=" + inputToString(inputMode, pair.input));
}
seekResult = fstEnum.seekExact(pair.input);
} else if (random.nextBoolean()) {
if (VERBOSE) {
System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input));
}
seekResult = fstEnum.seekFloor(pair.input);
} else {
if (VERBOSE) {
System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input));
}
seekResult = fstEnum.seekCeil(pair.input);
}
assertNotNull(seekResult);
assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input);
assertEquals(pair.output, seekResult.output);
}
}
if (VERBOSE) {
System.out.println("TEST: mixed next/seek");
}
// test mixed next/seek
num = atLeast(100);
for(int iter=0;iter<num;iter++) {
if (VERBOSE) {
System.out.println("TEST: iter " + iter);
}
// reset:
fstEnum = new IntsRefFSTEnum<T>(fst);
int upto = -1;
while(true) {
boolean isDone = false;
if (upto == pairs.size()-1 || random.nextBoolean()) {
// next
upto++;
if (VERBOSE) {
System.out.println(" do next");
}
isDone = fstEnum.next() == null;
} else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) {
int attempt = 0;
for(;attempt<10;attempt++) {
IntsRef term = toIntsRef(getRandomString(random), inputMode);
if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) {
int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
assert pos < 0;
upto = -(pos+1);
if (random.nextBoolean()) {
upto--;
assertTrue(upto != -1);
if (VERBOSE) {
System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")");
}
isDone = fstEnum.seekFloor(term) == null;
} else {
if (VERBOSE) {
System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")");
}
isDone = fstEnum.seekCeil(term) == null;
}
break;
}
}
if (attempt == 10) {
continue;
}
} else {
final int inc = random.nextInt(pairs.size() - upto - 1);
upto += inc;
if (upto == -1) {
upto = 0;
}
if (random.nextBoolean()) {
if (VERBOSE) {
System.out.println(" do seekCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")");
}
isDone = fstEnum.seekCeil(pairs.get(upto).input) == null;
} else {
if (VERBOSE) {
System.out.println(" do seekFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")");
}
isDone = fstEnum.seekFloor(pairs.get(upto).input) == null;
}
}
if (VERBOSE) {
if (!isDone) {
System.out.println(" got " + inputToString(inputMode, fstEnum.current().input));
} else {
System.out.println(" got null");
}
}
if (upto == pairs.size()) {
assertTrue(isDone);
break;
} else {
assertFalse(isDone);
assertEquals(pairs.get(upto).input, fstEnum.current().input);
assertEquals(pairs.get(upto).output, fstEnum.current().output);
/*
if (upto < pairs.size()-1) {
int tryCount = 0;
while(tryCount < 10) {
final IntsRef t = toIntsRef(getRandomString(), inputMode);
if (pairs.get(upto).input.compareTo(t) < 0) {
final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0;
if (VERBOSE) {
System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected);
}
assertEquals(expected, fstEnum.beforeNext(t));
break;
}
tryCount++;
}
}
*/
}
}
}
}
private static class CountMinOutput<T> {
int count;
T output;
T finalOutput;
boolean isLeaf = true;
boolean isFinal;
}
// FST is pruned
private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
if (VERBOSE) {
System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
for(InputOutput<T> pair : pairs) {
System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
}
}
// To validate the FST, we brute-force compute all prefixes
// in the terms, matched to their "common" outputs, prune that
// set according to the prune thresholds, then assert the FST
// matches that same set.
// NOTE: Crazy RAM intensive!!
//System.out.println("TEST: tally prefixes");
// build all prefixes
final Map<IntsRef,CountMinOutput<T>> prefixes = new HashMap<IntsRef,CountMinOutput<T>>();
final IntsRef scratch = new IntsRef(10);
for(InputOutput<T> pair: pairs) {
scratch.copyInts(pair.input);
for(int idx=0;idx<=pair.input.length;idx++) {
scratch.length = idx;
CountMinOutput<T> cmo = prefixes.get(scratch);
if (cmo == null) {
cmo = new CountMinOutput<T>();
cmo.count = 1;
cmo.output = pair.output;
prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
} else {
cmo.count++;
T output1 = cmo.output;
if (output1.equals(outputs.getNoOutput())) {
output1 = outputs.getNoOutput();
}
T output2 = pair.output;
if (output2.equals(outputs.getNoOutput())) {
output2 = outputs.getNoOutput();
}
cmo.output = outputs.common(output1, output2);
}
if (idx == pair.input.length) {
cmo.isFinal = true;
cmo.finalOutput = cmo.output;
}
}
}
if (VERBOSE) {
System.out.println("TEST: now prune");
}
// prune 'em
final Iterator<Map.Entry<IntsRef,CountMinOutput<T>>> it = prefixes.entrySet().iterator();
while(it.hasNext()) {
Map.Entry<IntsRef,CountMinOutput<T>> ent = it.next();
final IntsRef prefix = ent.getKey();
final CountMinOutput<T> cmo = ent.getValue();
if (VERBOSE) {
System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
}
final boolean keep;
if (prune1 > 0) {
keep = cmo.count >= prune1;
} else {
assert prune2 > 0;
if (prune2 > 1 && cmo.count >= prune2) {
keep = true;
} else if (prefix.length > 0) {
// consult our parent
scratch.length = prefix.length-1;
System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length);
final CountMinOutput<T> cmo2 = prefixes.get(scratch);
//System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count));
keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
} else if (cmo.count >= prune2) {
keep = true;
} else {
keep = false;
}
}
if (!keep) {
it.remove();
//System.out.println(" remove");
} else {
// clear isLeaf for all ancestors
//System.out.println(" keep");
scratch.copyInts(prefix);
scratch.length--;
while(scratch.length >= 0) {
final CountMinOutput<T> cmo2 = prefixes.get(scratch);
if (cmo2 != null) {
//System.out.println(" clear isLeaf " + inputToString(inputMode, scratch));
cmo2.isLeaf = false;
}
scratch.length--;
}
}
}
if (VERBOSE) {
System.out.println("TEST: after prune");
for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
System.out.println(" " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
if (ent.getValue().isFinal) {
System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
}
}
}
if (prefixes.size() <= 1) {
assertNull(fst);
return;
}
assertNotNull(fst);
// make sure FST only enums valid prefixes
if (VERBOSE) {
System.out.println("TEST: check pruned enum");
}
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
IntsRefFSTEnum.InputOutput<T> current;
while((current = fstEnum.next()) != null) {
if (VERBOSE) {
System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output));
}
final CountMinOutput<T> cmo = prefixes.get(current.input);
assertNotNull(cmo);
assertTrue(cmo.isLeaf || cmo.isFinal);
//if (cmo.isFinal && !cmo.isLeaf) {
if (cmo.isFinal) {
assertEquals(cmo.finalOutput, current.output);
} else {
assertEquals(cmo.output, current.output);
}
}
// make sure all non-pruned prefixes are present in the FST
if (VERBOSE) {
System.out.println("TEST: verify all prefixes");
}
final int[] stopNode = new int[1];
for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
if (ent.getKey().length > 0) {
final CountMinOutput<T> cmo = ent.getValue();
final T output = run(fst, ent.getKey(), stopNode);
if (VERBOSE) {
System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output));
}
// if (cmo.isFinal && !cmo.isLeaf) {
if (cmo.isFinal) {
assertEquals(cmo.finalOutput, output);
} else {
assertEquals(cmo.output, output);
}
assertEquals(ent.getKey().length, stopNode[0]);
}
}
}
}
public void testRandomWords() throws IOException { public void testRandomWords() throws IOException {
testRandomWords(1000, atLeast(2)); testRandomWords(1000, atLeast(2));
@ -1058,40 +282,11 @@ public class TestFSTs extends LuceneTestCase {
} }
} }
static String getRandomString(Random random) {
final String term;
if (random.nextBoolean()) {
term = _TestUtil.randomRealisticUnicodeString(random);
} else {
// we want to mix in limited-alphabet symbols so
// we get more sharing of the nodes given how few
// terms we are testing...
term = simpleRandomString(random);
}
return term;
}
@Nightly @Nightly
public void testBigSet() throws IOException { public void testBigSet() throws IOException {
testRandomWords(_TestUtil.nextInt(random(), 50000, 60000), 1); testRandomWords(_TestUtil.nextInt(random(), 50000, 60000), 1);
} }
static String inputToString(int inputMode, IntsRef term) {
return inputToString(inputMode, term, true);
}
private static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
if (!isValidUnicode) {
return term.toString();
} else if (inputMode == 0) {
// utf8
return toBytesRef(term).utf8ToString() + " " + term;
} else {
// utf32
return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
}
}
// Build FST for all unique terms in the test line docs // Build FST for all unique terms in the test line docs
// file, up until a time limit // file, up until a time limit
public void testRealTerms() throws Exception { public void testRealTerms() throws Exception {

View File

@ -0,0 +1,188 @@
package org.apache.lucene.util.fst;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
/**
* Wraps another Outputs implementation and encodes one or
* more of its output values. You can use this when a single
* input may need to map to more than one output,
* maintaining order: pass the same input with a different
* output by calling {@link Builder#add(IntsRef,Object)} multiple
* times. The builder will then combine the outputs using
* the {@link Outputs#merge(Object,Object)} method.
*
* <p>The resulting FST may not be minimal when an input has
* more than one output, as this requires pushing all
* multi-output values to a final state.
*
* <p>NOTE: this cannot wrap itself (ie you cannot make an
* FST with List&lt;List&lt;Object&gt;&gt; outputs using this).
*
* @lucene.experimental
*/
// NOTE: i think we could get a more compact FST if, instead
// of adding the same input multiple times with a different
// output each time, we added it only once with a
// pre-constructed List<T> output. This way the "multiple
// values" is fully opaque to the Builder/FST. It would
// require implementing the full algebra using set
// arithmetic (I think?); maybe SetOfOutputs is a good name.
@SuppressWarnings("unchecked")
public final class ListOfOutputs<T> extends Outputs<Object> {
private final Outputs<T> outputs;
public ListOfOutputs(Outputs<T> outputs) {
this.outputs = outputs;
}
@Override
public Object common(Object output1, Object output2) {
// These will never be a list:
return outputs.common((T) output1, (T) output2);
}
@Override
public Object subtract(Object object, Object inc) {
// These will never be a list:
return outputs.subtract((T) object, (T) inc);
}
@Override
public Object add(Object prefix, Object output) {
assert !(prefix instanceof List);
if (!(output instanceof List)) {
return outputs.add((T) prefix, (T) output);
} else {
List<T> outputList = (List<T>) output;
List<T> addedList = new ArrayList<T>(outputList.size());
for(T _output : outputList) {
addedList.add(outputs.add((T) prefix, _output));
}
return addedList;
}
}
@Override
public void write(Object output, DataOutput out) throws IOException {
assert !(output instanceof List);
outputs.write((T) output, out);
}
@Override
public void writeFinalOutput(Object output, DataOutput out) throws IOException {
if (!(output instanceof List)) {
out.writeVInt(1);
outputs.write((T) output, out);
} else {
List<T> outputList = (List<T>) output;
out.writeVInt(outputList.size());
for(T eachOutput : outputList) {
outputs.write(eachOutput, out);
}
}
}
@Override
public Object read(DataInput in) throws IOException {
return outputs.read(in);
}
@Override
public Object readFinalOutput(DataInput in) throws IOException {
int count = in.readVInt();
if (count == 1) {
return outputs.read(in);
} else {
List<T> outputList = new ArrayList<T>(count);
for(int i=0;i<count;i++) {
outputList.add(outputs.read(in));
}
return outputList;
}
}
@Override
public Object getNoOutput() {
return outputs.getNoOutput();
}
@Override
public String outputToString(Object output) {
if (!(output instanceof List)) {
return outputs.outputToString((T) output);
} else {
List<T> outputList = (List<T>) output;
StringBuilder b = new StringBuilder();
b.append('[');
for(int i=0;i<outputList.size();i++) {
if (i > 0) {
b.append(", ");
}
b.append(outputs.outputToString(outputList.get(i)));
}
b.append(']');
return b.toString();
}
}
@Override
public Object merge(Object first, Object second) {
List<T> outputList = new ArrayList<T>();
if (!(first instanceof List)) {
outputList.add((T) first);
} else {
outputList.addAll((List<T>) first);
}
if (!(second instanceof List)) {
outputList.add((T) second);
} else {
outputList.addAll((List<T>) second);
}
//System.out.println("MERGE: now " + outputList.size() + " first=" + outputToString(first) + " second=" + outputToString(second));
//System.out.println(" return " + outputToString(outputList));
return outputList;
}
@Override
public String toString() {
return "OneOrMoreOutputs(" + outputs + ")";
}
public List<T> asList(Object output) {
if (!(output instanceof List)) {
List<T> result = new ArrayList<T>(1);
result.add((T) output);
return result;
} else {
return (List<T>) output;
}
}
}

View File

@ -17,6 +17,21 @@ package org.apache.lucene.util.fst;
* limitations under the License. * limitations under the License.
*/ */
/**
* An FST {@link Outputs} implementation where each output
* is one or two non-negative long values. If it's a
* single output, Long is returned; else, TwoLongs. Order
* is preserved in the TwoLongs case, ie .first is the first
* input/output added to Builder, and .second is the
* second. You cannot store 0 output with this (that's
* reserved to mean "no output")!
*
* NOTE: the resulting FST is not guaranteed to be minimal!
* See {@link Builder}.
*
* @lucene.experimental
*/
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataInput;
@ -36,7 +51,6 @@ import org.apache.lucene.store.DataOutput;
* *
* @lucene.experimental * @lucene.experimental
*/ */
public final class UpToTwoPositiveIntOutputs extends Outputs<Object> { public final class UpToTwoPositiveIntOutputs extends Outputs<Object> {
/** Holds two long outputs. */ /** Holds two long outputs. */

View File

@ -0,0 +1,21 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Misc FST classes.
</body>
</html>

View File

@ -0,0 +1,237 @@
package org.apache.lucene.util.fst;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.fst.UpToTwoPositiveIntOutputs.TwoLongs;
import static org.apache.lucene.util.fst.FSTTester.getRandomString;
import static org.apache.lucene.util.fst.FSTTester.toIntsRef;
public class TestFSTsMisc extends LuceneTestCase {
private MockDirectoryWrapper dir;
@Override
public void setUp() throws Exception {
super.setUp();
dir = newMockDirectory();
dir.setPreventDoubleWrite(false);
}
@Override
public void tearDown() throws Exception {
// can be null if we force simpletext (funky, some kind of bug in test runner maybe)
if (dir != null) dir.close();
super.tearDown();
}
public void testRandomWords() throws IOException {
testRandomWords(1000, LuceneTestCase.atLeast(random(), 2));
//testRandomWords(100, 1);
}
private void testRandomWords(int maxNumWords, int numIter) throws IOException {
Random random = new Random(random().nextLong());
for(int iter=0;iter<numIter;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter " + iter);
}
for(int inputMode=0;inputMode<2;inputMode++) {
final int numWords = random.nextInt(maxNumWords+1);
Set<IntsRef> termsSet = new HashSet<IntsRef>();
IntsRef[] terms = new IntsRef[numWords];
while(termsSet.size() < numWords) {
final String term = getRandomString(random);
termsSet.add(toIntsRef(term, inputMode));
}
doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
}
}
}
private void doTest(int inputMode, IntsRef[] terms) throws IOException {
Arrays.sort(terms);
// Up to two positive ints, shared, generally but not
// monotonically increasing
{
if (VERBOSE) {
System.out.println("TEST: now test UpToTwoPositiveIntOutputs");
}
final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true);
final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
long lastOutput = 0;
for(int idx=0;idx<terms.length;idx++) {
// Sometimes go backwards
long value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
while(value < 0) {
value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
}
final Object output;
if (random().nextInt(5) == 3) {
long value2 = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
while(value2 < 0) {
value2 = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
}
List<Long> values = new ArrayList<Long>();
values.add(value);
values.add(value2);
output = values;
} else {
output = outputs.get(value);
}
pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
}
new FSTTester<Object>(random(), dir, inputMode, pairs, outputs, false) {
@Override
protected boolean outputsEqual(Object output1, Object output2) {
if (output1 instanceof TwoLongs && output2 instanceof List) {
TwoLongs twoLongs1 = (TwoLongs) output1;
return Arrays.asList(new Long[] {twoLongs1.first, twoLongs1.second}).equals(output2);
} else if (output2 instanceof TwoLongs && output1 instanceof List) {
TwoLongs twoLongs2 = (TwoLongs) output2;
return Arrays.asList(new Long[] {twoLongs2.first, twoLongs2.second}).equals(output1);
}
return output1.equals(output2);
}
}.doTest(false);
}
// ListOfOutputs(PositiveIntOutputs), generally but not
// monotonically increasing
{
if (VERBOSE) {
System.out.println("TEST: now test OneOrMoreOutputs");
}
final PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
final ListOfOutputs<Long> outputs = new ListOfOutputs<Long>(_outputs);
final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
long lastOutput = 0;
for(int idx=0;idx<terms.length;idx++) {
int outputCount = _TestUtil.nextInt(random(), 1, 7);
List<Long> values = new ArrayList<Long>();
for(int i=0;i<outputCount;i++) {
// Sometimes go backwards
long value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
while(value < 0) {
value = lastOutput + _TestUtil.nextInt(random(), -100, 1000);
}
values.add(value);
lastOutput = value;
}
final Object output;
if (values.size() == 1) {
output = values.get(0);
} else {
output = values;
}
pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
}
new FSTTester<Object>(random(), dir, inputMode, pairs, outputs, false).doTest(false);
}
}
public void testListOfOutputs() throws Exception {
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
ListOfOutputs<Long> outputs = new ListOfOutputs<Long>(_outputs);
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRef scratch = new IntsRef();
// Add the same input more than once and the outputs
// are merged:
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
final FST<Object> fst = builder.finish();
Object output = Util.get(fst, new BytesRef("a"));
assertNotNull(output);
List<Long> outputList = outputs.asList(output);
assertEquals(3, outputList.size());
assertEquals(1L, outputList.get(0).longValue());
assertEquals(3L, outputList.get(1).longValue());
assertEquals(0L, outputList.get(2).longValue());
output = Util.get(fst, new BytesRef("b"));
assertNotNull(output);
outputList = outputs.asList(output);
assertEquals(1, outputList.size());
assertEquals(17L, outputList.get(0).longValue());
}
public void testListOfOutputsEmptyString() throws Exception {
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
ListOfOutputs<Long> outputs = new ListOfOutputs<Long>(_outputs);
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRef scratch = new IntsRef();
builder.add(scratch, 0L);
builder.add(scratch, 1L);
builder.add(scratch, 17L);
builder.add(scratch, 1L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
final FST<Object> fst = builder.finish();
Object output = Util.get(fst, new BytesRef(""));
assertNotNull(output);
List<Long> outputList = outputs.asList(output);
assertEquals(4, outputList.size());
assertEquals(0L, outputList.get(0).longValue());
assertEquals(1L, outputList.get(1).longValue());
assertEquals(17L, outputList.get(2).longValue());
assertEquals(1L, outputList.get(3).longValue());
output = Util.get(fst, new BytesRef("a"));
assertNotNull(output);
outputList = outputs.asList(output);
assertEquals(3, outputList.size());
assertEquals(1L, outputList.get(0).longValue());
assertEquals(3L, outputList.get(1).longValue());
assertEquals(0L, outputList.get(2).longValue());
output = Util.get(fst, new BytesRef("b"));
assertNotNull(output);
outputList = outputs.asList(output);
assertEquals(1, outputList.size());
assertEquals(0L, outputList.get(0).longValue());
}
}

View File

@ -0,0 +1,832 @@
package org.apache.lucene.util.fst;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
/** Helper class to test FSTs. */
public class FSTTester<T> {
final Random random;
final List<InputOutput<T>> pairs;
final int inputMode;
final Outputs<T> outputs;
final Directory dir;
final boolean doReverseLookup;
public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs, boolean doReverseLookup) {
this.random = random;
this.dir = dir;
this.inputMode = inputMode;
this.pairs = pairs;
this.outputs = outputs;
this.doReverseLookup = doReverseLookup;
}
static String inputToString(int inputMode, IntsRef term) {
return inputToString(inputMode, term, true);
}
static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
if (!isValidUnicode) {
return term.toString();
} else if (inputMode == 0) {
// utf8
return toBytesRef(term).utf8ToString() + " " + term;
} else {
// utf32
return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
}
}
private static BytesRef toBytesRef(IntsRef ir) {
BytesRef br = new BytesRef(ir.length);
for(int i=0;i<ir.length;i++) {
int x = ir.ints[ir.offset+i];
assert x >= 0 && x <= 255;
br.bytes[i] = (byte) x;
}
br.length = ir.length;
return br;
}
static String getRandomString(Random random) {
final String term;
if (random.nextBoolean()) {
term = _TestUtil.randomRealisticUnicodeString(random);
} else {
// we want to mix in limited-alphabet symbols so
// we get more sharing of the nodes given how few
// terms we are testing...
term = simpleRandomString(random);
}
return term;
}
static String simpleRandomString(Random r) {
final int end = r.nextInt(10);
if (end == 0) {
// allow 0 length
return "";
}
final char[] buffer = new char[end];
for (int i = 0; i < end; i++) {
buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
}
return new String(buffer, 0, end);
}
static IntsRef toIntsRef(String s, int inputMode) {
return toIntsRef(s, inputMode, new IntsRef(10));
}
static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
if (inputMode == 0) {
// utf8
return toIntsRef(new BytesRef(s), ir);
} else {
// utf32
return toIntsRefUTF32(s, ir);
}
}
static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
final int charLength = s.length();
int charIdx = 0;
int intIdx = 0;
while(charIdx < charLength) {
if (intIdx == ir.ints.length) {
ir.grow(intIdx+1);
}
final int utf32 = s.codePointAt(charIdx);
ir.ints[intIdx] = utf32;
charIdx += Character.charCount(utf32);
intIdx++;
}
ir.length = intIdx;
return ir;
}
static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
if (br.length > ir.ints.length) {
ir.grow(br.length);
}
for(int i=0;i<br.length;i++) {
ir.ints[i] = br.bytes[br.offset+i]&0xFF;
}
ir.length = br.length;
return ir;
}
/** Holds one input/output pair. */
public static class InputOutput<T> implements Comparable<InputOutput<T>> {
public final IntsRef input;
public final T output;
public InputOutput(IntsRef input, T output) {
this.input = input;
this.output = output;
}
public int compareTo(InputOutput<T> other) {
if (other instanceof InputOutput) {
return input.compareTo((other).input);
} else {
throw new IllegalArgumentException();
}
}
}
public void doTest(boolean testPruning) throws IOException {
// no pruning
doTest(0, 0, true);
if (testPruning) {
// simple pruning
doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true);
// leafy pruning
doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true);
}
}
// runs the term, returning the output, or null if term
// isn't accepted. if prefixLength is non-null it must be
// length 1 int array; prefixLength[0] is set to the length
// of the term prefix that matches
private T run(FST<T> fst, IntsRef term, int[] prefixLength) throws IOException {
assert prefixLength == null || prefixLength.length == 1;
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
final FST.BytesReader fstReader = fst.getBytesReader(0);
for(int i=0;i<=term.length;i++) {
final int label;
if (i == term.length) {
label = FST.END_LABEL;
} else {
label = term.ints[term.offset+i];
}
// System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
if (fst.findTargetArc(label, arc, arc, fstReader) == null) {
// System.out.println(" not found");
if (prefixLength != null) {
prefixLength[0] = i;
return output;
} else {
return null;
}
}
output = fst.outputs.add(output, arc.output);
}
if (prefixLength != null) {
prefixLength[0] = term.length;
}
return output;
}
private T randomAcceptedWord(FST<T> fst, IntsRef in) throws IOException {
FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
final List<FST.Arc<T>> arcs = new ArrayList<FST.Arc<T>>();
in.length = 0;
in.offset = 0;
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
final FST.BytesReader fstReader = fst.getBytesReader(0);
while(true) {
// read all arcs:
fst.readFirstTargetArc(arc, arc, fstReader);
arcs.add(new FST.Arc<T>().copyFrom(arc));
while(!arc.isLast()) {
fst.readNextArc(arc, fstReader);
arcs.add(new FST.Arc<T>().copyFrom(arc));
}
// pick one
arc = arcs.get(random.nextInt(arcs.size()));
arcs.clear();
// accumulate output
output = fst.outputs.add(output, arc.output);
// append label
if (arc.label == FST.END_LABEL) {
break;
}
if (in.ints.length == in.length) {
in.grow(1+in.length);
}
in.ints[in.length++] = arc.label;
}
return output;
}
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
if (LuceneTestCase.VERBOSE) {
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
}
final boolean willRewrite = random.nextBoolean();
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1==0 && prune2==0,
allowRandomSuffixSharing ? random.nextBoolean() : true,
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
outputs,
null,
willRewrite);
for(InputOutput<T> pair : pairs) {
if (pair.output instanceof List) {
@SuppressWarnings("unchecked") List<Long> longValues = (List<Long>) pair.output;
@SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
for(Long value : longValues) {
builderObject.add(pair.input, value);
}
} else {
builder.add(pair.input, pair.output);
}
}
FST<T> fst = builder.finish();
if (random.nextBoolean() && fst != null && !willRewrite) {
IOContext context = LuceneTestCase.newIOContext(random);
IndexOutput out = dir.createOutput("fst.bin", context);
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst.bin", context);
try {
fst = new FST<T>(in, outputs);
} finally {
in.close();
dir.deleteFile("fst.bin");
}
}
if (LuceneTestCase.VERBOSE && pairs.size() <= 20 && fst != null) {
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
Util.toDot(fst, w, false, false);
w.close();
System.out.println("SAVED out.dot");
}
if (LuceneTestCase.VERBOSE) {
if (fst == null) {
System.out.println(" fst has 0 nodes (fully pruned)");
} else {
System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs");
}
}
if (prune1 == 0 && prune2 == 0) {
verifyUnPruned(inputMode, fst);
} else {
verifyPruned(inputMode, fst, prune1, prune2);
}
if (willRewrite && fst != null) {
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now rewrite");
}
final FST<T> packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat());
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now verify packed FST");
}
if (prune1 == 0 && prune2 == 0) {
verifyUnPruned(inputMode, packed);
} else {
verifyPruned(inputMode, packed, prune1, prune2);
}
}
return fst;
}
protected boolean outputsEqual(T a, T b) {
return a.equals(b);
}
// FST is complete
private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
final FST<Long> fstLong;
final Set<Long> validOutputs;
long minLong = Long.MAX_VALUE;
long maxLong = Long.MIN_VALUE;
if (doReverseLookup) {
@SuppressWarnings("unchecked") FST<Long> fstLong0 = (FST<Long>) fst;
fstLong = fstLong0;
validOutputs = new HashSet<Long>();
for(InputOutput<T> pair: pairs) {
Long output = (Long) pair.output;
maxLong = Math.max(maxLong, output);
minLong = Math.min(minLong, output);
validOutputs.add(output);
}
} else {
fstLong = null;
validOutputs = null;
}
if (pairs.size() == 0) {
assertNull(fst);
return;
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now verify " + pairs.size() + " terms");
for(InputOutput<T> pair : pairs) {
assertNotNull(pair);
assertNotNull(pair.input);
assertNotNull(pair.output);
System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
}
}
assertNotNull(fst);
// visit valid pairs in order -- make sure all words
// are accepted, and FSTEnum's next() steps through
// them correctly
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: check valid terms/next()");
}
{
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
for(InputOutput<T> pair : pairs) {
IntsRef term = pair.input;
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
}
T output = run(fst, term, null);
assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
assertTrue(outputsEqual(pair.output, output));
// verify enum's next
IntsRefFSTEnum.InputOutput<T> t = fstEnum.next();
assertNotNull(t);
assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input);
assertTrue(outputsEqual(pair.output, t.output));
}
assertNull(fstEnum.next());
}
final Map<IntsRef,T> termsMap = new HashMap<IntsRef,T>();
for(InputOutput<T> pair : pairs) {
termsMap.put(pair.input, pair.output);
}
if (doReverseLookup && maxLong > minLong) {
// Do random lookups so we test null (output doesn't
// exist) case:
assertNull(Util.getByOutput(fstLong, minLong-7));
assertNull(Util.getByOutput(fstLong, maxLong+7));
final int num = LuceneTestCase.atLeast(random, 100);
for(int iter=0;iter<num;iter++) {
Long v = _TestUtil.nextLong(random, minLong, maxLong);
IntsRef input = Util.getByOutput(fstLong, v);
assertTrue(validOutputs.contains(v) || input == null);
}
}
// find random matching word and make sure it's valid
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: verify random accepted terms");
}
final IntsRef scratch = new IntsRef(10);
int num = LuceneTestCase.atLeast(random, 500);
for(int iter=0;iter<num;iter++) {
T output = randomAcceptedWord(fst, scratch);
assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
assertTrue(outputsEqual(termsMap.get(scratch), output));
if (doReverseLookup) {
//System.out.println("lookup output=" + output + " outs=" + fst.outputs);
IntsRef input = Util.getByOutput(fstLong, (Long) output);
assertNotNull(input);
//System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
assertEquals(scratch, input);
}
}
// test IntsRefFSTEnum.seek:
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: verify seek");
}
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
num = LuceneTestCase.atLeast(random, 100);
for(int iter=0;iter<num;iter++) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" iter=" + iter);
}
if (random.nextBoolean()) {
// seek to term that doesn't exist:
while(true) {
final IntsRef term = toIntsRef(getRandomString(random), inputMode);
int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
if (pos < 0) {
pos = -(pos+1);
// ok doesn't exist
//System.out.println(" seek " + inputToString(inputMode, term));
final IntsRefFSTEnum.InputOutput<T> seekResult;
if (random.nextInt(3) == 0) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do non-exist seekExact term=" + inputToString(inputMode, term));
}
seekResult = fstEnum.seekExact(term);
pos = -1;
} else if (random.nextBoolean()) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term));
}
seekResult = fstEnum.seekFloor(term);
pos--;
} else {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term));
}
seekResult = fstEnum.seekCeil(term);
}
if (pos != -1 && pos < pairs.size()) {
//System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output));
assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult);
if (LuceneTestCase.VERBOSE) {
System.out.println(" got " + inputToString(inputMode, seekResult.input));
}
assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input);
assertTrue(outputsEqual(pairs.get(pos).output, seekResult.output));
} else {
// seeked before start or beyond end
//System.out.println("seek=" + seekTerm);
assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult);
if (LuceneTestCase.VERBOSE) {
System.out.println(" got null");
}
}
break;
}
}
} else {
// seek to term that does exist:
InputOutput<T> pair = pairs.get(random.nextInt(pairs.size()));
final IntsRefFSTEnum.InputOutput<T> seekResult;
if (random.nextInt(3) == 2) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do exists seekExact term=" + inputToString(inputMode, pair.input));
}
seekResult = fstEnum.seekExact(pair.input);
} else if (random.nextBoolean()) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input));
}
seekResult = fstEnum.seekFloor(pair.input);
} else {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input));
}
seekResult = fstEnum.seekCeil(pair.input);
}
assertNotNull(seekResult);
assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input);
assertTrue(outputsEqual(pair.output, seekResult.output));
}
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: mixed next/seek");
}
// test mixed next/seek
num = LuceneTestCase.atLeast(random, 100);
for(int iter=0;iter<num;iter++) {
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: iter " + iter);
}
// reset:
fstEnum = new IntsRefFSTEnum<T>(fst);
int upto = -1;
while(true) {
boolean isDone = false;
if (upto == pairs.size()-1 || random.nextBoolean()) {
// next
upto++;
if (LuceneTestCase.VERBOSE) {
System.out.println(" do next");
}
isDone = fstEnum.next() == null;
} else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) {
int attempt = 0;
for(;attempt<10;attempt++) {
IntsRef term = toIntsRef(getRandomString(random), inputMode);
if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) {
int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
assert pos < 0;
upto = -(pos+1);
if (random.nextBoolean()) {
upto--;
assertTrue(upto != -1);
if (LuceneTestCase.VERBOSE) {
System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")");
}
isDone = fstEnum.seekFloor(term) == null;
} else {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")");
}
isDone = fstEnum.seekCeil(term) == null;
}
break;
}
}
if (attempt == 10) {
continue;
}
} else {
final int inc = random.nextInt(pairs.size() - upto - 1);
upto += inc;
if (upto == -1) {
upto = 0;
}
if (random.nextBoolean()) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do seekCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")");
}
isDone = fstEnum.seekCeil(pairs.get(upto).input) == null;
} else {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do seekFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")");
}
isDone = fstEnum.seekFloor(pairs.get(upto).input) == null;
}
}
if (LuceneTestCase.VERBOSE) {
if (!isDone) {
System.out.println(" got " + inputToString(inputMode, fstEnum.current().input));
} else {
System.out.println(" got null");
}
}
if (upto == pairs.size()) {
assertTrue(isDone);
break;
} else {
assertFalse(isDone);
assertEquals(pairs.get(upto).input, fstEnum.current().input);
assertTrue(outputsEqual(pairs.get(upto).output, fstEnum.current().output));
/*
if (upto < pairs.size()-1) {
int tryCount = 0;
while(tryCount < 10) {
final IntsRef t = toIntsRef(getRandomString(), inputMode);
if (pairs.get(upto).input.compareTo(t) < 0) {
final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0;
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected);
}
assertEquals(expected, fstEnum.beforeNext(t));
break;
}
tryCount++;
}
}
*/
}
}
}
}
private static class CountMinOutput<T> {
int count;
T output;
T finalOutput;
boolean isLeaf = true;
boolean isFinal;
}
// FST is pruned
private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
for(InputOutput<T> pair : pairs) {
System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
}
}
// To validate the FST, we brute-force compute all prefixes
// in the terms, matched to their "common" outputs, prune that
// set according to the prune thresholds, then assert the FST
// matches that same set.
// NOTE: Crazy RAM intensive!!
//System.out.println("TEST: tally prefixes");
// build all prefixes
final Map<IntsRef,CountMinOutput<T>> prefixes = new HashMap<IntsRef,CountMinOutput<T>>();
final IntsRef scratch = new IntsRef(10);
for(InputOutput<T> pair: pairs) {
scratch.copyInts(pair.input);
for(int idx=0;idx<=pair.input.length;idx++) {
scratch.length = idx;
CountMinOutput<T> cmo = prefixes.get(scratch);
if (cmo == null) {
cmo = new CountMinOutput<T>();
cmo.count = 1;
cmo.output = pair.output;
prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
} else {
cmo.count++;
T output1 = cmo.output;
if (output1.equals(outputs.getNoOutput())) {
output1 = outputs.getNoOutput();
}
T output2 = pair.output;
if (output2.equals(outputs.getNoOutput())) {
output2 = outputs.getNoOutput();
}
cmo.output = outputs.common(output1, output2);
}
if (idx == pair.input.length) {
cmo.isFinal = true;
cmo.finalOutput = cmo.output;
}
}
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now prune");
}
// prune 'em
final Iterator<Map.Entry<IntsRef,CountMinOutput<T>>> it = prefixes.entrySet().iterator();
while(it.hasNext()) {
Map.Entry<IntsRef,CountMinOutput<T>> ent = it.next();
final IntsRef prefix = ent.getKey();
final CountMinOutput<T> cmo = ent.getValue();
if (LuceneTestCase.VERBOSE) {
System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
}
final boolean keep;
if (prune1 > 0) {
keep = cmo.count >= prune1;
} else {
assert prune2 > 0;
if (prune2 > 1 && cmo.count >= prune2) {
keep = true;
} else if (prefix.length > 0) {
// consult our parent
scratch.length = prefix.length-1;
System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length);
final CountMinOutput<T> cmo2 = prefixes.get(scratch);
//System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count));
keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
} else if (cmo.count >= prune2) {
keep = true;
} else {
keep = false;
}
}
if (!keep) {
it.remove();
//System.out.println(" remove");
} else {
// clear isLeaf for all ancestors
//System.out.println(" keep");
scratch.copyInts(prefix);
scratch.length--;
while(scratch.length >= 0) {
final CountMinOutput<T> cmo2 = prefixes.get(scratch);
if (cmo2 != null) {
//System.out.println(" clear isLeaf " + inputToString(inputMode, scratch));
cmo2.isLeaf = false;
}
scratch.length--;
}
}
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: after prune");
for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
System.out.println(" " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
if (ent.getValue().isFinal) {
System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
}
}
}
if (prefixes.size() <= 1) {
assertNull(fst);
return;
}
assertNotNull(fst);
// make sure FST only enums valid prefixes
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: check pruned enum");
}
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
IntsRefFSTEnum.InputOutput<T> current;
while((current = fstEnum.next()) != null) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output));
}
final CountMinOutput<T> cmo = prefixes.get(current.input);
assertNotNull(cmo);
assertTrue(cmo.isLeaf || cmo.isFinal);
//if (cmo.isFinal && !cmo.isLeaf) {
if (cmo.isFinal) {
assertEquals(cmo.finalOutput, current.output);
} else {
assertEquals(cmo.output, current.output);
}
}
// make sure all non-pruned prefixes are present in the FST
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: verify all prefixes");
}
final int[] stopNode = new int[1];
for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
if (ent.getKey().length > 0) {
final CountMinOutput<T> cmo = ent.getValue();
final T output = run(fst, ent.getKey(), stopNode);
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output));
}
// if (cmo.isFinal && !cmo.isLeaf) {
if (cmo.isFinal) {
assertEquals(cmo.finalOutput, output);
} else {
assertEquals(cmo.output, output);
}
assertEquals(ent.getKey().length, stopNode[0]);
}
}
}
}

View File

@ -0,0 +1,25 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Support for FST testing.
</body>
</html>