LUCENE-3121: add reverse-lookup by long to FST, when output grows strictly monotonically w/ input (eg, ord, address, etc.)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1233381 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-01-19 14:07:40 +00:00
parent 47f474735f
commit 5bc7f5bbd0
7 changed files with 216 additions and 37 deletions

View File

@ -786,6 +786,10 @@ New Features
will be passed null for any open endpoint. (Ingo Renner, Adriano
Crestani, yonik, Mike McCandless
* LUCENE-3121: Add sugar reverse lookup (given an output, find the
input mapping to it) for FSTs that have strictly monotonic long
outputs (such as an ord). (Mike McCandless)
Bug fixes
* LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter

View File

@ -274,7 +274,7 @@ public class FST<T> {
getFirstArc(arc);
final BytesReader in = getBytesReader(0);
if (targetHasArcs(arc)) {
readFirstRealArc(arc.target, arc);
readFirstRealArc(arc.target, arc, in);
while(true) {
assert arc.label != END_LABEL;
if (arc.label < cachedRootArcs.length) {
@ -666,14 +666,12 @@ public class FST<T> {
//System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output));
return arc;
} else {
return readFirstRealArc(follow.target, arc);
return readFirstRealArc(follow.target, arc, getBytesReader(0));
}
}
public Arc<T> readFirstRealArc(int address, Arc<T> arc) throws IOException {
final BytesReader in = getBytesReader(address);
public Arc<T> readFirstRealArc(int address, Arc<T> arc, final BytesReader in) throws IOException {
in.pos = address;
arc.flags = in.readByte();
if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) {
@ -715,7 +713,7 @@ public class FST<T> {
// This arc went to virtual final node, ie has no outgoing arcs
return null;
}
return readFirstRealArc(arc.nextArc, arc);
return readFirstRealArc(arc.nextArc, arc, getBytesReader(0));
} else {
return readNextRealArc(arc, getBytesReader(0));
}
@ -989,14 +987,16 @@ public class FST<T> {
public final BytesReader getBytesReader(int pos) {
// TODO: maybe re-use via ThreadLocal?
return new BytesReader(pos);
return new BytesReader(bytes, pos);
}
// Non-static: reads byte[] from FST
final class BytesReader extends DataInput {
/** Expert */
public final static class BytesReader extends DataInput {
final byte[] bytes;
int pos;
public BytesReader(int pos) {
public BytesReader(byte[] bytes, int pos) {
this.bytes = bytes;
this.pos = pos;
}

View File

@ -143,7 +143,7 @@ abstract class FSTEnum<T> {
// Arcs are fixed array -- use binary search to find
// the target.
final FST<T>.BytesReader in = fst.getBytesReader(0);
final FST.BytesReader in = fst.getBytesReader(0);
int low = arc.arcIdx;
int high = arc.numArcs-1;
int mid = 0;
@ -281,7 +281,7 @@ abstract class FSTEnum<T> {
// Arcs are fixed array -- use binary search to find
// the target.
final FST<T>.BytesReader in = fst.getBytesReader(0);
final FST.BytesReader in = fst.getBytesReader(0);
int low = arc.arcIdx;
int high = arc.numArcs-1;
int mid = 0;

View File

@ -34,9 +34,8 @@ final class NodeHash<T> {
this.fst = fst;
}
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address) throws IOException {
final FST<T>.BytesReader in = fst.getBytesReader(0);
fst.readFirstRealArc(address, scratchArc);
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address, FST.BytesReader in) throws IOException {
fst.readFirstRealArc(address, scratchArc, in);
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
return false;
}
@ -88,10 +87,10 @@ final class NodeHash<T> {
// hash code for a frozen node
private int hash(int node) throws IOException {
final int PRIME = 31;
final FST<T>.BytesReader in = fst.getBytesReader(0);
final FST.BytesReader in = fst.getBytesReader(0);
//System.out.println("hash frozen");
int h = 0;
fst.readFirstRealArc(node, scratchArc);
fst.readFirstRealArc(node, scratchArc, in);
while(true) {
//System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal());
h = PRIME * h + scratchArc.label;
@ -112,6 +111,7 @@ final class NodeHash<T> {
public int add(Builder.UnCompiledNode<T> node) throws IOException {
// System.out.println("hash: add count=" + count + " vs " + table.length);
final FST.BytesReader in = fst.getBytesReader(0);
final int h = hash(node);
int pos = h & mask;
int c = 0;
@ -128,7 +128,7 @@ final class NodeHash<T> {
rehash();
}
return address;
} else if (nodesEqual(node, v)) {
} else if (nodesEqual(node, v, in)) {
// same node is already here
return v;
}

View File

@ -135,4 +135,9 @@ public final class PositiveIntOutputs extends Outputs<Long> {
public String outputToString(Long output) {
return output.toString();
}
@Override
public String toString() {
return "PositiveIntOutputs(doShare=" + doShare + ")";
}
}

View File

@ -87,6 +87,113 @@ public final class Util {
}
}
// TODO: parameterize the FST type <T> and allow passing in a
// comparator; eg maybe your output is a PairOutput and
// one of the outputs in the pair is monotonic so you
// compare by that
/** Reverse lookup (lookup by output instead of by input),
* in the special case when your FSTs outputs are
* strictly ascending. This locates the input/output
* pair where the output is equal to the target, and will
* return null if that output does not exist.
*
* <p>NOTE: this only works with FST<Long>, only
* works when the outputs are ascending in order with
* the inputs and only works when you shared
* the outputs (pass doShare=true to {@link
* PositiveIntOutputs#getSingleton}).
* For example, simple ordinals (0, 1,
* 2, ...), or file offets (when appending to a file)
* fit this. */
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
final FST.BytesReader in = fst.getBytesReader(0);
// TODO: would be nice not to alloc this on every lookup
FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
FST.Arc<Long> scratchArc = new FST.Arc<Long>();
final IntsRef result = new IntsRef();
long output = arc.output;
int upto = 0;
//System.out.println("reverseLookup output=" + targetOutput);
while(true) {
if (arc.isFinal()) {
final long finalOutput = output + arc.nextFinalOutput;
//System.out.println(" isFinal finalOutput=" + finalOutput);
if (finalOutput == targetOutput) {
result.length = upto;
//System.out.println(" found!");
return result;
} else if (finalOutput > targetOutput) {
//System.out.println(" not found!");
return null;
}
}
if (fst.targetHasArcs(arc)) {
//System.out.println(" targetHasArcs");
if (result.ints.length == upto) {
result.grow(1+upto);
}
fst.readFirstRealArc(arc.target, arc, in);
FST.Arc<Long> prevArc = null;
// TODO: we could do binary search here if node arcs
// are array'd:
while(true) {
//System.out.println(" cycle label=" + arc.label + " output=" + arc.output);
// This is the min output we'd hit if we follow
// this arc:
final long minArcOutput = output + arc.output;
if (minArcOutput == targetOutput) {
// Recurse on this arc:
//System.out.println(" match! break");
output = minArcOutput;
result.ints[upto++] = arc.label;
break;
} else if (minArcOutput > targetOutput) {
if (prevArc == null) {
// Output doesn't exist
return null;
} else {
// Recurse on previous arc:
arc.copyFrom(prevArc);
result.ints[upto++] = arc.label;
output += arc.output;
//System.out.println(" recurse prev label=" + (char) arc.label + " output=" + output);
break;
}
} else if (arc.isLast()) {
// Recurse on this arc:
output = minArcOutput;
//System.out.println(" recurse last label=" + (char) arc.label + " output=" + output);
result.ints[upto++] = arc.label;
break;
} else {
// Read next arc in this node:
prevArc = scratchArc;
prevArc.copyFrom(arc);
//System.out.println(" after copy label=" + (char) prevArc.label + " vs " + (char) arc.label);
fst.readNextRealArc(arc, in);
}
}
} else {
//System.out.println(" no target arcs; not found!");
return null;
}
}
}
/**
* Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
* for visualization. Example of use:
@ -356,4 +463,15 @@ public final class Util {
scratch.length = input.length;
return scratch;
}
/** Just converts IntsRef to BytesRef; you must ensure the
* int values fit into a byte. */
public static BytesRef toBytesRef(IntsRef input, BytesRef scratch) {
scratch.grow(input.length);
for(int i=0;i<input.length;i++) {
scratch.bytes[i] = (byte) input.ints[i+input.offset];
}
scratch.length = input.length;
return scratch;
}
}

View File

@ -161,7 +161,7 @@ public class TestFSTs extends LuceneTestCase {
for(IntsRef term : terms2) {
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
}
FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(22, fst.getNodeCount());
assertEquals(27, fst.getArcCount());
@ -174,7 +174,7 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms2.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
}
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(22, fst.getNodeCount());
assertEquals(27, fst.getArcCount());
@ -189,7 +189,7 @@ public class TestFSTs extends LuceneTestCase {
final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
}
final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs, false).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(24, fst.getNodeCount());
assertEquals(30, fst.getArcCount());
@ -222,7 +222,7 @@ public class TestFSTs extends LuceneTestCase {
for(IntsRef term : terms) {
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
}
new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// PositiveIntOutput (ord)
@ -232,12 +232,13 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest();
}
// PositiveIntOutput (random monotonically increasing positive number)
{
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
final boolean doShare = random.nextBoolean();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(doShare);
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
long lastOutput = 0;
for(int idx=0;idx<terms.length;idx++) {
@ -245,7 +246,7 @@ public class TestFSTs extends LuceneTestCase {
lastOutput = value;
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, doShare).doTest();
}
// PositiveIntOutput (random positive number)
@ -255,7 +256,7 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Pair<ord, (random monotonically increasing positive number>
@ -272,7 +273,7 @@ public class TestFSTs extends LuceneTestCase {
outputs.get(o1.get(idx),
o2.get(value))));
}
new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Sequence-of-bytes
@ -284,7 +285,7 @@ public class TestFSTs extends LuceneTestCase {
final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output));
}
new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Sequence-of-ints
@ -300,7 +301,7 @@ public class TestFSTs extends LuceneTestCase {
}
pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output));
}
new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Up to two positive ints, shared, generally but not
@ -330,7 +331,7 @@ public class TestFSTs extends LuceneTestCase {
}
pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
}
new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest();
}
}
@ -341,13 +342,15 @@ public class TestFSTs extends LuceneTestCase {
final int inputMode;
final Outputs<T> outputs;
final Directory dir;
final boolean doReverseLookup;
public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs) {
public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs, boolean doReverseLookup) {
this.random = random;
this.dir = dir;
this.inputMode = inputMode;
this.pairs = pairs;
this.outputs = outputs;
this.doReverseLookup = doReverseLookup;
}
private static class InputOutput<T> implements Comparable<InputOutput<T>> {
@ -525,6 +528,26 @@ public class TestFSTs extends LuceneTestCase {
// FST is complete
private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
final FST<Long> fstLong;
final Set<Long> validOutputs;
long minLong = Long.MAX_VALUE;
long maxLong = Long.MIN_VALUE;
if (doReverseLookup) {
@SuppressWarnings("unchecked") FST<Long> fstLong0 = (FST<Long>) fst;
fstLong = fstLong0;
validOutputs = new HashSet<Long>();
for(InputOutput<T> pair: pairs) {
Long output = (Long) pair.output;
maxLong = Math.max(maxLong, output);
minLong = Math.min(minLong, output);
validOutputs.add(output);
}
} else {
fstLong = null;
validOutputs = null;
}
if (pairs.size() == 0) {
assertNull(fst);
return;
@ -542,7 +565,7 @@ public class TestFSTs extends LuceneTestCase {
assertNotNull(fst);
// visit valid paris in order -- make sure all words
// visit valid pairs in order -- make sure all words
// are accepted, and FSTEnum's next() steps through
// them correctly
if (VERBOSE) {
@ -556,7 +579,6 @@ public class TestFSTs extends LuceneTestCase {
System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
}
Object output = run(fst, term, null);
assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
assertEquals(pair.output, output);
@ -574,6 +596,20 @@ public class TestFSTs extends LuceneTestCase {
termsMap.put(pair.input, pair.output);
}
if (doReverseLookup && maxLong > minLong) {
// Do random lookups so we test null (output doesn't
// exist) case:
assertNull(Util.getByOutput(fstLong, minLong-7));
assertNull(Util.getByOutput(fstLong, maxLong+7));
final int num = atLeast(100);
for(int iter=0;iter<num;iter++) {
Long v = minLong + random.nextLong() % (maxLong - minLong);
IntsRef input = Util.getByOutput(fstLong, v);
assertTrue(validOutputs.contains(v) || input == null);
}
}
// find random matching word and make sure it's valid
if (VERBOSE) {
System.out.println("TEST: verify random accepted terms");
@ -584,6 +620,14 @@ public class TestFSTs extends LuceneTestCase {
T output = randomAcceptedWord(fst, scratch);
assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
assertEquals(termsMap.get(scratch), output);
if (doReverseLookup) {
//System.out.println("lookup output=" + output + " outs=" + fst.outputs);
IntsRef input = Util.getByOutput(fstLong, (Long) output);
assertNotNull(input);
//System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
assertEquals(scratch, input);
}
}
// test IntsRefFSTEnum.seek:
@ -887,7 +931,7 @@ public class TestFSTs extends LuceneTestCase {
if (VERBOSE) {
System.out.println("TEST: after prune");
for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
System.out.println(" " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
if (ent.getValue().isFinal) {
System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
}
@ -951,7 +995,7 @@ public class TestFSTs extends LuceneTestCase {
//testRandomWords(20, 100);
}
private String inputModeToString(int mode) {
String inputModeToString(int mode) {
if (mode == 0) {
return "utf8";
} else {
@ -995,7 +1039,7 @@ public class TestFSTs extends LuceneTestCase {
testRandomWords(_TestUtil.nextInt(random, 50000, 60000), 1);
}
private static String inputToString(int inputMode, IntsRef term) {
static String inputToString(int inputMode, IntsRef term) {
return inputToString(inputMode, term, true);
}
@ -1422,6 +1466,14 @@ public class TestFSTs extends LuceneTestCase {
assertNotNull(seekResult);
assertEquals(b, seekResult.input);
assertEquals(42, (long) seekResult.output);
assertEquals(Util.toIntsRef(new BytesRef("c"), new IntsRef()),
Util.getByOutput(fst, 13824324872317238L));
assertNull(Util.getByOutput(fst, 47));
assertEquals(Util.toIntsRef(new BytesRef("b"), new IntsRef()),
Util.getByOutput(fst, 42));
assertEquals(Util.toIntsRef(new BytesRef("a"), new IntsRef()),
Util.getByOutput(fst, 17));
}
public void testPrimaryKeys() throws Exception {