LUCENE-3121: add reverse-lookup by long to FST, when output grows strictly monotonically w/ input (eg, ord, address, etc.)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1233381 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-01-19 14:07:40 +00:00
parent 47f474735f
commit 5bc7f5bbd0
7 changed files with 216 additions and 37 deletions

View File

@ -785,6 +785,10 @@ New Features
(i.e. \* or "*") Custom QueryParser subclasses overriding getRangeQuery()
will be passed null for any open endpoint. (Ingo Renner, Adriano
Crestani, yonik, Mike McCandless
* LUCENE-3121: Add sugar reverse lookup (given an output, find the
input mapping to it) for FSTs that have strictly monotonic long
outputs (such as an ord). (Mike McCandless)
Bug fixes

View File

@ -274,7 +274,7 @@ public class FST<T> {
getFirstArc(arc);
final BytesReader in = getBytesReader(0);
if (targetHasArcs(arc)) {
readFirstRealArc(arc.target, arc);
readFirstRealArc(arc.target, arc, in);
while(true) {
assert arc.label != END_LABEL;
if (arc.label < cachedRootArcs.length) {
@ -666,14 +666,12 @@ public class FST<T> {
//System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output));
return arc;
} else {
return readFirstRealArc(follow.target, arc);
return readFirstRealArc(follow.target, arc, getBytesReader(0));
}
}
public Arc<T> readFirstRealArc(int address, Arc<T> arc) throws IOException {
final BytesReader in = getBytesReader(address);
public Arc<T> readFirstRealArc(int address, Arc<T> arc, final BytesReader in) throws IOException {
in.pos = address;
arc.flags = in.readByte();
if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) {
@ -715,7 +713,7 @@ public class FST<T> {
// This arc went to virtual final node, ie has no outgoing arcs
return null;
}
return readFirstRealArc(arc.nextArc, arc);
return readFirstRealArc(arc.nextArc, arc, getBytesReader(0));
} else {
return readNextRealArc(arc, getBytesReader(0));
}
@ -989,14 +987,16 @@ public class FST<T> {
public final BytesReader getBytesReader(int pos) {
// TODO: maybe re-use via ThreadLocal?
return new BytesReader(pos);
return new BytesReader(bytes, pos);
}
// Non-static: reads byte[] from FST
final class BytesReader extends DataInput {
/** Expert */
public final static class BytesReader extends DataInput {
final byte[] bytes;
int pos;
public BytesReader(int pos) {
public BytesReader(byte[] bytes, int pos) {
this.bytes = bytes;
this.pos = pos;
}

View File

@ -143,7 +143,7 @@ abstract class FSTEnum<T> {
// Arcs are fixed array -- use binary search to find
// the target.
final FST<T>.BytesReader in = fst.getBytesReader(0);
final FST.BytesReader in = fst.getBytesReader(0);
int low = arc.arcIdx;
int high = arc.numArcs-1;
int mid = 0;
@ -281,7 +281,7 @@ abstract class FSTEnum<T> {
// Arcs are fixed array -- use binary search to find
// the target.
final FST<T>.BytesReader in = fst.getBytesReader(0);
final FST.BytesReader in = fst.getBytesReader(0);
int low = arc.arcIdx;
int high = arc.numArcs-1;
int mid = 0;

View File

@ -34,9 +34,8 @@ final class NodeHash<T> {
this.fst = fst;
}
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address) throws IOException {
final FST<T>.BytesReader in = fst.getBytesReader(0);
fst.readFirstRealArc(address, scratchArc);
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address, FST.BytesReader in) throws IOException {
fst.readFirstRealArc(address, scratchArc, in);
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
return false;
}
@ -88,10 +87,10 @@ final class NodeHash<T> {
// hash code for a frozen node
private int hash(int node) throws IOException {
final int PRIME = 31;
final FST<T>.BytesReader in = fst.getBytesReader(0);
final FST.BytesReader in = fst.getBytesReader(0);
//System.out.println("hash frozen");
int h = 0;
fst.readFirstRealArc(node, scratchArc);
fst.readFirstRealArc(node, scratchArc, in);
while(true) {
//System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal());
h = PRIME * h + scratchArc.label;
@ -112,6 +111,7 @@ final class NodeHash<T> {
public int add(Builder.UnCompiledNode<T> node) throws IOException {
// System.out.println("hash: add count=" + count + " vs " + table.length);
final FST.BytesReader in = fst.getBytesReader(0);
final int h = hash(node);
int pos = h & mask;
int c = 0;
@ -128,7 +128,7 @@ final class NodeHash<T> {
rehash();
}
return address;
} else if (nodesEqual(node, v)) {
} else if (nodesEqual(node, v, in)) {
// same node is already here
return v;
}

View File

@ -135,4 +135,9 @@ public final class PositiveIntOutputs extends Outputs<Long> {
public String outputToString(Long output) {
return output.toString();
}
@Override
public String toString() {
return "PositiveIntOutputs(doShare=" + doShare + ")";
}
}

View File

@ -86,6 +86,113 @@ public final class Util {
return output;
}
}
// TODO: parameterize the FST type <T> and allow passing in a
// comparator; eg maybe your output is a PairOutput and
// one of the outputs in the pair is monotonic so you
// compare by that
/** Reverse lookup (lookup by output instead of by input),
* in the special case when your FSTs outputs are
* strictly ascending. This locates the input/output
* pair where the output is equal to the target, and will
* return null if that output does not exist.
*
* <p>NOTE: this only works with FST<Long>, only
* works when the outputs are ascending in order with
* the inputs and only works when you shared
* the outputs (pass doShare=true to {@link
* PositiveIntOutputs#getSingleton}).
* For example, simple ordinals (0, 1,
* 2, ...), or file offets (when appending to a file)
* fit this. */
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
final FST.BytesReader in = fst.getBytesReader(0);
// TODO: would be nice not to alloc this on every lookup
FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
FST.Arc<Long> scratchArc = new FST.Arc<Long>();
final IntsRef result = new IntsRef();
long output = arc.output;
int upto = 0;
//System.out.println("reverseLookup output=" + targetOutput);
while(true) {
if (arc.isFinal()) {
final long finalOutput = output + arc.nextFinalOutput;
//System.out.println(" isFinal finalOutput=" + finalOutput);
if (finalOutput == targetOutput) {
result.length = upto;
//System.out.println(" found!");
return result;
} else if (finalOutput > targetOutput) {
//System.out.println(" not found!");
return null;
}
}
if (fst.targetHasArcs(arc)) {
//System.out.println(" targetHasArcs");
if (result.ints.length == upto) {
result.grow(1+upto);
}
fst.readFirstRealArc(arc.target, arc, in);
FST.Arc<Long> prevArc = null;
// TODO: we could do binary search here if node arcs
// are array'd:
while(true) {
//System.out.println(" cycle label=" + arc.label + " output=" + arc.output);
// This is the min output we'd hit if we follow
// this arc:
final long minArcOutput = output + arc.output;
if (minArcOutput == targetOutput) {
// Recurse on this arc:
//System.out.println(" match! break");
output = minArcOutput;
result.ints[upto++] = arc.label;
break;
} else if (minArcOutput > targetOutput) {
if (prevArc == null) {
// Output doesn't exist
return null;
} else {
// Recurse on previous arc:
arc.copyFrom(prevArc);
result.ints[upto++] = arc.label;
output += arc.output;
//System.out.println(" recurse prev label=" + (char) arc.label + " output=" + output);
break;
}
} else if (arc.isLast()) {
// Recurse on this arc:
output = minArcOutput;
//System.out.println(" recurse last label=" + (char) arc.label + " output=" + output);
result.ints[upto++] = arc.label;
break;
} else {
// Read next arc in this node:
prevArc = scratchArc;
prevArc.copyFrom(arc);
//System.out.println(" after copy label=" + (char) prevArc.label + " vs " + (char) arc.label);
fst.readNextRealArc(arc, in);
}
}
} else {
//System.out.println(" no target arcs; not found!");
return null;
}
}
}
/**
* Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
@ -356,4 +463,15 @@ public final class Util {
scratch.length = input.length;
return scratch;
}
/** Just converts IntsRef to BytesRef; you must ensure the
* int values fit into a byte. */
public static BytesRef toBytesRef(IntsRef input, BytesRef scratch) {
scratch.grow(input.length);
for(int i=0;i<input.length;i++) {
scratch.bytes[i] = (byte) input.ints[i+input.offset];
}
scratch.length = input.length;
return scratch;
}
}

View File

@ -161,7 +161,7 @@ public class TestFSTs extends LuceneTestCase {
for(IntsRef term : terms2) {
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
}
FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(22, fst.getNodeCount());
assertEquals(27, fst.getArcCount());
@ -174,7 +174,7 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms2.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
}
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(22, fst.getNodeCount());
assertEquals(27, fst.getArcCount());
@ -189,7 +189,7 @@ public class TestFSTs extends LuceneTestCase {
final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
}
final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs, false).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(24, fst.getNodeCount());
assertEquals(30, fst.getArcCount());
@ -222,7 +222,7 @@ public class TestFSTs extends LuceneTestCase {
for(IntsRef term : terms) {
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
}
new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// PositiveIntOutput (ord)
@ -232,12 +232,13 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest();
}
// PositiveIntOutput (random monotonically increasing positive number)
{
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
final boolean doShare = random.nextBoolean();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(doShare);
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
long lastOutput = 0;
for(int idx=0;idx<terms.length;idx++) {
@ -245,7 +246,7 @@ public class TestFSTs extends LuceneTestCase {
lastOutput = value;
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, doShare).doTest();
}
// PositiveIntOutput (random positive number)
@ -255,7 +256,7 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Pair<ord, (random monotonically increasing positive number>
@ -272,7 +273,7 @@ public class TestFSTs extends LuceneTestCase {
outputs.get(o1.get(idx),
o2.get(value))));
}
new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Sequence-of-bytes
@ -284,7 +285,7 @@ public class TestFSTs extends LuceneTestCase {
final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output));
}
new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Sequence-of-ints
@ -300,7 +301,7 @@ public class TestFSTs extends LuceneTestCase {
}
pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output));
}
new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Up to two positive ints, shared, generally but not
@ -330,7 +331,7 @@ public class TestFSTs extends LuceneTestCase {
}
pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
}
new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest();
}
}
@ -341,13 +342,15 @@ public class TestFSTs extends LuceneTestCase {
final int inputMode;
final Outputs<T> outputs;
final Directory dir;
final boolean doReverseLookup;
public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs) {
public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs, boolean doReverseLookup) {
this.random = random;
this.dir = dir;
this.inputMode = inputMode;
this.pairs = pairs;
this.outputs = outputs;
this.doReverseLookup = doReverseLookup;
}
private static class InputOutput<T> implements Comparable<InputOutput<T>> {
@ -525,6 +528,26 @@ public class TestFSTs extends LuceneTestCase {
// FST is complete
private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
final FST<Long> fstLong;
final Set<Long> validOutputs;
long minLong = Long.MAX_VALUE;
long maxLong = Long.MIN_VALUE;
if (doReverseLookup) {
@SuppressWarnings("unchecked") FST<Long> fstLong0 = (FST<Long>) fst;
fstLong = fstLong0;
validOutputs = new HashSet<Long>();
for(InputOutput<T> pair: pairs) {
Long output = (Long) pair.output;
maxLong = Math.max(maxLong, output);
minLong = Math.min(minLong, output);
validOutputs.add(output);
}
} else {
fstLong = null;
validOutputs = null;
}
if (pairs.size() == 0) {
assertNull(fst);
return;
@ -542,7 +565,7 @@ public class TestFSTs extends LuceneTestCase {
assertNotNull(fst);
// visit valid paris in order -- make sure all words
// visit valid pairs in order -- make sure all words
// are accepted, and FSTEnum's next() steps through
// them correctly
if (VERBOSE) {
@ -556,7 +579,6 @@ public class TestFSTs extends LuceneTestCase {
System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
}
Object output = run(fst, term, null);
assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
assertEquals(pair.output, output);
@ -574,6 +596,20 @@ public class TestFSTs extends LuceneTestCase {
termsMap.put(pair.input, pair.output);
}
if (doReverseLookup && maxLong > minLong) {
// Do random lookups so we test null (output doesn't
// exist) case:
assertNull(Util.getByOutput(fstLong, minLong-7));
assertNull(Util.getByOutput(fstLong, maxLong+7));
final int num = atLeast(100);
for(int iter=0;iter<num;iter++) {
Long v = minLong + random.nextLong() % (maxLong - minLong);
IntsRef input = Util.getByOutput(fstLong, v);
assertTrue(validOutputs.contains(v) || input == null);
}
}
// find random matching word and make sure it's valid
if (VERBOSE) {
System.out.println("TEST: verify random accepted terms");
@ -584,6 +620,14 @@ public class TestFSTs extends LuceneTestCase {
T output = randomAcceptedWord(fst, scratch);
assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
assertEquals(termsMap.get(scratch), output);
if (doReverseLookup) {
//System.out.println("lookup output=" + output + " outs=" + fst.outputs);
IntsRef input = Util.getByOutput(fstLong, (Long) output);
assertNotNull(input);
//System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
assertEquals(scratch, input);
}
}
// test IntsRefFSTEnum.seek:
@ -887,7 +931,7 @@ public class TestFSTs extends LuceneTestCase {
if (VERBOSE) {
System.out.println("TEST: after prune");
for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
System.out.println(" " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
if (ent.getValue().isFinal) {
System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
}
@ -951,7 +995,7 @@ public class TestFSTs extends LuceneTestCase {
//testRandomWords(20, 100);
}
private String inputModeToString(int mode) {
String inputModeToString(int mode) {
if (mode == 0) {
return "utf8";
} else {
@ -995,7 +1039,7 @@ public class TestFSTs extends LuceneTestCase {
testRandomWords(_TestUtil.nextInt(random, 50000, 60000), 1);
}
private static String inputToString(int inputMode, IntsRef term) {
static String inputToString(int inputMode, IntsRef term) {
return inputToString(inputMode, term, true);
}
@ -1422,6 +1466,14 @@ public class TestFSTs extends LuceneTestCase {
assertNotNull(seekResult);
assertEquals(b, seekResult.input);
assertEquals(42, (long) seekResult.output);
assertEquals(Util.toIntsRef(new BytesRef("c"), new IntsRef()),
Util.getByOutput(fst, 13824324872317238L));
assertNull(Util.getByOutput(fst, 47));
assertEquals(Util.toIntsRef(new BytesRef("b"), new IntsRef()),
Util.getByOutput(fst, 42));
assertEquals(Util.toIntsRef(new BytesRef("a"), new IntsRef()),
Util.getByOutput(fst, 17));
}
public void testPrimaryKeys() throws Exception {