diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 32c2dff8155..4bc3cf52406 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -785,6 +785,10 @@ New Features (i.e. \* or "*") Custom QueryParser subclasses overriding getRangeQuery() will be passed null for any open endpoint. (Ingo Renner, Adriano Crestani, yonik, Mike McCandless + +* LUCENE-3121: Add sugar reverse lookup (given an output, find the + input mapping to it) for FSTs that have strictly monotonic long + outputs (such as an ord). (Mike McCandless) Bug fixes diff --git a/lucene/src/java/org/apache/lucene/util/fst/FST.java b/lucene/src/java/org/apache/lucene/util/fst/FST.java index 72f7302f07f..109eac83756 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/src/java/org/apache/lucene/util/fst/FST.java @@ -274,7 +274,7 @@ public class FST { getFirstArc(arc); final BytesReader in = getBytesReader(0); if (targetHasArcs(arc)) { - readFirstRealArc(arc.target, arc); + readFirstRealArc(arc.target, arc, in); while(true) { assert arc.label != END_LABEL; if (arc.label < cachedRootArcs.length) { @@ -666,14 +666,12 @@ public class FST { //System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output)); return arc; } else { - return readFirstRealArc(follow.target, arc); + return readFirstRealArc(follow.target, arc, getBytesReader(0)); } } - public Arc readFirstRealArc(int address, Arc arc) throws IOException { - - final BytesReader in = getBytesReader(address); - + public Arc readFirstRealArc(int address, Arc arc, final BytesReader in) throws IOException { + in.pos = address; arc.flags = in.readByte(); if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { @@ -715,7 +713,7 @@ public class FST { // This arc went to virtual final node, ie has no outgoing arcs return null; } - return readFirstRealArc(arc.nextArc, arc); + return readFirstRealArc(arc.nextArc, arc, getBytesReader(0)); } else { return readNextRealArc(arc, getBytesReader(0)); } @@ -989,14 +987,16 @@ public class FST { public final BytesReader getBytesReader(int pos) { // TODO: maybe re-use via ThreadLocal? - return new BytesReader(pos); + return new BytesReader(bytes, pos); } - // Non-static: reads byte[] from FST - final class BytesReader extends DataInput { + /** Expert */ + public final static class BytesReader extends DataInput { + final byte[] bytes; int pos; - public BytesReader(int pos) { + public BytesReader(byte[] bytes, int pos) { + this.bytes = bytes; this.pos = pos; } diff --git a/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java b/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java index a6c4b66c9c1..221e6b2eaf1 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java +++ b/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java @@ -143,7 +143,7 @@ abstract class FSTEnum { // Arcs are fixed array -- use binary search to find // the target. - final FST.BytesReader in = fst.getBytesReader(0); + final FST.BytesReader in = fst.getBytesReader(0); int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; @@ -281,7 +281,7 @@ abstract class FSTEnum { // Arcs are fixed array -- use binary search to find // the target. - final FST.BytesReader in = fst.getBytesReader(0); + final FST.BytesReader in = fst.getBytesReader(0); int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; diff --git a/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java index 0d854b722f1..da973358442 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -34,9 +34,8 @@ final class NodeHash { this.fst = fst; } - private boolean nodesEqual(Builder.UnCompiledNode node, int address) throws IOException { - final FST.BytesReader in = fst.getBytesReader(0); - fst.readFirstRealArc(address, scratchArc); + private boolean nodesEqual(Builder.UnCompiledNode node, int address, FST.BytesReader in) throws IOException { + fst.readFirstRealArc(address, scratchArc, in); if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { return false; } @@ -88,10 +87,10 @@ final class NodeHash { // hash code for a frozen node private int hash(int node) throws IOException { final int PRIME = 31; - final FST.BytesReader in = fst.getBytesReader(0); + final FST.BytesReader in = fst.getBytesReader(0); //System.out.println("hash frozen"); int h = 0; - fst.readFirstRealArc(node, scratchArc); + fst.readFirstRealArc(node, scratchArc, in); while(true) { //System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal()); h = PRIME * h + scratchArc.label; @@ -112,6 +111,7 @@ final class NodeHash { public int add(Builder.UnCompiledNode node) throws IOException { // System.out.println("hash: add count=" + count + " vs " + table.length); + final FST.BytesReader in = fst.getBytesReader(0); final int h = hash(node); int pos = h & mask; int c = 0; @@ -128,7 +128,7 @@ final class NodeHash { rehash(); } return address; - } else if (nodesEqual(node, v)) { + } else if (nodesEqual(node, v, in)) { // same node is already here return v; } diff --git a/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java b/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java index 22734186bbe..c1051152518 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java +++ b/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java @@ -135,4 +135,9 @@ public final class PositiveIntOutputs extends Outputs { public String outputToString(Long output) { return output.toString(); } + + @Override + public String toString() { + return "PositiveIntOutputs(doShare=" + doShare + ")"; + } } diff --git a/lucene/src/java/org/apache/lucene/util/fst/Util.java b/lucene/src/java/org/apache/lucene/util/fst/Util.java index 37a85a87c17..b5e4e2d6fea 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/src/java/org/apache/lucene/util/fst/Util.java @@ -86,6 +86,113 @@ public final class Util { return output; } } + + // TODO: parameterize the FST type and allow passing in a + // comparator; eg maybe your output is a PairOutput and + // one of the outputs in the pair is monotonic so you + // compare by that + + /** Reverse lookup (lookup by output instead of by input), + * in the special case when your FSTs outputs are + * strictly ascending. This locates the input/output + * pair where the output is equal to the target, and will + * return null if that output does not exist. + * + *

NOTE: this only works with FST, only + * works when the outputs are ascending in order with + * the inputs and only works when you shared + * the outputs (pass doShare=true to {@link + * PositiveIntOutputs#getSingleton}). + * For example, simple ordinals (0, 1, + * 2, ...), or file offets (when appending to a file) + * fit this. */ + public static IntsRef getByOutput(FST fst, long targetOutput) throws IOException { + + final FST.BytesReader in = fst.getBytesReader(0); + + // TODO: would be nice not to alloc this on every lookup + FST.Arc arc = fst.getFirstArc(new FST.Arc()); + + FST.Arc scratchArc = new FST.Arc(); + + final IntsRef result = new IntsRef(); + + long output = arc.output; + int upto = 0; + + //System.out.println("reverseLookup output=" + targetOutput); + + while(true) { + if (arc.isFinal()) { + final long finalOutput = output + arc.nextFinalOutput; + //System.out.println(" isFinal finalOutput=" + finalOutput); + if (finalOutput == targetOutput) { + result.length = upto; + //System.out.println(" found!"); + return result; + } else if (finalOutput > targetOutput) { + //System.out.println(" not found!"); + return null; + } + } + + if (fst.targetHasArcs(arc)) { + //System.out.println(" targetHasArcs"); + if (result.ints.length == upto) { + result.grow(1+upto); + } + + fst.readFirstRealArc(arc.target, arc, in); + + FST.Arc prevArc = null; + + // TODO: we could do binary search here if node arcs + // are array'd: + while(true) { + //System.out.println(" cycle label=" + arc.label + " output=" + arc.output); + + // This is the min output we'd hit if we follow + // this arc: + final long minArcOutput = output + arc.output; + + if (minArcOutput == targetOutput) { + // Recurse on this arc: + //System.out.println(" match! break"); + output = minArcOutput; + result.ints[upto++] = arc.label; + break; + } else if (minArcOutput > targetOutput) { + if (prevArc == null) { + // Output doesn't exist + return null; + } else { + // Recurse on previous arc: + arc.copyFrom(prevArc); + result.ints[upto++] = arc.label; + output += arc.output; + //System.out.println(" recurse prev label=" + (char) arc.label + " output=" + output); + break; + } + } else if (arc.isLast()) { + // Recurse on this arc: + output = minArcOutput; + //System.out.println(" recurse last label=" + (char) arc.label + " output=" + output); + result.ints[upto++] = arc.label; + break; + } else { + // Read next arc in this node: + prevArc = scratchArc; + prevArc.copyFrom(arc); + //System.out.println(" after copy label=" + (char) prevArc.label + " vs " + (char) arc.label); + fst.readNextRealArc(arc, in); + } + } + } else { + //System.out.println(" no target arcs; not found!"); + return null; + } + } + } /** * Dumps an {@link FST} to a GraphViz's dot language description @@ -356,4 +463,15 @@ public final class Util { scratch.length = input.length; return scratch; } + + /** Just converts IntsRef to BytesRef; you must ensure the + * int values fit into a byte. */ + public static BytesRef toBytesRef(IntsRef input, BytesRef scratch) { + scratch.grow(input.length); + for(int i=0;i(term, NO_OUTPUT)); } - FST fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0, false); + FST fst = new FSTTester(random, dir, inputMode, pairs, outputs, false).doTest(0, 0, false); assertNotNull(fst); assertEquals(22, fst.getNodeCount()); assertEquals(27, fst.getArcCount()); @@ -174,7 +174,7 @@ public class TestFSTs extends LuceneTestCase { for(int idx=0;idx(terms2[idx], outputs.get(idx))); } - final FST fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0, false); + final FST fst = new FSTTester(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false); assertNotNull(fst); assertEquals(22, fst.getNodeCount()); assertEquals(27, fst.getArcCount()); @@ -189,7 +189,7 @@ public class TestFSTs extends LuceneTestCase { final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx)); pairs.add(new FSTTester.InputOutput(terms2[idx], output)); } - final FST fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0, false); + final FST fst = new FSTTester(random, dir, inputMode, pairs, outputs, false).doTest(0, 0, false); assertNotNull(fst); assertEquals(24, fst.getNodeCount()); assertEquals(30, fst.getArcCount()); @@ -222,7 +222,7 @@ public class TestFSTs extends LuceneTestCase { for(IntsRef term : terms) { pairs.add(new FSTTester.InputOutput(term, NO_OUTPUT)); } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + new FSTTester(random, dir, inputMode, pairs, outputs, false).doTest(); } // PositiveIntOutput (ord) @@ -232,12 +232,13 @@ public class TestFSTs extends LuceneTestCase { for(int idx=0;idx(terms[idx], outputs.get(idx))); } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + new FSTTester(random, dir, inputMode, pairs, outputs, true).doTest(); } // PositiveIntOutput (random monotonically increasing positive number) { - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final boolean doShare = random.nextBoolean(); + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(doShare); final List> pairs = new ArrayList>(terms.length); long lastOutput = 0; for(int idx=0;idx(terms[idx], outputs.get(value))); } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + new FSTTester(random, dir, inputMode, pairs, outputs, doShare).doTest(); } // PositiveIntOutput (random positive number) @@ -255,7 +256,7 @@ public class TestFSTs extends LuceneTestCase { for(int idx=0;idx(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE)); } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + new FSTTester(random, dir, inputMode, pairs, outputs, false).doTest(); } // Pair @@ -272,7 +273,7 @@ public class TestFSTs extends LuceneTestCase { outputs.get(o1.get(idx), o2.get(value)))); } - new FSTTester>(random, dir, inputMode, pairs, outputs).doTest(); + new FSTTester>(random, dir, inputMode, pairs, outputs, false).doTest(); } // Sequence-of-bytes @@ -284,7 +285,7 @@ public class TestFSTs extends LuceneTestCase { final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx)); pairs.add(new FSTTester.InputOutput(terms[idx], output)); } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + new FSTTester(random, dir, inputMode, pairs, outputs, false).doTest(); } // Sequence-of-ints @@ -300,7 +301,7 @@ public class TestFSTs extends LuceneTestCase { } pairs.add(new FSTTester.InputOutput(terms[idx], output)); } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + new FSTTester(random, dir, inputMode, pairs, outputs, false).doTest(); } // Up to two positive ints, shared, generally but not @@ -330,7 +331,7 @@ public class TestFSTs extends LuceneTestCase { } pairs.add(new FSTTester.InputOutput(terms[idx], output)); } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + new FSTTester(random, dir, inputMode, pairs, outputs, false).doTest(); } } @@ -341,13 +342,15 @@ public class TestFSTs extends LuceneTestCase { final int inputMode; final Outputs outputs; final Directory dir; + final boolean doReverseLookup; - public FSTTester(Random random, Directory dir, int inputMode, List> pairs, Outputs outputs) { + public FSTTester(Random random, Directory dir, int inputMode, List> pairs, Outputs outputs, boolean doReverseLookup) { this.random = random; this.dir = dir; this.inputMode = inputMode; this.pairs = pairs; this.outputs = outputs; + this.doReverseLookup = doReverseLookup; } private static class InputOutput implements Comparable> { @@ -525,6 +528,26 @@ public class TestFSTs extends LuceneTestCase { // FST is complete private void verifyUnPruned(int inputMode, FST fst) throws IOException { + final FST fstLong; + final Set validOutputs; + long minLong = Long.MAX_VALUE; + long maxLong = Long.MIN_VALUE; + + if (doReverseLookup) { + @SuppressWarnings("unchecked") FST fstLong0 = (FST) fst; + fstLong = fstLong0; + validOutputs = new HashSet(); + for(InputOutput pair: pairs) { + Long output = (Long) pair.output; + maxLong = Math.max(maxLong, output); + minLong = Math.min(minLong, output); + validOutputs.add(output); + } + } else { + fstLong = null; + validOutputs = null; + } + if (pairs.size() == 0) { assertNull(fst); return; @@ -542,7 +565,7 @@ public class TestFSTs extends LuceneTestCase { assertNotNull(fst); - // visit valid paris in order -- make sure all words + // visit valid pairs in order -- make sure all words // are accepted, and FSTEnum's next() steps through // them correctly if (VERBOSE) { @@ -556,7 +579,6 @@ public class TestFSTs extends LuceneTestCase { System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output)); } Object output = run(fst, term, null); - assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output); assertEquals(pair.output, output); @@ -574,6 +596,20 @@ public class TestFSTs extends LuceneTestCase { termsMap.put(pair.input, pair.output); } + if (doReverseLookup && maxLong > minLong) { + // Do random lookups so we test null (output doesn't + // exist) case: + assertNull(Util.getByOutput(fstLong, minLong-7)); + assertNull(Util.getByOutput(fstLong, maxLong+7)); + + final int num = atLeast(100); + for(int iter=0;iter> ent : prefixes.entrySet()) { - System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); + System.out.println(" " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); if (ent.getValue().isFinal) { System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput)); } @@ -951,7 +995,7 @@ public class TestFSTs extends LuceneTestCase { //testRandomWords(20, 100); } - private String inputModeToString(int mode) { + String inputModeToString(int mode) { if (mode == 0) { return "utf8"; } else { @@ -995,7 +1039,7 @@ public class TestFSTs extends LuceneTestCase { testRandomWords(_TestUtil.nextInt(random, 50000, 60000), 1); } - private static String inputToString(int inputMode, IntsRef term) { + static String inputToString(int inputMode, IntsRef term) { return inputToString(inputMode, term, true); } @@ -1422,6 +1466,14 @@ public class TestFSTs extends LuceneTestCase { assertNotNull(seekResult); assertEquals(b, seekResult.input); assertEquals(42, (long) seekResult.output); + + assertEquals(Util.toIntsRef(new BytesRef("c"), new IntsRef()), + Util.getByOutput(fst, 13824324872317238L)); + assertNull(Util.getByOutput(fst, 47)); + assertEquals(Util.toIntsRef(new BytesRef("b"), new IntsRef()), + Util.getByOutput(fst, 42)); + assertEquals(Util.toIntsRef(new BytesRef("a"), new IntsRef()), + Util.getByOutput(fst, 17)); } public void testPrimaryKeys() throws Exception {