From fe0c042470dc1a1ba7ffd27f91ac7bc96c3254a0 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Thu, 4 Jul 2019 09:45:51 -0400 Subject: [PATCH] LUCENE-8920: remove Arc setters, moving implementations into Arc, or copying data into consumers --- .../blocktreeords/OrdsSegmentTermsEnum.java | 8 +- .../codecs/memory/FSTOrdTermsReader.java | 19 +++-- .../lucene/codecs/memory/FSTTermsReader.java | 29 +++---- .../java/org/apache/lucene/util/fst/FST.java | 77 +++++++++++-------- .../org/apache/lucene/util/fst/FSTEnum.java | 34 ++++---- .../java/org/apache/lucene/util/fst/Util.java | 27 ++----- 6 files changed, 90 insertions(+), 104 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java index 7bfaab55ffa..d6a182b5a73 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java @@ -1115,10 +1115,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum { } } - if (found) { - // Keep recursing - arc.arcIdx(mid - 1); - } else { + if (found == false) { result.setLength(bestUpto); InputOutput io = new InputOutput(); io.input = result.get(); @@ -1127,9 +1124,8 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum { return io; } - fr.index.readNextRealArc(arc, fstReader); - // Recurse on this arc: + fr.index.readArcByIndex(arc, fstReader, mid); result.setIntAt(upto++, arc.label()); output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output()); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java index d653c1209dd..daba6096c93 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java @@ -65,7 +65,7 @@ import org.apache.lucene.util.fst.Util; * FST-based terms dictionary reader. * * The FST index maps each term and its ord, and during seek - * the ord is used fetch metadata from a single block. + * the ord is used to fetch metadata from a single block. * The term dictionary is fully memory resident. * * @lucene.experimental @@ -305,7 +305,7 @@ public class FSTOrdTermsReader extends FieldsProducer { } // Only wraps common operations for PBF interact - abstract class BaseTermsEnum extends org.apache.lucene.index.BaseTermsEnum { + abstract class BaseTermsEnum extends org.apache.lucene.index.BaseTermsEnum { /* Current term's ord, starts from 0 */ long ord; @@ -563,6 +563,8 @@ public class FSTOrdTermsReader extends FieldsProducer { /* fst stats */ FST.Arc arc; + Long output; + /* automaton stats */ int state; @@ -620,9 +622,7 @@ public class FSTOrdTermsReader extends FieldsProducer { @Override void decodeStats() throws IOException { - final FST.Arc arc = topFrame().arc; - assert arc.nextFinalOutput() == fstOutputs.getNoOutput(); - ord = arc.output(); + ord = topFrame().output; super.decodeStats(); } @@ -704,8 +704,7 @@ public class FSTOrdTermsReader extends FieldsProducer { /** Virtual frame, never pop */ Frame loadVirtualFrame(Frame frame) { - frame.arc.output(fstOutputs.getNoOutput()); - frame.arc.nextFinalOutput(fstOutputs.getNoOutput()); + frame.output = fstOutputs.getNoOutput(); frame.state = -1; return frame; } @@ -713,6 +712,7 @@ public class FSTOrdTermsReader extends FieldsProducer { /** Load frame for start arc(node) on fst */ Frame loadFirstFrame(Frame frame) { frame.arc = fst.getFirstArc(frame.arc); + frame.output = frame.arc.output(); frame.state = 0; return frame; } @@ -724,6 +724,7 @@ public class FSTOrdTermsReader extends FieldsProducer { } frame.arc = fst.readFirstRealTargetArc(top.arc.target(), frame.arc, fstReader); frame.state = fsa.step(top.state, frame.arc.label()); + frame.output = frame.arc.output(); //if (TEST) System.out.println(" loadExpand frame="+frame); if (frame.state == -1) { return loadNextFrame(top, frame); @@ -738,6 +739,7 @@ public class FSTOrdTermsReader extends FieldsProducer { } while (!frame.arc.isLast()) { frame.arc = fst.readNextRealArc(frame.arc, fstReader); + frame.output = frame.arc.output(); frame.state = fsa.step(top.state, frame.arc.label()); if (frame.state != -1) { break; @@ -763,6 +765,7 @@ public class FSTOrdTermsReader extends FieldsProducer { if (frame.state == -1) { return loadNextFrame(top, frame); } + frame.output = arc.output(); return frame; } @@ -781,7 +784,7 @@ public class FSTOrdTermsReader extends FieldsProducer { void pushFrame(Frame frame) { final FST.Arc arc = frame.arc; - arc.output(fstOutputs.add(topFrame().arc.output(), arc.output())); + frame.output = fstOutputs.add(topFrame().output, frame.output); term = grow(arc.label()); level++; assert frame == stack[level]; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java index 41a992fafea..a5abfff355a 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java @@ -438,6 +438,8 @@ public class FSTTermsReader extends FieldsProducer { /* fst stats */ FST.Arc fstArc; + FSTTermOutputs.TermData output; + /* automaton stats */ int fsaState; @@ -464,11 +466,9 @@ public class FSTTermsReader extends FieldsProducer { this.stack[i] = new Frame(); } - Frame frame; - frame = loadVirtualFrame(newFrame()); + loadVirtualFrame(newFrame()); this.level++; - frame = loadFirstFrame(newFrame()); - pushFrame(frame); + pushFrame(loadFirstFrame(newFrame())); this.meta = null; this.metaUpto = 1; @@ -502,18 +502,18 @@ public class FSTTermsReader extends FieldsProducer { /** Lazily accumulate meta data, when we got a accepted term */ void loadMetaData() { - FST.Arc last, next; - last = stack[metaUpto].fstArc; + Frame last, next; + last = stack[metaUpto]; while (metaUpto != level) { metaUpto++; - next = stack[metaUpto].fstArc; - next.output(fstOutputs.add(next.output(), last.output())); + next = stack[metaUpto]; + next.output = fstOutputs.add(next.output, last.output); last = next; } - if (last.isFinal()) { - meta = fstOutputs.add(last.output(), last.nextFinalOutput()); + if (last.fstArc.isFinal()) { + meta = fstOutputs.add(last.output, last.fstArc.nextFinalOutput()); } else { - meta = last.output(); + meta = last.output; } state.docFreq = meta.docFreq; state.totalTermFreq = meta.totalTermFreq; @@ -604,8 +604,7 @@ public class FSTTermsReader extends FieldsProducer { /** Virtual frame, never pop */ Frame loadVirtualFrame(Frame frame) { - frame.fstArc.output(fstOutputs.getNoOutput()); - frame.fstArc.nextFinalOutput(fstOutputs.getNoOutput()); + frame.output = fstOutputs.getNoOutput(); frame.fsaState = -1; return frame; } @@ -613,6 +612,7 @@ public class FSTTermsReader extends FieldsProducer { /** Load frame for start arc(node) on fst */ Frame loadFirstFrame(Frame frame) throws IOException { frame.fstArc = fst.getFirstArc(frame.fstArc); + frame.output = frame.fstArc.output(); frame.fsaState = 0; return frame; } @@ -628,6 +628,7 @@ public class FSTTermsReader extends FieldsProducer { if (frame.fsaState == -1) { return loadNextFrame(top, frame); } + frame.output = frame.fstArc.output(); return frame; } @@ -647,6 +648,7 @@ public class FSTTermsReader extends FieldsProducer { if (frame.fsaState == -1) { return null; } + frame.output = frame.fstArc.output(); return frame; } @@ -663,6 +665,7 @@ public class FSTTermsReader extends FieldsProducer { if (frame.fsaState == -1) { return loadNextFrame(top, frame); } + frame.output = frame.fstArc.output(); return frame; } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index f9d0cd51605..cbef9098d08 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -186,7 +186,7 @@ public final class FST implements Accountable { } boolean flag(int flag) { - return FST.flag(flags(), flag); + return FST.flag(flags, flag); } public boolean isLast() { @@ -230,18 +230,10 @@ public final class FST implements Accountable { return label; } - public void label(int label) { - this.label = label; - } - public T output() { return output; } - public void output(T output) { - this.output = output; - } - /** To node (ord or address) */ public long target() { return target; @@ -251,30 +243,14 @@ public final class FST implements Accountable { return flags; } - public void flags(byte flags) { - this.flags = flags; - } - public T nextFinalOutput() { return nextFinalOutput; } - public void nextFinalOutput(T output) { - nextFinalOutput = output; - } - long nextArc() { return nextArc; } - /** - * Set the position of the next arc to read - * @param nextArc the position to set - */ - public void nextArc(long nextArc) { - this.nextArc = nextArc; - } - /** Where the first arc in the array starts; only valid if * bytesPerArc != 0 */ public long posArcsStart() { @@ -298,14 +274,6 @@ public final class FST implements Accountable { return arcIdx; } - /** - * Set the arcIdx - * @param idx the value to set - */ - public void arcIdx(int idx) { - arcIdx = idx; - } - /** How many arc, if bytesPerArc == 0. Otherwise, the size of the arc array. If the array is * direct, this may include holes. Otherwise it is also how many arcs are in the array */ public int numArcs() { @@ -818,7 +786,7 @@ public final class FST implements Accountable { arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; arc.nextFinalOutput = emptyOutput; if (emptyOutput != NO_OUTPUT) { - arc.flags |= BIT_ARC_HAS_FINAL_OUTPUT; + arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT); } } else { arc.flags = BIT_LAST_ARC; @@ -1033,6 +1001,27 @@ public final class FST implements Accountable { return readLabel(in); } + public Arc readArcAtPosition(Arc arc, final BytesReader in, long pos) throws IOException { + in.setPosition(pos); + arc.flags = in.readByte(); + arc.nextArc = pos; + while (flag(arc.flags(), BIT_MISSING_ARC)) { + // skip empty arcs + arc.nextArc -= arc.bytesPerArc(); + in.skipBytes(arc.bytesPerArc() - 1); + arc.flags = in.readByte(); + } + return readArc(arc, in); + } + + public Arc readArcByIndex(Arc arc, final BytesReader in, int idx) throws IOException { + arc.arcIdx = idx; + assert arc.arcIdx() < arc.numArcs(); + in.setPosition(arc.posArcsStart() - arc.arcIdx() * arc.bytesPerArc()); + arc.flags = in.readByte(); + return readArc(arc, in); + } + /** Never returns null, but you should never call this if * arc.isLast() is true. */ public Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { @@ -1064,7 +1053,10 @@ public final class FST implements Accountable { in.setPosition(arc.nextArc()); arc.flags = in.readByte(); } + return readArc(arc, in); + } + private Arc readArc(Arc arc, BytesReader in) throws IOException { arc.label = readLabel(in); if (arc.flag(BIT_ARC_HAS_OUTPUT)) { @@ -1118,6 +1110,23 @@ public final class FST implements Accountable { return arc; } + static Arc readEndArc(Arc follow, Arc arc) { + if (follow.isFinal()) { + if (follow.target() <= 0) { + arc.flags = FST.BIT_LAST_ARC; + } else { + arc.flags = 0; + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target(); + } + arc.output = follow.nextFinalOutput(); + arc.label = FST.END_LABEL; + return arc; + } else { + return null; + } + } + // LUCENE-5152: called only from asserts, to validate that the // non-cached arc lookup would produce the same result, to // catch callers that illegally modify shared structures with diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java index 1c41d31f400..feeddf3fb6b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java @@ -161,8 +161,7 @@ abstract class FSTEnum { int arcOffset = targetLabel - firstLabel; if (arcOffset >= arc.numArcs()) { // target is beyond the last arc - arc.nextArc(arc.posArcsStart() - (arc.numArcs() - 1) * arc.bytesPerArc()); - fst.readNextRealArc(arc, in); + fst.readArcAtPosition(arc, in, arc.posArcsStart() - (arc.numArcs() - 1) * arc.bytesPerArc()); assert arc.isLast(); // Dead end (target is after the last arc); // rollback to last fork then push @@ -182,12 +181,13 @@ abstract class FSTEnum { } } else { // TODO: if firstLabel == targetLabel + long pos; if (arcOffset >= 0) { - arc.nextArc(arc.posArcsStart() - (arc.bytesPerArc() * arcOffset)); + pos = arc.posArcsStart() - (arc.bytesPerArc() * arcOffset); } else { - arc.nextArc(arc.posArcsStart()); + pos = arc.posArcsStart(); } - fst.readNextRealArc(arc, in); + fst.readArcAtPosition(arc, in, pos); if (arc.label() == targetLabel) { // found -- copy pasta from below output[upto] = fst.outputs.add(output[upto-1], arc.output()); @@ -234,8 +234,7 @@ abstract class FSTEnum { // the outer else clause): if (found) { // Match - arc.arcIdx(mid - 1); - fst.readNextRealArc(arc, in); + fst.readArcByIndex(arc, in, mid); assert arc.arcIdx() == mid; assert arc.label() == targetLabel: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel + " mid=" + mid; output[upto] = fst.outputs.add(output[upto-1], arc.output()); @@ -247,8 +246,7 @@ abstract class FSTEnum { return fst.readFirstTargetArc(arc, getArc(upto), fstReader); } else if (low == arc.numArcs()) { // Dead end - arc.arcIdx(arc.numArcs() - 2); - fst.readNextRealArc(arc, in); + fst.readArcByIndex(arc, in, arc.numArcs() - 1); assert arc.isLast(); // Dead end (target is after the last arc); // rollback to last fork then push @@ -267,8 +265,7 @@ abstract class FSTEnum { upto--; } } else { - arc.arcIdx(low - 1); - fst.readNextRealArc(arc, in); + fst.readArcByIndex(arc, in, low); assert arc.label() > targetLabel; pushFirst(); return null; @@ -386,15 +383,13 @@ abstract class FSTEnum { } } else { if (targetOffset >= arc.numArcs()) { - arc.nextArc(arc.posArcsStart() - arc.bytesPerArc() * (arc.numArcs() - 1)); - fst.readNextRealArc(arc, in); + fst.readArcAtPosition(arc, in, arc.posArcsStart() - arc.bytesPerArc() * (arc.numArcs() - 1)); assert arc.isLast(); assert arc.label() < targetLabel: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel; pushLast(); return null; } - arc.nextArc(arc.posArcsStart() - arc.bytesPerArc() * targetOffset); - fst.readNextRealArc(arc, in); + fst.readArcAtPosition(arc, in, arc.posArcsStart() - arc.bytesPerArc() * targetOffset); if (arc.label() == targetLabel) { // found -- copy pasta from below output[upto] = fst.outputs.add(output[upto-1], arc.output()); @@ -408,8 +403,7 @@ abstract class FSTEnum { // Scan backwards to find a floor arc that is not missing for (long arcOffset = arc.posArcsStart() - targetOffset * arc.bytesPerArc(); arcOffset <= arc.posArcsStart(); arcOffset += arc.bytesPerArc()) { // TODO: we can do better here by skipping missing arcs - arc.nextArc(arcOffset); - fst.readNextRealArc(arc, in); + fst.readArcAtPosition(arc, in, arcOffset); if (arc.label() < targetLabel) { assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel; pushLast(); @@ -451,8 +445,7 @@ abstract class FSTEnum { if (found) { // Match -- recurse //System.out.println(" match! arcIdx=" + mid); - arc.arcIdx(mid - 1); - fst.readNextRealArc(arc, in); + fst.readArcByIndex(arc, in, mid); assert arc.arcIdx() == mid; assert arc.label() == targetLabel: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel + " mid=" + mid; output[upto] = fst.outputs.add(output[upto-1], arc.output()); @@ -491,8 +484,7 @@ abstract class FSTEnum { } } else { // There is a floor arc: - arc.arcIdx(high - 1); - fst.readNextRealArc(arc, in); + fst.readArcByIndex(arc, in, high); assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel; assert arc.label() < targetLabel: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel; pushLast(); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java index 0b2a0eea503..e033267ffbe 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java @@ -191,8 +191,7 @@ public final class Util { idx = low - 1; } - arc.arcIdx(idx - 1); - fst.readNextRealArc(arc, in); + fst.readArcByIndex(arc, in, idx); result.setIntAt(upto++, arc.label()); output += arc.output(); @@ -934,20 +933,7 @@ public final class Util { */ public static Arc readCeilArc(int label, FST fst, Arc follow, Arc arc, BytesReader in) throws IOException { if (label == FST.END_LABEL) { - if (follow.isFinal()) { - if (follow.target() <= 0) { - arc.flags((byte) FST.BIT_LAST_ARC); - } else { - arc.flags((byte) 0); - // NOTE: nextArc is a node (not an address!) in this case: - arc.nextArc(follow.target()); - } - arc.output(follow.nextFinalOutput()); - arc.label(FST.END_LABEL); - return arc; - } else { - return null; - } + return FST.readEndArc(follow, arc); } if (!FST.targetHasArcs(follow)) { return null; @@ -962,8 +948,7 @@ public final class Util { } else if (offset < 0) { return arc; } else { - arc.nextArc(arc.posArcsStart() - offset * arc.bytesPerArc()); - return fst.readNextRealArc(arc, in); + return fst.readArcAtPosition(arc, in, arc.posArcsStart() - offset * arc.bytesPerArc()); } } // Arcs are packed array -- use binary search to find @@ -987,16 +972,14 @@ public final class Util { } else if (cmp > 0) { high = mid - 1; } else { - arc.arcIdx(mid - 1); - return fst.readNextRealArc(arc, in); + return fst.readArcByIndex(arc, in, mid); } } if (low == arc.numArcs()) { // DEAD END! return null; } - arc.arcIdx(high + 1); - return fst.readNextRealArc(arc, in ); + return fst.readArcByIndex(arc, in , high + 1); } // Linear scan