From 491dc6a7165997b746422272b62b51fd36b4d292 Mon Sep 17 00:00:00 2001
From: Michael McCandless
Date: Thu, 15 May 2014 22:28:48 +0000
Subject: [PATCH] LUCENE-5675: initial fork of BT with versioning added
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5675@1595064 13f79535-47bb-0310-9956-ffa450edef68
---
.../idversion/IDVersionPostingsReader.java | 2 -
.../idversion/IDVersionPostingsWriter.java | 22 +-
.../idversion/IDVersionSegmentTermsEnum.java | 971 ++++++++++++++
.../IDVersionSegmentTermsEnumFrame.java | 746 +++++++++++
.../codecs/idversion/IDVersionTermState.java | 41 +
.../VersionBlockTreeTermsReader.java | 319 +++++
.../VersionBlockTreeTermsWriter.java | 1192 +++++++++++++++++
.../codecs/idversion/VersionFieldReader.java | 163 +++
.../blocktree/BlockTreeTermsReader.java | 2 +-
.../blocktree/BlockTreeTermsWriter.java | 14 +-
10 files changed, 3441 insertions(+), 31 deletions(-)
create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java
create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java
create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionTermState.java
create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java
create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java
create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java
index 5bc8a640c29..26e300fed7b 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java
@@ -17,8 +17,6 @@ package org.apache.lucene.codecs.idversion;
* limitations under the License.
*/
-import static org.apache.lucene.codecs.idversion.IDVersionPostingsWriter.IDVersionTermState;
-
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java
index d4fdb1fc0d8..e304dc982d7 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java
@@ -43,26 +43,6 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase {
private int lastPosition;
private long lastVersion;
- final static class IDVersionTermState extends BlockTermState {
- long idVersion;
- int docID;
-
- @Override
- public IDVersionTermState clone() {
- IDVersionTermState other = new IDVersionTermState();
- other.copyFrom(this);
- return other;
- }
-
- @Override
- public void copyFrom(TermState _other) {
- super.copyFrom(_other);
- IDVersionTermState other = (IDVersionTermState) _other;
- idVersion = other.idVersion;
- docID = other.docID;
- }
- }
-
@Override
public IDVersionTermState newTermState() {
return new IDVersionTermState();
@@ -144,8 +124,8 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase {
@Override
public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
IDVersionTermState state = (IDVersionTermState) _state;
- // nocommit must send version up to FST somehow ...
out.writeVInt(state.docID);
+ out.writeVLong(state.idVersion);
}
@Override
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java
new file mode 100644
index 00000000000..6320438a06d
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java
@@ -0,0 +1,971 @@
+package org.apache.lucene.codecs.idversion;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.PrintStream;
+
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.fst.PairOutputs.Pair;
+import org.apache.lucene.util.fst.PairOutputs;
+import org.apache.lucene.util.fst.Util;
+
+/** Iterates through terms in this field */
+final class IDVersionSegmentTermsEnum extends TermsEnum {
+
+ final static Outputs> fstOutputs = VersionBlockTreeTermsWriter.getFSTOutputs();
+ final static Pair NO_OUTPUT = fstOutputs.getNoOutput();
+
+ // Lazy init:
+ IndexInput in;
+
+ private IDVersionSegmentTermsEnumFrame[] stack;
+ private final IDVersionSegmentTermsEnumFrame staticFrame;
+ IDVersionSegmentTermsEnumFrame currentFrame;
+ boolean termExists;
+ final VersionFieldReader fr;
+
+ // nocommit make this public "for casting" and add a getVersion method?
+
+ private int targetBeforeCurrentLength;
+
+ private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
+
+ // What prefix of the current term was present in the index:
+ private int validIndexPrefix;
+
+ // assert only:
+ private boolean eof;
+
+ final BytesRef term = new BytesRef();
+ private final FST.BytesReader fstReader;
+
+ @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc>[] arcs =
+ new FST.Arc[1];
+
+ public IDVersionSegmentTermsEnum(VersionFieldReader fr) throws IOException {
+ this.fr = fr;
+
+ //if (DEBUG) System.out.println("BTTR.init seg=" + segment);
+ stack = new IDVersionSegmentTermsEnumFrame[0];
+
+ // Used to hold seek by TermState, or cached seek
+ staticFrame = new IDVersionSegmentTermsEnumFrame(this, -1);
+
+ if (fr.index == null) {
+ fstReader = null;
+ } else {
+ fstReader = fr.index.getBytesReader();
+ }
+
+ // Init w/ root block; don't use index since it may
+ // not (and need not) have been loaded
+ for(int arcIdx=0;arcIdx();
+ }
+
+ currentFrame = staticFrame;
+ final FST.Arc> arc;
+ if (fr.index != null) {
+ arc = fr.index.getFirstArc(arcs[0]);
+ // Empty string prefix must have an output in the index!
+ assert arc.isFinal();
+ } else {
+ arc = null;
+ }
+ currentFrame = staticFrame;
+ //currentFrame = pushFrame(arc, rootCode, 0);
+ //currentFrame.loadBlock();
+ validIndexPrefix = 0;
+ // if (DEBUG) {
+ // System.out.println("init frame state " + currentFrame.ord);
+ // printSeekState();
+ // }
+
+ //System.out.println();
+ // computeBlockStats().print(System.out);
+ }
+
+ // Not private to avoid synthetic access$NNN methods
+ void initIndexInput() {
+ if (this.in == null) {
+ this.in = fr.parent.in.clone();
+ }
+ }
+
+ private IDVersionSegmentTermsEnumFrame getFrame(int ord) throws IOException {
+ if (ord >= stack.length) {
+ final IDVersionSegmentTermsEnumFrame[] next = new IDVersionSegmentTermsEnumFrame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(stack, 0, next, 0, stack.length);
+ for(int stackOrd=stack.length;stackOrd> getArc(int ord) {
+ if (ord >= arcs.length) {
+ @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc>[] next =
+ new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(arcs, 0, next, 0, arcs.length);
+ for(int arcOrd=arcs.length;arcOrd();
+ }
+ arcs = next;
+ }
+ return arcs[ord];
+ }
+
+ // Pushes a frame we seek'd to
+ IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc> arc, Pair frameData, int length) throws IOException {
+ scratchReader.reset(frameData.output1.bytes, frameData.output1.offset, frameData.output1.length);
+ final long code = scratchReader.readVLong();
+ final long fpSeek = code >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
+ final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
+ f.hasTerms = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
+ f.hasTermsOrig = f.hasTerms;
+ f.isFloor = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;
+ if (f.isFloor) {
+ f.setFloorData(scratchReader, frameData.output1);
+ }
+ pushFrame(arc, fpSeek, length);
+
+ return f;
+ }
+
+ // Pushes next'd frame or seek'd frame; we later
+ // lazy-load the frame only when needed
+ IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc> arc, long fp, int length) throws IOException {
+ final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
+ f.arc = arc;
+ if (f.fpOrig == fp && f.nextEnt != -1) {
+ //if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix);
+ if (f.prefix > targetBeforeCurrentLength) {
+ f.rewind();
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" skip rewind!");
+ // }
+ }
+ assert length == f.prefix;
+ } else {
+ f.nextEnt = -1;
+ f.prefix = length;
+ f.state.termBlockOrd = 0;
+ f.fpOrig = f.fp = fp;
+ f.lastSubFP = -1;
+ // if (DEBUG) {
+ // final int sav = term.length;
+ // term.length = length;
+ // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term));
+ // term.length = sav;
+ // }
+ }
+
+ return f;
+ }
+
+ // asserts only
+ private boolean clearEOF() {
+ eof = false;
+ return true;
+ }
+
+ // asserts only
+ private boolean setEOF() {
+ eof = true;
+ return true;
+ }
+
+ // nocommit we need a seekExact(BytesRef target, long minVersion) API?
+
+ @Override
+ public boolean seekExact(final BytesRef target) throws IOException {
+ return seekExact(target, 0);
+ }
+
+ /** Returns false if the term deos not exist, or it exists but its version is < minIDVersion. */
+ public boolean seekExact(final BytesRef target, long minIDVersion) throws IOException {
+
+ if (fr.index == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ if (term.bytes.length <= target.length) {
+ term.bytes = ArrayUtil.grow(term.bytes, 1+target.length);
+ }
+
+ assert clearEOF();
+
+ // if (DEBUG) {
+ // System.out.println("\nBTTR.seekExact seg=" + segment + " target=" + fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix);
+ // printSeekState();
+ // }
+
+ FST.Arc> arc;
+ int targetUpto;
+ Pair output;
+
+ targetBeforeCurrentLength = currentFrame.ord;
+
+ if (currentFrame != staticFrame) {
+
+ // We are already seek'd; find the common
+ // prefix of new seek term vs current term and
+ // re-use the corresponding seek state. For
+ // example, if app first seeks to foobar, then
+ // seeks to foobaz, we can re-use the seek state
+ // for the first 5 bytes.
+
+ // if (DEBUG) {
+ // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
+ // }
+
+ arc = arcs[0];
+ assert arc.isFinal();
+ output = arc.output;
+ targetUpto = 0;
+
+ IDVersionSegmentTermsEnumFrame lastFrame = stack[0];
+ assert validIndexPrefix <= term.length;
+
+ final int targetLimit = Math.min(target.length, validIndexPrefix);
+
+ int cmp = 0;
+
+ // TODO: reverse vLong byte order for better FST
+ // prefix output sharing
+
+ // First compare up to valid seek frames:
+ while (targetUpto < targetLimit) {
+ cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
+ // if (DEBUG) {
+ // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
+ // }
+ if (cmp != 0) {
+ break;
+ }
+ arc = arcs[1+targetUpto];
+ //if (arc.label != (target.bytes[target.offset + targetUpto] & 0xFF)) {
+ //System.out.println("FAIL: arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF));
+ //}
+ assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
+ if (arc.output != NO_OUTPUT) {
+ output = fstOutputs.add(output, arc.output);
+ }
+ if (arc.isFinal()) {
+ lastFrame = stack[1+lastFrame.ord];
+ }
+ targetUpto++;
+ }
+
+ if (cmp == 0) {
+ final int targetUptoMid = targetUpto;
+
+ // Second compare the rest of the term, but
+ // don't save arc/output/frame; we only do this
+ // to find out if the target term is before,
+ // equal or after the current term
+ final int targetLimit2 = Math.min(target.length, term.length);
+ while (targetUpto < targetLimit2) {
+ cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
+ // if (DEBUG) {
+ // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
+ // }
+ if (cmp != 0) {
+ break;
+ }
+ targetUpto++;
+ }
+
+ if (cmp == 0) {
+ cmp = term.length - target.length;
+ }
+ targetUpto = targetUptoMid;
+ }
+
+ if (cmp < 0) {
+ // Common case: target term is after current
+ // term, ie, app is seeking multiple terms
+ // in sorted order
+ // if (DEBUG) {
+ // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord);
+ // }
+ currentFrame = lastFrame;
+
+ } else if (cmp > 0) {
+ // Uncommon case: target term
+ // is before current term; this means we can
+ // keep the currentFrame but we must rewind it
+ // (so we scan from the start)
+ targetBeforeCurrentLength = 0;
+ // if (DEBUG) {
+ // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
+ // }
+ currentFrame = lastFrame;
+ currentFrame.rewind();
+ } else {
+ // Target is exactly the same as current term
+ assert term.length == target.length;
+ if (termExists) {
+ // if (DEBUG) {
+ // System.out.println(" target is same as current; return true");
+ // }
+ return true;
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" target is same as current but term doesn't exist");
+ // }
+ }
+ //validIndexPrefix = currentFrame.depth;
+ //term.length = target.length;
+ //return termExists;
+ }
+
+ } else {
+
+ targetBeforeCurrentLength = -1;
+ arc = fr.index.getFirstArc(arcs[0]);
+
+ // Empty string prefix must have an output (block) in the index!
+ assert arc.isFinal();
+ assert arc.output != null;
+
+ // if (DEBUG) {
+ // System.out.println(" no seek state; push root frame");
+ // }
+
+ output = arc.output;
+
+ currentFrame = staticFrame;
+
+ //term.length = 0;
+ targetUpto = 0;
+ currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0);
+ }
+
+ // if (DEBUG) {
+ // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength);
+ // }
+
+ while (targetUpto < target.length) {
+
+ final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
+
+ final FST.Arc> nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
+
+ if (nextArc == null) {
+
+ // Index is exhausted
+ // if (DEBUG) {
+ // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
+ // }
+
+ validIndexPrefix = currentFrame.prefix;
+ //validIndexPrefix = targetUpto;
+
+ currentFrame.scanToFloorFrame(target);
+
+ if (!currentFrame.hasTerms) {
+ termExists = false;
+ term.bytes[targetUpto] = (byte) targetLabel;
+ term.length = 1+targetUpto;
+ // if (DEBUG) {
+ // System.out.println(" FAST NOT_FOUND term=" + brToString(term));
+ // }
+ return false;
+ }
+
+ if ((Long.MAX_VALUE-output.output2) < minIDVersion) {
+ // The max version for all terms in this block is lower than the minVersion
+ return false;
+ }
+
+ currentFrame.loadBlock();
+
+ final SeekStatus result = currentFrame.scanToTerm(target, true);
+ if (result == SeekStatus.FOUND) {
+ // if (DEBUG) {
+ // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
+ // }
+
+ currentFrame.decodeMetaData();
+ if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) {
+ // The max version for this term is lower than the minVersion
+ return false;
+ }
+ return true;
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term));
+ // }
+ return false;
+ }
+ } else {
+ // Follow this arc
+ arc = nextArc;
+ term.bytes[targetUpto] = (byte) targetLabel;
+ // Aggregate output as we go:
+ assert arc.output != null;
+ if (arc.output != NO_OUTPUT) {
+ output = fstOutputs.add(output, arc.output);
+ }
+
+ // if (DEBUG) {
+ // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
+ // }
+ targetUpto++;
+
+ if (arc.isFinal()) {
+ //if (DEBUG) System.out.println(" arc is final!");
+ currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto);
+ //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
+ }
+ }
+ }
+
+ //validIndexPrefix = targetUpto;
+ validIndexPrefix = currentFrame.prefix;
+
+ currentFrame.scanToFloorFrame(target);
+
+ // Target term is entirely contained in the index:
+ if (!currentFrame.hasTerms) {
+ termExists = false;
+ term.length = targetUpto;
+ // if (DEBUG) {
+ // System.out.println(" FAST NOT_FOUND term=" + brToString(term));
+ // }
+ return false;
+ }
+
+ currentFrame.loadBlock();
+
+ final SeekStatus result = currentFrame.scanToTerm(target, true);
+ if (result == SeekStatus.FOUND) {
+ // if (DEBUG) {
+ // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
+ // }
+ return true;
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString());
+ // }
+
+ return false;
+ }
+ }
+
+ @Override
+ public SeekStatus seekCeil(final BytesRef target) throws IOException {
+ if (fr.index == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ if (term.bytes.length <= target.length) {
+ term.bytes = ArrayUtil.grow(term.bytes, 1+target.length);
+ }
+
+ assert clearEOF();
+
+ //if (DEBUG) {
+ //System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix);
+ //printSeekState();
+ //}
+
+ FST.Arc> arc;
+ int targetUpto;
+ Pair output;
+
+ targetBeforeCurrentLength = currentFrame.ord;
+
+ if (currentFrame != staticFrame) {
+
+ // We are already seek'd; find the common
+ // prefix of new seek term vs current term and
+ // re-use the corresponding seek state. For
+ // example, if app first seeks to foobar, then
+ // seeks to foobaz, we can re-use the seek state
+ // for the first 5 bytes.
+
+ //if (DEBUG) {
+ //System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
+ //}
+
+ arc = arcs[0];
+ assert arc.isFinal();
+ output = arc.output;
+ targetUpto = 0;
+
+ IDVersionSegmentTermsEnumFrame lastFrame = stack[0];
+ assert validIndexPrefix <= term.length;
+
+ final int targetLimit = Math.min(target.length, validIndexPrefix);
+
+ int cmp = 0;
+
+ // TOOD: we should write our vLong backwards (MSB
+ // first) to get better sharing from the FST
+
+ // First compare up to valid seek frames:
+ while (targetUpto < targetLimit) {
+ cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
+ //if (DEBUG) {
+ //System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
+ //}
+ if (cmp != 0) {
+ break;
+ }
+ arc = arcs[1+targetUpto];
+ assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
+ // TOOD: we could save the outputs in local
+ // byte[][] instead of making new objs ever
+ // seek; but, often the FST doesn't have any
+ // shared bytes (but this could change if we
+ // reverse vLong byte order)
+ if (arc.output != NO_OUTPUT) {
+ output = fstOutputs.add(output, arc.output);
+ }
+ if (arc.isFinal()) {
+ lastFrame = stack[1+lastFrame.ord];
+ }
+ targetUpto++;
+ }
+
+
+ if (cmp == 0) {
+ final int targetUptoMid = targetUpto;
+ // Second compare the rest of the term, but
+ // don't save arc/output/frame:
+ final int targetLimit2 = Math.min(target.length, term.length);
+ while (targetUpto < targetLimit2) {
+ cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
+ //if (DEBUG) {
+ //System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
+ //}
+ if (cmp != 0) {
+ break;
+ }
+ targetUpto++;
+ }
+
+ if (cmp == 0) {
+ cmp = term.length - target.length;
+ }
+ targetUpto = targetUptoMid;
+ }
+
+ if (cmp < 0) {
+ // Common case: target term is after current
+ // term, ie, app is seeking multiple terms
+ // in sorted order
+ //if (DEBUG) {
+ //System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); clear frame.scanned ord=" + lastFrame.ord);
+ //}
+ currentFrame = lastFrame;
+
+ } else if (cmp > 0) {
+ // Uncommon case: target term
+ // is before current term; this means we can
+ // keep the currentFrame but we must rewind it
+ // (so we scan from the start)
+ targetBeforeCurrentLength = 0;
+ //if (DEBUG) {
+ //System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
+ //}
+ currentFrame = lastFrame;
+ currentFrame.rewind();
+ } else {
+ // Target is exactly the same as current term
+ assert term.length == target.length;
+ if (termExists) {
+ //if (DEBUG) {
+ //System.out.println(" target is same as current; return FOUND");
+ //}
+ return SeekStatus.FOUND;
+ } else {
+ //if (DEBUG) {
+ //System.out.println(" target is same as current but term doesn't exist");
+ //}
+ }
+ }
+
+ } else {
+
+ targetBeforeCurrentLength = -1;
+ arc = fr.index.getFirstArc(arcs[0]);
+
+ // Empty string prefix must have an output (block) in the index!
+ assert arc.isFinal();
+ assert arc.output != null;
+
+ //if (DEBUG) {
+ //System.out.println(" no seek state; push root frame");
+ //}
+
+ output = arc.output;
+
+ currentFrame = staticFrame;
+
+ //term.length = 0;
+ targetUpto = 0;
+ currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0);
+ }
+
+ //if (DEBUG) {
+ //System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength);
+ //}
+
+ while (targetUpto < target.length) {
+
+ final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
+
+ final FST.Arc> nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
+
+ if (nextArc == null) {
+
+ // Index is exhausted
+ // if (DEBUG) {
+ // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
+ // }
+
+ validIndexPrefix = currentFrame.prefix;
+ //validIndexPrefix = targetUpto;
+
+ currentFrame.scanToFloorFrame(target);
+
+ currentFrame.loadBlock();
+
+ final SeekStatus result = currentFrame.scanToTerm(target, false);
+ if (result == SeekStatus.END) {
+ term.copyBytes(target);
+ termExists = false;
+
+ if (next() != null) {
+ //if (DEBUG) {
+ //System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term);
+ //}
+ return SeekStatus.NOT_FOUND;
+ } else {
+ //if (DEBUG) {
+ //System.out.println(" return END");
+ //}
+ return SeekStatus.END;
+ }
+ } else {
+ //if (DEBUG) {
+ //System.out.println(" return " + result + " term=" + brToString(term) + " " + term);
+ //}
+ return result;
+ }
+ } else {
+ // Follow this arc
+ term.bytes[targetUpto] = (byte) targetLabel;
+ arc = nextArc;
+ // Aggregate output as we go:
+ assert arc.output != null;
+ if (arc.output != NO_OUTPUT) {
+ output = fstOutputs.add(output, arc.output);
+ }
+
+ //if (DEBUG) {
+ //System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
+ //}
+ targetUpto++;
+
+ if (arc.isFinal()) {
+ //if (DEBUG) System.out.println(" arc is final!");
+ currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto);
+ //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
+ }
+ }
+ }
+
+ //validIndexPrefix = targetUpto;
+ validIndexPrefix = currentFrame.prefix;
+
+ currentFrame.scanToFloorFrame(target);
+
+ currentFrame.loadBlock();
+
+ final SeekStatus result = currentFrame.scanToTerm(target, false);
+
+ if (result == SeekStatus.END) {
+ term.copyBytes(target);
+ termExists = false;
+ if (next() != null) {
+ //if (DEBUG) {
+ //System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term);
+ //}
+ return SeekStatus.NOT_FOUND;
+ } else {
+ //if (DEBUG) {
+ //System.out.println(" return END");
+ //}
+ return SeekStatus.END;
+ }
+ } else {
+ return result;
+ }
+ }
+
+ @SuppressWarnings("unused")
+ private void printSeekState(PrintStream out) throws IOException {
+ if (currentFrame == staticFrame) {
+ out.println(" no prior seek");
+ } else {
+ out.println(" prior seek state:");
+ int ord = 0;
+ boolean isSeekFrame = true;
+ while(true) {
+ IDVersionSegmentTermsEnumFrame f = getFrame(ord);
+ assert f != null;
+ final BytesRef prefix = new BytesRef(term.bytes, 0, f.prefix);
+ if (f.nextEnt == -1) {
+ out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp< output = Util.get(fr.index, prefix);
+ if (output == null) {
+ out.println(" broken seek state: prefix is not final in index");
+ throw new RuntimeException("seek state is broken");
+ } else if (isSeekFrame && !f.isFloor) {
+ final ByteArrayDataInput reader = new ByteArrayDataInput(output.output1.bytes, output.output1.offset, output.output1.length);
+ final long codeOrig = reader.readVLong();
+ final long code = (f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) | (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0);
+ if (codeOrig != code) {
+ out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code);
+ throw new RuntimeException("seek state is broken");
+ }
+ }
+ }
+ if (f == currentFrame) {
+ break;
+ }
+ if (f.prefix == validIndexPrefix) {
+ isSeekFrame = false;
+ }
+ ord++;
+ }
+ }
+ }
+
+ /* Decodes only the term bytes of the next term. If caller then asks for
+ metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily)
+ decode all metadata up to the current term. */
+ @Override
+ public BytesRef next() throws IOException {
+
+ if (in == null) {
+ // Fresh TermsEnum; seek to first term:
+ final FST.Arc> arc;
+ if (fr.index != null) {
+ arc = fr.index.getFirstArc(arcs[0]);
+ // Empty string prefix must have an output in the index!
+ assert arc.isFinal();
+ } else {
+ arc = null;
+ }
+ currentFrame = pushFrame(arc, fr.rootCode, 0);
+ currentFrame.loadBlock();
+ }
+
+ targetBeforeCurrentLength = currentFrame.ord;
+
+ assert !eof;
+ //if (DEBUG) {
+ //System.out.println("\nBTTR.next seg=" + segment + " term=" + brToString(term) + " termExists?=" + termExists + " field=" + fieldInfo.name + " termBlockOrd=" + currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix);
+ //printSeekState();
+ //}
+
+ if (currentFrame == staticFrame) {
+ // If seek was previously called and the term was
+ // cached, or seek(TermState) was called, usually
+ // caller is just going to pull a D/&PEnum or get
+ // docFreq, etc. But, if they then call next(),
+ // this method catches up all internal state so next()
+ // works properly:
+ //if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + term);
+ final boolean result = seekExact(term);
+ assert result;
+ }
+
+ // Pop finished blocks
+ while (currentFrame.nextEnt == currentFrame.entCount) {
+ if (!currentFrame.isLastInFloor) {
+ currentFrame.loadNextFloorBlock();
+ } else {
+ //if (DEBUG) System.out.println(" pop frame");
+ if (currentFrame.ord == 0) {
+ //if (DEBUG) System.out.println(" return null");
+ assert setEOF();
+ term.length = 0;
+ validIndexPrefix = 0;
+ currentFrame.rewind();
+ termExists = false;
+ return null;
+ }
+ final long lastFP = currentFrame.fpOrig;
+ currentFrame = stack[currentFrame.ord-1];
+
+ if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) {
+ // We popped into a frame that's not loaded
+ // yet or not scan'd to the right entry
+ currentFrame.scanToFloorFrame(term);
+ currentFrame.loadBlock();
+ currentFrame.scanToSubBlock(lastFP);
+ }
+
+ // Note that the seek state (last seek) has been
+ // invalidated beyond this depth
+ validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix);
+ //if (DEBUG) {
+ //System.out.println(" reset validIndexPrefix=" + validIndexPrefix);
+ //}
+ }
+ }
+
+ while(true) {
+ if (currentFrame.next()) {
+ // Push to new block:
+ //if (DEBUG) System.out.println(" push frame");
+ currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length);
+ // This is a "next" frame -- even if it's
+ // floor'd we must pretend it isn't so we don't
+ // try to scan to the right floor frame:
+ currentFrame.isFloor = false;
+ //currentFrame.hasTerms = true;
+ currentFrame.loadBlock();
+ } else {
+ //if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord);
+ return term;
+ }
+ }
+ }
+
+ @Override
+ public BytesRef term() {
+ assert !eof;
+ return term;
+ }
+
+ @Override
+ public int docFreq() throws IOException {
+ assert !eof;
+ //if (DEBUG) System.out.println("BTR.docFreq");
+ currentFrame.decodeMetaData();
+ //if (DEBUG) System.out.println(" return " + currentFrame.state.docFreq);
+ return currentFrame.state.docFreq;
+ }
+
+ @Override
+ public long totalTermFreq() throws IOException {
+ assert !eof;
+ currentFrame.decodeMetaData();
+ return currentFrame.state.totalTermFreq;
+ }
+
+ @Override
+ public DocsEnum docs(Bits skipDocs, DocsEnum reuse, int flags) throws IOException {
+ assert !eof;
+ //if (DEBUG) {
+ //System.out.println("BTTR.docs seg=" + segment);
+ //}
+ currentFrame.decodeMetaData();
+ //if (DEBUG) {
+ //System.out.println(" state=" + currentFrame.state);
+ //}
+ return fr.parent.postingsReader.docs(fr.fieldInfo, currentFrame.state, skipDocs, reuse, flags);
+ }
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
+ if (fr.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
+ // Positions were not indexed:
+ return null;
+ }
+
+ assert !eof;
+ currentFrame.decodeMetaData();
+ return fr.parent.postingsReader.docsAndPositions(fr.fieldInfo, currentFrame.state, skipDocs, reuse, flags);
+ }
+
+ @Override
+ public void seekExact(BytesRef target, TermState otherState) {
+ // if (DEBUG) {
+ // System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + target.utf8ToString() + " " + target + " state=" + otherState);
+ // }
+ assert clearEOF();
+ if (target.compareTo(term) != 0 || !termExists) {
+ assert otherState != null && otherState instanceof BlockTermState;
+ currentFrame = staticFrame;
+ currentFrame.state.copyFrom(otherState);
+ term.copyBytes(target);
+ currentFrame.metaDataUpto = currentFrame.getTermBlockOrd();
+ assert currentFrame.metaDataUpto > 0;
+ validIndexPrefix = 0;
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" skip seek: already on target state=" + currentFrame.state);
+ // }
+ }
+ }
+
+ @Override
+ public TermState termState() throws IOException {
+ assert !eof;
+ currentFrame.decodeMetaData();
+ TermState ts = currentFrame.state.clone();
+ //if (DEBUG) System.out.println("BTTR.termState seg=" + segment + " state=" + ts);
+ return ts;
+ }
+
+ @Override
+ public void seekExact(long ord) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long ord() {
+ throw new UnsupportedOperationException();
+ }
+}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java
new file mode 100644
index 00000000000..4fde1466a42
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java
@@ -0,0 +1,746 @@
+package org.apache.lucene.codecs.idversion;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.TermsEnum.SeekStatus;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PairOutputs.Pair;
+
+final class IDVersionSegmentTermsEnumFrame {
+ // Our index in stack[]:
+ final int ord;
+
+ boolean hasTerms;
+ boolean hasTermsOrig;
+ boolean isFloor;
+
+ FST.Arc> arc;
+
+ // File pointer where this block was loaded from
+ long fp;
+ long fpOrig;
+ long fpEnd;
+
+ byte[] suffixBytes = new byte[128];
+ final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
+
+ byte[] statBytes = new byte[64];
+ final ByteArrayDataInput statsReader = new ByteArrayDataInput();
+
+ byte[] floorData = new byte[32];
+ final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
+
+ // Length of prefix shared by all terms in this block
+ int prefix;
+
+ // Number of entries (term or sub-block) in this block
+ int entCount;
+
+ // Which term we will next read, or -1 if the block
+ // isn't loaded yet
+ int nextEnt;
+
+ // True if this block is either not a floor block,
+ // or, it's the last sub-block of a floor block
+ boolean isLastInFloor;
+
+ // True if all entries are terms
+ boolean isLeafBlock;
+
+ long lastSubFP;
+
+ int nextFloorLabel;
+ int numFollowFloorBlocks;
+
+ // Next term to decode metaData; we decode metaData
+ // lazily so that scanning to find the matching term is
+ // fast and only if you find a match and app wants the
+ // stats or docs/positions enums, will we decode the
+ // metaData
+ int metaDataUpto;
+
+ final BlockTermState state;
+
+ // metadata buffer, holding monotonic values
+ public long[] longs;
+ // metadata buffer, holding general values
+ public byte[] bytes;
+ ByteArrayDataInput bytesReader;
+
+ private final IDVersionSegmentTermsEnum ste;
+
+ public IDVersionSegmentTermsEnumFrame(IDVersionSegmentTermsEnum ste, int ord) throws IOException {
+ this.ste = ste;
+ this.ord = ord;
+ this.state = ste.fr.parent.postingsReader.newTermState();
+ this.state.totalTermFreq = -1;
+ this.longs = new long[ste.fr.longsSize];
+ }
+
+ public void setFloorData(ByteArrayDataInput in, BytesRef source) {
+ final int numBytes = source.length - (in.getPosition() - source.offset);
+ if (numBytes > floorData.length) {
+ floorData = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ System.arraycopy(source.bytes, source.offset+in.getPosition(), floorData, 0, numBytes);
+ floorDataReader.reset(floorData, 0, numBytes);
+ numFollowFloorBlocks = floorDataReader.readVInt();
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ //if (DEBUG) {
+ //System.out.println(" setFloorData fpOrig=" + fpOrig + " bytes=" + new BytesRef(source.bytes, source.offset + in.getPosition(), numBytes) + " numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + toHex(nextFloorLabel));
+ //}
+ }
+
+ public int getTermBlockOrd() {
+ return isLeafBlock ? nextEnt : state.termBlockOrd;
+ }
+
+ void loadNextFloorBlock() throws IOException {
+ //if (DEBUG) {
+ //System.out.println(" loadNextFloorBlock fp=" + fp + " fpEnd=" + fpEnd);
+ //}
+ assert arc == null || isFloor: "arc=" + arc + " isFloor=" + isFloor;
+ fp = fpEnd;
+ nextEnt = -1;
+ loadBlock();
+ }
+
+ /* Does initial decode of next block of terms; this
+ doesn't actually decode the docFreq, totalTermFreq,
+ postings details (frq/prx offset, etc.) metadata;
+ it just loads them as byte[] blobs which are then
+ decoded on-demand if the metadata is ever requested
+ for any term in this block. This enables terms-only
+ intensive consumes (eg certain MTQs, respelling) to
+ not pay the price of decoding metadata they won't
+ use. */
+ void loadBlock() throws IOException {
+
+ // Clone the IndexInput lazily, so that consumers
+ // that just pull a TermsEnum to
+ // seekExact(TermState) don't pay this cost:
+ ste.initIndexInput();
+
+ if (nextEnt != -1) {
+ // Already loaded
+ return;
+ }
+ //System.out.println("blc=" + blockLoadCount);
+
+ ste.in.seek(fp);
+ int code = ste.in.readVInt();
+ entCount = code >>> 1;
+ assert entCount > 0;
+ isLastInFloor = (code & 1) != 0;
+ assert arc == null || (isLastInFloor || isFloor);
+
+ // TODO: if suffixes were stored in random-access
+ // array structure, then we could do binary search
+ // instead of linear scan to find target term; eg
+ // we could have simple array of offsets
+
+ // term suffixes:
+ code = ste.in.readVInt();
+ isLeafBlock = (code & 1) != 0;
+ int numBytes = code >>> 1;
+ if (suffixBytes.length < numBytes) {
+ suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ ste.in.readBytes(suffixBytes, 0, numBytes);
+ suffixesReader.reset(suffixBytes, 0, numBytes);
+
+ /*if (DEBUG) {
+ if (arc == null) {
+ System.out.println(" loadBlock (next) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock);
+ } else {
+ System.out.println(" loadBlock (seek) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock);
+ }
+ }*/
+
+ // stats
+ numBytes = ste.in.readVInt();
+ if (statBytes.length < numBytes) {
+ statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ ste.in.readBytes(statBytes, 0, numBytes);
+ statsReader.reset(statBytes, 0, numBytes);
+ metaDataUpto = 0;
+
+ state.termBlockOrd = 0;
+ nextEnt = 0;
+ lastSubFP = -1;
+
+ // TODO: we could skip this if !hasTerms; but
+ // that's rare so won't help much
+ // metadata
+ numBytes = ste.in.readVInt();
+ if (bytes == null) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ bytesReader = new ByteArrayDataInput();
+ } else if (bytes.length < numBytes) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ ste.in.readBytes(bytes, 0, numBytes);
+ bytesReader.reset(bytes, 0, numBytes);
+
+
+ // Sub-blocks of a single floor block are always
+ // written one after another -- tail recurse:
+ fpEnd = ste.in.getFilePointer();
+ // if (DEBUG) {
+ // System.out.println(" fpEnd=" + fpEnd);
+ // }
+ }
+
+ void rewind() {
+
+ // Force reload:
+ fp = fpOrig;
+ nextEnt = -1;
+ hasTerms = hasTermsOrig;
+ if (isFloor) {
+ floorDataReader.rewind();
+ numFollowFloorBlocks = floorDataReader.readVInt();
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ }
+
+ /*
+ //System.out.println("rewind");
+ // Keeps the block loaded, but rewinds its state:
+ if (nextEnt > 0 || fp != fpOrig) {
+ if (DEBUG) {
+ System.out.println(" rewind frame ord=" + ord + " fpOrig=" + fpOrig + " fp=" + fp + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " nextEnt=" + nextEnt + " prefixLen=" + prefix);
+ }
+ if (fp != fpOrig) {
+ fp = fpOrig;
+ nextEnt = -1;
+ } else {
+ nextEnt = 0;
+ }
+ hasTerms = hasTermsOrig;
+ if (isFloor) {
+ floorDataReader.rewind();
+ numFollowFloorBlocks = floorDataReader.readVInt();
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ }
+ assert suffixBytes != null;
+ suffixesReader.rewind();
+ assert statBytes != null;
+ statsReader.rewind();
+ metaDataUpto = 0;
+ state.termBlockOrd = 0;
+ // TODO: skip this if !hasTerms? Then postings
+ // impl wouldn't have to write useless 0 byte
+ postingsReader.resetTermsBlock(fieldInfo, state);
+ lastSubFP = -1;
+ } else if (DEBUG) {
+ System.out.println(" skip rewind fp=" + fp + " fpOrig=" + fpOrig + " nextEnt=" + nextEnt + " ord=" + ord);
+ }
+ */
+ }
+
+ public boolean next() {
+ return isLeafBlock ? nextLeaf() : nextNonLeaf();
+ }
+
+ // Decodes next entry; returns true if it's a sub-block
+ public boolean nextLeaf() {
+ //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
+ assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
+ nextEnt++;
+ suffix = suffixesReader.readVInt();
+ startBytePos = suffixesReader.getPosition();
+ ste.term.length = prefix + suffix;
+ if (ste.term.bytes.length < ste.term.length) {
+ ste.term.grow(ste.term.length);
+ }
+ suffixesReader.readBytes(ste.term.bytes, prefix, suffix);
+ // A normal term
+ ste.termExists = true;
+ return false;
+ }
+
+ public boolean nextNonLeaf() {
+ //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
+ assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
+ nextEnt++;
+ final int code = suffixesReader.readVInt();
+ suffix = code >>> 1;
+ startBytePos = suffixesReader.getPosition();
+ ste.term.length = prefix + suffix;
+ if (ste.term.bytes.length < ste.term.length) {
+ ste.term.grow(ste.term.length);
+ }
+ suffixesReader.readBytes(ste.term.bytes, prefix, suffix);
+ if ((code & 1) == 0) {
+ // A normal term
+ ste.termExists = true;
+ subCode = 0;
+ state.termBlockOrd++;
+ return false;
+ } else {
+ // A sub-block; make sub-FP absolute:
+ ste.termExists = false;
+ subCode = suffixesReader.readVLong();
+ lastSubFP = fp - subCode;
+ //if (DEBUG) {
+ //System.out.println(" lastSubFP=" + lastSubFP);
+ //}
+ return true;
+ }
+ }
+
+ // TODO: make this array'd so we can do bin search?
+ // likely not worth it? need to measure how many
+ // floor blocks we "typically" get
+ public void scanToFloorFrame(BytesRef target) {
+
+ if (!isFloor || target.length <= prefix) {
+ // if (DEBUG) {
+ // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + target.length + " vs prefix=" + prefix);
+ // }
+ return;
+ }
+
+ final int targetLabel = target.bytes[target.offset + prefix] & 0xFF;
+
+ // if (DEBUG) {
+ // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + toHex(targetLabel) + " vs nextFloorLabel=" + toHex(nextFloorLabel) + " numFollowFloorBlocks=" + numFollowFloorBlocks);
+ // }
+
+ if (targetLabel < nextFloorLabel) {
+ // if (DEBUG) {
+ // System.out.println(" already on correct block");
+ // }
+ return;
+ }
+
+ assert numFollowFloorBlocks != 0;
+
+ long newFP = fpOrig;
+ while (true) {
+ final long code = floorDataReader.readVLong();
+ newFP = fpOrig + (code >>> 1);
+ hasTerms = (code & 1) != 0;
+ // if (DEBUG) {
+ // System.out.println(" label=" + toHex(nextFloorLabel) + " fp=" + newFP + " hasTerms?=" + hasTerms + " numFollowFloor=" + numFollowFloorBlocks);
+ // }
+
+ isLastInFloor = numFollowFloorBlocks == 1;
+ numFollowFloorBlocks--;
+
+ if (isLastInFloor) {
+ nextFloorLabel = 256;
+ // if (DEBUG) {
+ // System.out.println(" stop! last block nextFloorLabel=" + toHex(nextFloorLabel));
+ // }
+ break;
+ } else {
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ if (targetLabel < nextFloorLabel) {
+ // if (DEBUG) {
+ // System.out.println(" stop! nextFloorLabel=" + toHex(nextFloorLabel));
+ // }
+ break;
+ }
+ }
+ }
+
+ if (newFP != fp) {
+ // Force re-load of the block:
+ // if (DEBUG) {
+ // System.out.println(" force switch to fp=" + newFP + " oldFP=" + fp);
+ // }
+ nextEnt = -1;
+ fp = newFP;
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" stay on same fp=" + newFP);
+ // }
+ }
+ }
+
+ public void decodeMetaData() throws IOException {
+
+ //if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + segment + " mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd);
+
+ // lazily catch up on metadata decode:
+ final int limit = getTermBlockOrd();
+ boolean absolute = metaDataUpto == 0;
+ assert limit > 0;
+
+ // TODO: better API would be "jump straight to term=N"???
+ while (metaDataUpto < limit) {
+
+ // TODO: we could make "tiers" of metadata, ie,
+ // decode docFreq/totalTF but don't decode postings
+ // metadata; this way caller could get
+ // docFreq/totalTF w/o paying decode cost for
+ // postings
+
+ // TODO: if docFreq were bulk decoded we could
+ // just skipN here:
+
+ // stats
+ state.docFreq = statsReader.readVInt();
+ //if (DEBUG) System.out.println(" dF=" + state.docFreq);
+ if (ste.fr.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ state.totalTermFreq = state.docFreq + statsReader.readVLong();
+ //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
+ }
+ // metadata
+ for (int i = 0; i < ste.fr.longsSize; i++) {
+ longs[i] = bytesReader.readVLong();
+ }
+ ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute);
+
+ metaDataUpto++;
+ absolute = false;
+ }
+ state.termBlockOrd = metaDataUpto;
+ }
+
+ // Used only by assert
+ private boolean prefixMatches(BytesRef target) {
+ for(int bytePos=0;bytePosNOTE: this terms dictionary supports
+ * min/maxItemsPerBlock during indexing to control how
+ * much memory the terms index uses.
+ *
+ *
The data structure used by this implementation is very
+ * similar to a burst trie
+ * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499),
+ * but with added logic to break up too-large blocks of all
+ * terms sharing a given prefix into smaller ones.
+ *
+ *
Use {@link org.apache.lucene.index.CheckIndex} with the -verbose
+ * option to see summary statistics on the blocks in the
+ * dictionary.
+ *
+ * See {@link BlockTreeTermsWriter}.
+ *
+ * @lucene.experimental
+ */
+
+final class VersionBlockTreeTermsReader extends FieldsProducer {
+
+ // Open input to the main terms dict file (_X.tiv)
+ final IndexInput in;
+
+ //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
+
+ // Reads the terms dict entries, to gather state to
+ // produce DocsEnum on demand
+ final PostingsReaderBase postingsReader;
+
+ private final TreeMap fields = new TreeMap<>();
+
+ /** File offset where the directory starts in the terms file. */
+ private long dirOffset;
+
+ /** File offset where the directory starts in the index file. */
+ private long indexDirOffset;
+
+ final String segment;
+
+ private final int version;
+
+ /** Sole constructor. */
+ public VersionBlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo info,
+ PostingsReaderBase postingsReader, IOContext ioContext,
+ String segmentSuffix)
+ throws IOException {
+
+ this.postingsReader = postingsReader;
+
+ this.segment = info.name;
+ in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, VersionBlockTreeTermsWriter.TERMS_EXTENSION),
+ ioContext);
+
+ boolean success = false;
+ IndexInput indexIn = null;
+
+ try {
+ version = readHeader(in);
+ indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, VersionBlockTreeTermsWriter.TERMS_INDEX_EXTENSION),
+ ioContext);
+ int indexVersion = readIndexHeader(indexIn);
+ if (indexVersion != version) {
+ throw new CorruptIndexException("mixmatched version files: " + in + "=" + version + "," + indexIn + "=" + indexVersion);
+ }
+
+ // verify
+ if (version >= VersionBlockTreeTermsWriter.VERSION_CHECKSUM) {
+ CodecUtil.checksumEntireFile(indexIn);
+ }
+
+ // Have PostingsReader init itself
+ postingsReader.init(in);
+
+ // Read per-field details
+ seekDir(in, dirOffset);
+ seekDir(indexIn, indexDirOffset);
+
+ final int numFields = in.readVInt();
+ if (numFields < 0) {
+ throw new CorruptIndexException("invalid numFields: " + numFields + " (resource=" + in + ")");
+ }
+
+ PairOutputs fstOutputs = VersionBlockTreeTermsWriter.getFSTOutputs();
+
+ for(int i=0;i= 0;
+ final int numBytes = in.readVInt();
+ final BytesRef code = new BytesRef(new byte[numBytes]);
+ in.readBytes(code.bytes, 0, numBytes);
+ code.length = numBytes;
+ final long version = in.readVLong();
+ final Pair rootCode = fstOutputs.newPair(code, version);
+ final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ assert fieldInfo != null: "field=" + field;
+ final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
+ final long sumDocFreq = in.readVLong();
+ final int docCount = in.readVInt();
+ final int longsSize = version >= VersionBlockTreeTermsWriter.VERSION_META_ARRAY ? in.readVInt() : 0;
+
+ BytesRef minTerm, maxTerm;
+ if (version >= VersionBlockTreeTermsWriter.VERSION_MIN_MAX_TERMS) {
+ minTerm = readBytesRef(in);
+ maxTerm = readBytesRef(in);
+ } else {
+ minTerm = maxTerm = null;
+ }
+ if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs
+ throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
+ }
+ if (sumDocFreq < docCount) { // #postings must be >= #docs with field
+ throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount + " (resource=" + in + ")");
+ }
+ if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
+ throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")");
+ }
+ final long indexStartFP = indexIn.readVLong();
+ VersionFieldReader previous = fields.put(fieldInfo.name,
+ new VersionFieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
+ indexStartFP, longsSize, indexIn, minTerm, maxTerm));
+ if (previous != null) {
+ throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")");
+ }
+ }
+ indexIn.close();
+
+ success = true;
+ } finally {
+ if (!success) {
+ // this.close() will close in:
+ IOUtils.closeWhileHandlingException(indexIn, this);
+ }
+ }
+ }
+
+ private static BytesRef readBytesRef(IndexInput in) throws IOException {
+ BytesRef bytes = new BytesRef();
+ bytes.length = in.readVInt();
+ bytes.bytes = new byte[bytes.length];
+ in.readBytes(bytes.bytes, 0, bytes.length);
+ return bytes;
+ }
+
+ /** Reads terms file header. */
+ private int readHeader(IndexInput input) throws IOException {
+ int version = CodecUtil.checkHeader(input, VersionBlockTreeTermsWriter.TERMS_CODEC_NAME,
+ VersionBlockTreeTermsWriter.VERSION_START,
+ VersionBlockTreeTermsWriter.VERSION_CURRENT);
+ if (version < VersionBlockTreeTermsWriter.VERSION_APPEND_ONLY) {
+ dirOffset = input.readLong();
+ }
+ return version;
+ }
+
+ /** Reads index file header. */
+ private int readIndexHeader(IndexInput input) throws IOException {
+ int version = CodecUtil.checkHeader(input, VersionBlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME,
+ VersionBlockTreeTermsWriter.VERSION_START,
+ VersionBlockTreeTermsWriter.VERSION_CURRENT);
+ if (version < VersionBlockTreeTermsWriter.VERSION_APPEND_ONLY) {
+ indexDirOffset = input.readLong();
+ }
+ return version;
+ }
+
+ /** Seek {@code input} to the directory offset. */
+ private void seekDir(IndexInput input, long dirOffset)
+ throws IOException {
+ if (version >= VersionBlockTreeTermsWriter.VERSION_CHECKSUM) {
+ input.seek(input.length() - CodecUtil.footerLength() - 8);
+ dirOffset = input.readLong();
+ } else if (version >= VersionBlockTreeTermsWriter.VERSION_APPEND_ONLY) {
+ input.seek(input.length() - 8);
+ dirOffset = input.readLong();
+ }
+ input.seek(dirOffset);
+ }
+
+ // for debugging
+ // private static String toHex(int v) {
+ // return "0x" + Integer.toHexString(v);
+ // }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ IOUtils.close(in, postingsReader);
+ } finally {
+ // Clear so refs to terms index is GCable even if
+ // app hangs onto us:
+ fields.clear();
+ }
+ }
+
+ @Override
+ public Iterator iterator() {
+ return Collections.unmodifiableSet(fields.keySet()).iterator();
+ }
+
+ @Override
+ public Terms terms(String field) throws IOException {
+ assert field != null;
+ return fields.get(field);
+ }
+
+ @Override
+ public int size() {
+ return fields.size();
+ }
+
+ // for debugging
+ String brToString(BytesRef b) {
+ if (b == null) {
+ return "null";
+ } else {
+ try {
+ return b.utf8ToString() + " " + b;
+ } catch (Throwable t) {
+ // If BytesRef isn't actually UTF8, or it's eg a
+ // prefix of UTF8 that ends mid-unicode-char, we
+ // fallback to hex:
+ return b.toString();
+ }
+ }
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ long sizeInByes = ((postingsReader!=null) ? postingsReader.ramBytesUsed() : 0);
+ for(VersionFieldReader reader : fields.values()) {
+ sizeInByes += reader.ramBytesUsed();
+ }
+ return sizeInByes;
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ if (version >= VersionBlockTreeTermsWriter.VERSION_CHECKSUM) {
+ // term dictionary
+ CodecUtil.checksumEntireFile(in);
+
+ // postings
+ postingsReader.checkIntegrity();
+ }
+ }
+}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java
new file mode 100644
index 00000000000..d333ba494ef
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java
@@ -0,0 +1,1192 @@
+package org.apache.lucene.codecs.idversion;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.BytesRefFSTEnum;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.NoOutputs;
+import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.fst.PairOutputs.Pair;
+import org.apache.lucene.util.fst.PairOutputs;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.util.packed.PackedInts;
+
+/*
+ TODO:
+
+ - Currently there is a one-to-one mapping of indexed
+ term to term block, but we could decouple the two, ie,
+ put more terms into the index than there are blocks.
+ The index would take up more RAM but then it'd be able
+ to avoid seeking more often and could make PK/FuzzyQ
+ faster if the additional indexed terms could store
+ the offset into the terms block.
+
+ - The blocks are not written in true depth-first
+ order, meaning if you just next() the file pointer will
+ sometimes jump backwards. For example, block foo* will
+ be written before block f* because it finished before.
+ This could possibly hurt performance if the terms dict is
+ not hot, since OSs anticipate sequential file access. We
+ could fix the writer to re-order the blocks as a 2nd
+ pass.
+
+ - Each block encodes the term suffixes packed
+ sequentially using a separate vInt per term, which is
+ 1) wasteful and 2) slow (must linear scan to find a
+ particular suffix). We should instead 1) make
+ random-access array so we can directly access the Nth
+ suffix, and 2) bulk-encode this array using bulk int[]
+ codecs; then at search time we can binary search when
+ we seek a particular term.
+*/
+
+/**
+ * Block-based terms index and dictionary writer.
+ *
+ * Writes terms dict and index, block-encoding (column
+ * stride) each term's metadata for each set of terms
+ * between two index terms.
+ *
The .tim file contains the list of terms in each
+ * field along with per-term statistics (such as docfreq)
+ * and per-term metadata (typically pointers to the postings list
+ * for that term in the inverted index).
+ *
+ *
+ *
The .tim is arranged in blocks: with blocks containing
+ * a variable number of entries (by default 25-48), where
+ * each entry is either a term or a reference to a
+ * sub-block.
+ *
+ *
NOTE: The term dictionary can plug into different postings implementations:
+ * the postings writer/reader are actually responsible for encoding
+ * and decoding the Postings Metadata and Term Metadata sections.
Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
+ * for the BlockTree implementation.
+ *
DirOffset is a pointer to the FieldSummary section.
+ *
DocFreq is the count of documents which contain the term.
+ *
TotalTermFreq is the total number of occurrences of the term. This is encoded
+ * as the difference between the total number of occurrences and the DocFreq.
+ *
FieldNumber is the fields number from {@link FieldInfos}. (.fnm)
+ *
NumTerms is the number of unique terms for the field.
+ *
RootCode points to the root block for the field.
+ *
SumDocFreq is the total number of postings, the number of term-document pairs across
+ * the entire field.
+ *
DocCount is the number of documents that have at least one posting for this field.
+ *
LongsSize records how many long values the postings writer/reader record per term
+ * (e.g., to hold freq/prox/doc file offsets).
+ *
MinTerm, MaxTerm are the lowest and highest term in this field.
+ *
PostingsHeader and TermMetadata are plugged into by the specific postings implementation:
+ * these contain arbitrary per-file data (such as parameters or versioning information)
+ * and per-term data (such as pointers to inverted files).
+ *
For inner nodes of the tree, every entry will steal one bit to mark whether it points
+ * to child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted
+ *
+ *
+ *
Term Index
+ *
The .tip file contains an index into the term dictionary, so that it can be
+ * accessed randomly. The index is also used to determine
+ * when a given term cannot exist on disk (in the .tim file), saving a disk seek.
The .tip file contains a separate FST for each
+ * field. The FST maps a term prefix to the on-disk
+ * block that holds all terms starting with that
+ * prefix. Each field's IndexStartFP points to its
+ * FST.
+ *
DirOffset is a pointer to the start of the IndexStartFPs
+ * for all fields
+ *
It's possible that an on-disk block would contain
+ * too many terms (more than the allowed maximum
+ * (default: 48)). When this happens, the block is
+ * sub-divided into new blocks (called "floor
+ * blocks"), and then the output in the FST for the
+ * block's prefix encodes the leading byte of each
+ * sub-block, and its file pointer.
+ *
+ *
+ * @see BlockTreeTermsReader
+ * @lucene.experimental
+ */
+// nocommit fix jdocs
+final class VersionBlockTreeTermsWriter extends FieldsConsumer {
+
+ /** Suggested default value for the {@code
+ * minItemsInBlock} parameter to {@link
+ * #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
+ public final static int DEFAULT_MIN_BLOCK_SIZE = 25;
+
+ /** Suggested default value for the {@code
+ * maxItemsInBlock} parameter to {@link
+ * #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
+ public final static int DEFAULT_MAX_BLOCK_SIZE = 48;
+
+ //public final static boolean DEBUG = false;
+ //private final static boolean SAVE_DOT_FILES = false;
+
+ static final int OUTPUT_FLAGS_NUM_BITS = 2;
+ static final int OUTPUT_FLAGS_MASK = 0x3;
+ static final int OUTPUT_FLAG_IS_FLOOR = 0x1;
+ static final int OUTPUT_FLAG_HAS_TERMS = 0x2;
+
+ /** Extension of terms file */
+ static final String TERMS_EXTENSION = "tiv";
+ final static String TERMS_CODEC_NAME = "VERSION_BLOCK_TREE_TERMS_DICT";
+
+ /** Initial terms format. */
+ public static final int VERSION_START = 0;
+
+ // nocommit nuke all these old versions
+
+ /** Append-only */
+ public static final int VERSION_APPEND_ONLY = 1;
+
+ /** Meta data as array */
+ public static final int VERSION_META_ARRAY = 2;
+
+ /** checksums */
+ public static final int VERSION_CHECKSUM = 3;
+
+ /** min/max term */
+ public static final int VERSION_MIN_MAX_TERMS = 4;
+
+ /** Current terms format. */
+ public static final int VERSION_CURRENT = VERSION_MIN_MAX_TERMS;
+
+ /** Extension of terms index file */
+ static final String TERMS_INDEX_EXTENSION = "tip";
+ final static String TERMS_INDEX_CODEC_NAME = "VERSION_BLOCK_TREE_TERMS_INDEX";
+
+ private final IndexOutput out;
+ private final IndexOutput indexOut;
+ final int maxDoc;
+ final int minItemsInBlock;
+ final int maxItemsInBlock;
+
+ final PostingsWriterBase postingsWriter;
+ final FieldInfos fieldInfos;
+
+ private static class FieldMetaData {
+ public final FieldInfo fieldInfo;
+ public final Pair rootCode;
+ public final long numTerms;
+ public final long indexStartFP;
+ public final long sumTotalTermFreq;
+ public final long sumDocFreq;
+ public final int docCount;
+ private final int longsSize;
+ public final BytesRef minTerm;
+ public final BytesRef maxTerm;
+
+ public FieldMetaData(FieldInfo fieldInfo, Pair rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize,
+ BytesRef minTerm, BytesRef maxTerm) {
+ assert numTerms > 0;
+ this.fieldInfo = fieldInfo;
+ assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms;
+ this.rootCode = rootCode;
+ this.indexStartFP = indexStartFP;
+ this.numTerms = numTerms;
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ this.sumDocFreq = sumDocFreq;
+ this.docCount = docCount;
+ this.longsSize = longsSize;
+ this.minTerm = minTerm;
+ this.maxTerm = maxTerm;
+ }
+ }
+
+ private final List fields = new ArrayList<>();
+ // private final String segment;
+
+ /** Create a new writer. The number of items (terms or
+ * sub-blocks) per block will aim to be between
+ * minItemsPerBlock and maxItemsPerBlock, though in some
+ * cases the blocks may be smaller than the min. */
+ public VersionBlockTreeTermsWriter(
+ SegmentWriteState state,
+ PostingsWriterBase postingsWriter,
+ int minItemsInBlock,
+ int maxItemsInBlock)
+ throws IOException
+ {
+ if (minItemsInBlock <= 1) {
+ throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock);
+ }
+ if (maxItemsInBlock <= 0) {
+ throw new IllegalArgumentException("maxItemsInBlock must be >= 1; got " + maxItemsInBlock);
+ }
+ if (minItemsInBlock > maxItemsInBlock) {
+ throw new IllegalArgumentException("maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock);
+ }
+ if (2*(minItemsInBlock-1) > maxItemsInBlock) {
+ throw new IllegalArgumentException("maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock);
+ }
+
+ maxDoc = state.segmentInfo.getDocCount();
+
+ final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
+ out = state.directory.createOutput(termsFileName, state.context);
+ boolean success = false;
+ IndexOutput indexOut = null;
+ try {
+ fieldInfos = state.fieldInfos;
+ this.minItemsInBlock = minItemsInBlock;
+ this.maxItemsInBlock = maxItemsInBlock;
+ writeHeader(out);
+
+ //DEBUG = state.segmentName.equals("_4a");
+
+ final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
+ indexOut = state.directory.createOutput(termsIndexFileName, state.context);
+ writeIndexHeader(indexOut);
+
+ this.postingsWriter = postingsWriter;
+ // segment = state.segmentName;
+
+ // System.out.println("BTW.init seg=" + state.segmentName);
+
+ postingsWriter.init(out); // have consumer write its format/header
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(out, indexOut);
+ }
+ }
+ this.indexOut = indexOut;
+ }
+
+ /** Writes the terms file header. */
+ private void writeHeader(IndexOutput out) throws IOException {
+ CodecUtil.writeHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT);
+ }
+
+ /** Writes the index file header. */
+ private void writeIndexHeader(IndexOutput out) throws IOException {
+ CodecUtil.writeHeader(out, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT);
+ }
+
+ /** Writes the terms file trailer. */
+ private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
+ out.writeLong(dirStart);
+ }
+
+ /** Writes the index file trailer. */
+ private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException {
+ indexOut.writeLong(dirStart);
+ }
+
+ @Override
+ public void write(Fields fields) throws IOException {
+
+ String lastField = null;
+ for(String field : fields) {
+ assert lastField == null || lastField.compareTo(field) < 0;
+ lastField = field;
+
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+
+ TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field));
+ BytesRef minTerm = null;
+ BytesRef maxTerm = new BytesRef();
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ if (minTerm == null) {
+ minTerm = BytesRef.deepCopyOf(term);
+ }
+ maxTerm.copyBytes(term);
+ termsWriter.write(term, termsEnum);
+ }
+
+ termsWriter.finish(minTerm, minTerm == null ? null : maxTerm);
+ }
+ }
+
+ static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
+ assert fp < (1L << 62);
+ return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0);
+ }
+
+ private static class PendingEntry {
+ public final boolean isTerm;
+
+ protected PendingEntry(boolean isTerm) {
+ this.isTerm = isTerm;
+ }
+ }
+
+ private static final class PendingTerm extends PendingEntry {
+ public final BytesRef term;
+ // stats + metadata
+ public final BlockTermState state;
+
+ public PendingTerm(BytesRef term, BlockTermState state) {
+ super(true);
+ this.term = term;
+ this.state = state;
+ }
+
+ @Override
+ public String toString() {
+ return term.utf8ToString();
+ }
+ }
+
+ private static final class PendingBlock extends PendingEntry {
+ public final BytesRef prefix;
+ public final long fp;
+ public FST> index;
+ public List>> subIndices;
+ public final boolean hasTerms;
+ public final boolean isFloor;
+ public final int floorLeadByte;
+ private final IntsRef scratchIntsRef = new IntsRef();
+ /** Max version for all terms in this block. */
+ private final long maxVersion;
+
+ public PendingBlock(BytesRef prefix, long maxVersion, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List>> subIndices) {
+ super(false);
+ this.prefix = prefix;
+ this.maxVersion = maxVersion;
+ this.fp = fp;
+ this.hasTerms = hasTerms;
+ this.isFloor = isFloor;
+ this.floorLeadByte = floorLeadByte;
+ this.subIndices = subIndices;
+ }
+
+ @Override
+ public String toString() {
+ return "BLOCK: " + prefix.utf8ToString();
+ }
+
+ public void compileIndex(List floorBlocks, RAMOutputStream scratchBytes) throws IOException {
+
+ assert (isFloor && floorBlocks != null && floorBlocks.size() != 0) || (!isFloor && floorBlocks == null): "isFloor=" + isFloor + " floorBlocks=" + floorBlocks;
+
+ assert scratchBytes.getFilePointer() == 0;
+
+ long maxVersionIndex = maxVersion;
+
+ // TODO: try writing the leading vLong in MSB order
+ // (opposite of what Lucene does today), for better
+ // outputs sharing in the FST
+ scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
+ if (isFloor) {
+ scratchBytes.writeVInt(floorBlocks.size());
+ for (PendingBlock sub : floorBlocks) {
+ assert sub.floorLeadByte != -1;
+ maxVersionIndex = Math.max(maxVersionIndex, sub.maxVersion);
+ //if (DEBUG) {
+ // System.out.println(" write floorLeadByte=" + Integer.toHexString(sub.floorLeadByte&0xff));
+ //}
+ scratchBytes.writeByte((byte) sub.floorLeadByte);
+ assert sub.fp > fp;
+ scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0));
+ }
+ }
+
+ final PairOutputs outputs = getFSTOutputs();
+ final Builder> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
+ 0, 0, true, false, Integer.MAX_VALUE,
+ outputs, null, false,
+ PackedInts.COMPACT, true, 15);
+ //if (DEBUG) {
+ // System.out.println(" compile index for prefix=" + prefix);
+ //}
+ //indexBuilder.DEBUG = false;
+ final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()];
+ assert bytes.length > 0;
+ scratchBytes.writeTo(bytes, 0);
+ indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), outputs.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex));
+ scratchBytes.reset();
+
+ // Copy over index for all sub-blocks
+
+ if (subIndices != null) {
+ for(FST> subIndex : subIndices) {
+ append(indexBuilder, subIndex);
+ }
+ }
+
+ if (floorBlocks != null) {
+ for (PendingBlock sub : floorBlocks) {
+ if (sub.subIndices != null) {
+ for(FST> subIndex : sub.subIndices) {
+ append(indexBuilder, subIndex);
+ }
+ }
+ sub.subIndices = null;
+ }
+ }
+
+ index = indexBuilder.finish();
+ subIndices = null;
+
+ /*
+ Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
+ Util.toDot(index, w, false, false);
+ System.out.println("SAVED to out.dot");
+ w.close();
+ */
+ }
+
+ // TODO: maybe we could add bulk-add method to
+ // Builder? Takes FST and unions it w/ current
+ // FST.
+ private void append(Builder> builder, FST> subIndex) throws IOException {
+ final BytesRefFSTEnum> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
+ BytesRefFSTEnum.InputOutput> indexEnt;
+ while((indexEnt = subIndexEnum.next()) != null) {
+ //if (DEBUG) {
+ // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
+ //}
+ builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
+ }
+ }
+ }
+
+ static PairOutputs getFSTOutputs() {
+ return new PairOutputs<>(ByteSequenceOutputs.getSingleton(),
+ PositiveIntOutputs.getSingleton());
+ }
+
+ final RAMOutputStream scratchBytes = new RAMOutputStream();
+
+ class TermsWriter {
+ private final FieldInfo fieldInfo;
+ private final int longsSize;
+ private long numTerms;
+ final FixedBitSet docsSeen;
+ long sumTotalTermFreq;
+ long sumDocFreq;
+ long indexStartFP;
+
+ // Used only to partition terms into the block tree; we
+ // don't pull an FST from this builder:
+ private final NoOutputs noOutputs;
+ private final Builder