From 491dc6a7165997b746422272b62b51fd36b4d292 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Thu, 15 May 2014 22:28:48 +0000 Subject: [PATCH] LUCENE-5675: initial fork of BT with versioning added git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5675@1595064 13f79535-47bb-0310-9956-ffa450edef68 --- .../idversion/IDVersionPostingsReader.java | 2 - .../idversion/IDVersionPostingsWriter.java | 22 +- .../idversion/IDVersionSegmentTermsEnum.java | 971 ++++++++++++++ .../IDVersionSegmentTermsEnumFrame.java | 746 +++++++++++ .../codecs/idversion/IDVersionTermState.java | 41 + .../VersionBlockTreeTermsReader.java | 319 +++++ .../VersionBlockTreeTermsWriter.java | 1192 +++++++++++++++++ .../codecs/idversion/VersionFieldReader.java | 163 +++ .../blocktree/BlockTreeTermsReader.java | 2 +- .../blocktree/BlockTreeTermsWriter.java | 14 +- 10 files changed, 3441 insertions(+), 31 deletions(-) create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionTermState.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java index 5bc8a640c29..26e300fed7b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java @@ -17,8 +17,6 @@ package org.apache.lucene.codecs.idversion; * limitations under the License. */ -import static org.apache.lucene.codecs.idversion.IDVersionPostingsWriter.IDVersionTermState; - import java.io.IOException; import org.apache.lucene.codecs.BlockTermState; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java index d4fdb1fc0d8..e304dc982d7 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java @@ -43,26 +43,6 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase { private int lastPosition; private long lastVersion; - final static class IDVersionTermState extends BlockTermState { - long idVersion; - int docID; - - @Override - public IDVersionTermState clone() { - IDVersionTermState other = new IDVersionTermState(); - other.copyFrom(this); - return other; - } - - @Override - public void copyFrom(TermState _other) { - super.copyFrom(_other); - IDVersionTermState other = (IDVersionTermState) _other; - idVersion = other.idVersion; - docID = other.docID; - } - } - @Override public IDVersionTermState newTermState() { return new IDVersionTermState(); @@ -144,8 +124,8 @@ public final class IDVersionPostingsWriter extends PushPostingsWriterBase { @Override public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { IDVersionTermState state = (IDVersionTermState) _state; - // nocommit must send version up to FST somehow ... out.writeVInt(state.docID); + out.writeVLong(state.idVersion); } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java new file mode 100644 index 00000000000..6320438a06d --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java @@ -0,0 +1,971 @@ +package org.apache.lucene.codecs.idversion; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.PrintStream; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Outputs; +import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.fst.PairOutputs; +import org.apache.lucene.util.fst.Util; + +/** Iterates through terms in this field */ +final class IDVersionSegmentTermsEnum extends TermsEnum { + + final static Outputs> fstOutputs = VersionBlockTreeTermsWriter.getFSTOutputs(); + final static Pair NO_OUTPUT = fstOutputs.getNoOutput(); + + // Lazy init: + IndexInput in; + + private IDVersionSegmentTermsEnumFrame[] stack; + private final IDVersionSegmentTermsEnumFrame staticFrame; + IDVersionSegmentTermsEnumFrame currentFrame; + boolean termExists; + final VersionFieldReader fr; + + // nocommit make this public "for casting" and add a getVersion method? + + private int targetBeforeCurrentLength; + + private final ByteArrayDataInput scratchReader = new ByteArrayDataInput(); + + // What prefix of the current term was present in the index: + private int validIndexPrefix; + + // assert only: + private boolean eof; + + final BytesRef term = new BytesRef(); + private final FST.BytesReader fstReader; + + @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc>[] arcs = + new FST.Arc[1]; + + public IDVersionSegmentTermsEnum(VersionFieldReader fr) throws IOException { + this.fr = fr; + + //if (DEBUG) System.out.println("BTTR.init seg=" + segment); + stack = new IDVersionSegmentTermsEnumFrame[0]; + + // Used to hold seek by TermState, or cached seek + staticFrame = new IDVersionSegmentTermsEnumFrame(this, -1); + + if (fr.index == null) { + fstReader = null; + } else { + fstReader = fr.index.getBytesReader(); + } + + // Init w/ root block; don't use index since it may + // not (and need not) have been loaded + for(int arcIdx=0;arcIdx(); + } + + currentFrame = staticFrame; + final FST.Arc> arc; + if (fr.index != null) { + arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + currentFrame = staticFrame; + //currentFrame = pushFrame(arc, rootCode, 0); + //currentFrame.loadBlock(); + validIndexPrefix = 0; + // if (DEBUG) { + // System.out.println("init frame state " + currentFrame.ord); + // printSeekState(); + // } + + //System.out.println(); + // computeBlockStats().print(System.out); + } + + // Not private to avoid synthetic access$NNN methods + void initIndexInput() { + if (this.in == null) { + this.in = fr.parent.in.clone(); + } + } + + private IDVersionSegmentTermsEnumFrame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final IDVersionSegmentTermsEnumFrame[] next = new IDVersionSegmentTermsEnumFrame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, next, 0, stack.length); + for(int stackOrd=stack.length;stackOrd> getArc(int ord) { + if (ord >= arcs.length) { + @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc>[] next = + new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, next, 0, arcs.length); + for(int arcOrd=arcs.length;arcOrd(); + } + arcs = next; + } + return arcs[ord]; + } + + // Pushes a frame we seek'd to + IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc> arc, Pair frameData, int length) throws IOException { + scratchReader.reset(frameData.output1.bytes, frameData.output1.offset, frameData.output1.length); + final long code = scratchReader.readVLong(); + final long fpSeek = code >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; + final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord); + f.hasTerms = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0; + f.hasTermsOrig = f.hasTerms; + f.isFloor = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0; + if (f.isFloor) { + f.setFloorData(scratchReader, frameData.output1); + } + pushFrame(arc, fpSeek, length); + + return f; + } + + // Pushes next'd frame or seek'd frame; we later + // lazy-load the frame only when needed + IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc> arc, long fp, int length) throws IOException { + final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord); + f.arc = arc; + if (f.fpOrig == fp && f.nextEnt != -1) { + //if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix); + if (f.prefix > targetBeforeCurrentLength) { + f.rewind(); + } else { + // if (DEBUG) { + // System.out.println(" skip rewind!"); + // } + } + assert length == f.prefix; + } else { + f.nextEnt = -1; + f.prefix = length; + f.state.termBlockOrd = 0; + f.fpOrig = f.fp = fp; + f.lastSubFP = -1; + // if (DEBUG) { + // final int sav = term.length; + // term.length = length; + // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term)); + // term.length = sav; + // } + } + + return f; + } + + // asserts only + private boolean clearEOF() { + eof = false; + return true; + } + + // asserts only + private boolean setEOF() { + eof = true; + return true; + } + + // nocommit we need a seekExact(BytesRef target, long minVersion) API? + + @Override + public boolean seekExact(final BytesRef target) throws IOException { + return seekExact(target, 0); + } + + /** Returns false if the term deos not exist, or it exists but its version is < minIDVersion. */ + public boolean seekExact(final BytesRef target, long minIDVersion) throws IOException { + + if (fr.index == null) { + throw new IllegalStateException("terms index was not loaded"); + } + + if (term.bytes.length <= target.length) { + term.bytes = ArrayUtil.grow(term.bytes, 1+target.length); + } + + assert clearEOF(); + + // if (DEBUG) { + // System.out.println("\nBTTR.seekExact seg=" + segment + " target=" + fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix); + // printSeekState(); + // } + + FST.Arc> arc; + int targetUpto; + Pair output; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + // if (DEBUG) { + // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + // } + + arc = arcs[0]; + assert arc.isFinal(); + output = arc.output; + targetUpto = 0; + + IDVersionSegmentTermsEnumFrame lastFrame = stack[0]; + assert validIndexPrefix <= term.length; + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // TODO: reverse vLong byte order for better FST + // prefix output sharing + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + // if (DEBUG) { + // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); + // } + if (cmp != 0) { + break; + } + arc = arcs[1+targetUpto]; + //if (arc.label != (target.bytes[target.offset + targetUpto] & 0xFF)) { + //System.out.println("FAIL: arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF)); + //} + assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + if (arc.isFinal()) { + lastFrame = stack[1+lastFrame.ord]; + } + targetUpto++; + } + + if (cmp == 0) { + final int targetUptoMid = targetUpto; + + // Second compare the rest of the term, but + // don't save arc/output/frame; we only do this + // to find out if the target term is before, + // equal or after the current term + final int targetLimit2 = Math.min(target.length, term.length); + while (targetUpto < targetLimit2) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + // if (DEBUG) { + // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); + // } + if (cmp != 0) { + break; + } + targetUpto++; + } + + if (cmp == 0) { + cmp = term.length - target.length; + } + targetUpto = targetUptoMid; + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + // if (DEBUG) { + // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = 0; + // if (DEBUG) { + // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length == target.length; + if (termExists) { + // if (DEBUG) { + // System.out.println(" target is same as current; return true"); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" target is same as current but term doesn't exist"); + // } + } + //validIndexPrefix = currentFrame.depth; + //term.length = target.length; + //return termExists; + } + + } else { + + targetBeforeCurrentLength = -1; + arc = fr.index.getFirstArc(arcs[0]); + + // Empty string prefix must have an output (block) in the index! + assert arc.isFinal(); + assert arc.output != null; + + // if (DEBUG) { + // System.out.println(" no seek state; push root frame"); + // } + + output = arc.output; + + currentFrame = staticFrame; + + //term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0); + } + + // if (DEBUG) { + // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength); + // } + + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final FST.Arc> nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); + + if (nextArc == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel)); + // } + + validIndexPrefix = currentFrame.prefix; + //validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + if (!currentFrame.hasTerms) { + termExists = false; + term.bytes[targetUpto] = (byte) targetLabel; + term.length = 1+targetUpto; + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); + // } + return false; + } + + if ((Long.MAX_VALUE-output.output2) < minIDVersion) { + // The max version for all terms in this block is lower than the minVersion + return false; + } + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + + currentFrame.decodeMetaData(); + if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) { + // The max version for this term is lower than the minVersion + return false; + } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term)); + // } + return false; + } + } else { + // Follow this arc + arc = nextArc; + term.bytes[targetUpto] = (byte) targetLabel; + // Aggregate output as we go: + assert arc.output != null; + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + + // if (DEBUG) { + // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); + // } + targetUpto++; + + if (arc.isFinal()) { + //if (DEBUG) System.out.println(" arc is final!"); + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto); + //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); + } + } + } + + //validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefix; + + currentFrame.scanToFloorFrame(target); + + // Target term is entirely contained in the index: + if (!currentFrame.hasTerms) { + termExists = false; + term.length = targetUpto; + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); + // } + return false; + } + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString()); + // } + + return false; + } + } + + @Override + public SeekStatus seekCeil(final BytesRef target) throws IOException { + if (fr.index == null) { + throw new IllegalStateException("terms index was not loaded"); + } + + if (term.bytes.length <= target.length) { + term.bytes = ArrayUtil.grow(term.bytes, 1+target.length); + } + + assert clearEOF(); + + //if (DEBUG) { + //System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix); + //printSeekState(); + //} + + FST.Arc> arc; + int targetUpto; + Pair output; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + //if (DEBUG) { + //System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + //} + + arc = arcs[0]; + assert arc.isFinal(); + output = arc.output; + targetUpto = 0; + + IDVersionSegmentTermsEnumFrame lastFrame = stack[0]; + assert validIndexPrefix <= term.length; + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // TOOD: we should write our vLong backwards (MSB + // first) to get better sharing from the FST + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + //if (DEBUG) { + //System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); + //} + if (cmp != 0) { + break; + } + arc = arcs[1+targetUpto]; + assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + // TOOD: we could save the outputs in local + // byte[][] instead of making new objs ever + // seek; but, often the FST doesn't have any + // shared bytes (but this could change if we + // reverse vLong byte order) + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + if (arc.isFinal()) { + lastFrame = stack[1+lastFrame.ord]; + } + targetUpto++; + } + + + if (cmp == 0) { + final int targetUptoMid = targetUpto; + // Second compare the rest of the term, but + // don't save arc/output/frame: + final int targetLimit2 = Math.min(target.length, term.length); + while (targetUpto < targetLimit2) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + //if (DEBUG) { + //System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); + //} + if (cmp != 0) { + break; + } + targetUpto++; + } + + if (cmp == 0) { + cmp = term.length - target.length; + } + targetUpto = targetUptoMid; + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + //if (DEBUG) { + //System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); clear frame.scanned ord=" + lastFrame.ord); + //} + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = 0; + //if (DEBUG) { + //System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); + //} + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length == target.length; + if (termExists) { + //if (DEBUG) { + //System.out.println(" target is same as current; return FOUND"); + //} + return SeekStatus.FOUND; + } else { + //if (DEBUG) { + //System.out.println(" target is same as current but term doesn't exist"); + //} + } + } + + } else { + + targetBeforeCurrentLength = -1; + arc = fr.index.getFirstArc(arcs[0]); + + // Empty string prefix must have an output (block) in the index! + assert arc.isFinal(); + assert arc.output != null; + + //if (DEBUG) { + //System.out.println(" no seek state; push root frame"); + //} + + output = arc.output; + + currentFrame = staticFrame; + + //term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0); + } + + //if (DEBUG) { + //System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength); + //} + + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final FST.Arc> nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); + + if (nextArc == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel)); + // } + + validIndexPrefix = currentFrame.prefix; + //validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, false); + if (result == SeekStatus.END) { + term.copyBytes(target); + termExists = false; + + if (next() != null) { + //if (DEBUG) { + //System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term); + //} + return SeekStatus.NOT_FOUND; + } else { + //if (DEBUG) { + //System.out.println(" return END"); + //} + return SeekStatus.END; + } + } else { + //if (DEBUG) { + //System.out.println(" return " + result + " term=" + brToString(term) + " " + term); + //} + return result; + } + } else { + // Follow this arc + term.bytes[targetUpto] = (byte) targetLabel; + arc = nextArc; + // Aggregate output as we go: + assert arc.output != null; + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + + //if (DEBUG) { + //System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); + //} + targetUpto++; + + if (arc.isFinal()) { + //if (DEBUG) System.out.println(" arc is final!"); + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto); + //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); + } + } + } + + //validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefix; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, false); + + if (result == SeekStatus.END) { + term.copyBytes(target); + termExists = false; + if (next() != null) { + //if (DEBUG) { + //System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term); + //} + return SeekStatus.NOT_FOUND; + } else { + //if (DEBUG) { + //System.out.println(" return END"); + //} + return SeekStatus.END; + } + } else { + return result; + } + } + + @SuppressWarnings("unused") + private void printSeekState(PrintStream out) throws IOException { + if (currentFrame == staticFrame) { + out.println(" no prior seek"); + } else { + out.println(" prior seek state:"); + int ord = 0; + boolean isSeekFrame = true; + while(true) { + IDVersionSegmentTermsEnumFrame f = getFrame(ord); + assert f != null; + final BytesRef prefix = new BytesRef(term.bytes, 0, f.prefix); + if (f.nextEnt == -1) { + out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp< output = Util.get(fr.index, prefix); + if (output == null) { + out.println(" broken seek state: prefix is not final in index"); + throw new RuntimeException("seek state is broken"); + } else if (isSeekFrame && !f.isFloor) { + final ByteArrayDataInput reader = new ByteArrayDataInput(output.output1.bytes, output.output1.offset, output.output1.length); + final long codeOrig = reader.readVLong(); + final long code = (f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) | (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0); + if (codeOrig != code) { + out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code); + throw new RuntimeException("seek state is broken"); + } + } + } + if (f == currentFrame) { + break; + } + if (f.prefix == validIndexPrefix) { + isSeekFrame = false; + } + ord++; + } + } + } + + /* Decodes only the term bytes of the next term. If caller then asks for + metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) + decode all metadata up to the current term. */ + @Override + public BytesRef next() throws IOException { + + if (in == null) { + // Fresh TermsEnum; seek to first term: + final FST.Arc> arc; + if (fr.index != null) { + arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + currentFrame = pushFrame(arc, fr.rootCode, 0); + currentFrame.loadBlock(); + } + + targetBeforeCurrentLength = currentFrame.ord; + + assert !eof; + //if (DEBUG) { + //System.out.println("\nBTTR.next seg=" + segment + " term=" + brToString(term) + " termExists?=" + termExists + " field=" + fieldInfo.name + " termBlockOrd=" + currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix); + //printSeekState(); + //} + + if (currentFrame == staticFrame) { + // If seek was previously called and the term was + // cached, or seek(TermState) was called, usually + // caller is just going to pull a D/&PEnum or get + // docFreq, etc. But, if they then call next(), + // this method catches up all internal state so next() + // works properly: + //if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + term); + final boolean result = seekExact(term); + assert result; + } + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) { + if (!currentFrame.isLastInFloor) { + currentFrame.loadNextFloorBlock(); + } else { + //if (DEBUG) System.out.println(" pop frame"); + if (currentFrame.ord == 0) { + //if (DEBUG) System.out.println(" return null"); + assert setEOF(); + term.length = 0; + validIndexPrefix = 0; + currentFrame.rewind(); + termExists = false; + return null; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord-1]; + + if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) { + // We popped into a frame that's not loaded + // yet or not scan'd to the right entry + currentFrame.scanToFloorFrame(term); + currentFrame.loadBlock(); + currentFrame.scanToSubBlock(lastFP); + } + + // Note that the seek state (last seek) has been + // invalidated beyond this depth + validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix); + //if (DEBUG) { + //System.out.println(" reset validIndexPrefix=" + validIndexPrefix); + //} + } + } + + while(true) { + if (currentFrame.next()) { + // Push to new block: + //if (DEBUG) System.out.println(" push frame"); + currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length); + // This is a "next" frame -- even if it's + // floor'd we must pretend it isn't so we don't + // try to scan to the right floor frame: + currentFrame.isFloor = false; + //currentFrame.hasTerms = true; + currentFrame.loadBlock(); + } else { + //if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord); + return term; + } + } + } + + @Override + public BytesRef term() { + assert !eof; + return term; + } + + @Override + public int docFreq() throws IOException { + assert !eof; + //if (DEBUG) System.out.println("BTR.docFreq"); + currentFrame.decodeMetaData(); + //if (DEBUG) System.out.println(" return " + currentFrame.state.docFreq); + return currentFrame.state.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + assert !eof; + currentFrame.decodeMetaData(); + return currentFrame.state.totalTermFreq; + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse, int flags) throws IOException { + assert !eof; + //if (DEBUG) { + //System.out.println("BTTR.docs seg=" + segment); + //} + currentFrame.decodeMetaData(); + //if (DEBUG) { + //System.out.println(" state=" + currentFrame.state); + //} + return fr.parent.postingsReader.docs(fr.fieldInfo, currentFrame.state, skipDocs, reuse, flags); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + if (fr.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + // Positions were not indexed: + return null; + } + + assert !eof; + currentFrame.decodeMetaData(); + return fr.parent.postingsReader.docsAndPositions(fr.fieldInfo, currentFrame.state, skipDocs, reuse, flags); + } + + @Override + public void seekExact(BytesRef target, TermState otherState) { + // if (DEBUG) { + // System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + target.utf8ToString() + " " + target + " state=" + otherState); + // } + assert clearEOF(); + if (target.compareTo(term) != 0 || !termExists) { + assert otherState != null && otherState instanceof BlockTermState; + currentFrame = staticFrame; + currentFrame.state.copyFrom(otherState); + term.copyBytes(target); + currentFrame.metaDataUpto = currentFrame.getTermBlockOrd(); + assert currentFrame.metaDataUpto > 0; + validIndexPrefix = 0; + } else { + // if (DEBUG) { + // System.out.println(" skip seek: already on target state=" + currentFrame.state); + // } + } + } + + @Override + public TermState termState() throws IOException { + assert !eof; + currentFrame.decodeMetaData(); + TermState ts = currentFrame.state.clone(); + //if (DEBUG) System.out.println("BTTR.termState seg=" + segment + " state=" + ts); + return ts; + } + + @Override + public void seekExact(long ord) { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java new file mode 100644 index 00000000000..4fde1466a42 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java @@ -0,0 +1,746 @@ +package org.apache.lucene.codecs.idversion; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PairOutputs.Pair; + +final class IDVersionSegmentTermsEnumFrame { + // Our index in stack[]: + final int ord; + + boolean hasTerms; + boolean hasTermsOrig; + boolean isFloor; + + FST.Arc> arc; + + // File pointer where this block was loaded from + long fp; + long fpOrig; + long fpEnd; + + byte[] suffixBytes = new byte[128]; + final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); + + byte[] statBytes = new byte[64]; + final ByteArrayDataInput statsReader = new ByteArrayDataInput(); + + byte[] floorData = new byte[32]; + final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); + + // Length of prefix shared by all terms in this block + int prefix; + + // Number of entries (term or sub-block) in this block + int entCount; + + // Which term we will next read, or -1 if the block + // isn't loaded yet + int nextEnt; + + // True if this block is either not a floor block, + // or, it's the last sub-block of a floor block + boolean isLastInFloor; + + // True if all entries are terms + boolean isLeafBlock; + + long lastSubFP; + + int nextFloorLabel; + int numFollowFloorBlocks; + + // Next term to decode metaData; we decode metaData + // lazily so that scanning to find the matching term is + // fast and only if you find a match and app wants the + // stats or docs/positions enums, will we decode the + // metaData + int metaDataUpto; + + final BlockTermState state; + + // metadata buffer, holding monotonic values + public long[] longs; + // metadata buffer, holding general values + public byte[] bytes; + ByteArrayDataInput bytesReader; + + private final IDVersionSegmentTermsEnum ste; + + public IDVersionSegmentTermsEnumFrame(IDVersionSegmentTermsEnum ste, int ord) throws IOException { + this.ste = ste; + this.ord = ord; + this.state = ste.fr.parent.postingsReader.newTermState(); + this.state.totalTermFreq = -1; + this.longs = new long[ste.fr.longsSize]; + } + + public void setFloorData(ByteArrayDataInput in, BytesRef source) { + final int numBytes = source.length - (in.getPosition() - source.offset); + if (numBytes > floorData.length) { + floorData = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + System.arraycopy(source.bytes, source.offset+in.getPosition(), floorData, 0, numBytes); + floorDataReader.reset(floorData, 0, numBytes); + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + //if (DEBUG) { + //System.out.println(" setFloorData fpOrig=" + fpOrig + " bytes=" + new BytesRef(source.bytes, source.offset + in.getPosition(), numBytes) + " numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + toHex(nextFloorLabel)); + //} + } + + public int getTermBlockOrd() { + return isLeafBlock ? nextEnt : state.termBlockOrd; + } + + void loadNextFloorBlock() throws IOException { + //if (DEBUG) { + //System.out.println(" loadNextFloorBlock fp=" + fp + " fpEnd=" + fpEnd); + //} + assert arc == null || isFloor: "arc=" + arc + " isFloor=" + isFloor; + fp = fpEnd; + nextEnt = -1; + loadBlock(); + } + + /* Does initial decode of next block of terms; this + doesn't actually decode the docFreq, totalTermFreq, + postings details (frq/prx offset, etc.) metadata; + it just loads them as byte[] blobs which are then + decoded on-demand if the metadata is ever requested + for any term in this block. This enables terms-only + intensive consumes (eg certain MTQs, respelling) to + not pay the price of decoding metadata they won't + use. */ + void loadBlock() throws IOException { + + // Clone the IndexInput lazily, so that consumers + // that just pull a TermsEnum to + // seekExact(TermState) don't pay this cost: + ste.initIndexInput(); + + if (nextEnt != -1) { + // Already loaded + return; + } + //System.out.println("blc=" + blockLoadCount); + + ste.in.seek(fp); + int code = ste.in.readVInt(); + entCount = code >>> 1; + assert entCount > 0; + isLastInFloor = (code & 1) != 0; + assert arc == null || (isLastInFloor || isFloor); + + // TODO: if suffixes were stored in random-access + // array structure, then we could do binary search + // instead of linear scan to find target term; eg + // we could have simple array of offsets + + // term suffixes: + code = ste.in.readVInt(); + isLeafBlock = (code & 1) != 0; + int numBytes = code >>> 1; + if (suffixBytes.length < numBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ste.in.readBytes(suffixBytes, 0, numBytes); + suffixesReader.reset(suffixBytes, 0, numBytes); + + /*if (DEBUG) { + if (arc == null) { + System.out.println(" loadBlock (next) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); + } else { + System.out.println(" loadBlock (seek) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); + } + }*/ + + // stats + numBytes = ste.in.readVInt(); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ste.in.readBytes(statBytes, 0, numBytes); + statsReader.reset(statBytes, 0, numBytes); + metaDataUpto = 0; + + state.termBlockOrd = 0; + nextEnt = 0; + lastSubFP = -1; + + // TODO: we could skip this if !hasTerms; but + // that's rare so won't help much + // metadata + numBytes = ste.in.readVInt(); + if (bytes == null) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + bytesReader = new ByteArrayDataInput(); + } else if (bytes.length < numBytes) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ste.in.readBytes(bytes, 0, numBytes); + bytesReader.reset(bytes, 0, numBytes); + + + // Sub-blocks of a single floor block are always + // written one after another -- tail recurse: + fpEnd = ste.in.getFilePointer(); + // if (DEBUG) { + // System.out.println(" fpEnd=" + fpEnd); + // } + } + + void rewind() { + + // Force reload: + fp = fpOrig; + nextEnt = -1; + hasTerms = hasTermsOrig; + if (isFloor) { + floorDataReader.rewind(); + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + } + + /* + //System.out.println("rewind"); + // Keeps the block loaded, but rewinds its state: + if (nextEnt > 0 || fp != fpOrig) { + if (DEBUG) { + System.out.println(" rewind frame ord=" + ord + " fpOrig=" + fpOrig + " fp=" + fp + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " nextEnt=" + nextEnt + " prefixLen=" + prefix); + } + if (fp != fpOrig) { + fp = fpOrig; + nextEnt = -1; + } else { + nextEnt = 0; + } + hasTerms = hasTermsOrig; + if (isFloor) { + floorDataReader.rewind(); + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + } + assert suffixBytes != null; + suffixesReader.rewind(); + assert statBytes != null; + statsReader.rewind(); + metaDataUpto = 0; + state.termBlockOrd = 0; + // TODO: skip this if !hasTerms? Then postings + // impl wouldn't have to write useless 0 byte + postingsReader.resetTermsBlock(fieldInfo, state); + lastSubFP = -1; + } else if (DEBUG) { + System.out.println(" skip rewind fp=" + fp + " fpOrig=" + fpOrig + " nextEnt=" + nextEnt + " ord=" + ord); + } + */ + } + + public boolean next() { + return isLeafBlock ? nextLeaf() : nextNonLeaf(); + } + + // Decodes next entry; returns true if it's a sub-block + public boolean nextLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + suffix = suffixesReader.readVInt(); + startBytePos = suffixesReader.getPosition(); + ste.term.length = prefix + suffix; + if (ste.term.bytes.length < ste.term.length) { + ste.term.grow(ste.term.length); + } + suffixesReader.readBytes(ste.term.bytes, prefix, suffix); + // A normal term + ste.termExists = true; + return false; + } + + public boolean nextNonLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + final int code = suffixesReader.readVInt(); + suffix = code >>> 1; + startBytePos = suffixesReader.getPosition(); + ste.term.length = prefix + suffix; + if (ste.term.bytes.length < ste.term.length) { + ste.term.grow(ste.term.length); + } + suffixesReader.readBytes(ste.term.bytes, prefix, suffix); + if ((code & 1) == 0) { + // A normal term + ste.termExists = true; + subCode = 0; + state.termBlockOrd++; + return false; + } else { + // A sub-block; make sub-FP absolute: + ste.termExists = false; + subCode = suffixesReader.readVLong(); + lastSubFP = fp - subCode; + //if (DEBUG) { + //System.out.println(" lastSubFP=" + lastSubFP); + //} + return true; + } + } + + // TODO: make this array'd so we can do bin search? + // likely not worth it? need to measure how many + // floor blocks we "typically" get + public void scanToFloorFrame(BytesRef target) { + + if (!isFloor || target.length <= prefix) { + // if (DEBUG) { + // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + target.length + " vs prefix=" + prefix); + // } + return; + } + + final int targetLabel = target.bytes[target.offset + prefix] & 0xFF; + + // if (DEBUG) { + // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + toHex(targetLabel) + " vs nextFloorLabel=" + toHex(nextFloorLabel) + " numFollowFloorBlocks=" + numFollowFloorBlocks); + // } + + if (targetLabel < nextFloorLabel) { + // if (DEBUG) { + // System.out.println(" already on correct block"); + // } + return; + } + + assert numFollowFloorBlocks != 0; + + long newFP = fpOrig; + while (true) { + final long code = floorDataReader.readVLong(); + newFP = fpOrig + (code >>> 1); + hasTerms = (code & 1) != 0; + // if (DEBUG) { + // System.out.println(" label=" + toHex(nextFloorLabel) + " fp=" + newFP + " hasTerms?=" + hasTerms + " numFollowFloor=" + numFollowFloorBlocks); + // } + + isLastInFloor = numFollowFloorBlocks == 1; + numFollowFloorBlocks--; + + if (isLastInFloor) { + nextFloorLabel = 256; + // if (DEBUG) { + // System.out.println(" stop! last block nextFloorLabel=" + toHex(nextFloorLabel)); + // } + break; + } else { + nextFloorLabel = floorDataReader.readByte() & 0xff; + if (targetLabel < nextFloorLabel) { + // if (DEBUG) { + // System.out.println(" stop! nextFloorLabel=" + toHex(nextFloorLabel)); + // } + break; + } + } + } + + if (newFP != fp) { + // Force re-load of the block: + // if (DEBUG) { + // System.out.println(" force switch to fp=" + newFP + " oldFP=" + fp); + // } + nextEnt = -1; + fp = newFP; + } else { + // if (DEBUG) { + // System.out.println(" stay on same fp=" + newFP); + // } + } + } + + public void decodeMetaData() throws IOException { + + //if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + segment + " mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd); + + // lazily catch up on metadata decode: + final int limit = getTermBlockOrd(); + boolean absolute = metaDataUpto == 0; + assert limit > 0; + + // TODO: better API would be "jump straight to term=N"??? + while (metaDataUpto < limit) { + + // TODO: we could make "tiers" of metadata, ie, + // decode docFreq/totalTF but don't decode postings + // metadata; this way caller could get + // docFreq/totalTF w/o paying decode cost for + // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: + + // stats + state.docFreq = statsReader.readVInt(); + //if (DEBUG) System.out.println(" dF=" + state.docFreq); + if (ste.fr.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { + state.totalTermFreq = state.docFreq + statsReader.readVLong(); + //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + } + // metadata + for (int i = 0; i < ste.fr.longsSize; i++) { + longs[i] = bytesReader.readVLong(); + } + ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute); + + metaDataUpto++; + absolute = false; + } + state.termBlockOrd = metaDataUpto; + } + + // Used only by assert + private boolean prefixMatches(BytesRef target) { + for(int bytePos=0;bytePosNOTE: this terms dictionary supports + * min/maxItemsPerBlock during indexing to control how + * much memory the terms index uses.

+ * + *

The data structure used by this implementation is very + * similar to a burst trie + * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499), + * but with added logic to break up too-large blocks of all + * terms sharing a given prefix into smaller ones.

+ * + *

Use {@link org.apache.lucene.index.CheckIndex} with the -verbose + * option to see summary statistics on the blocks in the + * dictionary. + * + * See {@link BlockTreeTermsWriter}. + * + * @lucene.experimental + */ + +final class VersionBlockTreeTermsReader extends FieldsProducer { + + // Open input to the main terms dict file (_X.tiv) + final IndexInput in; + + //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + // Reads the terms dict entries, to gather state to + // produce DocsEnum on demand + final PostingsReaderBase postingsReader; + + private final TreeMap fields = new TreeMap<>(); + + /** File offset where the directory starts in the terms file. */ + private long dirOffset; + + /** File offset where the directory starts in the index file. */ + private long indexDirOffset; + + final String segment; + + private final int version; + + /** Sole constructor. */ + public VersionBlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo info, + PostingsReaderBase postingsReader, IOContext ioContext, + String segmentSuffix) + throws IOException { + + this.postingsReader = postingsReader; + + this.segment = info.name; + in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, VersionBlockTreeTermsWriter.TERMS_EXTENSION), + ioContext); + + boolean success = false; + IndexInput indexIn = null; + + try { + version = readHeader(in); + indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, VersionBlockTreeTermsWriter.TERMS_INDEX_EXTENSION), + ioContext); + int indexVersion = readIndexHeader(indexIn); + if (indexVersion != version) { + throw new CorruptIndexException("mixmatched version files: " + in + "=" + version + "," + indexIn + "=" + indexVersion); + } + + // verify + if (version >= VersionBlockTreeTermsWriter.VERSION_CHECKSUM) { + CodecUtil.checksumEntireFile(indexIn); + } + + // Have PostingsReader init itself + postingsReader.init(in); + + // Read per-field details + seekDir(in, dirOffset); + seekDir(indexIn, indexDirOffset); + + final int numFields = in.readVInt(); + if (numFields < 0) { + throw new CorruptIndexException("invalid numFields: " + numFields + " (resource=" + in + ")"); + } + + PairOutputs fstOutputs = VersionBlockTreeTermsWriter.getFSTOutputs(); + + for(int i=0;i= 0; + final int numBytes = in.readVInt(); + final BytesRef code = new BytesRef(new byte[numBytes]); + in.readBytes(code.bytes, 0, numBytes); + code.length = numBytes; + final long version = in.readVLong(); + final Pair rootCode = fstOutputs.newPair(code, version); + final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + assert fieldInfo != null: "field=" + field; + final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); + final long sumDocFreq = in.readVLong(); + final int docCount = in.readVInt(); + final int longsSize = version >= VersionBlockTreeTermsWriter.VERSION_META_ARRAY ? in.readVInt() : 0; + + BytesRef minTerm, maxTerm; + if (version >= VersionBlockTreeTermsWriter.VERSION_MIN_MAX_TERMS) { + minTerm = readBytesRef(in); + maxTerm = readBytesRef(in); + } else { + minTerm = maxTerm = null; + } + if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs + throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")"); + } + if (sumDocFreq < docCount) { // #postings must be >= #docs with field + throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount + " (resource=" + in + ")"); + } + if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings + throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")"); + } + final long indexStartFP = indexIn.readVLong(); + VersionFieldReader previous = fields.put(fieldInfo.name, + new VersionFieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, + indexStartFP, longsSize, indexIn, minTerm, maxTerm)); + if (previous != null) { + throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")"); + } + } + indexIn.close(); + + success = true; + } finally { + if (!success) { + // this.close() will close in: + IOUtils.closeWhileHandlingException(indexIn, this); + } + } + } + + private static BytesRef readBytesRef(IndexInput in) throws IOException { + BytesRef bytes = new BytesRef(); + bytes.length = in.readVInt(); + bytes.bytes = new byte[bytes.length]; + in.readBytes(bytes.bytes, 0, bytes.length); + return bytes; + } + + /** Reads terms file header. */ + private int readHeader(IndexInput input) throws IOException { + int version = CodecUtil.checkHeader(input, VersionBlockTreeTermsWriter.TERMS_CODEC_NAME, + VersionBlockTreeTermsWriter.VERSION_START, + VersionBlockTreeTermsWriter.VERSION_CURRENT); + if (version < VersionBlockTreeTermsWriter.VERSION_APPEND_ONLY) { + dirOffset = input.readLong(); + } + return version; + } + + /** Reads index file header. */ + private int readIndexHeader(IndexInput input) throws IOException { + int version = CodecUtil.checkHeader(input, VersionBlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME, + VersionBlockTreeTermsWriter.VERSION_START, + VersionBlockTreeTermsWriter.VERSION_CURRENT); + if (version < VersionBlockTreeTermsWriter.VERSION_APPEND_ONLY) { + indexDirOffset = input.readLong(); + } + return version; + } + + /** Seek {@code input} to the directory offset. */ + private void seekDir(IndexInput input, long dirOffset) + throws IOException { + if (version >= VersionBlockTreeTermsWriter.VERSION_CHECKSUM) { + input.seek(input.length() - CodecUtil.footerLength() - 8); + dirOffset = input.readLong(); + } else if (version >= VersionBlockTreeTermsWriter.VERSION_APPEND_ONLY) { + input.seek(input.length() - 8); + dirOffset = input.readLong(); + } + input.seek(dirOffset); + } + + // for debugging + // private static String toHex(int v) { + // return "0x" + Integer.toHexString(v); + // } + + @Override + public void close() throws IOException { + try { + IOUtils.close(in, postingsReader); + } finally { + // Clear so refs to terms index is GCable even if + // app hangs onto us: + fields.clear(); + } + } + + @Override + public Iterator iterator() { + return Collections.unmodifiableSet(fields.keySet()).iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + assert field != null; + return fields.get(field); + } + + @Override + public int size() { + return fields.size(); + } + + // for debugging + String brToString(BytesRef b) { + if (b == null) { + return "null"; + } else { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + } + + @Override + public long ramBytesUsed() { + long sizeInByes = ((postingsReader!=null) ? postingsReader.ramBytesUsed() : 0); + for(VersionFieldReader reader : fields.values()) { + sizeInByes += reader.ramBytesUsed(); + } + return sizeInByes; + } + + @Override + public void checkIntegrity() throws IOException { + if (version >= VersionBlockTreeTermsWriter.VERSION_CHECKSUM) { + // term dictionary + CodecUtil.checksumEntireFile(in); + + // postings + postingsReader.checkIntegrity(); + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java new file mode 100644 index 00000000000..d333ba494ef --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java @@ -0,0 +1,1192 @@ +package org.apache.lucene.codecs.idversion; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMOutputStream; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.BytesRefFSTEnum; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.NoOutputs; +import org.apache.lucene.util.fst.Outputs; +import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.fst.PairOutputs; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.PackedInts; + +/* + TODO: + + - Currently there is a one-to-one mapping of indexed + term to term block, but we could decouple the two, ie, + put more terms into the index than there are blocks. + The index would take up more RAM but then it'd be able + to avoid seeking more often and could make PK/FuzzyQ + faster if the additional indexed terms could store + the offset into the terms block. + + - The blocks are not written in true depth-first + order, meaning if you just next() the file pointer will + sometimes jump backwards. For example, block foo* will + be written before block f* because it finished before. + This could possibly hurt performance if the terms dict is + not hot, since OSs anticipate sequential file access. We + could fix the writer to re-order the blocks as a 2nd + pass. + + - Each block encodes the term suffixes packed + sequentially using a separate vInt per term, which is + 1) wasteful and 2) slow (must linear scan to find a + particular suffix). We should instead 1) make + random-access array so we can directly access the Nth + suffix, and 2) bulk-encode this array using bulk int[] + codecs; then at search time we can binary search when + we seek a particular term. +*/ + +/** + * Block-based terms index and dictionary writer. + *

+ * Writes terms dict and index, block-encoding (column + * stride) each term's metadata for each set of terms + * between two index terms. + *

+ * Files: + *

+ *

+ * + *

Term Dictionary

+ * + *

The .tim file contains the list of terms in each + * field along with per-term statistics (such as docfreq) + * and per-term metadata (typically pointers to the postings list + * for that term in the inverted index). + *

+ * + *

The .tim is arranged in blocks: with blocks containing + * a variable number of entries (by default 25-48), where + * each entry is either a term or a reference to a + * sub-block.

+ * + *

NOTE: The term dictionary can plug into different postings implementations: + * the postings writer/reader are actually responsible for encoding + * and decoding the Postings Metadata and Term Metadata sections.

+ * + *
    + *
  • TermsDict (.tim) --> Header, PostingsHeader, NodeBlockNumBlocks, + * FieldSummary, DirOffset, Footer
  • + *
  • NodeBlock --> (OuterNode | InnerNode)
  • + *
  • OuterNode --> EntryCount, SuffixLength, ByteSuffixLength, StatsLength, < TermStats >EntryCount, MetaLength, <TermMetadata>EntryCount
  • + *
  • InnerNode --> EntryCount, SuffixLength[,Sub?], ByteSuffixLength, StatsLength, < TermStats ? >EntryCount, MetaLength, <TermMetadata ? >EntryCount
  • + *
  • TermStats --> DocFreq, TotalTermFreq
  • + *
  • FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength, ByteRootCodeLength, + * SumTotalTermFreq?, SumDocFreq, DocCount, LongsSize, MinTerm, MaxTerm>NumFields
  • + *
  • Header --> {@link CodecUtil#writeHeader CodecHeader}
  • + *
  • DirOffset --> {@link DataOutput#writeLong Uint64}
  • + *
  • MinTerm,MaxTerm --> {@link DataOutput#writeVInt VInt} length followed by the byte[]
  • + *
  • EntryCount,SuffixLength,StatsLength,DocFreq,MetaLength,NumFields, + * FieldNumber,RootCodeLength,DocCount,LongsSize --> {@link DataOutput#writeVInt VInt}
  • + *
  • TotalTermFreq,NumTerms,SumTotalTermFreq,SumDocFreq --> + * {@link DataOutput#writeVLong VLong}
  • + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • + *
+ *

Notes:

+ *
    + *
  • Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information + * for the BlockTree implementation.
  • + *
  • DirOffset is a pointer to the FieldSummary section.
  • + *
  • DocFreq is the count of documents which contain the term.
  • + *
  • TotalTermFreq is the total number of occurrences of the term. This is encoded + * as the difference between the total number of occurrences and the DocFreq.
  • + *
  • FieldNumber is the fields number from {@link FieldInfos}. (.fnm)
  • + *
  • NumTerms is the number of unique terms for the field.
  • + *
  • RootCode points to the root block for the field.
  • + *
  • SumDocFreq is the total number of postings, the number of term-document pairs across + * the entire field.
  • + *
  • DocCount is the number of documents that have at least one posting for this field.
  • + *
  • LongsSize records how many long values the postings writer/reader record per term + * (e.g., to hold freq/prox/doc file offsets). + *
  • MinTerm, MaxTerm are the lowest and highest term in this field.
  • + *
  • PostingsHeader and TermMetadata are plugged into by the specific postings implementation: + * these contain arbitrary per-file data (such as parameters or versioning information) + * and per-term data (such as pointers to inverted files).
  • + *
  • For inner nodes of the tree, every entry will steal one bit to mark whether it points + * to child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted
  • + *
+ * + *

Term Index

+ *

The .tip file contains an index into the term dictionary, so that it can be + * accessed randomly. The index is also used to determine + * when a given term cannot exist on disk (in the .tim file), saving a disk seek.

+ *
    + *
  • TermsIndex (.tip) --> Header, FSTIndexNumFields + * <IndexStartFP>NumFields, DirOffset, Footer
  • + *
  • Header --> {@link CodecUtil#writeHeader CodecHeader}
  • + *
  • DirOffset --> {@link DataOutput#writeLong Uint64}
  • + *
  • IndexStartFP --> {@link DataOutput#writeVLong VLong}
  • + * + *
  • FSTIndex --> {@link FST FST<byte[]>}
  • + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • + *
+ *

Notes:

+ *
    + *
  • The .tip file contains a separate FST for each + * field. The FST maps a term prefix to the on-disk + * block that holds all terms starting with that + * prefix. Each field's IndexStartFP points to its + * FST.
  • + *
  • DirOffset is a pointer to the start of the IndexStartFPs + * for all fields
  • + *
  • It's possible that an on-disk block would contain + * too many terms (more than the allowed maximum + * (default: 48)). When this happens, the block is + * sub-divided into new blocks (called "floor + * blocks"), and then the output in the FST for the + * block's prefix encodes the leading byte of each + * sub-block, and its file pointer. + *
+ * + * @see BlockTreeTermsReader + * @lucene.experimental + */ +// nocommit fix jdocs +final class VersionBlockTreeTermsWriter extends FieldsConsumer { + + /** Suggested default value for the {@code + * minItemsInBlock} parameter to {@link + * #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */ + public final static int DEFAULT_MIN_BLOCK_SIZE = 25; + + /** Suggested default value for the {@code + * maxItemsInBlock} parameter to {@link + * #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */ + public final static int DEFAULT_MAX_BLOCK_SIZE = 48; + + //public final static boolean DEBUG = false; + //private final static boolean SAVE_DOT_FILES = false; + + static final int OUTPUT_FLAGS_NUM_BITS = 2; + static final int OUTPUT_FLAGS_MASK = 0x3; + static final int OUTPUT_FLAG_IS_FLOOR = 0x1; + static final int OUTPUT_FLAG_HAS_TERMS = 0x2; + + /** Extension of terms file */ + static final String TERMS_EXTENSION = "tiv"; + final static String TERMS_CODEC_NAME = "VERSION_BLOCK_TREE_TERMS_DICT"; + + /** Initial terms format. */ + public static final int VERSION_START = 0; + + // nocommit nuke all these old versions + + /** Append-only */ + public static final int VERSION_APPEND_ONLY = 1; + + /** Meta data as array */ + public static final int VERSION_META_ARRAY = 2; + + /** checksums */ + public static final int VERSION_CHECKSUM = 3; + + /** min/max term */ + public static final int VERSION_MIN_MAX_TERMS = 4; + + /** Current terms format. */ + public static final int VERSION_CURRENT = VERSION_MIN_MAX_TERMS; + + /** Extension of terms index file */ + static final String TERMS_INDEX_EXTENSION = "tip"; + final static String TERMS_INDEX_CODEC_NAME = "VERSION_BLOCK_TREE_TERMS_INDEX"; + + private final IndexOutput out; + private final IndexOutput indexOut; + final int maxDoc; + final int minItemsInBlock; + final int maxItemsInBlock; + + final PostingsWriterBase postingsWriter; + final FieldInfos fieldInfos; + + private static class FieldMetaData { + public final FieldInfo fieldInfo; + public final Pair rootCode; + public final long numTerms; + public final long indexStartFP; + public final long sumTotalTermFreq; + public final long sumDocFreq; + public final int docCount; + private final int longsSize; + public final BytesRef minTerm; + public final BytesRef maxTerm; + + public FieldMetaData(FieldInfo fieldInfo, Pair rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, + BytesRef minTerm, BytesRef maxTerm) { + assert numTerms > 0; + this.fieldInfo = fieldInfo; + assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms; + this.rootCode = rootCode; + this.indexStartFP = indexStartFP; + this.numTerms = numTerms; + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + this.docCount = docCount; + this.longsSize = longsSize; + this.minTerm = minTerm; + this.maxTerm = maxTerm; + } + } + + private final List fields = new ArrayList<>(); + // private final String segment; + + /** Create a new writer. The number of items (terms or + * sub-blocks) per block will aim to be between + * minItemsPerBlock and maxItemsPerBlock, though in some + * cases the blocks may be smaller than the min. */ + public VersionBlockTreeTermsWriter( + SegmentWriteState state, + PostingsWriterBase postingsWriter, + int minItemsInBlock, + int maxItemsInBlock) + throws IOException + { + if (minItemsInBlock <= 1) { + throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock); + } + if (maxItemsInBlock <= 0) { + throw new IllegalArgumentException("maxItemsInBlock must be >= 1; got " + maxItemsInBlock); + } + if (minItemsInBlock > maxItemsInBlock) { + throw new IllegalArgumentException("maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock); + } + if (2*(minItemsInBlock-1) > maxItemsInBlock) { + throw new IllegalArgumentException("maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock); + } + + maxDoc = state.segmentInfo.getDocCount(); + + final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION); + out = state.directory.createOutput(termsFileName, state.context); + boolean success = false; + IndexOutput indexOut = null; + try { + fieldInfos = state.fieldInfos; + this.minItemsInBlock = minItemsInBlock; + this.maxItemsInBlock = maxItemsInBlock; + writeHeader(out); + + //DEBUG = state.segmentName.equals("_4a"); + + final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); + indexOut = state.directory.createOutput(termsIndexFileName, state.context); + writeIndexHeader(indexOut); + + this.postingsWriter = postingsWriter; + // segment = state.segmentName; + + // System.out.println("BTW.init seg=" + state.segmentName); + + postingsWriter.init(out); // have consumer write its format/header + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(out, indexOut); + } + } + this.indexOut = indexOut; + } + + /** Writes the terms file header. */ + private void writeHeader(IndexOutput out) throws IOException { + CodecUtil.writeHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT); + } + + /** Writes the index file header. */ + private void writeIndexHeader(IndexOutput out) throws IOException { + CodecUtil.writeHeader(out, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT); + } + + /** Writes the terms file trailer. */ + private void writeTrailer(IndexOutput out, long dirStart) throws IOException { + out.writeLong(dirStart); + } + + /** Writes the index file trailer. */ + private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException { + indexOut.writeLong(dirStart); + } + + @Override + public void write(Fields fields) throws IOException { + + String lastField = null; + for(String field : fields) { + assert lastField == null || lastField.compareTo(field) < 0; + lastField = field; + + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + + TermsEnum termsEnum = terms.iterator(null); + + TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field)); + BytesRef minTerm = null; + BytesRef maxTerm = new BytesRef(); + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + if (minTerm == null) { + minTerm = BytesRef.deepCopyOf(term); + } + maxTerm.copyBytes(term); + termsWriter.write(term, termsEnum); + } + + termsWriter.finish(minTerm, minTerm == null ? null : maxTerm); + } + } + + static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) { + assert fp < (1L << 62); + return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0); + } + + private static class PendingEntry { + public final boolean isTerm; + + protected PendingEntry(boolean isTerm) { + this.isTerm = isTerm; + } + } + + private static final class PendingTerm extends PendingEntry { + public final BytesRef term; + // stats + metadata + public final BlockTermState state; + + public PendingTerm(BytesRef term, BlockTermState state) { + super(true); + this.term = term; + this.state = state; + } + + @Override + public String toString() { + return term.utf8ToString(); + } + } + + private static final class PendingBlock extends PendingEntry { + public final BytesRef prefix; + public final long fp; + public FST> index; + public List>> subIndices; + public final boolean hasTerms; + public final boolean isFloor; + public final int floorLeadByte; + private final IntsRef scratchIntsRef = new IntsRef(); + /** Max version for all terms in this block. */ + private final long maxVersion; + + public PendingBlock(BytesRef prefix, long maxVersion, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List>> subIndices) { + super(false); + this.prefix = prefix; + this.maxVersion = maxVersion; + this.fp = fp; + this.hasTerms = hasTerms; + this.isFloor = isFloor; + this.floorLeadByte = floorLeadByte; + this.subIndices = subIndices; + } + + @Override + public String toString() { + return "BLOCK: " + prefix.utf8ToString(); + } + + public void compileIndex(List floorBlocks, RAMOutputStream scratchBytes) throws IOException { + + assert (isFloor && floorBlocks != null && floorBlocks.size() != 0) || (!isFloor && floorBlocks == null): "isFloor=" + isFloor + " floorBlocks=" + floorBlocks; + + assert scratchBytes.getFilePointer() == 0; + + long maxVersionIndex = maxVersion; + + // TODO: try writing the leading vLong in MSB order + // (opposite of what Lucene does today), for better + // outputs sharing in the FST + scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor)); + if (isFloor) { + scratchBytes.writeVInt(floorBlocks.size()); + for (PendingBlock sub : floorBlocks) { + assert sub.floorLeadByte != -1; + maxVersionIndex = Math.max(maxVersionIndex, sub.maxVersion); + //if (DEBUG) { + // System.out.println(" write floorLeadByte=" + Integer.toHexString(sub.floorLeadByte&0xff)); + //} + scratchBytes.writeByte((byte) sub.floorLeadByte); + assert sub.fp > fp; + scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0)); + } + } + + final PairOutputs outputs = getFSTOutputs(); + final Builder> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, + 0, 0, true, false, Integer.MAX_VALUE, + outputs, null, false, + PackedInts.COMPACT, true, 15); + //if (DEBUG) { + // System.out.println(" compile index for prefix=" + prefix); + //} + //indexBuilder.DEBUG = false; + final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()]; + assert bytes.length > 0; + scratchBytes.writeTo(bytes, 0); + indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), outputs.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex)); + scratchBytes.reset(); + + // Copy over index for all sub-blocks + + if (subIndices != null) { + for(FST> subIndex : subIndices) { + append(indexBuilder, subIndex); + } + } + + if (floorBlocks != null) { + for (PendingBlock sub : floorBlocks) { + if (sub.subIndices != null) { + for(FST> subIndex : sub.subIndices) { + append(indexBuilder, subIndex); + } + } + sub.subIndices = null; + } + } + + index = indexBuilder.finish(); + subIndices = null; + + /* + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); + Util.toDot(index, w, false, false); + System.out.println("SAVED to out.dot"); + w.close(); + */ + } + + // TODO: maybe we could add bulk-add method to + // Builder? Takes FST and unions it w/ current + // FST. + private void append(Builder> builder, FST> subIndex) throws IOException { + final BytesRefFSTEnum> subIndexEnum = new BytesRefFSTEnum<>(subIndex); + BytesRefFSTEnum.InputOutput> indexEnt; + while((indexEnt = subIndexEnum.next()) != null) { + //if (DEBUG) { + // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); + //} + builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); + } + } + } + + static PairOutputs getFSTOutputs() { + return new PairOutputs<>(ByteSequenceOutputs.getSingleton(), + PositiveIntOutputs.getSingleton()); + } + + final RAMOutputStream scratchBytes = new RAMOutputStream(); + + class TermsWriter { + private final FieldInfo fieldInfo; + private final int longsSize; + private long numTerms; + final FixedBitSet docsSeen; + long sumTotalTermFreq; + long sumDocFreq; + long indexStartFP; + + // Used only to partition terms into the block tree; we + // don't pull an FST from this builder: + private final NoOutputs noOutputs; + private final Builder blockBuilder; + + // PendingTerm or PendingBlock: + private final List pending = new ArrayList<>(); + + // Index into pending of most recently written block + private int lastBlockIndex = -1; + + // Re-used when segmenting a too-large block into floor + // blocks: + private int[] subBytes = new int[10]; + private int[] subTermCounts = new int[10]; + private int[] subTermCountSums = new int[10]; + private int[] subSubCounts = new int[10]; + + // This class assigns terms to blocks "naturally", ie, + // according to the number of terms under a given prefix + // that we encounter: + private class FindBlocks extends Builder.FreezeTail { + + @Override + public void freeze(final Builder.UnCompiledNode[] frontier, int prefixLenPlus1, final IntsRef lastInput) throws IOException { + + //if (DEBUG) System.out.println(" freeze prefixLenPlus1=" + prefixLenPlus1); + + for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) { + final Builder.UnCompiledNode node = frontier[idx]; + + long totCount = 0; + + if (node.isFinal) { + totCount++; + } + + for(int arcIdx=0;arcIdx target = (Builder.UnCompiledNode) node.arcs[arcIdx].target; + totCount += target.inputCount; + target.clear(); + node.arcs[arcIdx].target = null; + } + node.numArcs = 0; + + if (totCount >= minItemsInBlock || idx == 0) { + // We are on a prefix node that has enough + // entries (terms or sub-blocks) under it to let + // us write a new block or multiple blocks (main + // block + follow on floor blocks): + //if (DEBUG) { + // if (totCount < minItemsInBlock && idx != 0) { + // System.out.println(" force block has terms"); + // } + //} + writeBlocks(lastInput, idx, (int) totCount); + node.inputCount = 1; + } else { + // stragglers! carry count upwards + node.inputCount = totCount; + } + frontier[idx] = new Builder.UnCompiledNode<>(blockBuilder, idx); + } + } + } + + // Write the top count entries on the pending stack as + // one or more blocks. Returns how many blocks were + // written. If the entry count is <= maxItemsPerBlock + // we just write a single block; else we break into + // primary (initial) block and then one or more + // following floor blocks: + + void writeBlocks(IntsRef prevTerm, int prefixLength, int count) throws IOException { + if (prefixLength == 0 || count <= maxItemsInBlock) { + // Easy case: not floor block. Eg, prefix is "foo", + // and we found 30 terms/sub-blocks starting w/ that + // prefix, and minItemsInBlock <= 30 <= + // maxItemsInBlock. + final PendingBlock nonFloorBlock = writeBlock(prevTerm, prefixLength, prefixLength, count, count, 0, false, -1, true); + nonFloorBlock.compileIndex(null, scratchBytes); + pending.add(nonFloorBlock); + } else { + // Floor block case. Eg, prefix is "foo" but we + // have 100 terms/sub-blocks starting w/ that + // prefix. We segment the entries into a primary + // block and following floor blocks using the first + // label in the suffix to assign to floor blocks. + + // TODO: we could store min & max suffix start byte + // in each block, to make floor blocks authoritative + + //if (DEBUG) { + // final BytesRef prefix = new BytesRef(prefixLength); + // for(int m=0;m= minItemsInBlock) { + final int curPrefixLength; + if (startLabel == -1) { + curPrefixLength = prefixLength; + } else { + curPrefixLength = 1+prefixLength; + // floor term: + prevTerm.ints[prevTerm.offset + prefixLength] = startLabel; + } + //System.out.println(" " + subCount + " subs"); + final PendingBlock floorBlock = writeBlock(prevTerm, prefixLength, curPrefixLength, curStart, pendingCount, subTermCountSums[1+sub], true, startLabel, curStart == pendingCount); + if (firstBlock == null) { + firstBlock = floorBlock; + } else { + floorBlocks.add(floorBlock); + } + curStart -= pendingCount; + //System.out.println(" = " + pendingCount); + pendingCount = 0; + + assert minItemsInBlock == 1 || subCount > 1: "minItemsInBlock=" + minItemsInBlock + " subCount=" + subCount + " sub=" + sub + " of " + numSubs + " subTermCount=" + subTermCountSums[sub] + " subSubCount=" + subSubCounts[sub] + " depth=" + prefixLength; + subCount = 0; + startLabel = subBytes[sub+1]; + + if (curStart == 0) { + break; + } + + if (curStart <= maxItemsInBlock) { + // remainder is small enough to fit into a + // block. NOTE that this may be too small (< + // minItemsInBlock); need a true segmenter + // here + assert startLabel != -1; + assert firstBlock != null; + prevTerm.ints[prevTerm.offset + prefixLength] = startLabel; + //System.out.println(" final " + (numSubs-sub-1) + " subs"); + /* + for(sub++;sub < numSubs;sub++) { + System.out.println(" " + (subTermCounts[sub] + subSubCounts[sub])); + } + System.out.println(" = " + curStart); + if (curStart < minItemsInBlock) { + System.out.println(" **"); + } + */ + floorBlocks.add(writeBlock(prevTerm, prefixLength, prefixLength+1, curStart, curStart, 0, true, startLabel, true)); + break; + } + } + } + + prevTerm.ints[prevTerm.offset + prefixLength] = savLabel; + + assert firstBlock != null; + firstBlock.compileIndex(floorBlocks, scratchBytes); + + pending.add(firstBlock); + //if (DEBUG) System.out.println(" done pending.size()=" + pending.size()); + } + lastBlockIndex = pending.size()-1; + } + + // for debugging + @SuppressWarnings("unused") + private String toString(BytesRef b) { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + + // Writes all entries in the pending slice as a single + // block: + private PendingBlock writeBlock(IntsRef prevTerm, int prefixLength, int indexPrefixLength, int startBackwards, int length, + int futureTermCount, boolean isFloor, int floorLeadByte, boolean isLastInFloor) throws IOException { + + assert length > 0; + + final int start = pending.size()-startBackwards; + + assert start >= 0: "pending.size()=" + pending.size() + " startBackwards=" + startBackwards + " length=" + length; + + final List slice = pending.subList(start, start + length); + + final long startFP = out.getFilePointer(); + + final BytesRef prefix = new BytesRef(indexPrefixLength); + for(int m=0;m= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.writeTo(metaWriter); + bytesWriter.reset(); + absolute = false; + } + termCount = length; + } else { + subIndices = new ArrayList<>(); + termCount = 0; + for (PendingEntry ent : slice) { + if (ent.isTerm) { + PendingTerm term = (PendingTerm) ent; + BlockTermState state = term.state; + maxVersionInBlock = Math.max(maxVersionInBlock, ((IDVersionTermState) state).idVersion); + final int suffix = term.term.length - prefixLength; + // if (DEBUG) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write term suffix=" + suffixBytes); + // } + // For non-leaf block we borrow 1 bit to record + // if entry is term or sub-block + suffixWriter.writeVInt(suffix<<1); + suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix); + + // Write term stats, to separate byte[] blob: + statsWriter.writeVInt(state.docFreq); + if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { + assert state.totalTermFreq >= state.docFreq; + statsWriter.writeVLong(state.totalTermFreq - state.docFreq); + } + + // TODO: now that terms dict "sees" these longs, + // we can explore better column-stride encodings + // to encode all long[0]s for this block at + // once, all long[1]s, etc., e.g. using + // Simple64. Alternatively, we could interleave + // stats + meta ... no reason to have them + // separate anymore: + + // Write term meta data + postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); + for (int pos = 0; pos < longsSize; pos++) { + assert longs[pos] >= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.writeTo(metaWriter); + bytesWriter.reset(); + absolute = false; + + termCount++; + } else { + PendingBlock block = (PendingBlock) ent; + maxVersionInBlock = Math.max(maxVersionInBlock, block.maxVersion); + final int suffix = block.prefix.length - prefixLength; + + assert suffix > 0; + + // For non-leaf block we borrow 1 bit to record + // if entry is term or sub-block + suffixWriter.writeVInt((suffix<<1)|1); + suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix); + assert block.fp < startFP; + + // if (DEBUG) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor); + // } + + suffixWriter.writeVLong(startFP - block.fp); + subIndices.add(block.index); + } + } + + assert subIndices.size() != 0; + } + + // TODO: we could block-write the term suffix pointers; + // this would take more space but would enable binary + // search on lookup + + // Write suffixes byte[] blob to terms dict output: + out.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0)); + suffixWriter.writeTo(out); + suffixWriter.reset(); + + // Write term stats byte[] blob + out.writeVInt((int) statsWriter.getFilePointer()); + statsWriter.writeTo(out); + statsWriter.reset(); + + // Write term meta data byte[] blob + out.writeVInt((int) metaWriter.getFilePointer()); + metaWriter.writeTo(out); + metaWriter.reset(); + + // Remove slice replaced by block: + slice.clear(); + + if (lastBlockIndex >= start) { + if (lastBlockIndex < start+length) { + lastBlockIndex = start; + } else { + lastBlockIndex -= length; + } + } + + // if (DEBUG) { + // System.out.println(" fpEnd=" + out.getFilePointer()); + // } + + return new PendingBlock(prefix, maxVersionInBlock, startFP, termCount != 0, isFloor, floorLeadByte, subIndices); + } + + TermsWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + docsSeen = new FixedBitSet(maxDoc); + + noOutputs = NoOutputs.getSingleton(); + + // This Builder is just used transiently to fragment + // terms into "good" blocks; we don't save the + // resulting FST: + blockBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, + 0, 0, true, + true, Integer.MAX_VALUE, + noOutputs, + new FindBlocks(), false, + PackedInts.COMPACT, + true, 15); + + this.longsSize = postingsWriter.setField(fieldInfo); + } + + private final IntsRef scratchIntsRef = new IntsRef(); + + /** Writes one term's worth of postings. */ + public void write(BytesRef text, TermsEnum termsEnum) throws IOException { + + BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen); + if (state != null) { + assert state.docFreq != 0; + assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter; + sumDocFreq += state.docFreq; + sumTotalTermFreq += state.totalTermFreq; + blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput()); + + PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state); + pending.add(term); + numTerms++; + } + } + + // Finishes all terms in this field + public void finish(BytesRef minTerm, BytesRef maxTerm) throws IOException { + if (numTerms > 0) { + blockBuilder.finish(); + + // We better have one final "root" block: + assert pending.size() == 1 && !pending.get(0).isTerm: "pending.size()=" + pending.size() + " pending=" + pending; + final PendingBlock root = (PendingBlock) pending.get(0); + assert root.prefix.length == 0; + assert root.index.getEmptyOutput() != null; + + // Write FST to index + indexStartFP = indexOut.getFilePointer(); + root.index.save(indexOut); + //System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name); + + // if (SAVE_DOT_FILES || DEBUG) { + // final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + // Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + // Util.toDot(root.index, w, false, false); + // System.out.println("SAVED to " + dotFileName); + // w.close(); + // } + + fields.add(new FieldMetaData(fieldInfo, + ((PendingBlock) pending.get(0)).index.getEmptyOutput(), + numTerms, + indexStartFP, + sumTotalTermFreq, + sumDocFreq, + docsSeen.cardinality(), + longsSize, + minTerm, maxTerm)); + } else { + assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1; + assert sumDocFreq == 0; + assert docsSeen.cardinality() == 0; + } + } + + private final RAMOutputStream suffixWriter = new RAMOutputStream(); + private final RAMOutputStream statsWriter = new RAMOutputStream(); + private final RAMOutputStream metaWriter = new RAMOutputStream(); + private final RAMOutputStream bytesWriter = new RAMOutputStream(); + } + + @Override + public void close() throws IOException { + + boolean success = false; + try { + + final long dirStart = out.getFilePointer(); + final long indexDirStart = indexOut.getFilePointer(); + + out.writeVInt(fields.size()); + + for(FieldMetaData field : fields) { + //System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms"); + out.writeVInt(field.fieldInfo.number); + assert field.numTerms > 0; + out.writeVLong(field.numTerms); + out.writeVInt(field.rootCode.output1.length); + out.writeBytes(field.rootCode.output1.bytes, field.rootCode.output1.offset, field.rootCode.output1.length); + out.writeVLong(field.rootCode.output2); + if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { + out.writeVLong(field.sumTotalTermFreq); + } + out.writeVLong(field.sumDocFreq); + out.writeVInt(field.docCount); + out.writeVInt(field.longsSize); + indexOut.writeVLong(field.indexStartFP); + writeBytesRef(out, field.minTerm); + writeBytesRef(out, field.maxTerm); + } + writeTrailer(out, dirStart); + CodecUtil.writeFooter(out); + writeIndexTrailer(indexOut, indexDirStart); + CodecUtil.writeFooter(indexOut); + success = true; + } finally { + if (success) { + IOUtils.close(out, indexOut, postingsWriter); + } else { + IOUtils.closeWhileHandlingException(out, indexOut, postingsWriter); + } + } + } + + private static void writeBytesRef(IndexOutput out, BytesRef bytes) throws IOException { + out.writeVInt(bytes.length); + out.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java new file mode 100644 index 00000000000..2b5dd4239f6 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java @@ -0,0 +1,163 @@ +package org.apache.lucene.codecs.idversion; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PairOutputs.Pair; + +/** BlockTree's implementation of {@link Terms}. */ +// public for CheckIndex: +final class VersionFieldReader extends Terms { + final long numTerms; + final FieldInfo fieldInfo; + final long sumTotalTermFreq; + final long sumDocFreq; + final int docCount; + final long indexStartFP; + final long rootBlockFP; + final Pair rootCode; + final BytesRef minTerm; + final BytesRef maxTerm; + final int longsSize; + final VersionBlockTreeTermsReader parent; + + final FST> index; + //private boolean DEBUG; + + VersionFieldReader(VersionBlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, Pair rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, + long indexStartFP, int longsSize, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException { + assert numTerms > 0; + this.fieldInfo = fieldInfo; + //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); + this.parent = parent; + this.numTerms = numTerms; + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + this.docCount = docCount; + this.indexStartFP = indexStartFP; + this.rootCode = rootCode; + this.longsSize = longsSize; + this.minTerm = minTerm; + this.maxTerm = maxTerm; + // if (DEBUG) { + // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor); + // } + + rootBlockFP = (new ByteArrayDataInput(rootCode.output1.bytes, rootCode.output1.offset, rootCode.output1.length)).readVLong() >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; + + if (indexIn != null) { + final IndexInput clone = indexIn.clone(); + //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name); + clone.seek(indexStartFP); + index = new FST<>(clone, VersionBlockTreeTermsWriter.getFSTOutputs()); + + /* + if (false) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(index, w, false, false); + System.out.println("FST INDEX: SAVED to " + dotFileName); + w.close(); + } + */ + } else { + index = null; + } + } + + @Override + public BytesRef getMin() throws IOException { + if (minTerm == null) { + // Older index that didn't store min/maxTerm + return super.getMin(); + } else { + return minTerm; + } + } + + @Override + public BytesRef getMax() throws IOException { + if (maxTerm == null) { + // Older index that didn't store min/maxTerm + return super.getMax(); + } else { + return maxTerm; + } + } + + @Override + public boolean hasFreqs() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + } + + @Override + public boolean hasOffsets() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + } + + @Override + public boolean hasPositions() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + } + + @Override + public boolean hasPayloads() { + return fieldInfo.hasPayloads(); + } + + @Override + public TermsEnum iterator(TermsEnum reuse) throws IOException { + return new IDVersionSegmentTermsEnum(this); + } + + @Override + public long size() { + return numTerms; + } + + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + + @Override + public long getSumDocFreq() { + return sumDocFreq; + } + + @Override + public int getDocCount() { + return docCount; + } + + /** Returns approximate RAM bytes used */ + public long ramBytesUsed() { + return ((index!=null)? index.sizeInBytes() : 0); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java index 5d02490bf99..5ef41bc4596 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java @@ -86,7 +86,7 @@ import org.apache.lucene.util.fst.Util; * @lucene.experimental */ -public class BlockTreeTermsReader extends FieldsProducer { +public final class BlockTreeTermsReader extends FieldsProducer { // Open input to the main terms dict file (_X.tib) final IndexInput in; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java index 2f8e8f3c328..4182fdc5cf4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java @@ -187,7 +187,7 @@ import org.apache.lucene.util.packed.PackedInts; * @see BlockTreeTermsReader * @lucene.experimental */ -public class BlockTreeTermsWriter extends FieldsConsumer { +public final class BlockTreeTermsWriter extends FieldsConsumer { /** Suggested default value for the {@code * minItemsInBlock} parameter to {@link @@ -1045,12 +1045,12 @@ public class BlockTreeTermsWriter extends FieldsConsumer { // terms into "good" blocks; we don't save the // resulting FST: blockBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, - 0, 0, true, - true, Integer.MAX_VALUE, - noOutputs, - new FindBlocks(), false, - PackedInts.COMPACT, - true, 15); + 0, 0, true, + true, Integer.MAX_VALUE, + noOutputs, + new FindBlocks(), false, + PackedInts.COMPACT, + true, 15); this.longsSize = postingsWriter.setField(fieldInfo); }