diff --git a/build.xml b/build.xml index 150211cb1e7..e0808c625cc 100644 --- a/build.xml +++ b/build.xml @@ -100,6 +100,7 @@ + @@ -112,7 +113,7 @@ - The following files contain @author tags, tabs, svn keywords or nocommits:${line.separator}${validate.patternsFound} + The following files contain @author tags, tabs, TOODs, svn keywords or nocommits:${line.separator}${validate.patternsFound} diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 23579c419b4..aece36e406a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -75,7 +75,7 @@ New Features * LUCENE-5969: Lucene 5.0 has a new index format with mismatched file detection, improved exception handling, and indirect norms encoding for sparse fields. - (Mike McCandless, Robert Muir) + (Mike McCandless, Ryan Ernst, Robert Muir) API Changes @@ -148,7 +148,10 @@ API Changes * LUCENE-5969: Add Codec.compoundFormat, which handles the encoding of compound files. Add getMergeInstance() to codec producer APIs, which can be overridden - to return an instance optimized for merging instead of searching. + to return an instance optimized for merging instead of searching. Add + Terms.getStats() which can return additional codec-specific statistics about a field. + Change instance method SegmentInfos.read() to two static methods: SegmentInfos.readCommit() + and SegmentInfos.readLatestCommit(). (Mike McCandless, Robert Muir) * LUCENE-5992: Remove FieldInfos from SegmentInfosWriter.write API. (Robert Muir, Mike McCandless) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40BlockTreeTermsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40BlockTreeTermsReader.java new file mode 100644 index 00000000000..562c8e38eac --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40BlockTreeTermsReader.java @@ -0,0 +1,358 @@ +package org.apache.lucene.codecs.blocktree; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.TreeMap; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.Outputs; + +/** A block-based terms index and dictionary that assigns + * terms to variable length blocks according to how they + * share prefixes. The terms index is a prefix trie + * whose leaves are term blocks. The advantage of this + * approach is that seekExact is often able to + * determine a term cannot exist without doing any IO, and + * intersection with Automata is very fast. Note that this + * terms dictionary has it's own fixed terms index (ie, it + * does not support a pluggable terms index + * implementation). + * + *

NOTE: this terms dictionary supports + * min/maxItemsPerBlock during indexing to control how + * much memory the terms index uses.

+ * + *

The data structure used by this implementation is very + * similar to a burst trie + * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499), + * but with added logic to break up too-large blocks of all + * terms sharing a given prefix into smaller ones.

+ * + *

Use {@link org.apache.lucene.index.CheckIndex} with the -verbose + * option to see summary statistics on the blocks in the + * dictionary. + * + * @lucene.experimental + * @deprecated Only for 4.x backcompat + */ +@Deprecated +public final class Lucene40BlockTreeTermsReader extends FieldsProducer { + + /** Extension of terms file */ + static final String TERMS_EXTENSION = "tim"; + final static String TERMS_CODEC_NAME = "BLOCK_TREE_TERMS_DICT"; + + /** Initial terms format. */ + public static final int VERSION_START = 0; + + /** Append-only */ + public static final int VERSION_APPEND_ONLY = 1; + + /** Meta data as array */ + public static final int VERSION_META_ARRAY = 2; + + /** checksums */ + public static final int VERSION_CHECKSUM = 3; + + /** min/max term */ + public static final int VERSION_MIN_MAX_TERMS = 4; + + /** Current terms format. */ + public static final int VERSION_CURRENT = VERSION_MIN_MAX_TERMS; + + /** Extension of terms index file */ + static final String TERMS_INDEX_EXTENSION = "tip"; + final static String TERMS_INDEX_CODEC_NAME = "BLOCK_TREE_TERMS_INDEX"; + static final int OUTPUT_FLAGS_NUM_BITS = 2; + static final int OUTPUT_FLAGS_MASK = 0x3; + static final int OUTPUT_FLAG_IS_FLOOR = 0x1; + static final int OUTPUT_FLAG_HAS_TERMS = 0x2; + static final Outputs FST_OUTPUTS = ByteSequenceOutputs.getSingleton(); + static final BytesRef NO_OUTPUT = FST_OUTPUTS.getNoOutput(); + + // Open input to the main terms dict file (_X.tib) + final IndexInput in; + + //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + // Reads the terms dict entries, to gather state to + // produce DocsEnum on demand + final PostingsReaderBase postingsReader; + + private final TreeMap fields = new TreeMap<>(); + + /** File offset where the directory starts in the terms file. */ + private long dirOffset; + + /** File offset where the directory starts in the index file. */ + private long indexDirOffset; + + final String segment; + + private final int version; + + /** Sole constructor. */ + public Lucene40BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) + throws IOException { + + this.postingsReader = postingsReader; + + this.segment = state.segmentInfo.name; + String termsFileName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION); + in = state.directory.openInput(termsFileName, state.context); + + boolean success = false; + IndexInput indexIn = null; + + try { + version = readHeader(in); + String indexFileName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_INDEX_EXTENSION); + indexIn = state.directory.openInput(indexFileName, state.context); + int indexVersion = readIndexHeader(indexIn); + if (indexVersion != version) { + throw new CorruptIndexException("mixmatched version files: " + in + "=" + version + "," + indexIn + "=" + indexVersion, indexIn); + } + + // verify + if (version >= VERSION_CHECKSUM) { + CodecUtil.checksumEntireFile(indexIn); + } + + // Have PostingsReader init itself + postingsReader.init(in, state); + + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + if (version >= VERSION_CHECKSUM) { + CodecUtil.retrieveChecksum(in); + } + + // Read per-field details + seekDir(in, dirOffset); + seekDir(indexIn, indexDirOffset); + + final int numFields = in.readVInt(); + if (numFields < 0) { + throw new CorruptIndexException("invalid numFields: " + numFields, in); + } + + for(int i=0;i= VERSION_META_ARRAY ? in.readVInt() : 0; + if (longsSize < 0) { + throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, in); + } + BytesRef minTerm, maxTerm; + if (version >= VERSION_MIN_MAX_TERMS) { + minTerm = readBytesRef(in); + maxTerm = readBytesRef(in); + } else { + minTerm = maxTerm = null; + } + if (docCount < 0 || docCount > state.segmentInfo.getDocCount()) { // #docs with field must be <= #docs + throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.getDocCount(), in); + } + if (sumDocFreq < docCount) { // #postings must be >= #docs with field + throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, in); + } + if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings + throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, in); + } + final long indexStartFP = indexIn.readVLong(); + Lucene40FieldReader previous = fields.put(fieldInfo.name, + new Lucene40FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, + indexStartFP, longsSize, indexIn, minTerm, maxTerm)); + if (previous != null) { + throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in); + } + } + indexIn.close(); + + success = true; + } finally { + if (!success) { + // this.close() will close in: + IOUtils.closeWhileHandlingException(indexIn, this); + } + } + } + + private static BytesRef readBytesRef(IndexInput in) throws IOException { + BytesRef bytes = new BytesRef(); + bytes.length = in.readVInt(); + bytes.bytes = new byte[bytes.length]; + in.readBytes(bytes.bytes, 0, bytes.length); + return bytes; + } + + /** Reads terms file header. */ + private int readHeader(IndexInput input) throws IOException { + int version = CodecUtil.checkHeader(input, TERMS_CODEC_NAME, + VERSION_START, + VERSION_CURRENT); + if (version < VERSION_APPEND_ONLY) { + dirOffset = input.readLong(); + } + return version; + } + + /** Reads index file header. */ + private int readIndexHeader(IndexInput input) throws IOException { + int version = CodecUtil.checkHeader(input, TERMS_INDEX_CODEC_NAME, + VERSION_START, + VERSION_CURRENT); + if (version < VERSION_APPEND_ONLY) { + indexDirOffset = input.readLong(); + } + return version; + } + + /** Seek {@code input} to the directory offset. */ + private void seekDir(IndexInput input, long dirOffset) + throws IOException { + if (version >= VERSION_CHECKSUM) { + input.seek(input.length() - CodecUtil.footerLength() - 8); + dirOffset = input.readLong(); + } else if (version >= VERSION_APPEND_ONLY) { + input.seek(input.length() - 8); + dirOffset = input.readLong(); + } + input.seek(dirOffset); + } + + // for debugging + // private static String toHex(int v) { + // return "0x" + Integer.toHexString(v); + // } + + @Override + public void close() throws IOException { + try { + IOUtils.close(in, postingsReader); + } finally { + // Clear so refs to terms index is GCable even if + // app hangs onto us: + fields.clear(); + } + } + + @Override + public Iterator iterator() { + return Collections.unmodifiableSet(fields.keySet()).iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + assert field != null; + return fields.get(field); + } + + @Override + public int size() { + return fields.size(); + } + + // for debugging + String brToString(BytesRef b) { + if (b == null) { + return "null"; + } else { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + } + + @Override + public long ramBytesUsed() { + long sizeInBytes = postingsReader.ramBytesUsed(); + for(Lucene40FieldReader reader : fields.values()) { + sizeInBytes += reader.ramBytesUsed(); + } + return sizeInBytes; + } + + @Override + public Iterable getChildResources() { + List resources = new ArrayList<>(); + resources.addAll(Accountables.namedAccountables("field", fields)); + resources.add(Accountables.namedAccountable("delegate", postingsReader)); + return Collections.unmodifiableList(resources); + } + + @Override + public void checkIntegrity() throws IOException { + if (version >= VERSION_CHECKSUM) { + // term dictionary + CodecUtil.checksumEntireFile(in); + + // postings + postingsReader.checkIntegrity(); + } + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(fields=" + fields.size() + ",delegate=" + postingsReader + ")"; + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40FieldReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40FieldReader.java new file mode 100644 index 00000000000..bed4d8ce7e9 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40FieldReader.java @@ -0,0 +1,202 @@ +package org.apache.lucene.codecs.blocktree; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collections; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; + +/** + * BlockTree's implementation of {@link Terms}. + * @deprecated Only for 4.x backcompat + */ +@Deprecated +final class Lucene40FieldReader extends Terms implements Accountable { + + private static final long BASE_RAM_BYTES_USED = + RamUsageEstimator.shallowSizeOfInstance(Lucene40FieldReader.class) + + 3 * RamUsageEstimator.shallowSizeOfInstance(BytesRef.class); + + final long numTerms; + final FieldInfo fieldInfo; + final long sumTotalTermFreq; + final long sumDocFreq; + final int docCount; + final long indexStartFP; + final long rootBlockFP; + final BytesRef rootCode; + final BytesRef minTerm; + final BytesRef maxTerm; + final int longsSize; + final Lucene40BlockTreeTermsReader parent; + + final FST index; + //private boolean DEBUG; + + Lucene40FieldReader(Lucene40BlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, + long indexStartFP, int longsSize, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException { + assert numTerms > 0; + this.fieldInfo = fieldInfo; + //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); + this.parent = parent; + this.numTerms = numTerms; + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + this.docCount = docCount; + this.indexStartFP = indexStartFP; + this.rootCode = rootCode; + this.longsSize = longsSize; + this.minTerm = minTerm; + this.maxTerm = maxTerm; + // if (DEBUG) { + // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor); + // } + + rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; + + if (indexIn != null) { + final IndexInput clone = indexIn.clone(); + //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name); + clone.seek(indexStartFP); + index = new FST<>(clone, ByteSequenceOutputs.getSingleton()); + + /* + if (false) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(index, w, false, false); + System.out.println("FST INDEX: SAVED to " + dotFileName); + w.close(); + } + */ + } else { + index = null; + } + } + + @Override + public BytesRef getMin() throws IOException { + if (minTerm == null) { + // Older index that didn't store min/maxTerm + return super.getMin(); + } else { + return minTerm; + } + } + + @Override + public BytesRef getMax() throws IOException { + if (maxTerm == null) { + // Older index that didn't store min/maxTerm + return super.getMax(); + } else { + return maxTerm; + } + } + + /** For debugging -- used by CheckIndex too*/ + @Override + public Lucene40Stats getStats() throws IOException { + return new Lucene40SegmentTermsEnum(this).computeBlockStats(); + } + + @Override + public boolean hasFreqs() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + } + + @Override + public boolean hasOffsets() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + } + + @Override + public boolean hasPositions() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + } + + @Override + public boolean hasPayloads() { + return fieldInfo.hasPayloads(); + } + + @Override + public TermsEnum iterator(TermsEnum reuse) throws IOException { + return new Lucene40SegmentTermsEnum(this); + } + + @Override + public long size() { + return numTerms; + } + + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + + @Override + public long getSumDocFreq() { + return sumDocFreq; + } + + @Override + public int getDocCount() { + return docCount; + } + + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } + return new Lucene40IntersectTermsEnum(this, compiled, startTerm); + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED + ((index!=null)? index.ramBytesUsed() : 0); + } + + @Override + public Iterable getChildResources() { + if (index == null) { + return Collections.emptyList(); + } else { + return Collections.singleton(Accountables.namedAccountable("term index", index)); + } + } + + @Override + public String toString() { + return "BlockTreeTerms(terms=" + numTerms + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")"; + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40IntersectTermsEnum.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40IntersectTermsEnum.java new file mode 100644 index 00000000000..292b57aa5dd --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40IntersectTermsEnum.java @@ -0,0 +1,490 @@ +package org.apache.lucene.codecs.blocktree; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Outputs; + +// NOTE: cannot seek! + +/** + * @deprecated Only for 4.x backcompat + */ +@Deprecated +final class Lucene40IntersectTermsEnum extends TermsEnum { + final IndexInput in; + final static Outputs fstOutputs = ByteSequenceOutputs.getSingleton(); + + private Lucene40IntersectTermsEnumFrame[] stack; + + @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc[] arcs = new FST.Arc[5]; + + final RunAutomaton runAutomaton; + final CompiledAutomaton compiledAutomaton; + + private Lucene40IntersectTermsEnumFrame currentFrame; + + private final BytesRef term = new BytesRef(); + + private final FST.BytesReader fstReader; + + final Lucene40FieldReader fr; + + private BytesRef savedStartTerm; + + // TODO: in some cases we can filter by length? eg + // regexp foo*bar must be at least length 6 bytes + public Lucene40IntersectTermsEnum(Lucene40FieldReader fr, CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + // if (DEBUG) { + // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef)); + // } + this.fr = fr; + runAutomaton = compiled.runAutomaton; + compiledAutomaton = compiled; + in = fr.parent.in.clone(); + stack = new Lucene40IntersectTermsEnumFrame[5]; + for(int idx=0;idx(); + } + + if (fr.index == null) { + fstReader = null; + } else { + fstReader = fr.index.getBytesReader(); + } + + // TODO: if the automaton is "smallish" we really + // should use the terms index to seek at least to + // the initial term and likely to subsequent terms + // (or, maybe just fallback to ATE for such cases). + // Else the seek cost of loading the frames will be + // too costly. + + final FST.Arc arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + + // Special pushFrame since it's the first one: + final Lucene40IntersectTermsEnumFrame f = stack[0]; + f.fp = f.fpOrig = fr.rootBlockFP; + f.prefix = 0; + f.setState(runAutomaton.getInitialState()); + f.arc = arc; + f.outputPrefix = arc.output; + f.load(fr.rootCode); + + // for assert: + assert setSavedStartTerm(startTerm); + + currentFrame = f; + if (startTerm != null) { + seekToStartTerm(startTerm); + } + } + + // only for assert: + private boolean setSavedStartTerm(BytesRef startTerm) { + savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm); + return true; + } + + @Override + public TermState termState() throws IOException { + currentFrame.decodeMetaData(); + return currentFrame.termState.clone(); + } + + private Lucene40IntersectTermsEnumFrame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final Lucene40IntersectTermsEnumFrame[] next = new Lucene40IntersectTermsEnumFrame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, next, 0, stack.length); + for(int stackOrd=stack.length;stackOrd getArc(int ord) { + if (ord >= arcs.length) { + @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc[] next = + new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, next, 0, arcs.length); + for(int arcOrd=arcs.length;arcOrd(); + } + arcs = next; + } + return arcs[ord]; + } + + private Lucene40IntersectTermsEnumFrame pushFrame(int state) throws IOException { + final Lucene40IntersectTermsEnumFrame f = getFrame(currentFrame == null ? 0 : 1+currentFrame.ord); + + f.fp = f.fpOrig = currentFrame.lastSubFP; + f.prefix = currentFrame.prefix + currentFrame.suffix; + // if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix); + f.setState(state); + + // Walk the arc through the index -- we only + // "bother" with this so we can get the floor data + // from the index and skip floor blocks when + // possible: + FST.Arc arc = currentFrame.arc; + int idx = currentFrame.prefix; + assert currentFrame.suffix > 0; + BytesRef output = currentFrame.outputPrefix; + while (idx < f.prefix) { + final int target = term.bytes[idx] & 0xff; + // TODO: we could be more efficient for the next() + // case by using current arc as starting point, + // passed to findTargetArc + arc = fr.index.findTargetArc(target, arc, getArc(1+idx), fstReader); + assert arc != null; + output = fstOutputs.add(output, arc.output); + idx++; + } + + f.arc = arc; + f.outputPrefix = output; + assert arc.isFinal(); + f.load(fstOutputs.add(output, arc.nextFinalOutput)); + return f; + } + + @Override + public BytesRef term() { + return term; + } + + @Override + public int docFreq() throws IOException { + //if (DEBUG) System.out.println("BTIR.docFreq"); + currentFrame.decodeMetaData(); + //if (DEBUG) System.out.println(" return " + currentFrame.termState.docFreq); + return currentFrame.termState.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + currentFrame.decodeMetaData(); + return currentFrame.termState.totalTermFreq; + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse, int flags) throws IOException { + currentFrame.decodeMetaData(); + return fr.parent.postingsReader.docs(fr.fieldInfo, currentFrame.termState, skipDocs, reuse, flags); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + if (fr.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + // Positions were not indexed: + return null; + } + + currentFrame.decodeMetaData(); + return fr.parent.postingsReader.docsAndPositions(fr.fieldInfo, currentFrame.termState, skipDocs, reuse, flags); + } + + private int getState() { + int state = currentFrame.state; + for(int idx=0;idx 0) { + // A prefix of the common suffix overlaps with + // the suffix of the block prefix so we first + // test whether the prefix part matches: + final byte[] termBytes = term.bytes; + int termBytesPos = currentFrame.prefix - lenInPrefix; + assert termBytesPos >= 0; + final int termBytesPosEnd = currentFrame.prefix; + while (termBytesPos < termBytesPosEnd) { + if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { + // if (DEBUG) { + // System.out.println(" skip: common suffix mismatch (in prefix)"); + // } + continue nextTerm; + } + } + suffixBytesPos = currentFrame.startBytePos; + } else { + suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.commonSuffixRef.length; + } + + // Test overlapping suffix part: + final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length; + while (commonSuffixBytesPos < commonSuffixBytesPosEnd) { + if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { + // if (DEBUG) { + // System.out.println(" skip: common suffix mismatch"); + // } + continue nextTerm; + } + } + } + + // TODO: maybe we should do the same linear test + // that AutomatonTermsEnum does, so that if we + // reach a part of the automaton where .* is + // "temporarily" accepted, we just blindly .next() + // until the limit + + // See if the term prefix matches the automaton: + int state = currentFrame.state; + for (int idx=0;idx arc; + + final BlockTermState termState; + + // metadata buffer, holding monotonic values + public long[] longs; + // metadata buffer, holding general values + public byte[] bytes; + ByteArrayDataInput bytesReader; + + // Cumulative output so far + BytesRef outputPrefix; + + int startBytePos; + int suffix; + + private final Lucene40IntersectTermsEnum ite; + + public Lucene40IntersectTermsEnumFrame(Lucene40IntersectTermsEnum ite, int ord) throws IOException { + this.ite = ite; + this.ord = ord; + this.termState = ite.fr.parent.postingsReader.newTermState(); + this.termState.totalTermFreq = -1; + this.longs = new long[ite.fr.longsSize]; + } + + void loadNextFloorBlock() throws IOException { + assert numFollowFloorBlocks > 0; + //if (DEBUG) System.out.println(" loadNextFoorBlock trans=" + transitions[transitionIndex]); + + do { + fp = fpOrig + (floorDataReader.readVLong() >>> 1); + numFollowFloorBlocks--; + // if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[transitionIndex].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); + if (numFollowFloorBlocks != 0) { + nextFloorLabel = floorDataReader.readByte() & 0xff; + } else { + nextFloorLabel = 256; + } + // if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel); + } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min); + + load(null); + } + + public void setState(int state) { + this.state = state; + transitionIndex = 0; + transitionCount = ite.compiledAutomaton.automaton.getNumTransitions(state); + if (transitionCount != 0) { + ite.compiledAutomaton.automaton.initTransition(state, transition); + ite.compiledAutomaton.automaton.getNextTransition(transition); + curTransitionMax = transition.max; + } else { + curTransitionMax = -1; + } + } + + void load(BytesRef frameIndexData) throws IOException { + + // if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state)); + + if (frameIndexData != null && transitionCount != 0) { + // Floor frame + if (floorData.length < frameIndexData.length) { + this.floorData = new byte[ArrayUtil.oversize(frameIndexData.length, 1)]; + } + System.arraycopy(frameIndexData.bytes, frameIndexData.offset, floorData, 0, frameIndexData.length); + floorDataReader.reset(floorData, 0, frameIndexData.length); + // Skip first long -- has redundant fp, hasTerms + // flag, isFloor flag + final long code = floorDataReader.readVLong(); + if ((code & Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + // if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel); + + // If current state is accept, we must process + // first block in case it has empty suffix: + if (!ite.runAutomaton.isAccept(state)) { + // Maybe skip floor blocks: + assert transitionIndex == 0: "transitionIndex=" + transitionIndex; + while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) { + fp = fpOrig + (floorDataReader.readVLong() >>> 1); + numFollowFloorBlocks--; + // if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); + if (numFollowFloorBlocks != 0) { + nextFloorLabel = floorDataReader.readByte() & 0xff; + } else { + nextFloorLabel = 256; + } + } + } + } + } + + ite.in.seek(fp); + int code = ite.in.readVInt(); + entCount = code >>> 1; + assert entCount > 0; + isLastInFloor = (code & 1) != 0; + + // term suffixes: + code = ite.in.readVInt(); + isLeafBlock = (code & 1) != 0; + int numBytes = code >>> 1; + // if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes); + if (suffixBytes.length < numBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ite.in.readBytes(suffixBytes, 0, numBytes); + suffixesReader.reset(suffixBytes, 0, numBytes); + + // stats + numBytes = ite.in.readVInt(); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ite.in.readBytes(statBytes, 0, numBytes); + statsReader.reset(statBytes, 0, numBytes); + metaDataUpto = 0; + + termState.termBlockOrd = 0; + nextEnt = 0; + + // metadata + numBytes = ite.in.readVInt(); + if (bytes == null) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + bytesReader = new ByteArrayDataInput(); + } else if (bytes.length < numBytes) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ite.in.readBytes(bytes, 0, numBytes); + bytesReader.reset(bytes, 0, numBytes); + + if (!isLastInFloor) { + // Sub-blocks of a single floor block are always + // written one after another -- tail recurse: + fpEnd = ite.in.getFilePointer(); + } + } + + // TODO: maybe add scanToLabel; should give perf boost + + public boolean next() { + return isLeafBlock ? nextLeaf() : nextNonLeaf(); + } + + // Decodes next entry; returns true if it's a sub-block + public boolean nextLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + suffix = suffixesReader.readVInt(); + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + return false; + } + + public boolean nextNonLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + final int code = suffixesReader.readVInt(); + suffix = code >>> 1; + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + if ((code & 1) == 0) { + // A normal term + termState.termBlockOrd++; + return false; + } else { + // A sub-block; make sub-FP absolute: + lastSubFP = fp - suffixesReader.readVLong(); + return true; + } + } + + public int getTermBlockOrd() { + return isLeafBlock ? nextEnt : termState.termBlockOrd; + } + + public void decodeMetaData() throws IOException { + + // lazily catch up on metadata decode: + final int limit = getTermBlockOrd(); + boolean absolute = metaDataUpto == 0; + assert limit > 0; + + // TODO: better API would be "jump straight to term=N"??? + while (metaDataUpto < limit) { + + // TODO: we could make "tiers" of metadata, ie, + // decode docFreq/totalTF but don't decode postings + // metadata; this way caller could get + // docFreq/totalTF w/o paying decode cost for + // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: + + // stats + termState.docFreq = statsReader.readVInt(); + //if (DEBUG) System.out.println(" dF=" + state.docFreq); + if (ite.fr.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { + termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); + //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + } + // metadata + for (int i = 0; i < ite.fr.longsSize; i++) { + longs[i] = bytesReader.readVLong(); + } + ite.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ite.fr.fieldInfo, termState, absolute); + + metaDataUpto++; + absolute = false; + } + termState.termBlockOrd = metaDataUpto; + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40SegmentTermsEnum.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40SegmentTermsEnum.java new file mode 100644 index 00000000000..fb8f0999c9f --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40SegmentTermsEnum.java @@ -0,0 +1,1051 @@ +package org.apache.lucene.codecs.blocktree; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.PrintStream; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Util; + +/** + * Iterates through terms in this field + * @deprecated Only for 4.x backcompat + */ +@Deprecated +final class Lucene40SegmentTermsEnum extends TermsEnum { + + // Lazy init: + IndexInput in; + + private Lucene40SegmentTermsEnumFrame[] stack; + private final Lucene40SegmentTermsEnumFrame staticFrame; + Lucene40SegmentTermsEnumFrame currentFrame; + boolean termExists; + final Lucene40FieldReader fr; + + private int targetBeforeCurrentLength; + + // static boolean DEBUG = false; + + private final ByteArrayDataInput scratchReader = new ByteArrayDataInput(); + + // What prefix of the current term was present in the index; when we only next() through the index, this stays at 0. It's only set when + // we seekCeil/Exact: + private int validIndexPrefix; + + // assert only: + private boolean eof; + + final BytesRefBuilder term = new BytesRefBuilder(); + private final FST.BytesReader fstReader; + + @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc[] arcs = new FST.Arc[1]; + + public Lucene40SegmentTermsEnum(Lucene40FieldReader fr) throws IOException { + this.fr = fr; + + // if (DEBUG) { + // System.out.println("BTTR.init seg=" + fr.parent.segment); + // } + stack = new Lucene40SegmentTermsEnumFrame[0]; + + // Used to hold seek by TermState, or cached seek + staticFrame = new Lucene40SegmentTermsEnumFrame(this, -1); + + if (fr.index == null) { + fstReader = null; + } else { + fstReader = fr.index.getBytesReader(); + } + + // Init w/ root block; don't use index since it may + // not (and need not) have been loaded + for(int arcIdx=0;arcIdx(); + } + + currentFrame = staticFrame; + final FST.Arc arc; + if (fr.index != null) { + arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + //currentFrame = pushFrame(arc, rootCode, 0); + //currentFrame.loadBlock(); + validIndexPrefix = 0; + // if (DEBUG) { + // System.out.println("init frame state " + currentFrame.ord); + // printSeekState(); + // } + + //System.out.println(); + // computeBlockStats().print(System.out); + } + + // Not private to avoid synthetic access$NNN methods + void initIndexInput() { + if (this.in == null) { + this.in = fr.parent.in.clone(); + } + } + + /** Runs next() through the entire terms dict, + * computing aggregate statistics. */ + public Lucene40Stats computeBlockStats() throws IOException { + + Lucene40Stats stats = new Lucene40Stats(fr.parent.segment, fr.fieldInfo.name); + if (fr.index != null) { + stats.indexNodeCount = fr.index.getNodeCount(); + stats.indexArcCount = fr.index.getArcCount(); + stats.indexNumBytes = fr.index.ramBytesUsed(); + } + + currentFrame = staticFrame; + FST.Arc arc; + if (fr.index != null) { + arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + + // Empty string prefix must have an output in the + // index! + currentFrame = pushFrame(arc, fr.rootCode, 0); + currentFrame.fpOrig = currentFrame.fp; + currentFrame.loadBlock(); + validIndexPrefix = 0; + + stats.startBlock(currentFrame, !currentFrame.isLastInFloor); + + allTerms: + while (true) { + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) { + stats.endBlock(currentFrame); + if (!currentFrame.isLastInFloor) { + currentFrame.loadNextFloorBlock(); + stats.startBlock(currentFrame, true); + } else { + if (currentFrame.ord == 0) { + break allTerms; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord-1]; + assert lastFP == currentFrame.lastSubFP; + // if (DEBUG) { + // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); + // } + } + } + + while(true) { + if (currentFrame.next()) { + // Push to new block: + currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length()); + currentFrame.fpOrig = currentFrame.fp; + // This is a "next" frame -- even if it's + // floor'd we must pretend it isn't so we don't + // try to scan to the right floor frame: + currentFrame.isFloor = false; + //currentFrame.hasTerms = true; + currentFrame.loadBlock(); + stats.startBlock(currentFrame, !currentFrame.isLastInFloor); + } else { + stats.term(term.get()); + break; + } + } + } + + stats.finish(); + + // Put root frame back: + currentFrame = staticFrame; + if (fr.index != null) { + arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + currentFrame = pushFrame(arc, fr.rootCode, 0); + currentFrame.rewind(); + currentFrame.loadBlock(); + validIndexPrefix = 0; + term.clear(); + + return stats; + } + + private Lucene40SegmentTermsEnumFrame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final Lucene40SegmentTermsEnumFrame[] next = new Lucene40SegmentTermsEnumFrame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, next, 0, stack.length); + for(int stackOrd=stack.length;stackOrd getArc(int ord) { + if (ord >= arcs.length) { + @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc[] next = + new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, next, 0, arcs.length); + for(int arcOrd=arcs.length;arcOrd(); + } + arcs = next; + } + return arcs[ord]; + } + + // Pushes a frame we seek'd to + Lucene40SegmentTermsEnumFrame pushFrame(FST.Arc arc, BytesRef frameData, int length) throws IOException { + scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); + final long code = scratchReader.readVLong(); + final long fpSeek = code >>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; + final Lucene40SegmentTermsEnumFrame f = getFrame(1+currentFrame.ord); + f.hasTerms = (code & Lucene40BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0; + f.hasTermsOrig = f.hasTerms; + f.isFloor = (code & Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0; + if (f.isFloor) { + f.setFloorData(scratchReader, frameData); + } + pushFrame(arc, fpSeek, length); + + return f; + } + + // Pushes next'd frame or seek'd frame; we later + // lazy-load the frame only when needed + Lucene40SegmentTermsEnumFrame pushFrame(FST.Arc arc, long fp, int length) throws IOException { + final Lucene40SegmentTermsEnumFrame f = getFrame(1+currentFrame.ord); + f.arc = arc; + if (f.fpOrig == fp && f.nextEnt != -1) { + //if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix); + //if (f.prefix > targetBeforeCurrentLength) { + if (f.ord > targetBeforeCurrentLength) { + f.rewind(); + } else { + // if (DEBUG) { + // System.out.println(" skip rewind!"); + // } + } + assert length == f.prefix; + } else { + f.nextEnt = -1; + f.prefix = length; + f.state.termBlockOrd = 0; + f.fpOrig = f.fp = fp; + f.lastSubFP = -1; + // if (DEBUG) { + // final int sav = term.length; + // term.length = length; + // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term)); + // term.length = sav; + // } + } + + return f; + } + + // asserts only + private boolean clearEOF() { + eof = false; + return true; + } + + // asserts only + private boolean setEOF() { + eof = true; + return true; + } + + // for debugging + @SuppressWarnings("unused") + static String brToString(BytesRef b) { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + + @Override + public boolean seekExact(final BytesRef target) throws IOException { + + if (fr.index == null) { + throw new IllegalStateException("terms index was not loaded"); + } + + term.grow(1 + target.length); + + assert clearEOF(); + + // if (DEBUG) { + // System.out.println("\nBTTR.seekExact seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix); + // printSeekState(System.out); + // } + + FST.Arc arc; + int targetUpto; + BytesRef output; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + // if (DEBUG) { + // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + // } + + arc = arcs[0]; + assert arc.isFinal(); + output = arc.output; + targetUpto = 0; + + Lucene40SegmentTermsEnumFrame lastFrame = stack[0]; + assert validIndexPrefix <= term.length(); + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // TODO: reverse vLong byte order for better FST + // prefix output sharing + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + // if (DEBUG) { + // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); + // } + if (cmp != 0) { + break; + } + arc = arcs[1+targetUpto]; + assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + if (arc.output != Lucene40BlockTreeTermsReader.NO_OUTPUT) { + output = Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output); + } + if (arc.isFinal()) { + lastFrame = stack[1+lastFrame.ord]; + } + targetUpto++; + } + + if (cmp == 0) { + final int targetUptoMid = targetUpto; + + // Second compare the rest of the term, but + // don't save arc/output/frame; we only do this + // to find out if the target term is before, + // equal or after the current term + final int targetLimit2 = Math.min(target.length, term.length()); + while (targetUpto < targetLimit2) { + cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + // if (DEBUG) { + // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); + // } + if (cmp != 0) { + break; + } + targetUpto++; + } + + if (cmp == 0) { + cmp = term.length() - target.length; + } + targetUpto = targetUptoMid; + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + // if (DEBUG) { + // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = lastFrame.ord; + // if (DEBUG) { + // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length() == target.length; + if (termExists) { + // if (DEBUG) { + // System.out.println(" target is same as current; return true"); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" target is same as current but term doesn't exist"); + // } + } + //validIndexPrefix = currentFrame.depth; + //term.length = target.length; + //return termExists; + } + + } else { + + targetBeforeCurrentLength = -1; + arc = fr.index.getFirstArc(arcs[0]); + + // Empty string prefix must have an output (block) in the index! + assert arc.isFinal(); + assert arc.output != null; + + // if (DEBUG) { + // System.out.println(" no seek state; push root frame"); + // } + + output = arc.output; + + currentFrame = staticFrame; + + //term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(arc, Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0); + } + + // if (DEBUG) { + // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength); + // } + + // We are done sharing the common prefix with the incoming target and where we are currently seek'd; now continue walking the index: + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final FST.Arc nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); + + if (nextArc == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel)); + // } + + validIndexPrefix = currentFrame.prefix; + //validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + if (!currentFrame.hasTerms) { + termExists = false; + term.setByteAt(targetUpto, (byte) targetLabel); + term.setLength(1+targetUpto); + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); + // } + return false; + } + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term)); + // } + return false; + } + } else { + // Follow this arc + arc = nextArc; + term.setByteAt(targetUpto, (byte) targetLabel); + // Aggregate output as we go: + assert arc.output != null; + if (arc.output != Lucene40BlockTreeTermsReader.NO_OUTPUT) { + output = Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output); + } + + // if (DEBUG) { + // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); + // } + targetUpto++; + + if (arc.isFinal()) { + //if (DEBUG) System.out.println(" arc is final!"); + currentFrame = pushFrame(arc, Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto); + //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); + } + } + } + + //validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefix; + + currentFrame.scanToFloorFrame(target); + + // Target term is entirely contained in the index: + if (!currentFrame.hasTerms) { + termExists = false; + term.setLength(targetUpto); + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); + // } + return false; + } + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString()); + // } + + return false; + } + } + + @Override + public SeekStatus seekCeil(final BytesRef target) throws IOException { + if (fr.index == null) { + throw new IllegalStateException("terms index was not loaded"); + } + + term.grow(1 + target.length); + + assert clearEOF(); + + // if (DEBUG) { + // System.out.println("\nBTTR.seekCeil seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix); + // printSeekState(System.out); + // } + + FST.Arc arc; + int targetUpto; + BytesRef output; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + //if (DEBUG) { + //System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + //} + + arc = arcs[0]; + assert arc.isFinal(); + output = arc.output; + targetUpto = 0; + + Lucene40SegmentTermsEnumFrame lastFrame = stack[0]; + assert validIndexPrefix <= term.length(); + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // TODO: we should write our vLong backwards (MSB + // first) to get better sharing from the FST + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + //if (DEBUG) { + //System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); + //} + if (cmp != 0) { + break; + } + arc = arcs[1+targetUpto]; + assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + // TODO: we could save the outputs in local + // byte[][] instead of making new objs ever + // seek; but, often the FST doesn't have any + // shared bytes (but this could change if we + // reverse vLong byte order) + if (arc.output != Lucene40BlockTreeTermsReader.NO_OUTPUT) { + output = Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output); + } + if (arc.isFinal()) { + lastFrame = stack[1+lastFrame.ord]; + } + targetUpto++; + } + + + if (cmp == 0) { + final int targetUptoMid = targetUpto; + // Second compare the rest of the term, but + // don't save arc/output/frame: + final int targetLimit2 = Math.min(target.length, term.length()); + while (targetUpto < targetLimit2) { + cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + //if (DEBUG) { + //System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); + //} + if (cmp != 0) { + break; + } + targetUpto++; + } + + if (cmp == 0) { + cmp = term.length() - target.length; + } + targetUpto = targetUptoMid; + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + //if (DEBUG) { + //System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); clear frame.scanned ord=" + lastFrame.ord); + //} + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = 0; + //if (DEBUG) { + //System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); + //} + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length() == target.length; + if (termExists) { + //if (DEBUG) { + //System.out.println(" target is same as current; return FOUND"); + //} + return SeekStatus.FOUND; + } else { + //if (DEBUG) { + //System.out.println(" target is same as current but term doesn't exist"); + //} + } + } + + } else { + + targetBeforeCurrentLength = -1; + arc = fr.index.getFirstArc(arcs[0]); + + // Empty string prefix must have an output (block) in the index! + assert arc.isFinal(); + assert arc.output != null; + + //if (DEBUG) { + //System.out.println(" no seek state; push root frame"); + //} + + output = arc.output; + + currentFrame = staticFrame; + + //term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(arc, Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0); + } + + //if (DEBUG) { + //System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength); + //} + + // We are done sharing the common prefix with the incoming target and where we are currently seek'd; now continue walking the index: + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final FST.Arc nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); + + if (nextArc == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel)); + // } + + validIndexPrefix = currentFrame.prefix; + //validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, false); + if (result == SeekStatus.END) { + term.copyBytes(target); + termExists = false; + + if (next() != null) { + //if (DEBUG) { + //System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term); + //} + return SeekStatus.NOT_FOUND; + } else { + //if (DEBUG) { + //System.out.println(" return END"); + //} + return SeekStatus.END; + } + } else { + //if (DEBUG) { + //System.out.println(" return " + result + " term=" + brToString(term) + " " + term); + //} + return result; + } + } else { + // Follow this arc + term.setByteAt(targetUpto, (byte) targetLabel); + arc = nextArc; + // Aggregate output as we go: + assert arc.output != null; + if (arc.output != Lucene40BlockTreeTermsReader.NO_OUTPUT) { + output = Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output); + } + + //if (DEBUG) { + //System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); + //} + targetUpto++; + + if (arc.isFinal()) { + //if (DEBUG) System.out.println(" arc is final!"); + currentFrame = pushFrame(arc, Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto); + //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); + } + } + } + + //validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefix; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, false); + + if (result == SeekStatus.END) { + term.copyBytes(target); + termExists = false; + if (next() != null) { + //if (DEBUG) { + //System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term); + //} + return SeekStatus.NOT_FOUND; + } else { + //if (DEBUG) { + //System.out.println(" return END"); + //} + return SeekStatus.END; + } + } else { + return result; + } + } + + @SuppressWarnings("unused") + private void printSeekState(PrintStream out) throws IOException { + if (currentFrame == staticFrame) { + out.println(" no prior seek"); + } else { + out.println(" prior seek state:"); + int ord = 0; + boolean isSeekFrame = true; + while(true) { + Lucene40SegmentTermsEnumFrame f = getFrame(ord); + assert f != null; + final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix); + if (f.nextEnt == -1) { + out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<< Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); + } else { + out.println(" frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<< Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR:0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); + } + if (fr.index != null) { + assert !isSeekFrame || f.arc != null: "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; + if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix-1)&0xFF)) { + out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix-1)&0xFF)); + throw new RuntimeException("seek state is broken"); + } + BytesRef output = Util.get(fr.index, prefix); + if (output == null) { + out.println(" broken seek state: prefix is not final in index"); + throw new RuntimeException("seek state is broken"); + } else if (isSeekFrame && !f.isFloor) { + final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length); + final long codeOrig = reader.readVLong(); + final long code = (f.fp << Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS:0) | (f.isFloor ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR:0); + if (codeOrig != code) { + out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code); + throw new RuntimeException("seek state is broken"); + } + } + } + if (f == currentFrame) { + break; + } + if (f.prefix == validIndexPrefix) { + isSeekFrame = false; + } + ord++; + } + } + } + + /* Decodes only the term bytes of the next term. If caller then asks for + metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) + decode all metadata up to the current term. */ + @Override + public BytesRef next() throws IOException { + if (in == null) { + // Fresh TermsEnum; seek to first term: + final FST.Arc arc; + if (fr.index != null) { + arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + currentFrame = pushFrame(arc, fr.rootCode, 0); + currentFrame.loadBlock(); + } + + targetBeforeCurrentLength = currentFrame.ord; + + assert !eof; + // if (DEBUG) { + // System.out.println("\nBTTR.next seg=" + fr.parent.segment + " term=" + brToString(term) + " termExists?=" + termExists + " field=" + fr.fieldInfo.name + " termBlockOrd=" + currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix); + // printSeekState(System.out); + // } + + if (currentFrame == staticFrame) { + // If seek was previously called and the term was + // cached, or seek(TermState) was called, usually + // caller is just going to pull a D/&PEnum or get + // docFreq, etc. But, if they then call next(), + // this method catches up all internal state so next() + // works properly: + //if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + term); + final boolean result = seekExact(term.get()); + assert result; + } + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) { + if (!currentFrame.isLastInFloor) { + currentFrame.loadNextFloorBlock(); + } else { + //if (DEBUG) System.out.println(" pop frame"); + if (currentFrame.ord == 0) { + //if (DEBUG) System.out.println(" return null"); + assert setEOF(); + term.clear(); + validIndexPrefix = 0; + currentFrame.rewind(); + termExists = false; + return null; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord-1]; + + if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) { + // We popped into a frame that's not loaded + // yet or not scan'd to the right entry + currentFrame.scanToFloorFrame(term.get()); + currentFrame.loadBlock(); + currentFrame.scanToSubBlock(lastFP); + } + + // Note that the seek state (last seek) has been + // invalidated beyond this depth + validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix); + //if (DEBUG) { + //System.out.println(" reset validIndexPrefix=" + validIndexPrefix); + //} + } + } + + while(true) { + if (currentFrame.next()) { + // Push to new block: + //if (DEBUG) System.out.println(" push frame"); + currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length()); + // This is a "next" frame -- even if it's + // floor'd we must pretend it isn't so we don't + // try to scan to the right floor frame: + currentFrame.isFloor = false; + //currentFrame.hasTerms = true; + currentFrame.loadBlock(); + } else { + //if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord); + return term.get(); + } + } + } + + @Override + public BytesRef term() { + assert !eof; + return term.get(); + } + + @Override + public int docFreq() throws IOException { + assert !eof; + //if (DEBUG) System.out.println("BTR.docFreq"); + currentFrame.decodeMetaData(); + //if (DEBUG) System.out.println(" return " + currentFrame.state.docFreq); + return currentFrame.state.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + assert !eof; + currentFrame.decodeMetaData(); + return currentFrame.state.totalTermFreq; + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse, int flags) throws IOException { + assert !eof; + //if (DEBUG) { + //System.out.println("BTTR.docs seg=" + segment); + //} + currentFrame.decodeMetaData(); + //if (DEBUG) { + //System.out.println(" state=" + currentFrame.state); + //} + return fr.parent.postingsReader.docs(fr.fieldInfo, currentFrame.state, skipDocs, reuse, flags); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + if (fr.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + // Positions were not indexed: + return null; + } + + assert !eof; + currentFrame.decodeMetaData(); + return fr.parent.postingsReader.docsAndPositions(fr.fieldInfo, currentFrame.state, skipDocs, reuse, flags); + } + + @Override + public void seekExact(BytesRef target, TermState otherState) { + // if (DEBUG) { + // System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + target.utf8ToString() + " " + target + " state=" + otherState); + // } + assert clearEOF(); + if (target.compareTo(term.get()) != 0 || !termExists) { + assert otherState != null && otherState instanceof BlockTermState; + currentFrame = staticFrame; + currentFrame.state.copyFrom(otherState); + term.copyBytes(target); + currentFrame.metaDataUpto = currentFrame.getTermBlockOrd(); + assert currentFrame.metaDataUpto > 0; + validIndexPrefix = 0; + } else { + // if (DEBUG) { + // System.out.println(" skip seek: already on target state=" + currentFrame.state); + // } + } + } + + @Override + public TermState termState() throws IOException { + assert !eof; + currentFrame.decodeMetaData(); + TermState ts = currentFrame.state.clone(); + //if (DEBUG) System.out.println("BTTR.termState seg=" + segment + " state=" + ts); + return ts; + } + + @Override + public void seekExact(long ord) { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40SegmentTermsEnumFrame.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40SegmentTermsEnumFrame.java new file mode 100644 index 00000000000..1cb35271de8 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/Lucene40SegmentTermsEnumFrame.java @@ -0,0 +1,732 @@ +package org.apache.lucene.codecs.blocktree; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.FST; + +/** + * @deprecated Only for 4.x backcompat + */ +@Deprecated +final class Lucene40SegmentTermsEnumFrame { + // Our index in stack[]: + final int ord; + + boolean hasTerms; + boolean hasTermsOrig; + boolean isFloor; + + FST.Arc arc; + + // File pointer where this block was loaded from + long fp; + long fpOrig; + long fpEnd; + + byte[] suffixBytes = new byte[128]; + final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); + + byte[] statBytes = new byte[64]; + final ByteArrayDataInput statsReader = new ByteArrayDataInput(); + + byte[] floorData = new byte[32]; + final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); + + // Length of prefix shared by all terms in this block + int prefix; + + // Number of entries (term or sub-block) in this block + int entCount; + + // Which term we will next read, or -1 if the block + // isn't loaded yet + int nextEnt; + + // True if this block is either not a floor block, + // or, it's the last sub-block of a floor block + boolean isLastInFloor; + + // True if all entries are terms + boolean isLeafBlock; + + long lastSubFP; + + int nextFloorLabel; + int numFollowFloorBlocks; + + // Next term to decode metaData; we decode metaData + // lazily so that scanning to find the matching term is + // fast and only if you find a match and app wants the + // stats or docs/positions enums, will we decode the + // metaData + int metaDataUpto; + + final BlockTermState state; + + // metadata buffer, holding monotonic values + public long[] longs; + // metadata buffer, holding general values + public byte[] bytes; + ByteArrayDataInput bytesReader; + + private final Lucene40SegmentTermsEnum ste; + + public Lucene40SegmentTermsEnumFrame(Lucene40SegmentTermsEnum ste, int ord) throws IOException { + this.ste = ste; + this.ord = ord; + this.state = ste.fr.parent.postingsReader.newTermState(); + this.state.totalTermFreq = -1; + this.longs = new long[ste.fr.longsSize]; + } + + public void setFloorData(ByteArrayDataInput in, BytesRef source) { + final int numBytes = source.length - (in.getPosition() - source.offset); + if (numBytes > floorData.length) { + floorData = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + System.arraycopy(source.bytes, source.offset+in.getPosition(), floorData, 0, numBytes); + floorDataReader.reset(floorData, 0, numBytes); + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + //if (DEBUG) { + //System.out.println(" setFloorData fpOrig=" + fpOrig + " bytes=" + new BytesRef(source.bytes, source.offset + in.getPosition(), numBytes) + " numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + toHex(nextFloorLabel)); + //} + } + + public int getTermBlockOrd() { + return isLeafBlock ? nextEnt : state.termBlockOrd; + } + + void loadNextFloorBlock() throws IOException { + //if (DEBUG) { + //System.out.println(" loadNextFloorBlock fp=" + fp + " fpEnd=" + fpEnd); + //} + assert arc == null || isFloor: "arc=" + arc + " isFloor=" + isFloor; + fp = fpEnd; + nextEnt = -1; + loadBlock(); + } + + /* Does initial decode of next block of terms; this + doesn't actually decode the docFreq, totalTermFreq, + postings details (frq/prx offset, etc.) metadata; + it just loads them as byte[] blobs which are then + decoded on-demand if the metadata is ever requested + for any term in this block. This enables terms-only + intensive consumes (eg certain MTQs, respelling) to + not pay the price of decoding metadata they won't + use. */ + void loadBlock() throws IOException { + + // Clone the IndexInput lazily, so that consumers + // that just pull a TermsEnum to + // seekExact(TermState) don't pay this cost: + ste.initIndexInput(); + + if (nextEnt != -1) { + // Already loaded + return; + } + //System.out.println("blc=" + blockLoadCount); + + ste.in.seek(fp); + int code = ste.in.readVInt(); + entCount = code >>> 1; + assert entCount > 0; + isLastInFloor = (code & 1) != 0; + + assert arc == null || (isLastInFloor || isFloor): "fp=" + fp + " arc=" + arc + " isFloor=" + isFloor + " isLastInFloor=" + isLastInFloor; + + // TODO: if suffixes were stored in random-access + // array structure, then we could do binary search + // instead of linear scan to find target term; eg + // we could have simple array of offsets + + // term suffixes: + code = ste.in.readVInt(); + isLeafBlock = (code & 1) != 0; + int numBytes = code >>> 1; + if (suffixBytes.length < numBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ste.in.readBytes(suffixBytes, 0, numBytes); + suffixesReader.reset(suffixBytes, 0, numBytes); + + /*if (DEBUG) { + if (arc == null) { + System.out.println(" loadBlock (next) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); + } else { + System.out.println(" loadBlock (seek) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); + } + }*/ + + // stats + numBytes = ste.in.readVInt(); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ste.in.readBytes(statBytes, 0, numBytes); + statsReader.reset(statBytes, 0, numBytes); + metaDataUpto = 0; + + state.termBlockOrd = 0; + nextEnt = 0; + lastSubFP = -1; + + // TODO: we could skip this if !hasTerms; but + // that's rare so won't help much + // metadata + numBytes = ste.in.readVInt(); + if (bytes == null) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + bytesReader = new ByteArrayDataInput(); + } else if (bytes.length < numBytes) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ste.in.readBytes(bytes, 0, numBytes); + bytesReader.reset(bytes, 0, numBytes); + + + // Sub-blocks of a single floor block are always + // written one after another -- tail recurse: + fpEnd = ste.in.getFilePointer(); + // if (DEBUG) { + // System.out.println(" fpEnd=" + fpEnd); + // } + } + + void rewind() { + + // Force reload: + fp = fpOrig; + nextEnt = -1; + hasTerms = hasTermsOrig; + if (isFloor) { + floorDataReader.rewind(); + numFollowFloorBlocks = floorDataReader.readVInt(); + assert numFollowFloorBlocks > 0; + nextFloorLabel = floorDataReader.readByte() & 0xff; + } + + /* + //System.out.println("rewind"); + // Keeps the block loaded, but rewinds its state: + if (nextEnt > 0 || fp != fpOrig) { + if (DEBUG) { + System.out.println(" rewind frame ord=" + ord + " fpOrig=" + fpOrig + " fp=" + fp + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " nextEnt=" + nextEnt + " prefixLen=" + prefix); + } + if (fp != fpOrig) { + fp = fpOrig; + nextEnt = -1; + } else { + nextEnt = 0; + } + hasTerms = hasTermsOrig; + if (isFloor) { + floorDataReader.rewind(); + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + } + assert suffixBytes != null; + suffixesReader.rewind(); + assert statBytes != null; + statsReader.rewind(); + metaDataUpto = 0; + state.termBlockOrd = 0; + // TODO: skip this if !hasTerms? Then postings + // impl wouldn't have to write useless 0 byte + postingsReader.resetTermsBlock(fieldInfo, state); + lastSubFP = -1; + } else if (DEBUG) { + System.out.println(" skip rewind fp=" + fp + " fpOrig=" + fpOrig + " nextEnt=" + nextEnt + " ord=" + ord); + } + */ + } + + public boolean next() { + return isLeafBlock ? nextLeaf() : nextNonLeaf(); + } + + // Decodes next entry; returns true if it's a sub-block + public boolean nextLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + suffix = suffixesReader.readVInt(); + startBytePos = suffixesReader.getPosition(); + ste.term.setLength(prefix + suffix); + ste.term.grow(ste.term.length()); + suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); + // A normal term + ste.termExists = true; + return false; + } + + public boolean nextNonLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + final int code = suffixesReader.readVInt(); + suffix = code >>> 1; + startBytePos = suffixesReader.getPosition(); + ste.term.setLength(prefix + suffix); + ste.term.grow(ste.term.length()); + suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); + if ((code & 1) == 0) { + // A normal term + ste.termExists = true; + subCode = 0; + state.termBlockOrd++; + return false; + } else { + // A sub-block; make sub-FP absolute: + ste.termExists = false; + subCode = suffixesReader.readVLong(); + lastSubFP = fp - subCode; + //if (DEBUG) { + //System.out.println(" lastSubFP=" + lastSubFP); + //} + return true; + } + } + + // TODO: make this array'd so we can do bin search? + // likely not worth it? need to measure how many + // floor blocks we "typically" get + public void scanToFloorFrame(BytesRef target) { + + if (!isFloor || target.length <= prefix) { + // if (DEBUG) { + // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + target.length + " vs prefix=" + prefix); + // } + return; + } + + final int targetLabel = target.bytes[target.offset + prefix] & 0xFF; + + // if (DEBUG) { + // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + toHex(targetLabel) + " vs nextFloorLabel=" + toHex(nextFloorLabel) + " numFollowFloorBlocks=" + numFollowFloorBlocks); + // } + + if (targetLabel < nextFloorLabel) { + // if (DEBUG) { + // System.out.println(" already on correct block"); + // } + return; + } + + assert numFollowFloorBlocks != 0; + + long newFP = fpOrig; + while (true) { + final long code = floorDataReader.readVLong(); + newFP = fpOrig + (code >>> 1); + hasTerms = (code & 1) != 0; + // if (DEBUG) { + // System.out.println(" label=" + toHex(nextFloorLabel) + " fp=" + newFP + " hasTerms?=" + hasTerms + " numFollowFloor=" + numFollowFloorBlocks); + // } + + isLastInFloor = numFollowFloorBlocks == 1; + numFollowFloorBlocks--; + + if (isLastInFloor) { + nextFloorLabel = 256; + // if (DEBUG) { + // System.out.println(" stop! last block nextFloorLabel=" + toHex(nextFloorLabel)); + // } + break; + } else { + nextFloorLabel = floorDataReader.readByte() & 0xff; + if (targetLabel < nextFloorLabel) { + // if (DEBUG) { + // System.out.println(" stop! nextFloorLabel=" + toHex(nextFloorLabel)); + // } + break; + } + } + } + + if (newFP != fp) { + // Force re-load of the block: + // if (DEBUG) { + // System.out.println(" force switch to fp=" + newFP + " oldFP=" + fp); + // } + nextEnt = -1; + fp = newFP; + } else { + // if (DEBUG) { + // System.out.println(" stay on same fp=" + newFP); + // } + } + } + + public void decodeMetaData() throws IOException { + + //if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + segment + " mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd); + + // lazily catch up on metadata decode: + final int limit = getTermBlockOrd(); + boolean absolute = metaDataUpto == 0; + assert limit > 0; + + // TODO: better API would be "jump straight to term=N"??? + while (metaDataUpto < limit) { + + // TODO: we could make "tiers" of metadata, ie, + // decode docFreq/totalTF but don't decode postings + // metadata; this way caller could get + // docFreq/totalTF w/o paying decode cost for + // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: + + // stats + state.docFreq = statsReader.readVInt(); + //if (DEBUG) System.out.println(" dF=" + state.docFreq); + if (ste.fr.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { + state.totalTermFreq = state.docFreq + statsReader.readVLong(); + //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + } + // metadata + for (int i = 0; i < ste.fr.longsSize; i++) { + longs[i] = bytesReader.readVLong(); + } + ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute); + + metaDataUpto++; + absolute = false; + } + state.termBlockOrd = metaDataUpto; + } + + // Used only by assert + private boolean prefixMatches(BytesRef target) { + for(int bytePos=0;bytePos 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd; + totalBlockOtherBytes += otherBytes; + } + + void term(BytesRef term) { + totalTermBytes += term.length; + } + + void finish() { + assert startBlockCount == endBlockCount: "startBlockCount=" + startBlockCount + " endBlockCount=" + endBlockCount; + assert totalBlockCount == floorSubBlockCount + nonFloorBlockCount: "floorSubBlockCount=" + floorSubBlockCount + " nonFloorBlockCount=" + nonFloorBlockCount + " totalBlockCount=" + totalBlockCount; + assert totalBlockCount == mixedBlockCount + termsOnlyBlockCount + subBlocksOnlyBlockCount: "totalBlockCount=" + totalBlockCount + " mixedBlockCount=" + mixedBlockCount + " subBlocksOnlyBlockCount=" + subBlocksOnlyBlockCount + " termsOnlyBlockCount=" + termsOnlyBlockCount; + } + + @Override + public String toString() { + final ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); + PrintStream out; + try { + out = new PrintStream(bos, false, IOUtils.UTF_8); + } catch (UnsupportedEncodingException bogus) { + throw new RuntimeException(bogus); + } + + out.println(" index FST:"); + out.println(" " + indexNodeCount + " nodes"); + out.println(" " + indexArcCount + " arcs"); + out.println(" " + indexNumBytes + " bytes"); + out.println(" terms:"); + out.println(" " + totalTermCount + " terms"); + out.println(" " + totalTermBytes + " bytes" + (totalTermCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalTermBytes)/totalTermCount) + " bytes/term)" : "")); + out.println(" blocks:"); + out.println(" " + totalBlockCount + " blocks"); + out.println(" " + termsOnlyBlockCount + " terms-only blocks"); + out.println(" " + subBlocksOnlyBlockCount + " sub-block-only blocks"); + out.println(" " + mixedBlockCount + " mixed blocks"); + out.println(" " + floorBlockCount + " floor blocks"); + out.println(" " + (totalBlockCount-floorSubBlockCount) + " non-floor blocks"); + out.println(" " + floorSubBlockCount + " floor sub-blocks"); + out.println(" " + totalBlockSuffixBytes + " term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockSuffixBytes)/totalBlockCount) + " suffix-bytes/block)" : "")); + out.println(" " + totalBlockStatsBytes + " term stats bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : "")); + out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : "")); + if (totalBlockCount != 0) { + out.println(" by prefix length:"); + int total = 0; + for(int prefix=0;prefix + + + + + + + +BlockTree terms dictionary from Lucene 4.0-4.10 + + diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsBaseFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsBaseFormat.java deleted file mode 100644 index 5d278db4209..00000000000 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsBaseFormat.java +++ /dev/null @@ -1,48 +0,0 @@ -package org.apache.lucene.codecs.lucene40; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.codecs.PostingsBaseFormat; -import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SegmentWriteState; - -/** - * PostingsReaderBase for 4.0 segments - * @deprecated Only for reading old 4.0 segments */ -@Deprecated -final class Lucene40PostingsBaseFormat extends PostingsBaseFormat { - - /** Sole constructor. */ - Lucene40PostingsBaseFormat() { - super("Lucene40"); - } - - @Override - public PostingsReaderBase postingsReaderBase(SegmentReadState state) throws IOException { - return new Lucene40PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix); - } - - @Override - public PostingsWriterBase postingsWriterBase(SegmentWriteState state) throws IOException { - throw new UnsupportedOperationException("this codec can only be used for reading"); - } -} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java index e6e3eeaf53b..eeb7ca09b59 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java @@ -23,7 +23,7 @@ import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader; +import org.apache.lucene.codecs.blocktree.Lucene40BlockTreeTermsReader; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -51,13 +51,7 @@ public class Lucene40PostingsFormat extends PostingsFormat { boolean success = false; try { - FieldsProducer ret = new BlockTreeTermsReader( - state.directory, - state.fieldInfos, - state.segmentInfo, - postings, - state.context, - state.segmentSuffix); + FieldsProducer ret = new Lucene40BlockTreeTermsReader(postings, state); success = true; return ret; } finally { diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java index 05ffeda2591..d60b15b15f1 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java @@ -31,6 +31,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.TermState; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.Directory; @@ -104,7 +105,7 @@ final class Lucene40PostingsReader extends PostingsReaderBase { } @Override - public void init(IndexInput termsIn) throws IOException { + public void init(IndexInput termsIn, SegmentReadState state) throws IOException { // Make sure we are talking to the matching past writer CodecUtil.checkHeader(termsIn, TERMS_CODEC, VERSION_START, VERSION_CURRENT); diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoFormat.java index ee3a20b9e1e..6ec50210a7e 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoFormat.java @@ -45,7 +45,7 @@ public class Lucene40SegmentInfoFormat extends SegmentInfoFormat { } @Override - public final SegmentInfo read(Directory dir, String segment, IOContext context) throws IOException { + public final SegmentInfo read(Directory dir, String segment, byte segmentID[], IOContext context) throws IOException { final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene40SegmentInfoFormat.SI_EXTENSION); final IndexInput input = dir.openInput(fileName, context); boolean success = false; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java new file mode 100644 index 00000000000..cf76197f9ee --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java @@ -0,0 +1,247 @@ +package org.apache.lucene.codecs.lucene41; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.packed.PackedInts.Decoder; +import org.apache.lucene.util.packed.PackedInts.FormatAndBits; +import org.apache.lucene.util.packed.PackedInts; + +import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE; + +/** + * Lucene 4.1 postings format. + * @deprecated only for reading old 4.x segments + */ +@Deprecated +final class ForUtil { + + /** + * Special number of bits per value used whenever all values to encode are equal. + */ + private static final int ALL_VALUES_EQUAL = 0; + + /** + * Upper limit of the number of bytes that might be required to stored + * BLOCK_SIZE encoded values. + */ + static final int MAX_ENCODED_SIZE = BLOCK_SIZE * 4; + + /** + * Upper limit of the number of values that might be decoded in a single call to + * {@link #readBlock(IndexInput, byte[], int[])}. Although values after + * BLOCK_SIZE are garbage, it is necessary to allocate value buffers + * whose size is >= MAX_DATA_SIZE to avoid {@link ArrayIndexOutOfBoundsException}s. + */ + static final int MAX_DATA_SIZE; + static { + int maxDataSize = 0; + for(int version=PackedInts.VERSION_START;version<=PackedInts.VERSION_CURRENT;version++) { + for (PackedInts.Format format : PackedInts.Format.values()) { + for (int bpv = 1; bpv <= 32; ++bpv) { + if (!format.isSupported(bpv)) { + continue; + } + final PackedInts.Decoder decoder = PackedInts.getDecoder(format, version, bpv); + final int iterations = computeIterations(decoder); + maxDataSize = Math.max(maxDataSize, iterations * decoder.byteValueCount()); + } + } + } + MAX_DATA_SIZE = maxDataSize; + } + + /** + * Compute the number of iterations required to decode BLOCK_SIZE + * values with the provided {@link Decoder}. + */ + private static int computeIterations(PackedInts.Decoder decoder) { + return (int) Math.ceil((float) BLOCK_SIZE / decoder.byteValueCount()); + } + + /** + * Compute the number of bytes required to encode a block of values that require + * bitsPerValue bits per value with format format. + */ + private static int encodedSize(PackedInts.Format format, int packedIntsVersion, int bitsPerValue) { + final long byteCount = format.byteCount(packedIntsVersion, BLOCK_SIZE, bitsPerValue); + assert byteCount >= 0 && byteCount <= Integer.MAX_VALUE : byteCount; + return (int) byteCount; + } + + private final int[] encodedSizes; + private final PackedInts.Encoder[] encoders; + private final PackedInts.Decoder[] decoders; + private final int[] iterations; + + /** + * Create a new {@link ForUtil} instance and save state into out. + */ + ForUtil(float acceptableOverheadRatio, DataOutput out) throws IOException { + out.writeVInt(PackedInts.VERSION_CURRENT); + encodedSizes = new int[33]; + encoders = new PackedInts.Encoder[33]; + decoders = new PackedInts.Decoder[33]; + iterations = new int[33]; + + for (int bpv = 1; bpv <= 32; ++bpv) { + final FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits( + BLOCK_SIZE, bpv, acceptableOverheadRatio); + assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue); + assert formatAndBits.bitsPerValue <= 32; + encodedSizes[bpv] = encodedSize(formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue); + encoders[bpv] = PackedInts.getEncoder( + formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue); + decoders[bpv] = PackedInts.getDecoder( + formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue); + iterations[bpv] = computeIterations(decoders[bpv]); + + out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1)); + } + } + + /** + * Restore a {@link ForUtil} from a {@link DataInput}. + */ + ForUtil(DataInput in) throws IOException { + int packedIntsVersion = in.readVInt(); + PackedInts.checkVersion(packedIntsVersion); + encodedSizes = new int[33]; + encoders = new PackedInts.Encoder[33]; + decoders = new PackedInts.Decoder[33]; + iterations = new int[33]; + + for (int bpv = 1; bpv <= 32; ++bpv) { + final int code = in.readVInt(); + final int formatId = code >>> 5; + final int bitsPerValue = (code & 31) + 1; + + final PackedInts.Format format = PackedInts.Format.byId(formatId); + assert format.isSupported(bitsPerValue); + encodedSizes[bpv] = encodedSize(format, packedIntsVersion, bitsPerValue); + encoders[bpv] = PackedInts.getEncoder( + format, packedIntsVersion, bitsPerValue); + decoders[bpv] = PackedInts.getDecoder( + format, packedIntsVersion, bitsPerValue); + iterations[bpv] = computeIterations(decoders[bpv]); + } + } + + /** + * Write a block of data (For format). + * + * @param data the data to write + * @param encoded a buffer to use to encode data + * @param out the destination output + * @throws IOException If there is a low-level I/O error + */ + void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException { + if (isAllEqual(data)) { + out.writeByte((byte) ALL_VALUES_EQUAL); + out.writeVInt(data[0]); + return; + } + + final int numBits = bitsRequired(data); + assert numBits > 0 && numBits <= 32 : numBits; + final PackedInts.Encoder encoder = encoders[numBits]; + final int iters = iterations[numBits]; + assert iters * encoder.byteValueCount() >= BLOCK_SIZE; + final int encodedSize = encodedSizes[numBits]; + assert iters * encoder.byteBlockCount() >= encodedSize; + + out.writeByte((byte) numBits); + + encoder.encode(data, 0, encoded, 0, iters); + out.writeBytes(encoded, encodedSize); + } + + /** + * Read the next block of data (For format). + * + * @param in the input to use to read data + * @param encoded a buffer that can be used to store encoded data + * @param decoded where to write decoded data + * @throws IOException If there is a low-level I/O error + */ + void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException { + final int numBits = in.readByte(); + assert numBits <= 32 : numBits; + + if (numBits == ALL_VALUES_EQUAL) { + final int value = in.readVInt(); + Arrays.fill(decoded, 0, BLOCK_SIZE, value); + return; + } + + final int encodedSize = encodedSizes[numBits]; + in.readBytes(encoded, 0, encodedSize); + + final PackedInts.Decoder decoder = decoders[numBits]; + final int iters = iterations[numBits]; + assert iters * decoder.byteValueCount() >= BLOCK_SIZE; + + decoder.decode(encoded, 0, decoded, 0, iters); + } + + /** + * Skip the next block of data. + * + * @param in the input where to read data + * @throws IOException If there is a low-level I/O error + */ + void skipBlock(IndexInput in) throws IOException { + final int numBits = in.readByte(); + if (numBits == ALL_VALUES_EQUAL) { + in.readVInt(); + return; + } + assert numBits > 0 && numBits <= 32 : numBits; + final int encodedSize = encodedSizes[numBits]; + in.seek(in.getFilePointer() + encodedSize); + } + + private static boolean isAllEqual(final int[] data) { + final int v = data[0]; + for (int i = 1; i < BLOCK_SIZE; ++i) { + if (data[i] != v) { + return false; + } + } + return true; + } + + /** + * Compute the number of bits required to serialize any of the longs in + * data. + */ + private static int bitsRequired(final int[] data) { + long or = 0; + for (int i = 0; i < BLOCK_SIZE; ++i) { + assert data[i] >= 0; + or |= data[i]; + } + return PackedInts.bitsRequired(or); + } + +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/IntBlockTermState.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/IntBlockTermState.java new file mode 100644 index 00000000000..b5add1e7ee7 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/IntBlockTermState.java @@ -0,0 +1,62 @@ +package org.apache.lucene.codecs.lucene41; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.index.TermState; + +/** + * term state for Lucene 4.1 postings format + * @deprecated only for reading old 4.x segments + */ +@Deprecated +final class IntBlockTermState extends BlockTermState { + long docStartFP = 0; + long posStartFP = 0; + long payStartFP = 0; + long skipOffset = -1; + long lastPosBlockOffset = -1; + // docid when there is a single pulsed posting, otherwise -1 + // freq is always implicitly totalTermFreq in this case. + int singletonDocID = -1; + + @Override + public IntBlockTermState clone() { + IntBlockTermState other = new IntBlockTermState(); + other.copyFrom(this); + return other; + } + + @Override + public void copyFrom(TermState _other) { + super.copyFrom(_other); + IntBlockTermState other = (IntBlockTermState) _other; + docStartFP = other.docStartFP; + posStartFP = other.posStartFP; + payStartFP = other.payStartFP; + lastPosBlockOffset = other.lastPosBlockOffset; + skipOffset = other.skipOffset; + singletonDocID = other.singletonDocID; + } + + + @Override + public String toString() { + return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID; + } +} \ No newline at end of file diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java new file mode 100644 index 00000000000..24217dd117b --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java @@ -0,0 +1,113 @@ +package org.apache.lucene.codecs.lucene41; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.blocktree.Lucene40BlockTreeTermsReader; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.IOUtils; + +/** + * Lucene 4.1 postings format. + * @deprecated only for reading old 4.x segments + */ +@Deprecated +public class Lucene41PostingsFormat extends PostingsFormat { + /** + * Filename extension for document number, frequencies, and skip data. + * See chapter: Frequencies and Skip Data + */ + public static final String DOC_EXTENSION = "doc"; + + /** + * Filename extension for positions. + * See chapter: Positions + */ + public static final String POS_EXTENSION = "pos"; + + /** + * Filename extension for payloads and offsets. + * See chapter: Payloads and Offsets + */ + public static final String PAY_EXTENSION = "pay"; + + /** + * Expert: The maximum number of skip levels. Smaller values result in + * slightly smaller indexes, but slower skipping in big posting lists. + */ + static final int maxSkipLevels = 10; + + final static String TERMS_CODEC = "Lucene41PostingsWriterTerms"; + final static String DOC_CODEC = "Lucene41PostingsWriterDoc"; + final static String POS_CODEC = "Lucene41PostingsWriterPos"; + final static String PAY_CODEC = "Lucene41PostingsWriterPay"; + + // Increment version to change it + final static int VERSION_START = 0; + final static int VERSION_META_ARRAY = 1; + final static int VERSION_CHECKSUM = 2; + final static int VERSION_CURRENT = VERSION_CHECKSUM; + + /** + * Fixed packed block size, number of integers encoded in + * a single packed block. + */ + // NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding + public final static int BLOCK_SIZE = 128; + + /** Creates {@code Lucene41PostingsFormat} with default + * settings. */ + public Lucene41PostingsFormat() { + super("Lucene41"); + } + + @Override + public String toString() { + return getName() + "(blocksize=" + BLOCK_SIZE + ")"; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + throw new UnsupportedOperationException("this codec can only be used for reading"); + } + + @Override + public final FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory, + state.fieldInfos, + state.segmentInfo, + state.context, + state.segmentSuffix); + boolean success = false; + try { + FieldsProducer ret = new Lucene40BlockTreeTermsReader(postingsReader, state); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsReader); + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java similarity index 97% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java index 26cb34b8823..2b7eb57616f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java @@ -20,7 +20,6 @@ package org.apache.lucene.codecs.lucene41; import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE; import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE; import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE; -import static org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter.IntBlockTermState; import java.io.IOException; import java.util.Arrays; @@ -32,6 +31,7 @@ import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; @@ -48,12 +48,10 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; /** - * Concrete class that reads docId(maybe frq,pos,offset,payloads) list - * with postings format. - * - * @see Lucene41SkipReader for details - * @lucene.experimental + * Lucene 4.1 postings format. + * @deprecated only for reading old 4.x segments */ +@Deprecated public final class Lucene41PostingsReader extends PostingsReaderBase { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Lucene41PostingsReader.class); @@ -77,12 +75,12 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.DOC_EXTENSION), ioContext); version = CodecUtil.checkHeader(docIn, - Lucene41PostingsWriter.DOC_CODEC, - Lucene41PostingsWriter.VERSION_START, - Lucene41PostingsWriter.VERSION_CURRENT); + Lucene41PostingsFormat.DOC_CODEC, + Lucene41PostingsFormat.VERSION_START, + Lucene41PostingsFormat.VERSION_CURRENT); forUtil = new ForUtil(docIn); - if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) { + if (version >= Lucene41PostingsFormat.VERSION_CHECKSUM) { // NOTE: data file is too costly to verify checksum against all the bytes on open, // but for now we at least verify proper structure of the checksum footer: which looks // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption @@ -93,9 +91,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { if (fieldInfos.hasProx()) { posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.POS_EXTENSION), ioContext); - CodecUtil.checkHeader(posIn, Lucene41PostingsWriter.POS_CODEC, version, version); + CodecUtil.checkHeader(posIn, Lucene41PostingsFormat.POS_CODEC, version, version); - if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) { + if (version >= Lucene41PostingsFormat.VERSION_CHECKSUM) { // NOTE: data file is too costly to verify checksum against all the bytes on open, // but for now we at least verify proper structure of the checksum footer: which looks // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption @@ -106,9 +104,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) { payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.PAY_EXTENSION), ioContext); - CodecUtil.checkHeader(payIn, Lucene41PostingsWriter.PAY_CODEC, version, version); + CodecUtil.checkHeader(payIn, Lucene41PostingsFormat.PAY_CODEC, version, version); - if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) { + if (version >= Lucene41PostingsFormat.VERSION_CHECKSUM) { // NOTE: data file is too costly to verify checksum against all the bytes on open, // but for now we at least verify proper structure of the checksum footer: which looks // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption @@ -130,12 +128,12 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { } @Override - public void init(IndexInput termsIn) throws IOException { + public void init(IndexInput termsIn, SegmentReadState state) throws IOException { // Make sure we are talking to the matching postings writer CodecUtil.checkHeader(termsIn, - Lucene41PostingsWriter.TERMS_CODEC, - Lucene41PostingsWriter.VERSION_START, - Lucene41PostingsWriter.VERSION_CURRENT); + Lucene41PostingsFormat.TERMS_CODEC, + Lucene41PostingsFormat.VERSION_START, + Lucene41PostingsFormat.VERSION_CURRENT); final int indexBlockSize = termsIn.readVInt(); if (indexBlockSize != BLOCK_SIZE) { throw new IllegalStateException("index-time BLOCK_SIZE (" + indexBlockSize + ") != read-time BLOCK_SIZE (" + BLOCK_SIZE + ")"); @@ -187,7 +185,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { termState.posStartFP = 0; termState.payStartFP = 0; } - if (version < Lucene41PostingsWriter.VERSION_META_ARRAY) { // backward compatibility + if (version < Lucene41PostingsFormat.VERSION_META_ARRAY) { // backward compatibility _decodeTerm(in, fieldInfo, termState); return; } @@ -488,7 +486,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { if (skipper == null) { // Lazy init: first time this enum has ever been used for skipping skipper = new Lucene41SkipReader(docIn.clone(), - Lucene41PostingsWriter.maxSkipLevels, + Lucene41PostingsFormat.maxSkipLevels, BLOCK_SIZE, indexHasPos, indexHasOffsets, @@ -821,7 +819,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { // System.out.println(" create skipper"); // } skipper = new Lucene41SkipReader(docIn.clone(), - Lucene41PostingsWriter.maxSkipLevels, + Lucene41PostingsFormat.maxSkipLevels, BLOCK_SIZE, true, indexHasOffsets, @@ -1347,7 +1345,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { // System.out.println(" create skipper"); // } skipper = new Lucene41SkipReader(docIn.clone(), - Lucene41PostingsWriter.maxSkipLevels, + Lucene41PostingsFormat.maxSkipLevels, BLOCK_SIZE, true, indexHasOffsets, @@ -1590,7 +1588,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { @Override public void checkIntegrity() throws IOException { - if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) { + if (version >= Lucene41PostingsFormat.VERSION_CHECKSUM) { if (docIn != null) { CodecUtil.checksumEntireFile(docIn); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41SkipReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/Lucene41SkipReader.java similarity index 84% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41SkipReader.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/Lucene41SkipReader.java index 483b0ec21df..4f57430bdc7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41SkipReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene41/Lucene41SkipReader.java @@ -24,32 +24,10 @@ import org.apache.lucene.codecs.MultiLevelSkipListReader; import org.apache.lucene.store.IndexInput; /** - * Implements the skip list reader for block postings format - * that stores positions and payloads. - * - * Although this skipper uses MultiLevelSkipListReader as an interface, - * its definition of skip position will be a little different. - * - * For example, when skipInterval = blockSize = 3, df = 2*skipInterval = 6, - * - * 0 1 2 3 4 5 - * d d d d d d (posting list) - * ^ ^ (skip point in MultiLeveSkipWriter) - * ^ (skip point in Lucene41SkipWriter) - * - * In this case, MultiLevelSkipListReader will use the last document as a skip point, - * while Lucene41SkipReader should assume no skip point will comes. - * - * If we use the interface directly in Lucene41SkipReader, it may silly try to read - * another skip data after the only skip point is loaded. - * - * To illustrate this, we can call skipTo(d[5]), since skip point d[3] has smaller docId, - * and numSkipped+blockSize== df, the MultiLevelSkipListReader will assume the skip list - * isn't exhausted yet, and try to load a non-existed skip point - * - * Therefore, we'll trim df before passing it to the interface. see trim(int) - * + * Lucene 4.1 skiplist format. + * @deprecated only for reading old 4.x segments */ +@Deprecated final class Lucene41SkipReader extends MultiLevelSkipListReader { // private boolean DEBUG = Lucene41PostingsReader.DEBUG; private final int blockSize; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410Codec.java index 8455cd1f765..6c40db914b9 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410Codec.java @@ -74,12 +74,12 @@ public class Lucene410Codec extends Codec { } @Override - public final StoredFieldsFormat storedFieldsFormat() { + public StoredFieldsFormat storedFieldsFormat() { return fieldsFormat; } @Override - public final TermVectorsFormat termVectorsFormat() { + public TermVectorsFormat termVectorsFormat() { return vectorsFormat; } @@ -94,7 +94,7 @@ public class Lucene410Codec extends Codec { } @Override - public final SegmentInfoFormat segmentInfoFormat() { + public SegmentInfoFormat segmentInfoFormat() { return segmentInfosFormat; } @@ -127,7 +127,7 @@ public class Lucene410Codec extends Codec { } @Override - public final DocValuesFormat docValuesFormat() { + public DocValuesFormat docValuesFormat() { return docValuesFormat; } @@ -142,7 +142,7 @@ public class Lucene410Codec extends Codec { }; @Override - public final NormsFormat normsFormat() { + public NormsFormat normsFormat() { return normsFormat; } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesConsumer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesConsumer.java similarity index 97% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesConsumer.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesConsumer.java index 301ee04448a..c230d33c538 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesConsumer.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesConsumer.java @@ -28,6 +28,7 @@ import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.FieldInfo.DocValuesType; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; @@ -41,7 +42,11 @@ import org.apache.lucene.util.packed.DirectWriter; import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; import org.apache.lucene.util.packed.PackedInts; -/** writer for {@link Lucene410DocValuesFormat} */ +/** + * writer for 4.10 docvalues format + * @deprecated only for old 4.x segments + */ +@Deprecated class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable { static final int BLOCK_SIZE = 16384; @@ -108,6 +113,7 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable @Override public void addNumericField(FieldInfo field, Iterable values) throws IOException { + checkCanWrite(field); addNumericField(field, values, true); } @@ -262,6 +268,7 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable @Override public void addBinaryField(FieldInfo field, Iterable values) throws IOException { + checkCanWrite(field); // write the byte[] data meta.writeVInt(field.number); meta.writeByte(Lucene410DocValuesFormat.BINARY); @@ -466,6 +473,7 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable @Override public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + checkCanWrite(field); meta.writeVInt(field.number); meta.writeByte(Lucene410DocValuesFormat.SORTED); addTermsDict(field, values); @@ -474,6 +482,7 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable @Override public void addSortedNumericField(FieldInfo field, final Iterable docToValueCount, final Iterable values) throws IOException { + checkCanWrite(field); meta.writeVInt(field.number); meta.writeByte(Lucene410DocValuesFormat.SORTED_NUMERIC); if (isSingleValued(docToValueCount)) { @@ -491,6 +500,7 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable @Override public void addSortedSetField(FieldInfo field, Iterable values, final Iterable docToOrdCount, final Iterable ords) throws IOException { + checkCanWrite(field); meta.writeVInt(field.number); meta.writeByte(Lucene410DocValuesFormat.SORTED_SET); @@ -556,4 +566,14 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable meta = data = null; } } + + void checkCanWrite(FieldInfo field) { + if ((field.getDocValuesType() == DocValuesType.NUMERIC || + field.getDocValuesType() == DocValuesType.BINARY) && + field.getDocValuesGen() != -1) { + // ok + } else { + throw new UnsupportedOperationException("this codec can only be used for reading"); + } + } } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesFormat.java new file mode 100644 index 00000000000..472ff9ab085 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesFormat.java @@ -0,0 +1,61 @@ +package org.apache.lucene.codecs.lucene410; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; + +/** + * 4.10 docvalues format + * @deprecated only for old 4.x segments + */ +@Deprecated +public class Lucene410DocValuesFormat extends DocValuesFormat { + + /** Sole Constructor */ + public Lucene410DocValuesFormat() { + super("Lucene410"); + } + + @Override + public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new Lucene410DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + } + + @Override + public final DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { + return new Lucene410DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + } + + static final String DATA_CODEC = "Lucene410DocValuesData"; + static final String DATA_EXTENSION = "dvd"; + static final String META_CODEC = "Lucene410ValuesMetadata"; + static final String META_EXTENSION = "dvm"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + static final byte NUMERIC = 0; + static final byte BINARY = 1; + static final byte SORTED = 2; + static final byte SORTED_SET = 3; + static final byte SORTED_NUMERIC = 4; +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesProducer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesProducer.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesProducer.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesProducer.java index d68f4b0b71c..fafea3a8443 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesProducer.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesProducer.java @@ -74,7 +74,11 @@ import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.DirectReader; import org.apache.lucene.util.packed.MonotonicBlockPackedReader; -/** reader for {@link Lucene410DocValuesFormat} */ +/** + * reader for 4.10 docvalues format + * @deprecated only for old 4.x segments + */ +@Deprecated class Lucene410DocValuesProducer extends DocValuesProducer implements Closeable { private final Map numerics = new HashMap<>(); private final Map binaries = new HashMap<>(); diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoFormat.java index 361aa5accef..e6baa9760e9 100755 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoFormat.java @@ -44,7 +44,7 @@ public class Lucene46SegmentInfoFormat extends SegmentInfoFormat { } @Override - public SegmentInfo read(Directory dir, String segment, IOContext context) throws IOException { + public SegmentInfo read(Directory dir, String segment, byte segmentID[], IOContext context) throws IOException { final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene46SegmentInfoFormat.SI_EXTENSION); try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) { int codecVersion = CodecUtil.checkHeader(input, Lucene46SegmentInfoFormat.CODEC_NAME, diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat index 01ce305b441..574c9c564a2 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat @@ -16,3 +16,5 @@ org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat org.apache.lucene.codecs.lucene49.Lucene49DocValuesFormat +org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat + diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 112a1698302..023d9c9e1a6 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -14,3 +14,4 @@ # limitations under the License. org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat +org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/blocktree/Lucene40BlockTreeTermsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/blocktree/Lucene40BlockTreeTermsWriter.java new file mode 100644 index 00000000000..53a644825c6 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/blocktree/Lucene40BlockTreeTermsWriter.java @@ -0,0 +1,1022 @@ +package org.apache.lucene.codecs.blocktree; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMOutputStream; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.BytesRefFSTEnum; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Outputs; +import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.PackedInts; + +/* + TODO: + + - Currently there is a one-to-one mapping of indexed + term to term block, but we could decouple the two, ie, + put more terms into the index than there are blocks. + The index would take up more RAM but then it'd be able + to avoid seeking more often and could make PK/FuzzyQ + faster if the additional indexed terms could store + the offset into the terms block. + + - The blocks are not written in true depth-first + order, meaning if you just next() the file pointer will + sometimes jump backwards. For example, block foo* will + be written before block f* because it finished before. + This could possibly hurt performance if the terms dict is + not hot, since OSs anticipate sequential file access. We + could fix the writer to re-order the blocks as a 2nd + pass. + + - Each block encodes the term suffixes packed + sequentially using a separate vInt per term, which is + 1) wasteful and 2) slow (must linear scan to find a + particular suffix). We should instead 1) make + random-access array so we can directly access the Nth + suffix, and 2) bulk-encode this array using bulk int[] + codecs; then at search time we can binary search when + we seek a particular term. +*/ + +/** + * Block-based terms index and dictionary writer. + *

+ * Writes terms dict and index, block-encoding (column + * stride) each term's metadata for each set of terms + * between two index terms. + *

+ * Files: + *

+ *

+ * + *

Term Dictionary

+ * + *

The .tim file contains the list of terms in each + * field along with per-term statistics (such as docfreq) + * and per-term metadata (typically pointers to the postings list + * for that term in the inverted index). + *

+ * + *

The .tim is arranged in blocks: with blocks containing + * a variable number of entries (by default 25-48), where + * each entry is either a term or a reference to a + * sub-block.

+ * + *

NOTE: The term dictionary can plug into different postings implementations: + * the postings writer/reader are actually responsible for encoding + * and decoding the Postings Metadata and Term Metadata sections.

+ * + *
    + *
  • TermsDict (.tim) --> Header, PostingsHeader, NodeBlockNumBlocks, + * FieldSummary, DirOffset, Footer
  • + *
  • NodeBlock --> (OuterNode | InnerNode)
  • + *
  • OuterNode --> EntryCount, SuffixLength, ByteSuffixLength, StatsLength, < TermStats >EntryCount, MetaLength, <TermMetadata>EntryCount
  • + *
  • InnerNode --> EntryCount, SuffixLength[,Sub?], ByteSuffixLength, StatsLength, < TermStats ? >EntryCount, MetaLength, <TermMetadata ? >EntryCount
  • + *
  • TermStats --> DocFreq, TotalTermFreq
  • + *
  • FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength, ByteRootCodeLength, + * SumTotalTermFreq?, SumDocFreq, DocCount, LongsSize, MinTerm, MaxTerm>NumFields
  • + *
  • Header --> {@link CodecUtil#writeHeader CodecHeader}
  • + *
  • DirOffset --> {@link DataOutput#writeLong Uint64}
  • + *
  • MinTerm,MaxTerm --> {@link DataOutput#writeVInt VInt} length followed by the byte[]
  • + *
  • EntryCount,SuffixLength,StatsLength,DocFreq,MetaLength,NumFields, + * FieldNumber,RootCodeLength,DocCount,LongsSize --> {@link DataOutput#writeVInt VInt}
  • + *
  • TotalTermFreq,NumTerms,SumTotalTermFreq,SumDocFreq --> + * {@link DataOutput#writeVLong VLong}
  • + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • + *
+ *

Notes:

+ *
    + *
  • Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information + * for the BlockTree implementation.
  • + *
  • DirOffset is a pointer to the FieldSummary section.
  • + *
  • DocFreq is the count of documents which contain the term.
  • + *
  • TotalTermFreq is the total number of occurrences of the term. This is encoded + * as the difference between the total number of occurrences and the DocFreq.
  • + *
  • FieldNumber is the fields number from {@link FieldInfos}. (.fnm)
  • + *
  • NumTerms is the number of unique terms for the field.
  • + *
  • RootCode points to the root block for the field.
  • + *
  • SumDocFreq is the total number of postings, the number of term-document pairs across + * the entire field.
  • + *
  • DocCount is the number of documents that have at least one posting for this field.
  • + *
  • LongsSize records how many long values the postings writer/reader record per term + * (e.g., to hold freq/prox/doc file offsets). + *
  • MinTerm, MaxTerm are the lowest and highest term in this field.
  • + *
  • PostingsHeader and TermMetadata are plugged into by the specific postings implementation: + * these contain arbitrary per-file data (such as parameters or versioning information) + * and per-term data (such as pointers to inverted files).
  • + *
  • For inner nodes of the tree, every entry will steal one bit to mark whether it points + * to child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted
  • + *
+ * + *

Term Index

+ *

The .tip file contains an index into the term dictionary, so that it can be + * accessed randomly. The index is also used to determine + * when a given term cannot exist on disk (in the .tim file), saving a disk seek.

+ *
    + *
  • TermsIndex (.tip) --> Header, FSTIndexNumFields + * <IndexStartFP>NumFields, DirOffset, Footer
  • + *
  • Header --> {@link CodecUtil#writeHeader CodecHeader}
  • + *
  • DirOffset --> {@link DataOutput#writeLong Uint64}
  • + *
  • IndexStartFP --> {@link DataOutput#writeVLong VLong}
  • + * + *
  • FSTIndex --> {@link FST FST<byte[]>}
  • + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • + *
+ *

Notes:

+ *
    + *
  • The .tip file contains a separate FST for each + * field. The FST maps a term prefix to the on-disk + * block that holds all terms starting with that + * prefix. Each field's IndexStartFP points to its + * FST.
  • + *
  • DirOffset is a pointer to the start of the IndexStartFPs + * for all fields
  • + *
  • It's possible that an on-disk block would contain + * too many terms (more than the allowed maximum + * (default: 48)). When this happens, the block is + * sub-divided into new blocks (called "floor + * blocks"), and then the output in the FST for the + * block's prefix encodes the leading byte of each + * sub-block, and its file pointer. + *
+ * + * @see Lucene40BlockTreeTermsReader + * @lucene.experimental + * @deprecated Only for 4.x backcompat + */ +@Deprecated +public final class Lucene40BlockTreeTermsWriter extends FieldsConsumer { + + /** Suggested default value for the {@code + * minItemsInBlock} parameter to {@link + * #Lucene40BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */ + public final static int DEFAULT_MIN_BLOCK_SIZE = 25; + + /** Suggested default value for the {@code + * maxItemsInBlock} parameter to {@link + * #Lucene40BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */ + public final static int DEFAULT_MAX_BLOCK_SIZE = 48; + + // public final static boolean DEBUG = false; + //private final static boolean SAVE_DOT_FILES = false; + + private final IndexOutput out; + private final IndexOutput indexOut; + final int maxDoc; + final int minItemsInBlock; + final int maxItemsInBlock; + + final PostingsWriterBase postingsWriter; + final FieldInfos fieldInfos; + + private static class FieldMetaData { + public final FieldInfo fieldInfo; + public final BytesRef rootCode; + public final long numTerms; + public final long indexStartFP; + public final long sumTotalTermFreq; + public final long sumDocFreq; + public final int docCount; + private final int longsSize; + public final BytesRef minTerm; + public final BytesRef maxTerm; + + public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, + BytesRef minTerm, BytesRef maxTerm) { + assert numTerms > 0; + this.fieldInfo = fieldInfo; + assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms; + this.rootCode = rootCode; + this.indexStartFP = indexStartFP; + this.numTerms = numTerms; + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + this.docCount = docCount; + this.longsSize = longsSize; + this.minTerm = minTerm; + this.maxTerm = maxTerm; + } + } + + private final List fields = new ArrayList<>(); + + // private final String segment; + + /** Create a new writer. The number of items (terms or + * sub-blocks) per block will aim to be between + * minItemsPerBlock and maxItemsPerBlock, though in some + * cases the blocks may be smaller than the min. */ + public Lucene40BlockTreeTermsWriter( + SegmentWriteState state, + PostingsWriterBase postingsWriter, + int minItemsInBlock, + int maxItemsInBlock) + throws IOException + { + if (minItemsInBlock <= 1) { + throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock); + } + if (maxItemsInBlock <= 0) { + throw new IllegalArgumentException("maxItemsInBlock must be >= 1; got " + maxItemsInBlock); + } + if (minItemsInBlock > maxItemsInBlock) { + throw new IllegalArgumentException("maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock); + } + if (2*(minItemsInBlock-1) > maxItemsInBlock) { + throw new IllegalArgumentException("maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock); + } + + maxDoc = state.segmentInfo.getDocCount(); + + final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40BlockTreeTermsReader.TERMS_EXTENSION); + out = state.directory.createOutput(termsFileName, state.context); + boolean success = false; + IndexOutput indexOut = null; + try { + fieldInfos = state.fieldInfos; + this.minItemsInBlock = minItemsInBlock; + this.maxItemsInBlock = maxItemsInBlock; + writeHeader(out); + + //DEBUG = state.segmentName.equals("_4a"); + + final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40BlockTreeTermsReader.TERMS_INDEX_EXTENSION); + indexOut = state.directory.createOutput(termsIndexFileName, state.context); + writeIndexHeader(indexOut); + + this.postingsWriter = postingsWriter; + // segment = state.segmentInfo.name; + + // System.out.println("BTW.init seg=" + state.segmentName); + + postingsWriter.init(out, state); // have consumer write its format/header + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(out, indexOut); + } + } + this.indexOut = indexOut; + } + + /** Writes the terms file header. */ + private void writeHeader(IndexOutput out) throws IOException { + CodecUtil.writeHeader(out, Lucene40BlockTreeTermsReader.TERMS_CODEC_NAME, Lucene40BlockTreeTermsReader.VERSION_CURRENT); + } + + /** Writes the index file header. */ + private void writeIndexHeader(IndexOutput out) throws IOException { + CodecUtil.writeHeader(out, Lucene40BlockTreeTermsReader.TERMS_INDEX_CODEC_NAME, Lucene40BlockTreeTermsReader.VERSION_CURRENT); + } + + /** Writes the terms file trailer. */ + private void writeTrailer(IndexOutput out, long dirStart) throws IOException { + out.writeLong(dirStart); + } + + /** Writes the index file trailer. */ + private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException { + indexOut.writeLong(dirStart); + } + + @Override + public void write(Fields fields) throws IOException { + + String lastField = null; + for(String field : fields) { + assert lastField == null || lastField.compareTo(field) < 0; + lastField = field; + + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + + TermsEnum termsEnum = terms.iterator(null); + + TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field)); + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + termsWriter.write(term, termsEnum); + } + + termsWriter.finish(); + } + } + + static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) { + assert fp < (1L << 62); + return (fp << 2) | (hasTerms ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0); + } + + private static class PendingEntry { + public final boolean isTerm; + + protected PendingEntry(boolean isTerm) { + this.isTerm = isTerm; + } + } + + private static final class PendingTerm extends PendingEntry { + public final byte[] termBytes; + // stats + metadata + public final BlockTermState state; + + public PendingTerm(BytesRef term, BlockTermState state) { + super(true); + this.termBytes = new byte[term.length]; + System.arraycopy(term.bytes, term.offset, termBytes, 0, term.length); + this.state = state; + } + + @Override + public String toString() { + return brToString(termBytes); + } + } + + // for debugging + @SuppressWarnings("unused") + static String brToString(BytesRef b) { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + + // for debugging + @SuppressWarnings("unused") + static String brToString(byte[] b) { + return brToString(new BytesRef(b)); + } + + private static final class PendingBlock extends PendingEntry { + public final BytesRef prefix; + public final long fp; + public FST index; + public List> subIndices; + public final boolean hasTerms; + public final boolean isFloor; + public final int floorLeadByte; + + public PendingBlock(BytesRef prefix, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List> subIndices) { + super(false); + this.prefix = prefix; + this.fp = fp; + this.hasTerms = hasTerms; + this.isFloor = isFloor; + this.floorLeadByte = floorLeadByte; + this.subIndices = subIndices; + } + + @Override + public String toString() { + return "BLOCK: " + brToString(prefix); + } + + public void compileIndex(List blocks, RAMOutputStream scratchBytes, IntsRefBuilder scratchIntsRef) throws IOException { + + assert (isFloor && blocks.size() > 1) || (isFloor == false && blocks.size() == 1): "isFloor=" + isFloor + " blocks=" + blocks; + assert this == blocks.get(0); + + assert scratchBytes.getFilePointer() == 0; + + // TODO: try writing the leading vLong in MSB order + // (opposite of what Lucene does today), for better + // outputs sharing in the FST + scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor)); + if (isFloor) { + scratchBytes.writeVInt(blocks.size()-1); + for (int i=1;i builder, FST subIndex, IntsRefBuilder scratchIntsRef) throws IOException { + final BytesRefFSTEnum subIndexEnum = new BytesRefFSTEnum<>(subIndex); + BytesRefFSTEnum.InputOutput indexEnt; + while((indexEnt = subIndexEnum.next()) != null) { + //if (DEBUG) { + // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); + //} + builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); + } + } + } + + private final RAMOutputStream scratchBytes = new RAMOutputStream(); + private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); + + class TermsWriter { + private final FieldInfo fieldInfo; + private final int longsSize; + private long numTerms; + final FixedBitSet docsSeen; + long sumTotalTermFreq; + long sumDocFreq; + long indexStartFP; + + // Records index into pending where the current prefix at that + // length "started"; for example, if current term starts with 't', + // startsByPrefix[0] is the index into pending for the first + // term/sub-block starting with 't'. We use this to figure out when + // to write a new block: + private final BytesRefBuilder lastTerm = new BytesRefBuilder(); + private int[] prefixStarts = new int[8]; + + private final long[] longs; + + // Pending stack of terms and blocks. As terms arrive (in sorted order) + // we append to this stack, and once the top of the stack has enough + // terms starting with a common prefix, we write a new block with + // those terms and replace those terms in the stack with a new block: + private final List pending = new ArrayList<>(); + + // Reused in writeBlocks: + private final List newBlocks = new ArrayList<>(); + + private PendingTerm firstPendingTerm; + private PendingTerm lastPendingTerm; + + /** Writes the top count entries in pending, using prevTerm to compute the prefix. */ + void writeBlocks(int prefixLength, int count) throws IOException { + + assert count > 0; + + /* + if (DEBUG) { + BytesRef br = new BytesRef(lastTerm.bytes); + br.offset = lastTerm.offset; + br.length = prefixLength; + System.out.println("writeBlocks: " + br.utf8ToString() + " count=" + count); + } + */ + + // Root block better write all remaining pending entries: + assert prefixLength > 0 || count == pending.size(); + + int lastSuffixLeadLabel = -1; + + // True if we saw at least one term in this block (we record if a block + // only points to sub-blocks in the terms index so we can avoid seeking + // to it when we are looking for a term): + boolean hasTerms = false; + boolean hasSubBlocks = false; + + int start = pending.size()-count; + int end = pending.size(); + int nextBlockStart = start; + int nextFloorLeadLabel = -1; + + for (int i=start; i prefixLength; + suffixLeadLabel = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff; + } + // if (DEBUG) System.out.println(" i=" + i + " ent=" + ent + " suffixLeadLabel=" + suffixLeadLabel); + + if (suffixLeadLabel != lastSuffixLeadLabel) { + int itemsInBlock = i - nextBlockStart; + if (itemsInBlock >= minItemsInBlock && end-nextBlockStart > maxItemsInBlock) { + // The count is too large for one block, so we must break it into "floor" blocks, where we record + // the leading label of the suffix of the first term in each floor block, so at search time we can + // jump to the right floor block. We just use a naive greedy segmenter here: make a new floor + // block as soon as we have at least minItemsInBlock. This is not always best: it often produces + // a too-small block as the final block: + boolean isFloor = itemsInBlock < count; + newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, i, hasTerms, hasSubBlocks)); + + hasTerms = false; + hasSubBlocks = false; + nextFloorLeadLabel = suffixLeadLabel; + nextBlockStart = i; + } + + lastSuffixLeadLabel = suffixLeadLabel; + } + + if (ent.isTerm) { + hasTerms = true; + } else { + hasSubBlocks = true; + } + } + + // Write last block, if any: + if (nextBlockStart < end) { + int itemsInBlock = end - nextBlockStart; + boolean isFloor = itemsInBlock < count; + newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, end, hasTerms, hasSubBlocks)); + } + + assert newBlocks.isEmpty() == false; + + PendingBlock firstBlock = newBlocks.get(0); + + assert firstBlock.isFloor || newBlocks.size() == 1; + + firstBlock.compileIndex(newBlocks, scratchBytes, scratchIntsRef); + + // Remove slice from the top of the pending stack, that we just wrote: + pending.subList(pending.size()-count, pending.size()).clear(); + + // Append new block + pending.add(firstBlock); + + newBlocks.clear(); + } + + /** Writes the specified slice (start is inclusive, end is exclusive) + * from pending stack as a new block. If isFloor is true, there + * were too many (more than maxItemsInBlock) entries sharing the + * same prefix, and so we broke it into multiple floor blocks where + * we record the starting label of the suffix of each floor block. */ + private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLeadLabel, int start, int end, boolean hasTerms, boolean hasSubBlocks) throws IOException { + + assert end > start; + + long startFP = out.getFilePointer(); + + boolean hasFloorLeadLabel = isFloor && floorLeadLabel != -1; + + final BytesRef prefix = new BytesRef(prefixLength + (hasFloorLeadLabel ? 1 : 0)); + System.arraycopy(lastTerm.get().bytes, 0, prefix.bytes, 0, prefixLength); + prefix.length = prefixLength; + + // Write block header: + int numEntries = end - start; + int code = numEntries << 1; + if (end == pending.size()) { + // Last block: + code |= 1; + } + out.writeVInt(code); + + /* + if (DEBUG) { + System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + brToString(prefix) + " entCount=" + (end-start+1) + " startFP=" + startFP + (isFloor ? (" floorLeadLabel=" + Integer.toHexString(floorLeadLabel)) : "")); + } + */ + + // 1st pass: pack term suffix bytes into byte[] blob + // TODO: cutover to bulk int codec... simple64? + + // We optimize the leaf block case (block has only terms), writing a more + // compact format in this case: + boolean isLeafBlock = hasSubBlocks == false; + + final List> subIndices; + + boolean absolute = true; + + if (isLeafBlock) { + // Only terms: + subIndices = null; + for (int i=start;i= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.writeTo(metaWriter); + bytesWriter.reset(); + absolute = false; + } + } else { + // Mixed terms and sub-blocks: + subIndices = new ArrayList<>(); + for (int i=start;i= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.writeTo(metaWriter); + bytesWriter.reset(); + absolute = false; + } else { + PendingBlock block = (PendingBlock) ent; + assert StringHelper.startsWith(block.prefix, prefix); + final int suffix = block.prefix.length - prefixLength; + + assert suffix > 0; + + // For non-leaf block we borrow 1 bit to record + // if entry is term or sub-block + suffixWriter.writeVInt((suffix<<1)|1); + suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix); + + assert floorLeadLabel == -1 || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel; + + assert block.fp < startFP; + + /* + if (DEBUG) { + BytesRef suffixBytes = new BytesRef(suffix); + System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix); + suffixBytes.length = suffix; + System.out.println(" write sub-block suffix=" + brToString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor); + } + */ + + suffixWriter.writeVLong(startFP - block.fp); + subIndices.add(block.index); + } + } + + assert subIndices.size() != 0; + } + + // TODO: we could block-write the term suffix pointers; + // this would take more space but would enable binary + // search on lookup + + // Write suffixes byte[] blob to terms dict output: + out.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0)); + suffixWriter.writeTo(out); + suffixWriter.reset(); + + // Write term stats byte[] blob + out.writeVInt((int) statsWriter.getFilePointer()); + statsWriter.writeTo(out); + statsWriter.reset(); + + // Write term meta data byte[] blob + out.writeVInt((int) metaWriter.getFilePointer()); + metaWriter.writeTo(out); + metaWriter.reset(); + + // if (DEBUG) { + // System.out.println(" fpEnd=" + out.getFilePointer()); + // } + + if (hasFloorLeadLabel) { + // We already allocated to length+1 above: + prefix.bytes[prefix.length++] = (byte) floorLeadLabel; + } + + return new PendingBlock(prefix, startFP, hasTerms, isFloor, floorLeadLabel, subIndices); + } + + TermsWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + docsSeen = new FixedBitSet(maxDoc); + + this.longsSize = postingsWriter.setField(fieldInfo); + this.longs = new long[longsSize]; + } + + /** Writes one term's worth of postings. */ + public void write(BytesRef text, TermsEnum termsEnum) throws IOException { + /* + if (DEBUG) { + int[] tmp = new int[lastTerm.length]; + System.arraycopy(prefixStarts, 0, tmp, 0, tmp.length); + System.out.println("BTTW: write term=" + brToString(text) + " prefixStarts=" + Arrays.toString(tmp) + " pending.size()=" + pending.size()); + } + */ + + BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen); + if (state != null) { + assert state.docFreq != 0; + assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter; + sumDocFreq += state.docFreq; + sumTotalTermFreq += state.totalTermFreq; + pushTerm(text); + + PendingTerm term = new PendingTerm(text, state); + pending.add(term); + numTerms++; + if (firstPendingTerm == null) { + firstPendingTerm = term; + } + lastPendingTerm = term; + } + } + + /** Pushes the new term to the top of the stack, and writes new blocks. */ + private void pushTerm(BytesRef text) throws IOException { + int limit = Math.min(lastTerm.length(), text.length); + + // Find common prefix between last term and current term: + int pos = 0; + while (pos < limit && lastTerm.byteAt(pos) == text.bytes[text.offset+pos]) { + pos++; + } + + // if (DEBUG) System.out.println(" shared=" + pos + " lastTerm.length=" + lastTerm.length); + + // Close the "abandoned" suffix now: + for(int i=lastTerm.length()-1;i>=pos;i--) { + + // How many items on top of the stack share the current suffix + // we are closing: + int prefixTopSize = pending.size() - prefixStarts[i]; + if (prefixTopSize >= minItemsInBlock) { + // if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInBlock); + writeBlocks(i+1, prefixTopSize); + prefixStarts[i] -= prefixTopSize-1; + } + } + + if (prefixStarts.length < text.length) { + prefixStarts = ArrayUtil.grow(prefixStarts, text.length); + } + + // Init new tail: + for(int i=pos;i 0) { + // if (DEBUG) System.out.println("BTTW: finish prefixStarts=" + Arrays.toString(prefixStarts)); + + // Add empty term to force closing of all final blocks: + pushTerm(new BytesRef()); + + // TODO: if pending.size() is already 1 with a non-zero prefix length + // we can save writing a "degenerate" root block, but we have to + // fix all the places that assume the root block's prefix is the empty string: + writeBlocks(0, pending.size()); + + // We better have one final "root" block: + assert pending.size() == 1 && !pending.get(0).isTerm: "pending.size()=" + pending.size() + " pending=" + pending; + final PendingBlock root = (PendingBlock) pending.get(0); + assert root.prefix.length == 0; + assert root.index.getEmptyOutput() != null; + + // Write FST to index + indexStartFP = indexOut.getFilePointer(); + root.index.save(indexOut); + //System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name); + + /* + if (DEBUG) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(root.index, w, false, false); + System.out.println("SAVED to " + dotFileName); + w.close(); + } + */ + assert firstPendingTerm != null; + BytesRef minTerm = new BytesRef(firstPendingTerm.termBytes); + + assert lastPendingTerm != null; + BytesRef maxTerm = new BytesRef(lastPendingTerm.termBytes); + + fields.add(new FieldMetaData(fieldInfo, + ((PendingBlock) pending.get(0)).index.getEmptyOutput(), + numTerms, + indexStartFP, + sumTotalTermFreq, + sumDocFreq, + docsSeen.cardinality(), + longsSize, + minTerm, maxTerm)); + } else { + assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1; + assert sumDocFreq == 0; + assert docsSeen.cardinality() == 0; + } + } + + private final RAMOutputStream suffixWriter = new RAMOutputStream(); + private final RAMOutputStream statsWriter = new RAMOutputStream(); + private final RAMOutputStream metaWriter = new RAMOutputStream(); + private final RAMOutputStream bytesWriter = new RAMOutputStream(); + } + + @Override + public void close() throws IOException { + + boolean success = false; + try { + + final long dirStart = out.getFilePointer(); + final long indexDirStart = indexOut.getFilePointer(); + + out.writeVInt(fields.size()); + + for(FieldMetaData field : fields) { + //System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms"); + out.writeVInt(field.fieldInfo.number); + assert field.numTerms > 0; + out.writeVLong(field.numTerms); + out.writeVInt(field.rootCode.length); + out.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length); + if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { + out.writeVLong(field.sumTotalTermFreq); + } + out.writeVLong(field.sumDocFreq); + out.writeVInt(field.docCount); + out.writeVInt(field.longsSize); + indexOut.writeVLong(field.indexStartFP); + writeBytesRef(out, field.minTerm); + writeBytesRef(out, field.maxTerm); + } + writeTrailer(out, dirStart); + CodecUtil.writeFooter(out); + writeIndexTrailer(indexOut, indexDirStart); + CodecUtil.writeFooter(indexOut); + success = true; + } finally { + if (success) { + IOUtils.close(out, indexOut, postingsWriter); + } else { + IOUtils.closeWhileHandlingException(out, indexOut, postingsWriter); + } + } + } + + private static void writeBytesRef(IndexOutput out, BytesRef bytes) throws IOException { + out.writeVInt(bytes.length); + out.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/blocktree/TestLucene40BlockFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/blocktree/TestLucene40BlockFormat.java new file mode 100644 index 00000000000..2e863a87ea6 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/blocktree/TestLucene40BlockFormat.java @@ -0,0 +1,67 @@ +package org.apache.lucene.codecs.blocktree; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.blocktree.Lucene40FieldReader; +import org.apache.lucene.codecs.blocktree.Lucene40Stats; +import org.apache.lucene.codecs.lucene41.Lucene41RWCodec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.BasePostingsFormatTestCase; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; + +/** + * Tests BlockPostingsFormat + */ +public class TestLucene40BlockFormat extends BasePostingsFormatTestCase { + private final Codec codec = new Lucene41RWCodec(); + + @Override + protected Codec getCodec() { + return codec; + } + + /** Make sure the final sub-block(s) are not skipped. */ + public void testFinalBlock() throws Exception { + Directory d = newDirectory(); + IndexWriter w = new IndexWriter(d, new IndexWriterConfig(new MockAnalyzer(random()))); + for(int i=0;i<25;i++) { + Document doc = new Document(); + doc.add(newStringField("field", Character.toString((char) (97+i)), Field.Store.NO)); + doc.add(newStringField("field", "z" + Character.toString((char) (97+i)), Field.Store.NO)); + w.addDocument(doc); + } + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w, true); + assertEquals(1, r.leaves().size()); + Lucene40FieldReader field = (Lucene40FieldReader) r.leaves().get(0).reader().fields().terms("field"); + // We should see exactly two blocks: one root block (prefix empty string) and one block for z* terms (prefix z): + Lucene40Stats stats = field.getStats(); + assertEquals(0, stats.floorBlockCount); + assertEquals(2, stats.nonFloorBlockCount); + r.close(); + w.close(); + d.close(); + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java index cff96cf38bf..bac0e4bd6f1 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java @@ -129,7 +129,7 @@ final class Lucene40PostingsWriter extends PushPostingsWriterBase { } @Override - public void init(IndexOutput termsOut) throws IOException { + public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { CodecUtil.writeHeader(termsOut, Lucene40PostingsReader.TERMS_CODEC, Lucene40PostingsReader.VERSION_CURRENT); termsOut.writeInt(skipInterval); // write skipInterval termsOut.writeInt(maxSkipLevels); // write maxSkipLevels diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene40/Lucene40RWPostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene40/Lucene40RWPostingsFormat.java index 965b1bf8d3e..4b69af8fa80 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene40/Lucene40RWPostingsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene40/Lucene40RWPostingsFormat.java @@ -21,7 +21,7 @@ import java.io.IOException; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter; +import org.apache.lucene.codecs.blocktree.Lucene40BlockTreeTermsWriter; import org.apache.lucene.index.SegmentWriteState; /** @@ -46,7 +46,7 @@ public final class Lucene40RWPostingsFormat extends Lucene40PostingsFormat { // Or... you must make a new Codec for this? boolean success = false; try { - FieldsConsumer ret = new BlockTreeTermsWriter(state, docs, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE); + FieldsConsumer ret = new Lucene40BlockTreeTermsWriter(state, docs, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE); success = true; return ret; } finally { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java similarity index 87% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java index 40147176584..41d735aaad5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java @@ -26,7 +26,6 @@ import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.TermState; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; @@ -38,35 +37,13 @@ import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE; import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE; import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE; - /** - * Concrete class that writes docId(maybe frq,pos,offset,payloads) list - * with postings format. - * - * Postings list for each term will be stored separately. - * - * @see Lucene41SkipWriter for details about skipping setting and postings layout. - * @lucene.experimental + * Writes 4.1 postings for testing + * @deprecated for test purposes only */ +@Deprecated public final class Lucene41PostingsWriter extends PushPostingsWriterBase { - /** - * Expert: The maximum number of skip levels. Smaller values result in - * slightly smaller indexes, but slower skipping in big posting lists. - */ - static final int maxSkipLevels = 10; - - final static String TERMS_CODEC = "Lucene41PostingsWriterTerms"; - final static String DOC_CODEC = "Lucene41PostingsWriterDoc"; - final static String POS_CODEC = "Lucene41PostingsWriterPos"; - final static String PAY_CODEC = "Lucene41PostingsWriterPay"; - - // Increment version to change it - final static int VERSION_START = 0; - final static int VERSION_META_ARRAY = 1; - final static int VERSION_CHECKSUM = 2; - final static int VERSION_CURRENT = VERSION_CHECKSUM; - IndexOutput docOut; IndexOutput posOut; IndexOutput payOut; @@ -119,13 +96,13 @@ public final class Lucene41PostingsWriter extends PushPostingsWriterBase { IndexOutput payOut = null; boolean success = false; try { - CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT); + CodecUtil.writeHeader(docOut, Lucene41PostingsFormat.DOC_CODEC, Lucene41PostingsFormat.VERSION_CURRENT); forUtil = new ForUtil(acceptableOverheadRatio, docOut); if (state.fieldInfos.hasProx()) { posDeltaBuffer = new int[MAX_DATA_SIZE]; posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene41PostingsFormat.POS_EXTENSION), state.context); - CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT); + CodecUtil.writeHeader(posOut, Lucene41PostingsFormat.POS_CODEC, Lucene41PostingsFormat.VERSION_CURRENT); if (state.fieldInfos.hasPayloads()) { payloadBytes = new byte[128]; @@ -146,7 +123,7 @@ public final class Lucene41PostingsWriter extends PushPostingsWriterBase { if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene41PostingsFormat.PAY_EXTENSION), state.context); - CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT); + CodecUtil.writeHeader(payOut, Lucene41PostingsFormat.PAY_CODEC, Lucene41PostingsFormat.VERSION_CURRENT); } } else { posDeltaBuffer = null; @@ -168,7 +145,7 @@ public final class Lucene41PostingsWriter extends PushPostingsWriterBase { freqBuffer = new int[MAX_DATA_SIZE]; // TODO: should we try skipping every 2/4 blocks...? - skipWriter = new Lucene41SkipWriter(maxSkipLevels, + skipWriter = new Lucene41SkipWriter(Lucene41PostingsFormat.maxSkipLevels, BLOCK_SIZE, state.segmentInfo.getDocCount(), docOut, @@ -183,50 +160,14 @@ public final class Lucene41PostingsWriter extends PushPostingsWriterBase { this(state, PackedInts.COMPACT); } - final static class IntBlockTermState extends BlockTermState { - long docStartFP = 0; - long posStartFP = 0; - long payStartFP = 0; - long skipOffset = -1; - long lastPosBlockOffset = -1; - // docid when there is a single pulsed posting, otherwise -1 - // freq is always implicitly totalTermFreq in this case. - int singletonDocID = -1; - - @Override - public IntBlockTermState clone() { - IntBlockTermState other = new IntBlockTermState(); - other.copyFrom(this); - return other; - } - - @Override - public void copyFrom(TermState _other) { - super.copyFrom(_other); - IntBlockTermState other = (IntBlockTermState) _other; - docStartFP = other.docStartFP; - posStartFP = other.posStartFP; - payStartFP = other.payStartFP; - lastPosBlockOffset = other.lastPosBlockOffset; - skipOffset = other.skipOffset; - singletonDocID = other.singletonDocID; - } - - - @Override - public String toString() { - return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID; - } - } - @Override public IntBlockTermState newTermState() { return new IntBlockTermState(); } @Override - public void init(IndexOutput termsOut) throws IOException { - CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT); + public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { + CodecUtil.writeHeader(termsOut, Lucene41PostingsFormat.TERMS_CODEC, Lucene41PostingsFormat.VERSION_CURRENT); termsOut.writeVInt(BLOCK_SIZE); } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41RWCodec.java index d07e370ad57..16779c86e14 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41RWCodec.java @@ -3,6 +3,7 @@ package org.apache.lucene.codecs.lucene41; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; @@ -40,6 +41,12 @@ public final class Lucene41RWCodec extends Lucene41Codec { private final DocValuesFormat docValues = new Lucene40RWDocValuesFormat(); private final NormsFormat norms = new Lucene40RWNormsFormat(); private final TermVectorsFormat vectors = new Lucene40RWTermVectorsFormat(); + private final PostingsFormat postings = new Lucene41RWPostingsFormat(); + + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return postings; + } @Override public FieldInfosFormat fieldInfosFormat() { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41RWPostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41RWPostingsFormat.java new file mode 100644 index 00000000000..0e320126a70 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41RWPostingsFormat.java @@ -0,0 +1,56 @@ +package org.apache.lucene.codecs.lucene41; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.blocktree.Lucene40BlockTreeTermsWriter; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.IOUtils; + +/** + * Read-write version of 4.1 postings format for testing + * @deprecated for test purposes only + */ +@Deprecated +public class Lucene41RWPostingsFormat extends Lucene41PostingsFormat { + + static final int MIN_BLOCK_SIZE = 25; + static final int MAX_BLOCK_SIZE = 48; + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state); + + boolean success = false; + try { + FieldsConsumer ret = new Lucene40BlockTreeTermsWriter(state, + postingsWriter, + MIN_BLOCK_SIZE, + MAX_BLOCK_SIZE); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41SkipWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41SkipWriter.java similarity index 86% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41SkipWriter.java rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41SkipWriter.java index fdb65d73d8b..fd1b61c8928 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41SkipWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41SkipWriter.java @@ -24,25 +24,10 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.codecs.MultiLevelSkipListWriter; /** - * Write skip lists with multiple levels, and support skip within block ints. - * - * Assume that docFreq = 28, skipInterval = blockSize = 12 - * - * | block#0 | | block#1 | |vInts| - * d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list) - * ^ ^ (level 0 skip point) - * - * Note that skipWriter will ignore first document in block#0, since - * it is useless as a skip point. Also, we'll never skip into the vInts - * block, only record skip data at the start its start point(if it exist). - * - * For each skip point, we will record: - * 1. docID in former position, i.e. for position 12, record docID[11], etc. - * 2. its related file points(position, payload), - * 3. related numbers or uptos(position, payload). - * 4. start offset. - * + * Writes 4.1 skiplists for testing + * @deprecated for test purposes only */ +@Deprecated final class Lucene41SkipWriter extends MultiLevelSkipListWriter { // private boolean DEBUG = Lucene41PostingsReader.DEBUG; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/TestLucene41ForUtil.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/TestLucene41ForUtil.java new file mode 100644 index 00000000000..437758d777b --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/TestLucene41ForUtil.java @@ -0,0 +1,94 @@ +package org.apache.lucene.codecs.lucene41; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE; +import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.packed.PackedInts; + +import com.carrotsearch.randomizedtesting.generators.RandomInts; + +public class TestLucene41ForUtil extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + final int iterations = RandomInts.randomIntBetween(random(), 1, 1000); + final float acceptableOverheadRatio = random().nextFloat(); + final int[] values = new int[(iterations - 1) * BLOCK_SIZE + ForUtil.MAX_DATA_SIZE]; + for (int i = 0; i < iterations; ++i) { + final int bpv = random().nextInt(32); + if (bpv == 0) { + final int value = RandomInts.randomIntBetween(random(), 0, Integer.MAX_VALUE); + for (int j = 0; j < BLOCK_SIZE; ++j) { + values[i * BLOCK_SIZE + j] = value; + } + } else { + for (int j = 0; j < BLOCK_SIZE; ++j) { + values[i * BLOCK_SIZE + j] = RandomInts.randomIntBetween(random(), + 0, (int) PackedInts.maxValue(bpv)); + } + } + } + + final Directory d = new RAMDirectory(); + final long endPointer; + + { + // encode + IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); + final ForUtil forUtil = new ForUtil(acceptableOverheadRatio, out); + + for (int i = 0; i < iterations; ++i) { + forUtil.writeBlock( + Arrays.copyOfRange(values, i * BLOCK_SIZE, values.length), + new byte[MAX_ENCODED_SIZE], out); + } + endPointer = out.getFilePointer(); + out.close(); + } + + { + // decode + IndexInput in = d.openInput("test.bin", IOContext.READONCE); + final ForUtil forUtil = new ForUtil(in); + for (int i = 0; i < iterations; ++i) { + if (random().nextBoolean()) { + forUtil.skipBlock(in); + continue; + } + final int[] restored = new int[MAX_DATA_SIZE]; + forUtil.readBlock(in, new byte[MAX_ENCODED_SIZE], restored); + assertArrayEquals(Arrays.copyOfRange(values, i * BLOCK_SIZE, (i + 1) * BLOCK_SIZE), + Arrays.copyOf(restored, BLOCK_SIZE)); + } + assertEquals(endPointer, in.getFilePointer()); + in.close(); + } + } + +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/TestLucene41PostingsFormat2.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/TestLucene41PostingsFormat2.java new file mode 100644 index 00000000000..8ebd506a4da --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/TestLucene41PostingsFormat2.java @@ -0,0 +1,132 @@ +package org.apache.lucene.codecs.lucene41; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +/** + * Tests special cases of BlockPostingsFormat + */ + +public class TestLucene41PostingsFormat2 extends LuceneTestCase { + Directory dir; + RandomIndexWriter iw; + + @Override + public void setUp() throws Exception { + super.setUp(); + dir = newFSDirectory(createTempDir("testDFBlockSize")); + IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(new Lucene41RWCodec()); + iw = new RandomIndexWriter(random(), dir, iwc); + iw.setDoRandomForceMerge(false); // we will ourselves + } + + @Override + public void tearDown() throws Exception { + iw.close(); + TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge + IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(new Lucene41RWCodec()); + iwc.setOpenMode(OpenMode.APPEND); + IndexWriter iw = new IndexWriter(dir, iwc); + iw.forceMerge(1); + iw.close(); + dir.close(); // just force a checkindex for now + super.tearDown(); + } + + private Document newDocument() { + Document doc = new Document(); + for (IndexOptions option : FieldInfo.IndexOptions.values()) { + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + // turn on tvs for a cross-check, since we rely upon checkindex in this test (for now) + ft.setStoreTermVectors(true); + ft.setStoreTermVectorOffsets(true); + ft.setStoreTermVectorPositions(true); + ft.setStoreTermVectorPayloads(true); + ft.setIndexOptions(option); + doc.add(new Field(option.toString(), "", ft)); + } + return doc; + } + + /** tests terms with df = blocksize */ + public void testDFBlockSize() throws Exception { + Document doc = newDocument(); + for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE; i++) { + for (Field f : doc.getFields()) { + f.setStringValue(f.name() + " " + f.name() + "_2"); + } + iw.addDocument(doc); + } + } + + /** tests terms with df % blocksize = 0 */ + public void testDFBlockSizeMultiple() throws Exception { + Document doc = newDocument(); + for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE * 16; i++) { + for (Field f : doc.getFields()) { + f.setStringValue(f.name() + " " + f.name() + "_2"); + } + iw.addDocument(doc); + } + } + + /** tests terms with ttf = blocksize */ + public void testTTFBlockSize() throws Exception { + Document doc = newDocument(); + for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE/2; i++) { + for (Field f : doc.getFields()) { + f.setStringValue(f.name() + " " + f.name() + " " + f.name() + "_2 " + f.name() + "_2"); + } + iw.addDocument(doc); + } + } + + /** tests terms with ttf % blocksize = 0 */ + public void testTTFBlockSizeMultiple() throws Exception { + Document doc = newDocument(); + for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE/2; i++) { + for (Field f : doc.getFields()) { + String proto = (f.name() + " " + f.name() + " " + f.name() + " " + f.name() + " " + + f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2"); + StringBuilder val = new StringBuilder(); + for (int j = 0; j < 16; j++) { + val.append(proto); + val.append(" "); + } + f.setStringValue(val.toString()); + } + iw.addDocument(doc); + } + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/TestLucene41PostingsFormat3.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/TestLucene41PostingsFormat3.java new file mode 100644 index 00000000000..30a92734b9f --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/TestLucene41PostingsFormat3.java @@ -0,0 +1,521 @@ +package org.apache.lucene.codecs.lucene41; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.Random; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockFixedLengthPayloadFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.MockVariableLengthPayloadFilter; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.English; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.automaton.AutomatonTestUtil; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Tests partial enumeration (only pulling a subset of the indexed data) + */ +public class TestLucene41PostingsFormat3 extends LuceneTestCase { + static final int MAXDOC = Lucene41PostingsFormat.BLOCK_SIZE * 20; + + // creates 8 fields with different options and does "duels" of fields against each other + public void test() throws Exception { + Directory dir = newDirectory(); + Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(); + if (fieldName.contains("payloadsFixed")) { + TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1); + return new TokenStreamComponents(tokenizer, filter); + } else if (fieldName.contains("payloadsVariable")) { + TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer); + return new TokenStreamComponents(tokenizer, filter); + } else { + return new TokenStreamComponents(tokenizer); + } + } + }; + IndexWriterConfig iwc = newIndexWriterConfig(analyzer); + iwc.setCodec(new Lucene41RWCodec()); + // TODO we could actually add more fields implemented with different PFs + // or, just put this test into the usual rotation? + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + Document doc = new Document(); + FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED); + // turn this on for a cross-check + docsOnlyType.setStoreTermVectors(true); + docsOnlyType.setIndexOptions(IndexOptions.DOCS_ONLY); + + FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED); + // turn this on for a cross-check + docsAndFreqsType.setStoreTermVectors(true); + docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); + + FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED); + // turn these on for a cross-check + positionsType.setStoreTermVectors(true); + positionsType.setStoreTermVectorPositions(true); + positionsType.setStoreTermVectorOffsets(true); + positionsType.setStoreTermVectorPayloads(true); + FieldType offsetsType = new FieldType(positionsType); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field field1 = new Field("field1docs", "", docsOnlyType); + Field field2 = new Field("field2freqs", "", docsAndFreqsType); + Field field3 = new Field("field3positions", "", positionsType); + Field field4 = new Field("field4offsets", "", offsetsType); + Field field5 = new Field("field5payloadsFixed", "", positionsType); + Field field6 = new Field("field6payloadsVariable", "", positionsType); + Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType); + Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType); + doc.add(field1); + doc.add(field2); + doc.add(field3); + doc.add(field4); + doc.add(field5); + doc.add(field6); + doc.add(field7); + doc.add(field8); + for (int i = 0; i < MAXDOC; i++) { + String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + TestUtil.randomSimpleString(random()); + field1.setStringValue(stringValue); + field2.setStringValue(stringValue); + field3.setStringValue(stringValue); + field4.setStringValue(stringValue); + field5.setStringValue(stringValue); + field6.setStringValue(stringValue); + field7.setStringValue(stringValue); + field8.setStringValue(stringValue); + iw.addDocument(doc); + } + iw.close(); + verify(dir); + TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge + iwc = newIndexWriterConfig(analyzer); + iwc.setCodec(new Lucene41RWCodec()); + iwc.setOpenMode(OpenMode.APPEND); + IndexWriter iw2 = new IndexWriter(dir, iwc); + iw2.forceMerge(1); + iw2.close(); + verify(dir); + dir.close(); + } + + private void verify(Directory dir) throws Exception { + DirectoryReader ir = DirectoryReader.open(dir); + for (LeafReaderContext leaf : ir.leaves()) { + LeafReader leafReader = leaf.reader(); + assertTerms(leafReader.terms("field1docs"), leafReader.terms("field2freqs"), true); + assertTerms(leafReader.terms("field3positions"), leafReader.terms("field4offsets"), true); + assertTerms(leafReader.terms("field4offsets"), leafReader.terms("field5payloadsFixed"), true); + assertTerms(leafReader.terms("field5payloadsFixed"), leafReader.terms("field6payloadsVariable"), true); + assertTerms(leafReader.terms("field6payloadsVariable"), leafReader.terms("field7payloadsFixedOffsets"), true); + assertTerms(leafReader.terms("field7payloadsFixedOffsets"), leafReader.terms("field8payloadsVariableOffsets"), true); + } + ir.close(); + } + + // following code is almost an exact dup of code from TestDuelingCodecs: sorry! + + public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception { + if (leftTerms == null || rightTerms == null) { + assertNull(leftTerms); + assertNull(rightTerms); + return; + } + assertTermsStatistics(leftTerms, rightTerms); + + // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different + + TermsEnum leftTermsEnum = leftTerms.iterator(null); + TermsEnum rightTermsEnum = rightTerms.iterator(null); + assertTermsEnum(leftTermsEnum, rightTermsEnum, true); + + assertTermsSeeking(leftTerms, rightTerms); + + if (deep) { + int numIntersections = atLeast(3); + for (int i = 0; i < numIntersections; i++) { + String re = AutomatonTestUtil.randomRegexp(random()); + CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton()); + if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + // TODO: test start term too + TermsEnum leftIntersection = leftTerms.intersect(automaton, null); + TermsEnum rightIntersection = rightTerms.intersect(automaton, null); + assertTermsEnum(leftIntersection, rightIntersection, rarely()); + } + } + } + } + + private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception { + TermsEnum leftEnum = null; + TermsEnum rightEnum = null; + + // just an upper bound + int numTests = atLeast(20); + Random random = random(); + + // collect this number of terms from the left side + HashSet tests = new HashSet<>(); + int numPasses = 0; + while (numPasses < 10 && tests.size() < numTests) { + leftEnum = leftTerms.iterator(leftEnum); + BytesRef term = null; + while ((term = leftEnum.next()) != null) { + int code = random.nextInt(10); + if (code == 0) { + // the term + tests.add(BytesRef.deepCopyOf(term)); + } else if (code == 1) { + // truncated subsequence of term + term = BytesRef.deepCopyOf(term); + if (term.length > 0) { + // truncate it + term.length = random.nextInt(term.length); + } + } else if (code == 2) { + // term, but ensure a non-zero offset + byte newbytes[] = new byte[term.length+5]; + System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length); + tests.add(new BytesRef(newbytes, 5, term.length)); + } + } + numPasses++; + } + + ArrayList shuffledTests = new ArrayList<>(tests); + Collections.shuffle(shuffledTests, random); + + for (BytesRef b : shuffledTests) { + leftEnum = leftTerms.iterator(leftEnum); + rightEnum = rightTerms.iterator(rightEnum); + + assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); + assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); + + SeekStatus leftStatus; + SeekStatus rightStatus; + + leftStatus = leftEnum.seekCeil(b); + rightStatus = rightEnum.seekCeil(b); + assertEquals(leftStatus, rightStatus); + if (leftStatus != SeekStatus.END) { + assertEquals(leftEnum.term(), rightEnum.term()); + } + + leftStatus = leftEnum.seekCeil(b); + rightStatus = rightEnum.seekCeil(b); + assertEquals(leftStatus, rightStatus); + if (leftStatus != SeekStatus.END) { + assertEquals(leftEnum.term(), rightEnum.term()); + } + } + } + + /** + * checks collection-level statistics on Terms + */ + public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception { + if (leftTerms.getDocCount() != -1 && rightTerms.getDocCount() != -1) { + assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount()); + } + if (leftTerms.getSumDocFreq() != -1 && rightTerms.getSumDocFreq() != -1) { + assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq()); + } + if (leftTerms.getSumTotalTermFreq() != -1 && rightTerms.getSumTotalTermFreq() != -1) { + assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq()); + } + if (leftTerms.size() != -1 && rightTerms.size() != -1) { + assertEquals(leftTerms.size(), rightTerms.size()); + } + } + + /** + * checks the terms enum sequentially + * if deep is false, it does a 'shallow' test that doesnt go down to the docsenums + */ + public void assertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, boolean deep) throws Exception { + BytesRef term; + Bits randomBits = new RandomBits(MAXDOC, random().nextDouble(), random()); + DocsAndPositionsEnum leftPositions = null; + DocsAndPositionsEnum rightPositions = null; + DocsEnum leftDocs = null; + DocsEnum rightDocs = null; + + while ((term = leftTermsEnum.next()) != null) { + assertEquals(term, rightTermsEnum.next()); + assertTermStats(leftTermsEnum, rightTermsEnum); + if (deep) { + // with payloads + off + assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions), + rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions)); + assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions), + rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions)); + + assertPositionsSkipping(leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions), + rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions)); + assertPositionsSkipping(leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions), + rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions)); + // with payloads only + assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), + rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); + assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), + rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); + + assertPositionsSkipping(leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), + rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); + assertPositionsSkipping(leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), + rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); + + // with offsets only + assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), + rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); + assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), + rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); + + assertPositionsSkipping(leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), + rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); + assertPositionsSkipping(leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), + rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); + + // with positions only + assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE), + rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE)); + assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE), + rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE)); + + assertPositionsSkipping(leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE), + rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE)); + assertPositionsSkipping(leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE), + rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE)); + + // with freqs: + assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs), + rightDocs = rightTermsEnum.docs(null, rightDocs)); + assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs), + rightDocs = rightTermsEnum.docs(randomBits, rightDocs)); + + // w/o freqs: + assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs, DocsEnum.FLAG_NONE), + rightDocs = rightTermsEnum.docs(null, rightDocs, DocsEnum.FLAG_NONE)); + assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), + rightDocs = rightTermsEnum.docs(randomBits, rightDocs, DocsEnum.FLAG_NONE)); + + // with freqs: + assertDocsSkipping(leftTermsEnum.docFreq(), + leftDocs = leftTermsEnum.docs(null, leftDocs), + rightDocs = rightTermsEnum.docs(null, rightDocs)); + assertDocsSkipping(leftTermsEnum.docFreq(), + leftDocs = leftTermsEnum.docs(randomBits, leftDocs), + rightDocs = rightTermsEnum.docs(randomBits, rightDocs)); + + // w/o freqs: + assertDocsSkipping(leftTermsEnum.docFreq(), + leftDocs = leftTermsEnum.docs(null, leftDocs, DocsEnum.FLAG_NONE), + rightDocs = rightTermsEnum.docs(null, rightDocs, DocsEnum.FLAG_NONE)); + assertDocsSkipping(leftTermsEnum.docFreq(), + leftDocs = leftTermsEnum.docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), + rightDocs = rightTermsEnum.docs(randomBits, rightDocs, DocsEnum.FLAG_NONE)); + } + } + assertNull(rightTermsEnum.next()); + } + + /** + * checks term-level statistics + */ + public void assertTermStats(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum) throws Exception { + assertEquals(leftTermsEnum.docFreq(), rightTermsEnum.docFreq()); + if (leftTermsEnum.totalTermFreq() != -1 && rightTermsEnum.totalTermFreq() != -1) { + assertEquals(leftTermsEnum.totalTermFreq(), rightTermsEnum.totalTermFreq()); + } + } + + /** + * checks docs + freqs + positions + payloads, sequentially + */ + public void assertDocsAndPositionsEnum(DocsAndPositionsEnum leftDocs, DocsAndPositionsEnum rightDocs) throws Exception { + if (leftDocs == null || rightDocs == null) { + assertNull(leftDocs); + assertNull(rightDocs); + return; + } + assertEquals(-1, leftDocs.docID()); + assertEquals(-1, rightDocs.docID()); + int docid; + while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + assertEquals(docid, rightDocs.nextDoc()); + int freq = leftDocs.freq(); + assertEquals(freq, rightDocs.freq()); + for (int i = 0; i < freq; i++) { + assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition()); + // we don't assert offsets/payloads, they are allowed to be different + } + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); + } + + /** + * checks docs + freqs, sequentially + */ + public void assertDocsEnum(DocsEnum leftDocs, DocsEnum rightDocs) throws Exception { + if (leftDocs == null) { + assertNull(rightDocs); + return; + } + assertEquals(-1, leftDocs.docID()); + assertEquals(-1, rightDocs.docID()); + int docid; + while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + assertEquals(docid, rightDocs.nextDoc()); + // we don't assert freqs, they are allowed to be different + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); + } + + /** + * checks advancing docs + */ + public void assertDocsSkipping(int docFreq, DocsEnum leftDocs, DocsEnum rightDocs) throws Exception { + if (leftDocs == null) { + assertNull(rightDocs); + return; + } + int docid = -1; + int averageGap = MAXDOC / (1+docFreq); + int skipInterval = 16; + + while (true) { + if (random().nextBoolean()) { + // nextDoc() + docid = leftDocs.nextDoc(); + assertEquals(docid, rightDocs.nextDoc()); + } else { + // advance() + int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap)); + docid = leftDocs.advance(skip); + assertEquals(docid, rightDocs.advance(skip)); + } + + if (docid == DocIdSetIterator.NO_MORE_DOCS) { + return; + } + // we don't assert freqs, they are allowed to be different + } + } + + /** + * checks advancing docs + positions + */ + public void assertPositionsSkipping(int docFreq, DocsAndPositionsEnum leftDocs, DocsAndPositionsEnum rightDocs) throws Exception { + if (leftDocs == null || rightDocs == null) { + assertNull(leftDocs); + assertNull(rightDocs); + return; + } + + int docid = -1; + int averageGap = MAXDOC / (1+docFreq); + int skipInterval = 16; + + while (true) { + if (random().nextBoolean()) { + // nextDoc() + docid = leftDocs.nextDoc(); + assertEquals(docid, rightDocs.nextDoc()); + } else { + // advance() + int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap)); + docid = leftDocs.advance(skip); + assertEquals(docid, rightDocs.advance(skip)); + } + + if (docid == DocIdSetIterator.NO_MORE_DOCS) { + return; + } + int freq = leftDocs.freq(); + assertEquals(freq, rightDocs.freq()); + for (int i = 0; i < freq; i++) { + assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition()); + // we don't compare the payloads, its allowed that one is empty etc + } + } + } + + private static class RandomBits implements Bits { + FixedBitSet bits; + + RandomBits(int maxDoc, double pctLive, Random random) { + bits = new FixedBitSet(maxDoc); + for (int i = 0; i < maxDoc; i++) { + if (random.nextDouble() <= pctLive) { + bits.set(i); + } + } + } + + @Override + public boolean get(int index) { + return bits.get(index); + } + + @Override + public int length() { + return bits.length(); + } + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/Lucene410RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/Lucene410RWCodec.java new file mode 100644 index 00000000000..c8da82d5c27 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/Lucene410RWCodec.java @@ -0,0 +1,80 @@ +package org.apache.lucene.codecs.lucene410; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene41.Lucene41RWPostingsFormat; +import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat; +import org.apache.lucene.codecs.lucene42.Lucene42RWTermVectorsFormat; +import org.apache.lucene.codecs.lucene46.Lucene46RWSegmentInfoFormat; +import org.apache.lucene.codecs.lucene49.Lucene49RWNormsFormat; + +/** + * Read-Write version of 4.10 codec for testing + * @deprecated for test purposes only + */ +@Deprecated +public final class Lucene410RWCodec extends Lucene410Codec { + + private final PostingsFormat postings = new Lucene41RWPostingsFormat(); + + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return postings; + } + + private static final DocValuesFormat docValues = new Lucene410RWDocValuesFormat(); + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return docValues; + } + + private static final NormsFormat norms = new Lucene49RWNormsFormat(); + + @Override + public NormsFormat normsFormat() { + return norms; + } + + private static final SegmentInfoFormat segmentInfos = new Lucene46RWSegmentInfoFormat(); + + @Override + public SegmentInfoFormat segmentInfoFormat() { + return segmentInfos; + } + + private static final StoredFieldsFormat storedFields = new Lucene41RWStoredFieldsFormat(); + + @Override + public StoredFieldsFormat storedFieldsFormat() { + return storedFields; + } + + private final TermVectorsFormat vectorsFormat = new Lucene42RWTermVectorsFormat(); + + @Override + public TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/Lucene410RWDocValuesFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/Lucene410RWDocValuesFormat.java new file mode 100644 index 00000000000..02d7b1e16c1 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/Lucene410RWDocValuesFormat.java @@ -0,0 +1,42 @@ +package org.apache.lucene.codecs.lucene410; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentWriteState; + +/** + * Read-Write version of 4.10 docvalues format for testing + * @deprecated for test purposes only + */ +class Lucene410RWDocValuesFormat extends Lucene410DocValuesFormat { + + @Override + public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new Lucene410DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION) { + @Override + void checkCanWrite(FieldInfo field) { + // allow writing all fields + } + }; + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene410/TestLucene410DocValuesFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/TestLucene410DocValuesFormat.java similarity index 93% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene410/TestLucene410DocValuesFormat.java rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/TestLucene410DocValuesFormat.java index ca17aa89a1d..baa0eb9bd7f 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene410/TestLucene410DocValuesFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/TestLucene410DocValuesFormat.java @@ -26,8 +26,6 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.asserting.AssertingCodec; -import org.apache.lucene.codecs.blocktreeords.Ords41PostingsFormat; -import org.apache.lucene.codecs.lucene41ords.Lucene41WithOrds; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.SortedSetDocValuesField; @@ -51,7 +49,7 @@ import org.apache.lucene.util.TestUtil; * Tests Lucene410DocValuesFormat */ public class TestLucene410DocValuesFormat extends BaseCompressingDocValuesFormatTestCase { - private final Codec codec = TestUtil.alwaysDocValuesFormat(new Lucene410DocValuesFormat()); + private final Codec codec = new Lucene410RWCodec(); @Override protected Codec getCodec() { @@ -121,18 +119,8 @@ public class TestLucene410DocValuesFormat extends BaseCompressingDocValuesFormat IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); conf.setMergeScheduler(new SerialMergeScheduler()); // set to duel against a codec which has ordinals: - final PostingsFormat pf; - switch (random().nextInt(2)) { - case 0: pf = new Lucene41WithOrds(); - break; - case 1: pf = new Ords41PostingsFormat(); - break; - // TODO: these don't actually support ords! - //case 2: pf = new FSTOrdPostingsFormat(); - // break; - default: throw new AssertionError(); - } - final DocValuesFormat dv = new Lucene410DocValuesFormat(); + final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random()); + final DocValuesFormat dv = new Lucene410RWDocValuesFormat(); conf.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene42/Lucene42RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene42/Lucene42RWCodec.java index c708d114a21..9c465714c3f 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene42/Lucene42RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene42/Lucene42RWCodec.java @@ -20,10 +20,12 @@ package org.apache.lucene.codecs.lucene42; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.lucene40.Lucene40RWSegmentInfoFormat; +import org.apache.lucene.codecs.lucene41.Lucene41RWPostingsFormat; import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat; /** @@ -37,6 +39,13 @@ public final class Lucene42RWCodec extends Lucene42Codec { private static final NormsFormat norms = new Lucene42RWNormsFormat(); private static final StoredFieldsFormat storedFields = new Lucene41RWStoredFieldsFormat(); private static final FieldInfosFormat fieldInfosFormat = new Lucene42RWFieldInfosFormat(); + + private final PostingsFormat postings = new Lucene41RWPostingsFormat(); + + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return postings; + } @Override public DocValuesFormat getDocValuesFormatForField(String field) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene45/Lucene45RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene45/Lucene45RWCodec.java index a2b2ef45964..6f42569abc5 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene45/Lucene45RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene45/Lucene45RWCodec.java @@ -20,10 +20,12 @@ package org.apache.lucene.codecs.lucene45; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.lucene40.Lucene40RWSegmentInfoFormat; +import org.apache.lucene.codecs.lucene41.Lucene41RWPostingsFormat; import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat; import org.apache.lucene.codecs.lucene42.Lucene42RWFieldInfosFormat; import org.apache.lucene.codecs.lucene42.Lucene42RWNormsFormat; @@ -31,10 +33,18 @@ import org.apache.lucene.codecs.lucene42.Lucene42RWTermVectorsFormat; /** * Read-write version of {@link Lucene45Codec} for testing. + * @deprecated for test purposes only */ -@SuppressWarnings("deprecation") +@Deprecated public final class Lucene45RWCodec extends Lucene45Codec { + private final PostingsFormat postings = new Lucene41RWPostingsFormat(); + + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return postings; + } + private static final FieldInfosFormat fieldInfosFormat = new Lucene42RWFieldInfosFormat(); @Override diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46RWCodec.java index ea9e00be014..62631c9e539 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46RWCodec.java @@ -19,9 +19,11 @@ package org.apache.lucene.codecs.lucene46; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene41.Lucene41RWPostingsFormat; import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat; import org.apache.lucene.codecs.lucene42.Lucene42RWNormsFormat; import org.apache.lucene.codecs.lucene42.Lucene42RWTermVectorsFormat; @@ -34,6 +36,13 @@ import org.apache.lucene.codecs.lucene45.Lucene45RWDocValuesFormat; @Deprecated public final class Lucene46RWCodec extends Lucene46Codec { + private final PostingsFormat postings = new Lucene41RWPostingsFormat(); + + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return postings; + } + private static final DocValuesFormat docValues = new Lucene45RWDocValuesFormat(); @Override diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene49/Lucene49RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene49/Lucene49RWCodec.java index 3f24c13b3f8..c000488ef0b 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene49/Lucene49RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene49/Lucene49RWCodec.java @@ -19,9 +19,11 @@ package org.apache.lucene.codecs.lucene49; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene41.Lucene41RWPostingsFormat; import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat; import org.apache.lucene.codecs.lucene42.Lucene42RWTermVectorsFormat; import org.apache.lucene.codecs.lucene46.Lucene46RWSegmentInfoFormat; @@ -33,6 +35,13 @@ import org.apache.lucene.codecs.lucene46.Lucene46RWSegmentInfoFormat; @Deprecated public final class Lucene49RWCodec extends Lucene49Codec { + private final PostingsFormat postings = new Lucene41RWPostingsFormat(); + + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return postings; + } + private static final DocValuesFormat docValues = new Lucene49RWDocValuesFormat(); @Override diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 240c7c9bfab..f131d3f3e37 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -1075,8 +1075,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase { } private int checkAllSegmentsUpgraded(Directory dir) throws IOException { - final SegmentInfos infos = new SegmentInfos(); - infos.read(dir); + final SegmentInfos infos = SegmentInfos.readLatestCommit(dir); if (VERBOSE) { System.out.println("checkAllSegmentsUpgraded: " + infos); } @@ -1087,8 +1086,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase { } private int getNumberOfSegments(Directory dir) throws IOException { - final SegmentInfos infos = new SegmentInfos(); - infos.read(dir); + final SegmentInfos infos = SegmentInfos.readLatestCommit(dir); return infos.size(); } @@ -1306,7 +1304,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase { writer.forceMerge(1); writer.commit(); writer.rollback(); - new SegmentInfos().read(dir); + SegmentInfos.readLatestCommit(dir); dir.close(); } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestDocValuesUpdatesOnOldSegments.java b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestDocValuesUpdatesOnOldSegments.java index 46f983b51ae..188bc0d4ba2 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestDocValuesUpdatesOnOldSegments.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestDocValuesUpdatesOnOldSegments.java @@ -32,7 +32,10 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; - +/** + * Tests performing docvalues updates against versions of lucene + * that did not support it. + */ public class TestDocValuesUpdatesOnOldSegments extends LuceneTestCase { static long getValue(BinaryDocValues bdv, int idx) { @@ -60,56 +63,62 @@ public class TestDocValuesUpdatesOnOldSegments extends LuceneTestCase { public void testBinaryUpdates() throws Exception { Codec[] oldCodecs = new Codec[] { new Lucene40RWCodec(), new Lucene41RWCodec(), new Lucene42RWCodec(), new Lucene45RWCodec() }; - Directory dir = newDirectory(); - - // create a segment with an old Codec - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); - conf.setCodec(oldCodecs[random().nextInt(oldCodecs.length)]); - IndexWriter writer = new IndexWriter(dir, conf); - Document doc = new Document(); - doc.add(new StringField("id", "doc", Store.NO)); - doc.add(new BinaryDocValuesField("f", toBytes(5L))); - writer.addDocument(doc); - writer.close(); - - conf = newIndexWriterConfig(new MockAnalyzer(random())); - writer = new IndexWriter(dir, conf); - writer.updateBinaryDocValue(new Term("id", "doc"), "f", toBytes(4L)); - try { + + for (Codec codec : oldCodecs) { + Directory dir = newDirectory(); + + // create a segment with an old Codec + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + conf.setCodec(codec); + IndexWriter writer = new IndexWriter(dir, conf); + Document doc = new Document(); + doc.add(new StringField("id", "doc", Store.NO)); + doc.add(new BinaryDocValuesField("f", toBytes(5L))); + writer.addDocument(doc); writer.close(); - fail("should not have succeeded to update a segment written with an old Codec"); - } catch (UnsupportedOperationException e) { - writer.rollback(); + + conf = newIndexWriterConfig(new MockAnalyzer(random())); + writer = new IndexWriter(dir, conf); + writer.updateBinaryDocValue(new Term("id", "doc"), "f", toBytes(4L)); + try { + writer.close(); + fail("should not have succeeded to update a segment written with an old Codec"); + } catch (UnsupportedOperationException e) { + writer.rollback(); + } + + dir.close(); } - - dir.close(); } public void testNumericUpdates() throws Exception { Codec[] oldCodecs = new Codec[] { new Lucene40RWCodec(), new Lucene41RWCodec(), new Lucene42RWCodec(), new Lucene45RWCodec() }; - Directory dir = newDirectory(); - - // create a segment with an old Codec - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); - conf.setCodec(oldCodecs[random().nextInt(oldCodecs.length)]); - IndexWriter writer = new IndexWriter(dir, conf); - Document doc = new Document(); - doc.add(new StringField("id", "doc", Store.NO)); - doc.add(new NumericDocValuesField("f", 5)); - writer.addDocument(doc); - writer.close(); - - conf = newIndexWriterConfig(new MockAnalyzer(random())); - writer = new IndexWriter(dir, conf); - writer.updateNumericDocValue(new Term("id", "doc"), "f", 4L); - try { + + for (Codec codec : oldCodecs) { + Directory dir = newDirectory(); + + // create a segment with an old Codec + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + conf.setCodec(codec); + IndexWriter writer = new IndexWriter(dir, conf); + Document doc = new Document(); + doc.add(new StringField("id", "doc", Store.NO)); + doc.add(new NumericDocValuesField("f", 5)); + writer.addDocument(doc); writer.close(); - fail("should not have succeeded to update a segment written with an old Codec"); - } catch (UnsupportedOperationException e) { - writer.rollback(); + + conf = newIndexWriterConfig(new MockAnalyzer(random())); + writer = new IndexWriter(dir, conf); + writer.updateNumericDocValue(new Term("id", "doc"), "f", 4L); + try { + writer.close(); + fail("should not have succeeded to update a segment written with an old Codec"); + } catch (UnsupportedOperationException e) { + writer.rollback(); + } + + dir.close(); } - - dir.close(); } } diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java index fb8cf26360c..61ed62a91d6 100644 --- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java +++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java @@ -833,8 +833,7 @@ public class TestPerfTasksLogic extends BenchmarkTestCase { ir.close(); // Make sure we have 3 segments: - SegmentInfos infos = new SegmentInfos(); - infos.read(benchmark.getRunData().getDirectory()); + SegmentInfos infos = SegmentInfos.readLatestCommit(benchmark.getRunData().getDirectory()); assertEquals(3, infos.size()); } diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CommitIndexTaskTest.java b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CommitIndexTaskTest.java index 02409880529..57846d434c0 100644 --- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CommitIndexTaskTest.java +++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CommitIndexTaskTest.java @@ -50,8 +50,7 @@ public class CommitIndexTaskTest extends BenchmarkTestCase { CommitIndexTask task = new CommitIndexTask(runData); task.setParams("params"); task.doLogic(); - SegmentInfos infos = new SegmentInfos(); - infos.read(runData.getDirectory()); + SegmentInfos infos = SegmentInfos.readLatestCommit(runData.getDirectory()); assertEquals("params", infos.getUserData().get(OpenReaderTask.USER_DATA)); new CloseIndexTask(runData).doLogic(); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java index 6cf56ab9e7e..36d73ca6751 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java @@ -113,13 +113,13 @@ public class BlockTermsReader extends FieldsProducer { boolean success = false; try { - CodecUtil.checkSegmentHeader(in, BlockTermsWriter.CODEC_NAME, + CodecUtil.checkIndexHeader(in, BlockTermsWriter.CODEC_NAME, BlockTermsWriter.VERSION_START, BlockTermsWriter.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); // Have PostingsReader init itself - postingsReader.init(in); + postingsReader.init(in, state); // NOTE: data file is too costly to verify checksum against all the bytes on open, // but for now we at least verify proper structure of the checksum footer: which looks diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java index c504751d333..9d143c11848 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java @@ -110,14 +110,14 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { boolean success = false; try { fieldInfos = state.fieldInfos; - CodecUtil.writeSegmentHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); currentField = null; this.postingsWriter = postingsWriter; // segment = state.segmentName; //System.out.println("BTW.init seg=" + state.segmentName); - postingsWriter.init(out); // have consumer write its format/header + postingsWriter.init(out, state); // have consumer write its format/header success = true; } finally { if (!success) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java index bd7852081b9..1087fa9d4c9 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java @@ -74,7 +74,7 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase { try { - CodecUtil.checkSegmentHeader(in, FixedGapTermsIndexWriter.CODEC_NAME, + CodecUtil.checkIndexHeader(in, FixedGapTermsIndexWriter.CODEC_NAME, FixedGapTermsIndexWriter.VERSION_CURRENT, FixedGapTermsIndexWriter.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java index dd2008ed234..01625762a54 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java @@ -72,7 +72,7 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase { out = state.directory.createOutput(indexFileName, state.context); boolean success = false; try { - CodecUtil.writeSegmentHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); out.writeVInt(termIndexInterval); out.writeVInt(PackedInts.VERSION_CURRENT); out.writeVInt(BLOCKSIZE); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java index 7456cc5f042..9a40baca269 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java @@ -54,7 +54,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase { try { - CodecUtil.checkSegmentHeader(in, VariableGapTermsIndexWriter.CODEC_NAME, + CodecUtil.checkIndexHeader(in, VariableGapTermsIndexWriter.CODEC_NAME, VariableGapTermsIndexWriter.VERSION_START, VariableGapTermsIndexWriter.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java index d161ecf154c..b213558aeaf 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java @@ -182,7 +182,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { try { fieldInfos = state.fieldInfos; this.policy = policy; - CodecUtil.writeSegmentHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); success = true; } finally { if (!success) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/Ords41PostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/BlockTreeOrdsPostingsFormat.java similarity index 79% rename from lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/Ords41PostingsFormat.java rename to lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/BlockTreeOrdsPostingsFormat.java index 797fe6b5e73..61e8f84cbcb 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/Ords41PostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/BlockTreeOrdsPostingsFormat.java @@ -24,14 +24,14 @@ import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; -/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene41PostingsWriter}. */ -public class Ords41PostingsFormat extends PostingsFormat { +/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene50PostingsWriter}. */ +public class BlockTreeOrdsPostingsFormat extends PostingsFormat { private final int minTermBlockSize; private final int maxTermBlockSize; @@ -45,7 +45,7 @@ public class Ords41PostingsFormat extends PostingsFormat { /** Creates {@code Lucene41PostingsFormat} with default * settings. */ - public Ords41PostingsFormat() { + public BlockTreeOrdsPostingsFormat() { this(OrdsBlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, OrdsBlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); } @@ -53,8 +53,8 @@ public class Ords41PostingsFormat extends PostingsFormat { * values for {@code minBlockSize} and {@code * maxBlockSize} passed to block terms dictionary. * @see OrdsBlockTreeTermsWriter#OrdsBlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */ - public Ords41PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { - super("OrdsLucene41"); + public BlockTreeOrdsPostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + super("BlockTreeOrds"); this.minTermBlockSize = minTermBlockSize; assert minTermBlockSize > 1; this.maxTermBlockSize = maxTermBlockSize; @@ -68,7 +68,7 @@ public class Ords41PostingsFormat extends PostingsFormat { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state); boolean success = false; try { @@ -87,11 +87,7 @@ public class Ords41PostingsFormat extends PostingsFormat { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory, - state.fieldInfos, - state.segmentInfo, - state.context, - state.segmentSuffix); + PostingsReaderBase postingsReader = new Lucene50PostingsReader(state); boolean success = false; try { FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java index b45b90bb618..b34899bfa86 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java @@ -73,7 +73,7 @@ public final class OrdsBlockTreeTermsReader extends FieldsProducer { IndexInput indexIn = null; try { - int version = CodecUtil.checkSegmentHeader(in, OrdsBlockTreeTermsWriter.TERMS_CODEC_NAME, + int version = CodecUtil.checkIndexHeader(in, OrdsBlockTreeTermsWriter.TERMS_CODEC_NAME, OrdsBlockTreeTermsWriter.VERSION_START, OrdsBlockTreeTermsWriter.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); @@ -82,7 +82,7 @@ public final class OrdsBlockTreeTermsReader extends FieldsProducer { state.segmentSuffix, OrdsBlockTreeTermsWriter.TERMS_INDEX_EXTENSION); indexIn = state.directory.openInput(indexFile, state.context); - int indexVersion = CodecUtil.checkSegmentHeader(indexIn, OrdsBlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME, + int indexVersion = CodecUtil.checkIndexHeader(indexIn, OrdsBlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME, OrdsBlockTreeTermsWriter.VERSION_START, OrdsBlockTreeTermsWriter.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); @@ -94,7 +94,7 @@ public final class OrdsBlockTreeTermsReader extends FieldsProducer { CodecUtil.checksumEntireFile(indexIn); // Have PostingsReader init itself - postingsReader.init(in); + postingsReader.init(in, state); // NOTE: data file is too costly to verify checksum against all the bytes on open, diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java index ffa991044c4..a3104220745 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java @@ -203,18 +203,18 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { fieldInfos = state.fieldInfos; this.minItemsInBlock = minItemsInBlock; this.maxItemsInBlock = maxItemsInBlock; - CodecUtil.writeSegmentHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); indexOut = state.directory.createOutput(termsIndexFileName, state.context); - CodecUtil.writeSegmentHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); this.postingsWriter = postingsWriter; // segment = state.segmentInfo.name; // System.out.println("BTW.init seg=" + state.segmentName); - postingsWriter.init(out); // have consumer write its format/header + postingsWriter.init(out, state); // have consumer write its format/header success = true; } finally { if (!success) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java index 87216d6d04b..75064c21ae4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java @@ -541,7 +541,7 @@ public final class OrdsSegmentTermsEnum extends TermsEnum { int cmp = 0; - // TOOD: we should write our vLong backwards (MSB + // TODO: we should write our vLong backwards (MSB // first) to get better sharing from the FST // First compare up to valid seek frames: @@ -555,7 +555,7 @@ public final class OrdsSegmentTermsEnum extends TermsEnum { } arc = arcs[1+targetUpto]; assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); - // TOOD: we could save the outputs in local + // TODO: we could save the outputs in local // byte[][] instead of making new objs ever // seek; but, often the FST doesn't have any // shared bytes (but this could change if we diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java index d84d6378646..fc292b62f3e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java @@ -72,7 +72,7 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; * NumFilteredFields, FilterNumFilteredFields, Footer *
  • Filter --> FieldNumber, FuzzySet
  • *
  • FuzzySet -->See {@link FuzzySet#serialize(DataOutput)}
  • - *
  • Header --> {@link CodecUtil#writeSegmentHeader SegmentHeader}
  • + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • *
  • DelegatePostingsFormatName --> {@link DataOutput#writeString(String) * String} The name of a ServiceProvider registered {@link PostingsFormat}
  • *
  • NumFilteredFields --> {@link DataOutput#writeInt Uint32}
  • @@ -166,7 +166,7 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat { boolean success = false; try { bloomIn = state.directory.openChecksumInput(bloomFileName, state.context); - CodecUtil.checkSegmentHeader(bloomIn, BLOOM_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.checkIndexHeader(bloomIn, BLOOM_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); // // Load the hash function used in the BloomFilter // hashFunction = HashFunction.forName(bloomIn.readString()); // Load the delegate postings format @@ -502,7 +502,7 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat { IndexOutput bloomOutput = null; try { bloomOutput = state.directory.createOutput(bloomFileName, state.context); - CodecUtil.writeSegmentHeader(bloomOutput, BLOOM_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(bloomOutput, BLOOM_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); // remember the name of the postings format we will delegate to bloomOutput.writeString(delegatePostingsFormat.getName()); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesConsumer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesConsumer.java index 33b8b15a9d8..38b97f7d5e9 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesConsumer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesConsumer.java @@ -52,10 +52,10 @@ class DirectDocValuesConsumer extends DocValuesConsumer { try { String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); data = state.directory.createOutput(dataName, state.context); - CodecUtil.writeSegmentHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); meta = state.directory.createOutput(metaName, state.context); - CodecUtil.writeSegmentHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); success = true; } finally { if (!success) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java index 206e91f9c07..1673efebd43 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java @@ -122,7 +122,7 @@ class DirectDocValuesProducer extends DocValuesProducer { ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass())); boolean success = false; try { - version = CodecUtil.checkSegmentHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, + version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); numEntries = readFields(in, state.fieldInfos); @@ -140,7 +140,7 @@ class DirectDocValuesProducer extends DocValuesProducer { this.data = state.directory.openInput(dataName, state.context); success = false; try { - final int version2 = CodecUtil.checkSegmentHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, + final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); if (version != version2) { throw new CorruptIndexException("Format versions mismatch: meta=" + version + ", data=" + version2, data); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java index 418ae8c6834..d2a7624ce11 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java @@ -26,7 +26,7 @@ import java.util.TreeMap; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; // javadocs +import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; // javadocs import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo.IndexOptions; @@ -54,7 +54,7 @@ import org.apache.lucene.util.automaton.Transition; // - build depth-N prefix hash? // - or: longer dense skip lists than just next byte? -/** Wraps {@link Lucene41PostingsFormat} format for on-disk +/** Wraps {@link Lucene50PostingsFormat} format for on-disk * storage, but then at read time loads and stores all * terms & postings directly in RAM as byte[], int[]. * @@ -102,12 +102,12 @@ public final class DirectPostingsFormat extends PostingsFormat { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - return PostingsFormat.forName("Lucene41").fieldsConsumer(state); + return PostingsFormat.forName("Lucene50").fieldsConsumer(state); } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - FieldsProducer postings = PostingsFormat.forName("Lucene41").fieldsProducer(state); + FieldsProducer postings = PostingsFormat.forName("Lucene50").fieldsProducer(state); if (state.context.context != IOContext.Context.MERGE) { FieldsProducer loadedPostings; try { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java index 516ee84d132..98a293bbbd8 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java @@ -25,19 +25,19 @@ import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; /** - * FSTOrd term dict + Lucene41PBF + * FSTOrd term dict + Lucene50PBF */ public final class FSTOrdPostingsFormat extends PostingsFormat { public FSTOrdPostingsFormat() { - super("FSTOrd41"); + super("FSTOrd50"); } @Override @@ -47,7 +47,7 @@ public final class FSTOrdPostingsFormat extends PostingsFormat { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state); boolean success = false; try { @@ -63,11 +63,7 @@ public final class FSTOrdPostingsFormat extends PostingsFormat { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory, - state.fieldInfos, - state.segmentInfo, - state.context, - state.segmentSuffix); + PostingsReaderBase postingsReader = new Lucene50PostingsReader(state); boolean success = false; try { FieldsProducer ret = new FSTOrdTermsReader(state, postingsReader); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java index 6537794d06b..65352e7e6f4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java @@ -88,11 +88,11 @@ public class FSTOrdTermsReader extends FieldsProducer { try { indexIn = state.directory.openChecksumInput(termsIndexFileName, state.context); blockIn = state.directory.openInput(termsBlockFileName, state.context); - int version = CodecUtil.checkSegmentHeader(indexIn, FSTOrdTermsWriter.TERMS_INDEX_CODEC_NAME, + int version = CodecUtil.checkIndexHeader(indexIn, FSTOrdTermsWriter.TERMS_INDEX_CODEC_NAME, FSTOrdTermsWriter.VERSION_START, FSTOrdTermsWriter.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); - int version2 = CodecUtil.checkSegmentHeader(blockIn, FSTOrdTermsWriter.TERMS_CODEC_NAME, + int version2 = CodecUtil.checkIndexHeader(blockIn, FSTOrdTermsWriter.TERMS_CODEC_NAME, FSTOrdTermsWriter.VERSION_START, FSTOrdTermsWriter.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); @@ -103,7 +103,7 @@ public class FSTOrdTermsReader extends FieldsProducer { CodecUtil.checksumEntireFile(blockIn); - this.postingsReader.init(blockIn); + this.postingsReader.init(blockIn, state); seekDir(blockIn); final FieldInfos fieldInfos = state.fieldInfos; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java index 5d495bc3836..5549417d72b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java @@ -75,7 +75,7 @@ import org.apache.lucene.util.fst.Util; *
      *
    • TermIndex(.tix) --> Header, TermFSTNumFields, Footer
    • *
    • TermFST --> {@link FST FST<long>}
    • - *
    • Header --> {@link CodecUtil#writeSegmentHeader SegmentHeader}
    • + *
    • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
    • *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter}
    • *
    * @@ -113,7 +113,7 @@ import org.apache.lucene.util.fst.Util; *
  • StatsBlock --> < DocFreq[Same?], (TotalTermFreq-DocFreq) ? > NumTerms *
  • MetaLongsBlock --> < LongDeltaLongsSize, BytesSize > NumTerms *
  • MetaBytesBlock --> Byte MetaBytesBlockLength - *
  • Header --> {@link CodecUtil#writeSegmentHeader CodecHeader}
  • + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • *
  • DirOffset --> {@link DataOutput#writeLong Uint64}
  • *
  • NumFields, FieldNumber, DocCount, DocFreq, LongsSize, * FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}
  • @@ -174,11 +174,11 @@ public class FSTOrdTermsWriter extends FieldsConsumer { try { this.indexOut = state.directory.createOutput(termsIndexFileName, state.context); this.blockOut = state.directory.createOutput(termsBlockFileName, state.context); - CodecUtil.writeSegmentHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, + CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); - CodecUtil.writeSegmentHeader(blockOut, TERMS_CODEC_NAME, VERSION_CURRENT, + CodecUtil.writeIndexHeader(blockOut, TERMS_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); - this.postingsWriter.init(blockOut); + this.postingsWriter.init(blockOut, state); success = true; } finally { if (!success) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java index 4b317830265..f25f3337940 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java @@ -25,19 +25,19 @@ import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; /** - * FST term dict + Lucene41PBF + * FST term dict + Lucene50PBF */ public final class FSTPostingsFormat extends PostingsFormat { public FSTPostingsFormat() { - super("FST41"); + super("FST50"); } @Override @@ -47,7 +47,7 @@ public final class FSTPostingsFormat extends PostingsFormat { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state); boolean success = false; try { @@ -63,11 +63,7 @@ public final class FSTPostingsFormat extends PostingsFormat { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory, - state.fieldInfos, - state.segmentInfo, - state.context, - state.segmentSuffix); + PostingsReaderBase postingsReader = new Lucene50PostingsReader(state); boolean success = false; try { FieldsProducer ret = new FSTTermsReader(state, postingsReader); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java index 4297550e635..f96897810f2 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java @@ -81,12 +81,12 @@ public class FSTTermsReader extends FieldsProducer { boolean success = false; try { - CodecUtil.checkSegmentHeader(in, FSTTermsWriter.TERMS_CODEC_NAME, + CodecUtil.checkIndexHeader(in, FSTTermsWriter.TERMS_CODEC_NAME, FSTTermsWriter.TERMS_VERSION_START, FSTTermsWriter.TERMS_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); CodecUtil.checksumEntireFile(in); - this.postingsReader.init(in); + this.postingsReader.init(in, state); seekDir(in); final FieldInfos fieldInfos = state.fieldInfos; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java index 2d908e5caeb..836ea70943e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java @@ -90,7 +90,7 @@ import org.apache.lucene.util.fst.Util; *
  • TermFST --> {@link FST FST<TermData>}
  • *
  • TermData --> Flag, BytesSize?, LongDeltaLongsSize?, ByteBytesSize?, * < DocFreq[Same?], (TotalTermFreq-DocFreq) > ?
  • - *
  • Header --> {@link CodecUtil#writeSegmentHeader SegmentHeader}
  • + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • *
  • DirOffset --> {@link DataOutput#writeLong Uint64}
  • *
  • DocFreq, LongsSize, BytesSize, NumFields, * FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}
  • @@ -142,10 +142,10 @@ public class FSTTermsWriter extends FieldsConsumer { boolean success = false; try { - CodecUtil.writeSegmentHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT, + CodecUtil.writeIndexHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); - this.postingsWriter.init(out); + this.postingsWriter.init(out, state); success = true; } finally { if (!success) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java index 33e81af995d..b6e5b1e5bf3 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java @@ -74,10 +74,10 @@ class MemoryDocValuesConsumer extends DocValuesConsumer { try { String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); data = state.directory.createOutput(dataName, state.context); - CodecUtil.writeSegmentHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); meta = state.directory.createOutput(metaName, state.context); - CodecUtil.writeSegmentHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); success = true; } finally { if (!success) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java index 568f13f0d5b..8e2189bf8d6 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java @@ -146,7 +146,7 @@ class MemoryDocValuesProducer extends DocValuesProducer { ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context); boolean success = false; try { - version = CodecUtil.checkSegmentHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, + version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); numEntries = readFields(in, state.fieldInfos); CodecUtil.checkFooter(in); @@ -164,7 +164,7 @@ class MemoryDocValuesProducer extends DocValuesProducer { this.data = state.directory.openInput(dataName, state.context); success = false; try { - final int version2 = CodecUtil.checkSegmentHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, + final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); if (version != version2) { throw new CorruptIndexException("Format versions mismatch: meta=" + version + ", data=" + version2, data); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java index 846c34abfb1..e5c93225787 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java @@ -288,7 +288,7 @@ public final class MemoryPostingsFormat extends PostingsFormat { out = state.directory.createOutput(fileName, state.context); boolean success = false; try { - CodecUtil.writeSegmentHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); success = true; } finally { if (!success) { @@ -981,7 +981,7 @@ public final class MemoryPostingsFormat extends PostingsFormat { try (ChecksumIndexInput in = state.directory.openChecksumInput(fileName, IOContext.READONCE)) { Throwable priorE = null; try { - CodecUtil.checkSegmentHeader(in, CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.checkIndexHeader(in, CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); while(true) { final int termCount = in.readVInt(); if (termCount == 0) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index 0b7e37c54a7..a20896ec8e4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -60,7 +60,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { public static final String SI_EXTENSION = "si"; @Override - public SegmentInfo read(Directory directory, String segmentName, IOContext context) throws IOException { + public SegmentInfo read(Directory directory, String segmentName, byte[] segmentID, IOContext context) throws IOException { BytesRefBuilder scratch = new BytesRefBuilder(); String segFileName = IndexFileNames.segmentFileName(segmentName, "", SimpleTextSegmentInfoFormat.SI_EXTENSION); ChecksumIndexInput input = directory.openChecksumInput(segFileName, context); @@ -114,6 +114,11 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), SI_ID); final byte[] id = Arrays.copyOfRange(scratch.bytes(), SI_ID.length, scratch.length()); + + if (!Arrays.equals(segmentID, id)) { + throw new CorruptIndexException("file mismatch, expected: " + StringHelper.idToString(segmentID) + + ", got: " + StringHelper.idToString(id), input); + } SimpleTextUtil.checkFooter(input); diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index a7cf270af48..726d55e5704 100644 --- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.blocktreeords.Ords41PostingsFormat +org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat org.apache.lucene.codecs.memory.DirectPostingsFormat org.apache.lucene.codecs.memory.FSTOrdPostingsFormat diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestFixedGapPostingsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestFixedGapPostingsFormat.java index 4a4a7c20da6..9cd5e85cb30 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestFixedGapPostingsFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestFixedGapPostingsFormat.java @@ -18,7 +18,6 @@ package org.apache.lucene.codecs.blockterms; */ import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene41ords.Lucene41WithOrds; import org.apache.lucene.index.BasePostingsFormatTestCase; import org.apache.lucene.util.TestUtil; @@ -26,7 +25,7 @@ import org.apache.lucene.util.TestUtil; * Basic tests of a PF using FixedGap terms dictionary */ public class TestFixedGapPostingsFormat extends BasePostingsFormatTestCase { - private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene41WithOrds(TestUtil.nextInt(random(), 1, 1000))); + private final Codec codec = TestUtil.alwaysPostingsFormat(new LuceneFixedGap(TestUtil.nextInt(random(), 1, 1000))); @Override protected Codec getCodec() { diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestVarGapDocFreqIntervalPostingsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestVarGapDocFreqIntervalPostingsFormat.java index 59608f86a16..b70dc6ec574 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestVarGapDocFreqIntervalPostingsFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestVarGapDocFreqIntervalPostingsFormat.java @@ -18,7 +18,7 @@ package org.apache.lucene.codecs.blockterms; */ import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapFixedInterval; +import org.apache.lucene.codecs.blockterms.LuceneVarGapFixedInterval; import org.apache.lucene.index.BasePostingsFormatTestCase; import org.apache.lucene.util.TestUtil; @@ -26,7 +26,7 @@ import org.apache.lucene.util.TestUtil; * Basic tests of a PF using VariableGap terms dictionary (fixed interval) */ public class TestVarGapDocFreqIntervalPostingsFormat extends BasePostingsFormatTestCase { - private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene41VarGapFixedInterval(TestUtil.nextInt(random(), 1, 1000))); + private final Codec codec = TestUtil.alwaysPostingsFormat(new LuceneVarGapFixedInterval(TestUtil.nextInt(random(), 1, 1000))); @Override protected Codec getCodec() { diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestVarGapFixedIntervalPostingsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestVarGapFixedIntervalPostingsFormat.java index d0935a1c33b..8ea4d1755c0 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestVarGapFixedIntervalPostingsFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/blockterms/TestVarGapFixedIntervalPostingsFormat.java @@ -18,7 +18,7 @@ package org.apache.lucene.codecs.blockterms; */ import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapDocFreqInterval; +import org.apache.lucene.codecs.blockterms.LuceneVarGapDocFreqInterval; import org.apache.lucene.index.BasePostingsFormatTestCase; import org.apache.lucene.util.TestUtil; @@ -26,7 +26,7 @@ import org.apache.lucene.util.TestUtil; * Basic tests of a PF using VariableGap terms dictionary (fixed interval, docFreq threshold) */ public class TestVarGapFixedIntervalPostingsFormat extends BasePostingsFormatTestCase { - private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene41VarGapDocFreqInterval(TestUtil.nextInt(random(), 1, 100), TestUtil.nextInt(random(), 1, 1000))); + private final Codec codec = TestUtil.alwaysPostingsFormat(new LuceneVarGapDocFreqInterval(TestUtil.nextInt(random(), 1, 100), TestUtil.nextInt(random(), 1, 1000))); @Override protected Codec getCodec() { diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/blocktreeords/TestOrdsBlockTree.java b/lucene/codecs/src/test/org/apache/lucene/codecs/blocktreeords/TestOrdsBlockTree.java index 9c418c55038..341b8a39e93 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/blocktreeords/TestOrdsBlockTree.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/blocktreeords/TestOrdsBlockTree.java @@ -39,7 +39,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.TestUtil; public class TestOrdsBlockTree extends BasePostingsFormatTestCase { - private final Codec codec = TestUtil.alwaysPostingsFormat(new Ords41PostingsFormat()); + private final Codec codec = TestUtil.alwaysPostingsFormat(new BlockTreeOrdsPostingsFormat()); @Override protected Codec getCodec() { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java index c9fb98e3873..269d48d1d0b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java @@ -94,46 +94,48 @@ public final class CodecUtil { } /** - * Writes a codec header for a per-segment, which records both a string to - * identify the file, a version number, and the unique ID of the segment. - * This header can be parsed and validated with - * {@link #checkSegmentHeader(DataInput, String, int, int, byte[], String) checkSegmentHeader()}. + * Writes a codec header for an index file, which records both a string to + * identify the format of the file, a version number, and data to identify + * the file instance (ID and auxiliary suffix such as generation). *

    - * CodecSegmentHeader --> CodecHeader,SegmentID,SegmentSuffix + * This header can be parsed and validated with + * {@link #checkIndexHeader(DataInput, String, int, int, byte[], String) checkIndexHeader()}. + *

    + * IndexHeader --> CodecHeader,ObjectID,ObjectSuffix *

      *
    • CodecHeader --> {@link #writeHeader} - *
    • SegmentID --> {@link DataOutput#writeByte byte}16 - *
    • SegmentSuffix --> SuffixLength,SuffixBytes + *
    • ObjectID --> {@link DataOutput#writeByte byte}16 + *
    • ObjectSuffix --> SuffixLength,SuffixBytes *
    • SuffixLength --> {@link DataOutput#writeByte byte} *
    • SuffixBytes --> {@link DataOutput#writeByte byte}SuffixLength *
    *

    - * Note that the length of a segment header depends only upon the + * Note that the length of an index header depends only upon the * name of the codec and suffix, so this length can be computed at any time - * with {@link #segmentHeaderLength(String,String)}. + * with {@link #indexHeaderLength(String,String)}. * * @param out Output stream - * @param codec String to identify this file. It should be simple ASCII, + * @param codec String to identify the format of this file. It should be simple ASCII, * less than 128 characters in length. - * @param segmentID Unique identifier for the segment - * @param segmentSuffix auxiliary suffix for the file. It should be simple ASCII, + * @param id Unique identifier for this particular file instance. + * @param suffix auxiliary suffix information for the file. It should be simple ASCII, * less than 256 characters in length. * @param version Version number * @throws IOException If there is an I/O error writing to the underlying medium. * @throws IllegalArgumentException If the codec name is not simple ASCII, or - * is more than 127 characters in length, or if segmentID is invalid, - * or if the segmentSuffix is not simple ASCII, or more than 255 characters + * is more than 127 characters in length, or if id is invalid, + * or if the suffix is not simple ASCII, or more than 255 characters * in length. */ - public static void writeSegmentHeader(DataOutput out, String codec, int version, byte[] segmentID, String segmentSuffix) throws IOException { - if (segmentID.length != StringHelper.ID_LENGTH) { - throw new IllegalArgumentException("Invalid id: " + StringHelper.idToString(segmentID)); + public static void writeIndexHeader(DataOutput out, String codec, int version, byte[] id, String suffix) throws IOException { + if (id.length != StringHelper.ID_LENGTH) { + throw new IllegalArgumentException("Invalid id: " + StringHelper.idToString(id)); } writeHeader(out, codec, version); - out.writeBytes(segmentID, 0, segmentID.length); - BytesRef suffixBytes = new BytesRef(segmentSuffix); - if (suffixBytes.length != segmentSuffix.length() || suffixBytes.length >= 256) { - throw new IllegalArgumentException("codec must be simple ASCII, less than 256 characters in length [got " + segmentSuffix + "]"); + out.writeBytes(id, 0, id.length); + BytesRef suffixBytes = new BytesRef(suffix); + if (suffixBytes.length != suffix.length() || suffixBytes.length >= 256) { + throw new IllegalArgumentException("codec must be simple ASCII, less than 256 characters in length [got " + suffix + "]"); } out.writeByte((byte)suffixBytes.length); out.writeBytes(suffixBytes.bytes, suffixBytes.offset, suffixBytes.length); @@ -151,14 +153,14 @@ public final class CodecUtil { } /** - * Computes the length of a segment header. + * Computes the length of an index header. * * @param codec Codec name. - * @return length of the entire segment header. - * @see #writeSegmentHeader(DataOutput, String, int, byte[], String) + * @return length of the entire index header. + * @see #writeIndexHeader(DataOutput, String, int, byte[], String) */ - public static int segmentHeaderLength(String codec, String segmentSuffix) { - return headerLength(codec) + StringHelper.ID_LENGTH + 1 + segmentSuffix.length(); + public static int indexHeaderLength(String codec, String suffix) { + return headerLength(codec) + StringHelper.ID_LENGTH + 1 + suffix.length(); } /** @@ -220,11 +222,11 @@ public final class CodecUtil { /** * Reads and validates a header previously written with - * {@link #writeSegmentHeader(DataOutput, String, int, byte[], String)}. + * {@link #writeIndexHeader(DataOutput, String, int, byte[], String)}. *

    * When reading a file, supply the expected codec, * expected version range (minVersion to maxVersion), - * and segment ID. + * and object ID and suffix. * * @param in Input stream, positioned at the point where the * header was previously written. Typically this is located @@ -232,41 +234,53 @@ public final class CodecUtil { * @param codec The expected codec name. * @param minVersion The minimum supported expected version number. * @param maxVersion The maximum supported expected version number. - * @param segmentID The expected segment this file belongs to. - * @param segmentSuffix The expected auxiliary segment suffix for this file. + * @param expectedID The expected object identifier for this file. + * @param expectedSuffix The expected auxiliary suffix for this file. * @return The actual version found, when a valid header is found * that matches codec, with an actual version * where minVersion <= actual <= maxVersion, - * and matching segmentID + * and matching expectedID and expectedSuffix * Otherwise an exception is thrown. * @throws CorruptIndexException If the first four bytes are not * {@link #CODEC_MAGIC}, or if the actual codec found is - * not codec, or if the segmentID - * or segmentSuffix do not match. + * not codec, or if the expectedID + * or expectedSuffix do not match. * @throws IndexFormatTooOldException If the actual version is less * than minVersion. * @throws IndexFormatTooNewException If the actual version is greater * than maxVersion. * @throws IOException If there is an I/O error reading from the underlying medium. - * @see #writeSegmentHeader(DataOutput, String, int, byte[],String) + * @see #writeIndexHeader(DataOutput, String, int, byte[],String) */ - public static int checkSegmentHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] segmentID, String segmentSuffix) throws IOException { + public static int checkIndexHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] expectedID, String expectedSuffix) throws IOException { int version = checkHeader(in, codec, minVersion, maxVersion); + checkIndexHeaderID(in, expectedID); + checkIndexHeaderSuffix(in, expectedSuffix); + return version; + } + + /** Expert: just reads and verifies the object ID of an index header */ + public static byte[] checkIndexHeaderID(DataInput in, byte[] expectedID) throws IOException { byte id[] = new byte[StringHelper.ID_LENGTH]; in.readBytes(id, 0, id.length); - if (!Arrays.equals(id, segmentID)) { - throw new CorruptIndexException("file mismatch, expected segment id=" + StringHelper.idToString(segmentID) - + ", got=" + StringHelper.idToString(id), in); + if (!Arrays.equals(id, expectedID)) { + throw new CorruptIndexException("file mismatch, expected id=" + StringHelper.idToString(expectedID) + + ", got=" + StringHelper.idToString(id), in); } + return id; + } + + /** Expert: just reads and verifies the suffix of an index header */ + public static String checkIndexHeaderSuffix(DataInput in, String expectedSuffix) throws IOException { int suffixLength = in.readByte() & 0xFF; byte suffixBytes[] = new byte[suffixLength]; in.readBytes(suffixBytes, 0, suffixBytes.length); String suffix = new String(suffixBytes, 0, suffixBytes.length, StandardCharsets.UTF_8); - if (!suffix.equals(segmentSuffix)) { - throw new CorruptIndexException("file mismatch, expected segment suffix=" + segmentSuffix - + ", got=" + suffix, in); + if (!suffix.equals(expectedSuffix)) { + throw new CorruptIndexException("file mismatch, expected suffix=" + expectedSuffix + + ", got=" + suffix, in); } - return version; + return suffix; } /** diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java deleted file mode 100644 index 198c2082bec..00000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java +++ /dev/null @@ -1,55 +0,0 @@ -package org.apache.lucene.codecs; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.SegmentReadState; - -/** - * Provides a {@link PostingsReaderBase} and {@link - * PostingsWriterBase}. - * - * @lucene.experimental */ - -// TODO: find a better name; this defines the API that the -// terms dict impls use to talk to a postings impl. -// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer - -// can we clean this up and do this some other way? -// refactor some of these classes and use covariant return? -public abstract class PostingsBaseFormat { - - /** Unique name that's used to retrieve this codec when - * reading the index */ - public final String name; - - /** Sole constructor. */ - protected PostingsBaseFormat(String name) { - this.name = name; - } - - /** Creates the {@link PostingsReaderBase} for this - * format. */ - public abstract PostingsReaderBase postingsReaderBase(SegmentReadState state) throws IOException; - - /** Creates the {@link PostingsWriterBase} for this - * format. */ - public abstract PostingsWriterBase postingsWriterBase(SegmentWriteState state) throws IOException; -} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java index 7145fe9a810..5681c1904bc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java @@ -23,6 +23,7 @@ import java.io.IOException; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Accountable; @@ -53,7 +54,7 @@ public abstract class PostingsReaderBase implements Closeable, Accountable { /** Performs any initialization, such as reading and * verifying the header from the provided terms * dictionary {@link IndexInput}. */ - public abstract void init(IndexInput termsIn) throws IOException; + public abstract void init(IndexInput termsIn, SegmentReadState state) throws IOException; /** Return a newly created empty TermState */ public abstract BlockTermState newTermState() throws IOException; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java index 6e083164bec..0dc7bb5b41c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter; import org.apache.lucene.index.DocsAndPositionsEnum; // javadocs import org.apache.lucene.index.DocsEnum; // javadocs import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; @@ -50,7 +51,7 @@ public abstract class PostingsWriterBase implements Closeable { /** Called once after startup, before any terms have been * added. Implementations typically write a header to * the provided {@code termsOut}. */ - public abstract void init(IndexOutput termsOut) throws IOException; + public abstract void init(IndexOutput termsOut, SegmentWriteState state) throws IOException; /** Write all postings for one term; use the provided * {@link TermsEnum} to pull a {@link DocsEnum} or {@link diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java index a310cf65abf..0ce70324612 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java @@ -24,8 +24,6 @@ import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; @@ -72,11 +70,6 @@ public abstract class PushPostingsWriterBase extends PostingsWriterBase { protected PushPostingsWriterBase() { } - /** Called once after startup, before any terms have been - * added. Implementations typically write a header to - * the provided {@code termsOut}. */ - public abstract void init(IndexOutput termsOut) throws IOException; - /** Return a newly created empty TermState */ public abstract BlockTermState newTermState() throws IOException; @@ -90,26 +83,11 @@ public abstract class PushPostingsWriterBase extends PostingsWriterBase { * and will holds metadata from PBF when returned */ public abstract void finishTerm(BlockTermState state) throws IOException; - /** - * Encode metadata as long[] and byte[]. {@code absolute} controls whether - * current term is delta encoded according to latest term. - * Usually elements in {@code longs} are file pointers, so each one always - * increases when a new term is consumed. {@code out} is used to write generic - * bytes, which are not monotonic. - * - * NOTE: sometimes long[] might contain "don't care" values that are unused, e.g. - * the pointer to postings list may not be defined for some terms but is defined - * for others, if it is designed to inline some postings data in term dictionary. - * In this case, the postings writer should always use the last value, so that each - * element in metadata long[] remains monotonic. - */ - public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; - /** * Sets the current field for writing, and returns the * fixed length of long[] metadata (which is fixed per * field), called when the writing switches to another field. */ - // TODO: better name? + @Override public int setField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; indexOptions = fieldInfo.getIndexOptions(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfoFormat.java index 82af5551269..524c41a2de8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfoFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfoFormat.java @@ -41,10 +41,11 @@ public abstract class SegmentInfoFormat { * Read {@link SegmentInfo} data from a directory. * @param directory directory to read from * @param segmentName name of the segment to read + * @param segmentID expected identifier for the segment * @return infos instance to be populated with data * @throws IOException If an I/O error occurs */ - public abstract SegmentInfo read(Directory directory, String segmentName, IOContext context) throws IOException; + public abstract SegmentInfo read(Directory directory, String segmentName, byte segmentID[], IOContext context) throws IOException; /** * Write {@link SegmentInfo} data. diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java index b72d086b6f5..e01288eccbe 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java @@ -30,17 +30,16 @@ import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.Terms; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountables; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.Outputs; /** A block-based terms index and dictionary that assigns * terms to variable length blocks according to how they @@ -74,8 +73,31 @@ import org.apache.lucene.util.IOUtils; public final class BlockTreeTermsReader extends FieldsProducer { + static final Outputs FST_OUTPUTS = ByteSequenceOutputs.getSingleton(); + + static final BytesRef NO_OUTPUT = FST_OUTPUTS.getNoOutput(); + + static final int OUTPUT_FLAGS_NUM_BITS = 2; + static final int OUTPUT_FLAGS_MASK = 0x3; + static final int OUTPUT_FLAG_IS_FLOOR = 0x1; + static final int OUTPUT_FLAG_HAS_TERMS = 0x2; + + /** Extension of terms file */ + static final String TERMS_EXTENSION = "tim"; + final static String TERMS_CODEC_NAME = "BlockTreeTermsDict"; + + /** Initial terms format. */ + public static final int VERSION_START = 0; + + /** Current terms format. */ + public static final int VERSION_CURRENT = VERSION_START; + + /** Extension of terms index file */ + static final String TERMS_INDEX_EXTENSION = "tip"; + final static String TERMS_INDEX_CODEC_NAME = "BlockTreeTermsIndex"; + // Open input to the main terms dict file (_X.tib) - final IndexInput in; + final IndexInput termsIn; //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; @@ -96,105 +118,86 @@ public final class BlockTreeTermsReader extends FieldsProducer { private final int version; /** Sole constructor. */ - public BlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo info, - PostingsReaderBase postingsReader, IOContext ioContext, - String segmentSuffix) - throws IOException { - - this.postingsReader = postingsReader; - - this.segment = info.name; - in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_EXTENSION), - ioContext); - + public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) throws IOException { boolean success = false; IndexInput indexIn = null; - + + this.postingsReader = postingsReader; + this.segment = state.segmentInfo.name; + + String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION); try { - version = readHeader(in); - indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION), - ioContext); - int indexVersion = readIndexHeader(indexIn); - if (indexVersion != version) { - throw new CorruptIndexException("mixmatched version files: " + in + "=" + version + "," + indexIn + "=" + indexVersion, indexIn); - } + termsIn = state.directory.openInput(termsName, state.context); + version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); - // verify - if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) { - CodecUtil.checksumEntireFile(indexIn); - } + String indexName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_INDEX_EXTENSION); + indexIn = state.directory.openInput(indexName, state.context); + CodecUtil.checkIndexHeader(indexIn, TERMS_INDEX_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.checksumEntireFile(indexIn); // Have PostingsReader init itself - postingsReader.init(in); - + postingsReader.init(termsIn, state); // NOTE: data file is too costly to verify checksum against all the bytes on open, // but for now we at least verify proper structure of the checksum footer: which looks // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption // such as file truncation. - if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) { - CodecUtil.retrieveChecksum(in); - } + CodecUtil.retrieveChecksum(termsIn); // Read per-field details - seekDir(in, dirOffset); + seekDir(termsIn, dirOffset); seekDir(indexIn, indexDirOffset); - final int numFields = in.readVInt(); + final int numFields = termsIn.readVInt(); if (numFields < 0) { - throw new CorruptIndexException("invalid numFields: " + numFields, in); + throw new CorruptIndexException("invalid numFields: " + numFields, termsIn); } - for(int i=0;i= BlockTreeTermsWriter.VERSION_META_ARRAY ? in.readVInt() : 0; + final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : termsIn.readVLong(); + final long sumDocFreq = termsIn.readVLong(); + final int docCount = termsIn.readVInt(); + final int longsSize = termsIn.readVInt(); if (longsSize < 0) { - throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, in); + throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn); } - BytesRef minTerm, maxTerm; - if (version >= BlockTreeTermsWriter.VERSION_MIN_MAX_TERMS) { - minTerm = readBytesRef(in); - maxTerm = readBytesRef(in); - } else { - minTerm = maxTerm = null; - } - if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs - throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount(), in); + BytesRef minTerm = readBytesRef(termsIn); + BytesRef maxTerm = readBytesRef(termsIn); + if (docCount < 0 || docCount > state.segmentInfo.getDocCount()) { // #docs with field must be <= #docs + throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.getDocCount(), termsIn); } if (sumDocFreq < docCount) { // #postings must be >= #docs with field - throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, in); + throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsIn); } if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings - throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, in); + throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsIn); } final long indexStartFP = indexIn.readVLong(); FieldReader previous = fields.put(fieldInfo.name, new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, longsSize, indexIn, minTerm, maxTerm)); if (previous != null) { - throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in); + throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn); } } + indexIn.close(); - success = true; } finally { if (!success) { @@ -212,38 +215,11 @@ public final class BlockTreeTermsReader extends FieldsProducer { return bytes; } - /** Reads terms file header. */ - private int readHeader(IndexInput input) throws IOException { - int version = CodecUtil.checkHeader(input, BlockTreeTermsWriter.TERMS_CODEC_NAME, - BlockTreeTermsWriter.VERSION_START, - BlockTreeTermsWriter.VERSION_CURRENT); - if (version < BlockTreeTermsWriter.VERSION_APPEND_ONLY) { - dirOffset = input.readLong(); - } - return version; - } - - /** Reads index file header. */ - private int readIndexHeader(IndexInput input) throws IOException { - int version = CodecUtil.checkHeader(input, BlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME, - BlockTreeTermsWriter.VERSION_START, - BlockTreeTermsWriter.VERSION_CURRENT); - if (version < BlockTreeTermsWriter.VERSION_APPEND_ONLY) { - indexDirOffset = input.readLong(); - } - return version; - } - /** Seek {@code input} to the directory offset. */ private void seekDir(IndexInput input, long dirOffset) throws IOException { - if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) { - input.seek(input.length() - CodecUtil.footerLength() - 8); - dirOffset = input.readLong(); - } else if (version >= BlockTreeTermsWriter.VERSION_APPEND_ONLY) { - input.seek(input.length() - 8); - dirOffset = input.readLong(); - } + input.seek(input.length() - CodecUtil.footerLength() - 8); + dirOffset = input.readLong(); input.seek(dirOffset); } @@ -255,7 +231,7 @@ public final class BlockTreeTermsReader extends FieldsProducer { @Override public void close() throws IOException { try { - IOUtils.close(in, postingsReader); + IOUtils.close(termsIn, postingsReader); } finally { // Clear so refs to terms index is GCable even if // app hangs onto us: @@ -313,14 +289,12 @@ public final class BlockTreeTermsReader extends FieldsProducer { } @Override - public void checkIntegrity() throws IOException { - if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) { - // term dictionary - CodecUtil.checksumEntireFile(in); + public void checkIntegrity() throws IOException { + // term dictionary + CodecUtil.checksumEntireFile(termsIn); - // postings - postingsReader.checkIntegrity(); - } + // postings + postingsReader.checkIntegrity(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java index 39a1becfd01..d9b36762ea2 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java @@ -41,14 +41,12 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.packed.PackedInts; @@ -192,10 +190,6 @@ import org.apache.lucene.util.packed.PackedInts; */ public final class BlockTreeTermsWriter extends FieldsConsumer { - static final Outputs FST_OUTPUTS = ByteSequenceOutputs.getSingleton(); - - static final BytesRef NO_OUTPUT = FST_OUTPUTS.getNoOutput(); - /** Suggested default value for the {@code * minItemsInBlock} parameter to {@link * #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */ @@ -209,38 +203,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { // public final static boolean DEBUG = false; //private final static boolean SAVE_DOT_FILES = false; - static final int OUTPUT_FLAGS_NUM_BITS = 2; - static final int OUTPUT_FLAGS_MASK = 0x3; - static final int OUTPUT_FLAG_IS_FLOOR = 0x1; - static final int OUTPUT_FLAG_HAS_TERMS = 0x2; - - /** Extension of terms file */ - static final String TERMS_EXTENSION = "tim"; - final static String TERMS_CODEC_NAME = "BLOCK_TREE_TERMS_DICT"; - - /** Initial terms format. */ - public static final int VERSION_START = 0; - - /** Append-only */ - public static final int VERSION_APPEND_ONLY = 1; - - /** Meta data as array */ - public static final int VERSION_META_ARRAY = 2; - - /** checksums */ - public static final int VERSION_CHECKSUM = 3; - - /** min/max term */ - public static final int VERSION_MIN_MAX_TERMS = 4; - - /** Current terms format. */ - public static final int VERSION_CURRENT = VERSION_MIN_MAX_TERMS; - - /** Extension of terms index file */ - static final String TERMS_INDEX_EXTENSION = "tip"; - final static String TERMS_INDEX_CODEC_NAME = "BLOCK_TREE_TERMS_INDEX"; - - private final IndexOutput out; + private final IndexOutput termsOut; private final IndexOutput indexOut; final int maxDoc; final int minItemsInBlock; @@ -286,8 +249,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { * sub-blocks) per block will aim to be between * minItemsPerBlock and maxItemsPerBlock, though in some * cases the blocks may be smaller than the min. */ - public BlockTreeTermsWriter( - SegmentWriteState state, + public BlockTreeTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter, int minItemsInBlock, int maxItemsInBlock) @@ -306,47 +268,34 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { throw new IllegalArgumentException("maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock); } - maxDoc = state.segmentInfo.getDocCount(); + this.maxDoc = state.segmentInfo.getDocCount(); + this.fieldInfos = state.fieldInfos; + this.minItemsInBlock = minItemsInBlock; + this.maxItemsInBlock = maxItemsInBlock; + this.postingsWriter = postingsWriter; - final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION); - out = state.directory.createOutput(termsFileName, state.context); + final String termsName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_EXTENSION); + termsOut = state.directory.createOutput(termsName, state.context); boolean success = false; IndexOutput indexOut = null; try { - fieldInfos = state.fieldInfos; - this.minItemsInBlock = minItemsInBlock; - this.maxItemsInBlock = maxItemsInBlock; - writeHeader(out); + CodecUtil.writeIndexHeader(termsOut, BlockTreeTermsReader.TERMS_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); - //DEBUG = state.segmentName.equals("_4a"); + final String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_INDEX_EXTENSION); + indexOut = state.directory.createOutput(indexName, state.context); + CodecUtil.writeIndexHeader(indexOut, BlockTreeTermsReader.TERMS_INDEX_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); - final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); - indexOut = state.directory.createOutput(termsIndexFileName, state.context); - writeIndexHeader(indexOut); - - this.postingsWriter = postingsWriter; - // segment = state.segmentInfo.name; - - // System.out.println("BTW.init seg=" + state.segmentName); - - postingsWriter.init(out); // have consumer write its format/header + postingsWriter.init(termsOut, state); // have consumer write its format/header + + this.indexOut = indexOut; success = true; } finally { if (!success) { - IOUtils.closeWhileHandlingException(out, indexOut); + IOUtils.closeWhileHandlingException(termsOut, indexOut); } } - this.indexOut = indexOut; - } - - /** Writes the terms file header. */ - private void writeHeader(IndexOutput out) throws IOException { - CodecUtil.writeHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT); - } - - /** Writes the index file header. */ - private void writeIndexHeader(IndexOutput out) throws IOException { - CodecUtil.writeHeader(out, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT); } /** Writes the terms file trailer. */ @@ -389,7 +338,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) { assert fp < (1L << 62); - return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0); + return (fp << 2) | (hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0); } private static class PendingEntry { @@ -686,7 +635,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { assert end > start; - long startFP = out.getFilePointer(); + long startFP = termsOut.getFilePointer(); boolean hasFloorLeadLabel = isFloor && floorLeadLabel != -1; @@ -701,7 +650,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { // Last block: code |= 1; } - out.writeVInt(code); + termsOut.writeVInt(code); /* if (DEBUG) { @@ -847,18 +796,18 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { // search on lookup // Write suffixes byte[] blob to terms dict output: - out.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0)); - suffixWriter.writeTo(out); + termsOut.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0)); + suffixWriter.writeTo(termsOut); suffixWriter.reset(); // Write term stats byte[] blob - out.writeVInt((int) statsWriter.getFilePointer()); - statsWriter.writeTo(out); + termsOut.writeVInt((int) statsWriter.getFilePointer()); + statsWriter.writeTo(termsOut); statsWriter.reset(); // Write term meta data byte[] blob - out.writeVInt((int) metaWriter.getFilePointer()); - metaWriter.writeTo(out); + termsOut.writeVInt((int) metaWriter.getFilePointer()); + metaWriter.writeTo(termsOut); metaWriter.reset(); // if (DEBUG) { @@ -1013,38 +962,38 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { boolean success = false; try { - final long dirStart = out.getFilePointer(); + final long dirStart = termsOut.getFilePointer(); final long indexDirStart = indexOut.getFilePointer(); - out.writeVInt(fields.size()); + termsOut.writeVInt(fields.size()); for(FieldMetaData field : fields) { //System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms"); - out.writeVInt(field.fieldInfo.number); + termsOut.writeVInt(field.fieldInfo.number); assert field.numTerms > 0; - out.writeVLong(field.numTerms); - out.writeVInt(field.rootCode.length); - out.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length); + termsOut.writeVLong(field.numTerms); + termsOut.writeVInt(field.rootCode.length); + termsOut.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length); if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { - out.writeVLong(field.sumTotalTermFreq); + termsOut.writeVLong(field.sumTotalTermFreq); } - out.writeVLong(field.sumDocFreq); - out.writeVInt(field.docCount); - out.writeVInt(field.longsSize); + termsOut.writeVLong(field.sumDocFreq); + termsOut.writeVInt(field.docCount); + termsOut.writeVInt(field.longsSize); indexOut.writeVLong(field.indexStartFP); - writeBytesRef(out, field.minTerm); - writeBytesRef(out, field.maxTerm); + writeBytesRef(termsOut, field.minTerm); + writeBytesRef(termsOut, field.maxTerm); } - writeTrailer(out, dirStart); - CodecUtil.writeFooter(out); + writeTrailer(termsOut, dirStart); + CodecUtil.writeFooter(termsOut); writeIndexTrailer(indexOut, indexDirStart); CodecUtil.writeFooter(indexOut); success = true; } finally { if (success) { - IOUtils.close(out, indexOut, postingsWriter); + IOUtils.close(termsOut, indexOut, postingsWriter); } else { - IOUtils.closeWhileHandlingException(out, indexOut, postingsWriter); + IOUtils.closeWhileHandlingException(termsOut, indexOut, postingsWriter); } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java index 37f8873c269..67d0dfbe457 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java @@ -34,8 +34,10 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST; -/** BlockTree's implementation of {@link Terms}. */ -// public for CheckIndex: +/** + * BlockTree's implementation of {@link Terms}. + * @lucene.internal + */ public final class FieldReader extends Terms implements Accountable { private static final long BASE_RAM_BYTES_USED = @@ -77,7 +79,7 @@ public final class FieldReader extends Terms implements Accountable { // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor); // } - rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; + rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; if (indexIn != null) { final IndexInput clone = indexIn.clone(); @@ -120,8 +122,8 @@ public final class FieldReader extends Terms implements Accountable { } /** For debugging -- used by CheckIndex too*/ - // TODO: maybe push this into Terms? - public Stats computeStats() throws IOException { + @Override + public Stats getStats() throws IOException { return new SegmentTermsEnum(this).computeBlockStats(); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java index a7a569b20b5..a7c6ac01f98 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java @@ -67,7 +67,7 @@ final class IntersectTermsEnum extends TermsEnum { this.fr = fr; runAutomaton = compiled.runAutomaton; compiledAutomaton = compiled; - in = fr.parent.in.clone(); + in = fr.parent.termsIn.clone(); stack = new IntersectTermsEnumFrame[5]; for(int idx=0;idx diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java index 35483f9d746..9aad0b77888 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java @@ -64,7 +64,7 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { *

    * formatName is the name of the format. This name will be used * in the file formats to perform - * {@link CodecUtil#checkSegmentHeader codec header checks}. + * {@link CodecUtil#checkIndexHeader codec header checks}. *

    * segmentSuffix is the segment suffix. This suffix is added to * the result file name only if it's not the empty string. diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java index a0d694c9c78..32d9b3ff186 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java @@ -118,8 +118,8 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader { Throwable priorE = null; try { final String codecNameIdx = formatName + CODEC_SFX_IDX; - version = CodecUtil.checkSegmentHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); - assert CodecUtil.segmentHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer(); + version = CodecUtil.checkIndexHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); + assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer(); indexReader = new CompressingStoredFieldsIndexReader(indexStream, si); maxPointer = indexStream.readVLong(); } catch (Throwable exception) { @@ -141,11 +141,11 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader { throw new CorruptIndexException("Invalid fieldsStream maxPointer (file truncated?): maxPointer=" + maxPointer + ", length=" + fieldsStream.length(), fieldsStream); } final String codecNameDat = formatName + CODEC_SFX_DAT; - final int fieldsVersion = CodecUtil.checkSegmentHeader(fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); + final int fieldsVersion = CodecUtil.checkIndexHeader(fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); if (version != fieldsVersion) { throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + fieldsVersion, fieldsStream); } - assert CodecUtil.segmentHeaderLength(codecNameDat, segmentSuffix) == fieldsStream.getFilePointer(); + assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == fieldsStream.getFilePointer(); chunkSize = fieldsStream.readVInt(); packedIntsVersion = fieldsStream.readVInt(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java index 4277646d284..cf1d5f4db42 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java @@ -29,13 +29,9 @@ import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.SegmentReader; import org.apache.lucene.index.StorableField; -import org.apache.lucene.index.StoredDocument; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -118,10 +114,10 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { final String codecNameIdx = formatName + CODEC_SFX_IDX; final String codecNameDat = formatName + CODEC_SFX_DAT; - CodecUtil.writeSegmentHeader(indexStream, codecNameIdx, VERSION_CURRENT, si.getId(), segmentSuffix); - CodecUtil.writeSegmentHeader(fieldsStream, codecNameDat, VERSION_CURRENT, si.getId(), segmentSuffix); - assert CodecUtil.segmentHeaderLength(codecNameDat, segmentSuffix) == fieldsStream.getFilePointer(); - assert CodecUtil.segmentHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer(); + CodecUtil.writeIndexHeader(indexStream, codecNameIdx, VERSION_CURRENT, si.getId(), segmentSuffix); + CodecUtil.writeIndexHeader(fieldsStream, codecNameDat, VERSION_CURRENT, si.getId(), segmentSuffix); + assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == fieldsStream.getFilePointer(); + assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer(); indexWriter = new CompressingStoredFieldsIndexWriter(indexStream); indexStream = null; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java index 1a3ee4d04e7..ea8aada3f03 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java @@ -46,7 +46,7 @@ public class CompressingTermVectorsFormat extends TermVectorsFormat { *

    * formatName is the name of the format. This name will be used * in the file formats to perform - * {@link CodecUtil#checkSegmentHeader codec header checks}. + * {@link CodecUtil#checkIndexHeader codec header checks}. *

    * The compressionMode parameter allows you to choose between * compression algorithms that have various compression and decompression diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java index ead0ecdead9..a545e9a1d36 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java @@ -114,8 +114,8 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem Throwable priorE = null; try { final String codecNameIdx = formatName + CODEC_SFX_IDX; - version = CodecUtil.checkSegmentHeader(input, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); - assert CodecUtil.segmentHeaderLength(codecNameIdx, segmentSuffix) == input.getFilePointer(); + version = CodecUtil.checkIndexHeader(input, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); + assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == input.getFilePointer(); indexReader = new CompressingStoredFieldsIndexReader(input, si); input.readVLong(); // the end of the data file } catch (Throwable exception) { @@ -133,11 +133,11 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem final String vectorsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION); vectorsStream = d.openInput(vectorsStreamFN, context); final String codecNameDat = formatName + CODEC_SFX_DAT; - int version2 = CodecUtil.checkSegmentHeader(vectorsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); + int version2 = CodecUtil.checkIndexHeader(vectorsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); if (version != version2) { throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + version2, vectorsStream); } - assert CodecUtil.segmentHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer(); + assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer(); long pos = vectorsStream.getFilePointer(); // NOTE: data file is too costly to verify checksum against all the bytes on open, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java index 75439967bed..1d941cec276 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java @@ -32,11 +32,8 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.SegmentReader; import org.apache.lucene.store.BufferedChecksumIndexInput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -231,10 +228,10 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter { final String codecNameIdx = formatName + CODEC_SFX_IDX; final String codecNameDat = formatName + CODEC_SFX_DAT; - CodecUtil.writeSegmentHeader(indexStream, codecNameIdx, VERSION_CURRENT, si.getId(), segmentSuffix); - CodecUtil.writeSegmentHeader(vectorsStream, codecNameDat, VERSION_CURRENT, si.getId(), segmentSuffix); - assert CodecUtil.segmentHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer(); - assert CodecUtil.segmentHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer(); + CodecUtil.writeIndexHeader(indexStream, codecNameIdx, VERSION_CURRENT, si.getId(), segmentSuffix); + CodecUtil.writeIndexHeader(vectorsStream, codecNameDat, VERSION_CURRENT, si.getId(), segmentSuffix); + assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer(); + assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer(); indexWriter = new CompressingStoredFieldsIndexWriter(indexStream); indexStream = null; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java deleted file mode 100644 index c6931623640..00000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java +++ /dev/null @@ -1,51 +0,0 @@ -package org.apache.lucene.codecs.lucene41; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.codecs.PostingsBaseFormat; -import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SegmentWriteState; - -/** - * Provides a {@link PostingsReaderBase} and {@link - * PostingsWriterBase}. - * - * @lucene.experimental */ - -// TODO: should these also be named / looked up via SPI? -public final class Lucene41PostingsBaseFormat extends PostingsBaseFormat { - - /** Sole constructor. */ - public Lucene41PostingsBaseFormat() { - super("Lucene41"); - } - - @Override - public PostingsReaderBase postingsReaderBase(SegmentReadState state) throws IOException { - return new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix); - } - - @Override - public PostingsWriterBase postingsWriterBase(SegmentWriteState state) throws IOException { - return new Lucene41PostingsWriter(state); - } -} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/package.html b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/package.html deleted file mode 100644 index abea0c2767e..00000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/package.html +++ /dev/null @@ -1,25 +0,0 @@ - - - - - - - -Lucene 4.1 file format. - - diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene410/package.html b/lucene/core/src/java/org/apache/lucene/codecs/lucene410/package.html deleted file mode 100755 index cecf6b8316e..00000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene410/package.html +++ /dev/null @@ -1,25 +0,0 @@ - - - - - - - -Lucene 4.10 file format. - - diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/ForUtil.java similarity index 98% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene50/ForUtil.java index 64178590c57..97b99982e3e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/ForUtil.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.lucene41; +package org.apache.lucene.codecs.lucene50; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -27,7 +27,7 @@ import org.apache.lucene.util.packed.PackedInts.Decoder; import org.apache.lucene.util.packed.PackedInts.FormatAndBits; import org.apache.lucene.util.packed.PackedInts; -import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.BLOCK_SIZE; /** * Encode all values in normal area with fixed bit width, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java index e290a4a77de..80417ef707e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java @@ -106,7 +106,7 @@ public class Lucene50Codec extends Codec { /** Returns the postings format that should be used for writing * new segments of field. * - * The default implementation always returns "Lucene41" + * The default implementation always returns "Lucene50" */ public PostingsFormat getPostingsFormatForField(String field) { return defaultFormat; @@ -115,7 +115,7 @@ public class Lucene50Codec extends Codec { /** Returns the docvalues format that should be used for writing * new segments of field. * - * The default implementation always returns "Lucene410" + * The default implementation always returns "Lucene50" */ public DocValuesFormat getDocValuesFormatForField(String field) { return defaultDVFormat; @@ -126,8 +126,8 @@ public class Lucene50Codec extends Codec { return docValuesFormat; } - private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41"); - private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene410"); + private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50"); + private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene50"); private final NormsFormat normsFormat = new Lucene50NormsFormat(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundFormat.java index aa196d533c8..1c4443acc48 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundFormat.java @@ -46,7 +46,7 @@ import org.apache.lucene.store.IndexOutput; *

  • Compound (.cfs) --> Header, FileData FileCount, Footer
  • *
  • Compound Entry Table (.cfe) --> Header, FileCount, <FileName, * DataOffset, DataLength> FileCount
  • - *
  • Header --> {@link CodecUtil#writeSegmentHeader SegmentHeader}
  • + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • *
  • FileCount --> {@link DataOutput#writeVInt VInt}
  • *
  • DataOffset,DataLength,Checksum --> {@link DataOutput#writeLong UInt64}
  • *
  • FileName --> {@link DataOutput#writeString String}
  • @@ -79,8 +79,8 @@ public final class Lucene50CompoundFormat extends CompoundFormat { try (IndexOutput data = dir.createOutput(dataFile, context); IndexOutput entries = dir.createOutput(entriesFile, context)) { - CodecUtil.writeSegmentHeader(data, DATA_CODEC, VERSION_CURRENT, si.getId(), ""); - CodecUtil.writeSegmentHeader(entries, ENTRY_CODEC, VERSION_CURRENT, si.getId(), ""); + CodecUtil.writeIndexHeader(data, DATA_CODEC, VERSION_CURRENT, si.getId(), ""); + CodecUtil.writeIndexHeader(entries, ENTRY_CODEC, VERSION_CURRENT, si.getId(), ""); // write number of files entries.writeVInt(files.size()); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundReader.java index 09696421351..0a1f48535d5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundReader.java @@ -71,7 +71,7 @@ final class Lucene50CompoundReader extends BaseDirectory { boolean success = false; handle = directory.openInput(dataFileName, context); try { - CodecUtil.checkSegmentHeader(handle, Lucene50CompoundFormat.DATA_CODEC, version, version, si.getId(), ""); + CodecUtil.checkIndexHeader(handle, Lucene50CompoundFormat.DATA_CODEC, version, version, si.getId(), ""); // NOTE: data file is too costly to verify checksum against all the bytes on open, // but for now we at least verify proper structure of the checksum footer: which looks @@ -93,7 +93,7 @@ final class Lucene50CompoundReader extends BaseDirectory { try (ChecksumIndexInput entriesStream = dir.openChecksumInput(entriesFileName, IOContext.READONCE)) { Throwable priorE = null; try { - version = CodecUtil.checkSegmentHeader(entriesStream, Lucene50CompoundFormat.ENTRY_CODEC, + version = CodecUtil.checkIndexHeader(entriesStream, Lucene50CompoundFormat.ENTRY_CODEC, Lucene50CompoundFormat.VERSION_START, Lucene50CompoundFormat.VERSION_CURRENT, segmentID, ""); final int numEntries = entriesStream.readVInt(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java new file mode 100644 index 00000000000..f0120c9e130 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java @@ -0,0 +1,586 @@ +package org.apache.lucene.codecs.lucene50; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Closeable; // javadocs +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMOutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.MathUtil; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.PagedBytes.PagedBytesDataInput; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.packed.DirectWriter; +import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; +import org.apache.lucene.util.packed.PackedInts; + +/** writer for {@link Lucene50DocValuesFormat} */ +class Lucene50DocValuesConsumer extends DocValuesConsumer implements Closeable { + + static final int BLOCK_SIZE = 16384; + + // address terms in blocks of 16 terms + static final int INTERVAL_SHIFT = 4; + static final int INTERVAL_COUNT = 1 << INTERVAL_SHIFT; + static final int INTERVAL_MASK = INTERVAL_COUNT - 1; + + // build reverse index from every 1024th term + static final int REVERSE_INTERVAL_SHIFT = 10; + static final int REVERSE_INTERVAL_COUNT = 1 << REVERSE_INTERVAL_SHIFT; + static final int REVERSE_INTERVAL_MASK = REVERSE_INTERVAL_COUNT - 1; + + // for conversion from reverse index to block + static final int BLOCK_INTERVAL_SHIFT = REVERSE_INTERVAL_SHIFT - INTERVAL_SHIFT; + static final int BLOCK_INTERVAL_COUNT = 1 << BLOCK_INTERVAL_SHIFT; + static final int BLOCK_INTERVAL_MASK = BLOCK_INTERVAL_COUNT - 1; + + /** Compressed using packed blocks of ints. */ + public static final int DELTA_COMPRESSED = 0; + /** Compressed by computing the GCD. */ + public static final int GCD_COMPRESSED = 1; + /** Compressed by giving IDs to unique values. */ + public static final int TABLE_COMPRESSED = 2; + /** Compressed with monotonically increasing values */ + public static final int MONOTONIC_COMPRESSED = 3; + /** Compressed with constant value (uses only missing bitset) */ + public static final int CONST_COMPRESSED = 4; + + /** Uncompressed binary, written directly (fixed length). */ + public static final int BINARY_FIXED_UNCOMPRESSED = 0; + /** Uncompressed binary, written directly (variable length). */ + public static final int BINARY_VARIABLE_UNCOMPRESSED = 1; + /** Compressed binary with shared prefixes */ + public static final int BINARY_PREFIX_COMPRESSED = 2; + + /** Standard storage for sorted set values with 1 level of indirection: + * docId -> address -> ord. */ + public static final int SORTED_WITH_ADDRESSES = 0; + /** Single-valued sorted set values, encoded as sorted values, so no level + * of indirection: docId -> ord. */ + public static final int SORTED_SINGLE_VALUED = 1; + + /** placeholder for missing offset that means there are no missing values */ + public static final int ALL_LIVE = -1; + /** placeholder for missing offset that means all values are missing */ + public static final int ALL_MISSING = -2; + + IndexOutput data, meta; + final int maxDoc; + + /** expert: Creates a new writer */ + public Lucene50DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + boolean success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.createOutput(dataName, state.context); + CodecUtil.writeIndexHeader(data, dataCodec, Lucene50DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + meta = state.directory.createOutput(metaName, state.context); + CodecUtil.writeIndexHeader(meta, metaCodec, Lucene50DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + maxDoc = state.segmentInfo.getDocCount(); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void addNumericField(FieldInfo field, Iterable values) throws IOException { + addNumericField(field, values, true); + } + + void addNumericField(FieldInfo field, Iterable values, boolean optimizeStorage) throws IOException { + long count = 0; + long minValue = Long.MAX_VALUE; + long maxValue = Long.MIN_VALUE; + long gcd = 0; + long missingCount = 0; + long zeroCount = 0; + // TODO: more efficient? + HashSet uniqueValues = null; + if (optimizeStorage) { + uniqueValues = new HashSet<>(); + + for (Number nv : values) { + final long v; + if (nv == null) { + v = 0; + missingCount++; + zeroCount++; + } else { + v = nv.longValue(); + if (v == 0) { + zeroCount++; + } + } + + if (gcd != 1) { + if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { + // in that case v - minValue might overflow and make the GCD computation return + // wrong results. Since these extreme values are unlikely, we just discard + // GCD computation for them + gcd = 1; + } else if (count != 0) { // minValue needs to be set first + gcd = MathUtil.gcd(gcd, v - minValue); + } + } + + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + + if (uniqueValues != null) { + if (uniqueValues.add(v)) { + if (uniqueValues.size() > 256) { + uniqueValues = null; + } + } + } + + ++count; + } + } else { + for (Number nv : values) { + long v = nv.longValue(); + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + ++count; + } + } + + final long delta = maxValue - minValue; + final int deltaBitsRequired = DirectWriter.unsignedBitsRequired(delta); + final int tableBitsRequired = uniqueValues == null + ? Integer.MAX_VALUE + : DirectWriter.bitsRequired(uniqueValues.size() - 1); + + final int format; + if (uniqueValues != null + && count <= Integer.MAX_VALUE + && (uniqueValues.size() == 1 + || (uniqueValues.size() == 2 && missingCount > 0 && zeroCount == missingCount))) { + // either one unique value C or two unique values: "missing" and C + format = CONST_COMPRESSED; + } else if (uniqueValues != null && tableBitsRequired < deltaBitsRequired) { + format = TABLE_COMPRESSED; + } else if (gcd != 0 && gcd != 1) { + final long gcdDelta = (maxValue - minValue) / gcd; + final long gcdBitsRequired = DirectWriter.unsignedBitsRequired(gcdDelta); + format = gcdBitsRequired < deltaBitsRequired ? GCD_COMPRESSED : DELTA_COMPRESSED; + } else { + format = DELTA_COMPRESSED; + } + meta.writeVInt(field.number); + meta.writeByte(Lucene50DocValuesFormat.NUMERIC); + meta.writeVInt(format); + if (missingCount == 0) { + meta.writeLong(ALL_LIVE); + } else if (missingCount == count) { + meta.writeLong(ALL_MISSING); + } else { + meta.writeLong(data.getFilePointer()); + writeMissingBitset(values); + } + meta.writeLong(data.getFilePointer()); + meta.writeVLong(count); + + switch (format) { + case CONST_COMPRESSED: + // write the constant (nonzero value in the n=2 case, singleton value otherwise) + meta.writeLong(minValue < 0 ? Collections.min(uniqueValues) : Collections.max(uniqueValues)); + break; + case GCD_COMPRESSED: + meta.writeLong(minValue); + meta.writeLong(gcd); + final long maxDelta = (maxValue - minValue) / gcd; + final int bits = DirectWriter.unsignedBitsRequired(maxDelta); + meta.writeVInt(bits); + final DirectWriter quotientWriter = DirectWriter.getInstance(data, count, bits); + for (Number nv : values) { + long value = nv == null ? 0 : nv.longValue(); + quotientWriter.add((value - minValue) / gcd); + } + quotientWriter.finish(); + break; + case DELTA_COMPRESSED: + final long minDelta = delta < 0 ? 0 : minValue; + meta.writeLong(minDelta); + meta.writeVInt(deltaBitsRequired); + final DirectWriter writer = DirectWriter.getInstance(data, count, deltaBitsRequired); + for (Number nv : values) { + long v = nv == null ? 0 : nv.longValue(); + writer.add(v - minDelta); + } + writer.finish(); + break; + case TABLE_COMPRESSED: + final Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]); + Arrays.sort(decode); + final HashMap encode = new HashMap<>(); + meta.writeVInt(decode.length); + for (int i = 0; i < decode.length; i++) { + meta.writeLong(decode[i]); + encode.put(decode[i], i); + } + meta.writeVInt(tableBitsRequired); + final DirectWriter ordsWriter = DirectWriter.getInstance(data, count, tableBitsRequired); + for (Number nv : values) { + ordsWriter.add(encode.get(nv == null ? 0 : nv.longValue())); + } + ordsWriter.finish(); + break; + default: + throw new AssertionError(); + } + meta.writeLong(data.getFilePointer()); + } + + // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on, + // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode) + void writeMissingBitset(Iterable values) throws IOException { + byte bits = 0; + int count = 0; + for (Object v : values) { + if (count == 8) { + data.writeByte(bits); + count = 0; + bits = 0; + } + if (v != null) { + bits |= 1 << (count & 7); + } + count++; + } + if (count > 0) { + data.writeByte(bits); + } + } + + @Override + public void addBinaryField(FieldInfo field, Iterable values) throws IOException { + // write the byte[] data + meta.writeVInt(field.number); + meta.writeByte(Lucene50DocValuesFormat.BINARY); + int minLength = Integer.MAX_VALUE; + int maxLength = Integer.MIN_VALUE; + final long startFP = data.getFilePointer(); + long count = 0; + long missingCount = 0; + for(BytesRef v : values) { + final int length; + if (v == null) { + length = 0; + missingCount++; + } else { + length = v.length; + } + minLength = Math.min(minLength, length); + maxLength = Math.max(maxLength, length); + if (v != null) { + data.writeBytes(v.bytes, v.offset, v.length); + } + count++; + } + meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED); + if (missingCount == 0) { + meta.writeLong(ALL_LIVE); + } else if (missingCount == count) { + meta.writeLong(ALL_MISSING); + } else { + meta.writeLong(data.getFilePointer()); + writeMissingBitset(values); + } + meta.writeVInt(minLength); + meta.writeVInt(maxLength); + meta.writeVLong(count); + meta.writeLong(startFP); + + // if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit) + // otherwise, we need to record the length fields... + if (minLength != maxLength) { + meta.writeLong(data.getFilePointer()); + meta.writeVInt(PackedInts.VERSION_CURRENT); + meta.writeVInt(BLOCK_SIZE); + + final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE); + long addr = 0; + writer.add(addr); + for (BytesRef v : values) { + if (v != null) { + addr += v.length; + } + writer.add(addr); + } + writer.finish(); + } + } + + /** expert: writes a value dictionary for a sorted/sortedset field */ + private void addTermsDict(FieldInfo field, final Iterable values) throws IOException { + // first check if its a "fixed-length" terms dict + int minLength = Integer.MAX_VALUE; + int maxLength = Integer.MIN_VALUE; + long numValues = 0; + for (BytesRef v : values) { + minLength = Math.min(minLength, v.length); + maxLength = Math.max(maxLength, v.length); + numValues++; + } + if (minLength == maxLength) { + // no index needed: direct addressing by mult + addBinaryField(field, values); + } else if (numValues < REVERSE_INTERVAL_COUNT) { + // low cardinality: waste a few KB of ram, but can't really use fancy index etc + addBinaryField(field, values); + } else { + assert numValues > 0; // we don't have to handle the empty case + // header + meta.writeVInt(field.number); + meta.writeByte(Lucene50DocValuesFormat.BINARY); + meta.writeVInt(BINARY_PREFIX_COMPRESSED); + meta.writeLong(-1L); + // now write the bytes: sharing prefixes within a block + final long startFP = data.getFilePointer(); + // currently, we have to store the delta from expected for every 1/nth term + // we could avoid this, but its not much and less overall RAM than the previous approach! + RAMOutputStream addressBuffer = new RAMOutputStream(); + MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE); + // buffers up 16 terms + RAMOutputStream bytesBuffer = new RAMOutputStream(); + // buffers up block header + RAMOutputStream headerBuffer = new RAMOutputStream(); + BytesRefBuilder lastTerm = new BytesRefBuilder(); + lastTerm.grow(maxLength); + long count = 0; + int suffixDeltas[] = new int[INTERVAL_COUNT]; + for (BytesRef v : values) { + int termPosition = (int) (count & INTERVAL_MASK); + if (termPosition == 0) { + termAddresses.add(data.getFilePointer() - startFP); + // abs-encode first term + headerBuffer.writeVInt(v.length); + headerBuffer.writeBytes(v.bytes, v.offset, v.length); + lastTerm.copyBytes(v); + } else { + // prefix-code: we only share at most 255 characters, to encode the length as a single + // byte and have random access. Larger terms just get less compression. + int sharedPrefix = Math.min(255, StringHelper.bytesDifference(lastTerm.get(), v)); + bytesBuffer.writeByte((byte) sharedPrefix); + bytesBuffer.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix); + // we can encode one smaller, because terms are unique. + suffixDeltas[termPosition] = v.length - sharedPrefix - 1; + } + + count++; + // flush block + if ((count & INTERVAL_MASK) == 0) { + flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas); + } + } + // flush trailing crap + int leftover = (int) (count & INTERVAL_MASK); + if (leftover > 0) { + Arrays.fill(suffixDeltas, leftover, suffixDeltas.length, 0); + flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas); + } + final long indexStartFP = data.getFilePointer(); + // write addresses of indexed terms + termAddresses.finish(); + addressBuffer.writeTo(data); + addressBuffer = null; + termAddresses = null; + meta.writeVInt(minLength); + meta.writeVInt(maxLength); + meta.writeVLong(count); + meta.writeLong(startFP); + meta.writeLong(indexStartFP); + meta.writeVInt(PackedInts.VERSION_CURRENT); + meta.writeVInt(BLOCK_SIZE); + addReverseTermIndex(field, values, maxLength); + } + } + + // writes term dictionary "block" + // first term is absolute encoded as vint length + bytes. + // lengths of subsequent N terms are encoded as either N bytes or N shorts. + // in the double-byte case, the first byte is indicated with -1. + // subsequent terms are encoded as byte suffixLength + bytes. + private void flushTermsDictBlock(RAMOutputStream headerBuffer, RAMOutputStream bytesBuffer, int suffixDeltas[]) throws IOException { + boolean twoByte = false; + for (int i = 1; i < suffixDeltas.length; i++) { + if (suffixDeltas[i] > 254) { + twoByte = true; + } + } + if (twoByte) { + headerBuffer.writeByte((byte)255); + for (int i = 1; i < suffixDeltas.length; i++) { + headerBuffer.writeShort((short) suffixDeltas[i]); + } + } else { + for (int i = 1; i < suffixDeltas.length; i++) { + headerBuffer.writeByte((byte) suffixDeltas[i]); + } + } + headerBuffer.writeTo(data); + headerBuffer.reset(); + bytesBuffer.writeTo(data); + bytesBuffer.reset(); + } + + // writes reverse term index: used for binary searching a term into a range of 64 blocks + // for every 64 blocks (1024 terms) we store a term, trimming any suffix unnecessary for comparison + // terms are written as a contiguous byte[], but never spanning 2^15 byte boundaries. + private void addReverseTermIndex(FieldInfo field, final Iterable values, int maxLength) throws IOException { + long count = 0; + BytesRefBuilder priorTerm = new BytesRefBuilder(); + priorTerm.grow(maxLength); + BytesRef indexTerm = new BytesRef(); + long startFP = data.getFilePointer(); + PagedBytes pagedBytes = new PagedBytes(15); + MonotonicBlockPackedWriter addresses = new MonotonicBlockPackedWriter(data, BLOCK_SIZE); + + for (BytesRef b : values) { + int termPosition = (int) (count & REVERSE_INTERVAL_MASK); + if (termPosition == 0) { + int len = StringHelper.sortKeyLength(priorTerm.get(), b); + indexTerm.bytes = b.bytes; + indexTerm.offset = b.offset; + indexTerm.length = len; + addresses.add(pagedBytes.copyUsingLengthPrefix(indexTerm)); + } else if (termPosition == REVERSE_INTERVAL_MASK) { + priorTerm.copyBytes(b); + } + count++; + } + addresses.finish(); + long numBytes = pagedBytes.getPointer(); + pagedBytes.freeze(true); + PagedBytesDataInput in = pagedBytes.getDataInput(); + meta.writeLong(startFP); + data.writeVLong(numBytes); + data.copyBytes(in, numBytes); + } + + @Override + public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(Lucene50DocValuesFormat.SORTED); + addTermsDict(field, values); + addNumericField(field, docToOrd, false); + } + + @Override + public void addSortedNumericField(FieldInfo field, final Iterable docToValueCount, final Iterable values) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(Lucene50DocValuesFormat.SORTED_NUMERIC); + if (isSingleValued(docToValueCount)) { + meta.writeVInt(SORTED_SINGLE_VALUED); + // The field is single-valued, we can encode it as NUMERIC + addNumericField(field, singletonView(docToValueCount, values, null)); + } else { + meta.writeVInt(SORTED_WITH_ADDRESSES); + // write the stream of values as a numeric field + addNumericField(field, values, true); + // write the doc -> ord count as a absolute index to the stream + addAddresses(field, docToValueCount); + } + } + + @Override + public void addSortedSetField(FieldInfo field, Iterable values, final Iterable docToOrdCount, final Iterable ords) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(Lucene50DocValuesFormat.SORTED_SET); + + if (isSingleValued(docToOrdCount)) { + meta.writeVInt(SORTED_SINGLE_VALUED); + // The field is single-valued, we can encode it as SORTED + addSortedField(field, values, singletonView(docToOrdCount, ords, -1L)); + } else { + meta.writeVInt(SORTED_WITH_ADDRESSES); + + // write the ord -> byte[] as a binary field + addTermsDict(field, values); + + // write the stream of ords as a numeric field + // NOTE: we could return an iterator that delta-encodes these within a doc + addNumericField(field, ords, false); + + // write the doc -> ord count as a absolute index to the stream + addAddresses(field, docToOrdCount); + } + } + + // writes addressing information as MONOTONIC_COMPRESSED integer + private void addAddresses(FieldInfo field, Iterable values) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(Lucene50DocValuesFormat.NUMERIC); + meta.writeVInt(MONOTONIC_COMPRESSED); + meta.writeLong(-1L); + meta.writeLong(data.getFilePointer()); + meta.writeVLong(maxDoc); + meta.writeVInt(PackedInts.VERSION_CURRENT); + meta.writeVInt(BLOCK_SIZE); + + final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE); + long addr = 0; + writer.add(addr); + for (Number v : values) { + addr += v.longValue(); + writer.add(addr); + } + writer.finish(); + meta.writeLong(data.getFilePointer()); + } + + @Override + public void close() throws IOException { + boolean success = false; + try { + if (meta != null) { + meta.writeVInt(-1); // write EOF marker + CodecUtil.writeFooter(meta); // write checksum + } + if (data != null) { + CodecUtil.writeFooter(data); // write checksum + } + success = true; + } finally { + if (success) { + IOUtils.close(data, meta); + } else { + IOUtils.closeWhileHandlingException(data, meta); + } + meta = data = null; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java similarity index 90% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesFormat.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java index d2014e497c7..c5808053260 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene410/Lucene410DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.lucene410; +package org.apache.lucene.codecs.lucene50; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -33,7 +33,7 @@ import org.apache.lucene.util.packed.DirectWriter; import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; /** - * Lucene 4.10 DocValues format. + * Lucene 5.0 DocValues format. *

    * Encodes the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) with these strategies: *

    @@ -49,6 +49,8 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; * common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics. *

  • Monotonic-compressed: when all numbers are monotonically increasing offsets, they are written * as blocks of bitpacked integers, encoding the deviation from the expected delta. + *
  • Const-compressed: when there is only one possible non-missing value, only the missing + * bitset is encoded. * *

    * {@link DocValuesType#BINARY BINARY}: @@ -110,7 +112,7 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; *

  • SortedNumericEntry --> EntryType,NumericEntry,NumericEntry
  • *
  • FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}
  • *
  • EntryType,CompressionType --> {@link DataOutput#writeByte Byte}
  • - *
  • Header --> {@link CodecUtil#writeHeader CodecHeader}
  • + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • *
  • MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --> {@link DataOutput#writeLong Int64}
  • *
  • TableSize,BitsPerValue --> {@link DataOutput#writeVInt vInt}
  • *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • @@ -133,6 +135,8 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; * using blocks of delta-encoded ints. *
  • 2 --> table-compressed. When the number of unique numeric values is small and it would save space, * a lookup table of unique values is written, followed by the ordinal for each document. + *
  • 3 --> monotonic-compressed. Used to implement addressing for BINARY, SORTED_SET, SORTED_NUMERIC. + *
  • 4 --> const-compressed. Used when all non-missing values are the same. * *

    BinaryType indicates how Binary values will be stored: *

      @@ -145,10 +149,7 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; * Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) * is written for the addresses. *

      MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field. - * If its -1, then there are no missing values. - *

      Checksum contains the CRC32 checksum of all bytes in the .dvm file up - * until the checksum. This is used to verify integrity of the file on opening the - * index. + * If its -1, then there are no missing values. If its -2, all values are missing. *

    • *

      The DocValues data or .dvd file.

      *

      For DocValues field, this stores the actual per-document data (the heavy-lifting)

      @@ -164,26 +165,26 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; * * @lucene.experimental */ -public final class Lucene410DocValuesFormat extends DocValuesFormat { +public final class Lucene50DocValuesFormat extends DocValuesFormat { /** Sole Constructor */ - public Lucene410DocValuesFormat() { - super("Lucene410"); + public Lucene50DocValuesFormat() { + super("Lucene50"); } @Override public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - return new Lucene410DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + return new Lucene50DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); } @Override public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { - return new Lucene410DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + return new Lucene50DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); } - static final String DATA_CODEC = "Lucene410DocValuesData"; + static final String DATA_CODEC = "Lucene50DocValuesData"; static final String DATA_EXTENSION = "dvd"; - static final String META_CODEC = "Lucene410ValuesMetadata"; + static final String META_CODEC = "Lucene50DocValuesMetadata"; static final String META_EXTENSION = "dvm"; static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java new file mode 100644 index 00000000000..efae5d4e1b6 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java @@ -0,0 +1,1153 @@ +package org.apache.lucene.codecs.lucene50; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.BINARY_FIXED_UNCOMPRESSED; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.BINARY_PREFIX_COMPRESSED; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.CONST_COMPRESSED; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.DELTA_COMPRESSED; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.GCD_COMPRESSED; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.MONOTONIC_COMPRESSED; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.SORTED_SINGLE_VALUED; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.SORTED_WITH_ADDRESSES; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.TABLE_COMPRESSED; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.INTERVAL_SHIFT; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.INTERVAL_COUNT; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.INTERVAL_MASK; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.REVERSE_INTERVAL_SHIFT; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.REVERSE_INTERVAL_MASK; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.BLOCK_INTERVAL_SHIFT; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.BLOCK_INTERVAL_MASK; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.ALL_LIVE; +import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesConsumer.ALL_MISSING; + +import java.io.Closeable; // javadocs +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.RandomAccessOrds; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.DirectReader; +import org.apache.lucene.util.packed.MonotonicBlockPackedReader; + +/** reader for {@link Lucene50DocValuesFormat} */ +class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable { + private final Map numerics = new HashMap<>(); + private final Map binaries = new HashMap<>(); + private final Map sortedSets = new HashMap<>(); + private final Map sortedNumerics = new HashMap<>(); + private final Map ords = new HashMap<>(); + private final Map ordIndexes = new HashMap<>(); + private final int numFields; + private final AtomicLong ramBytesUsed; + private final IndexInput data; + private final int maxDoc; + + // memory-resident structures + private final Map addressInstances = new HashMap<>(); + private final Map ordIndexInstances = new HashMap<>(); + private final Map reverseIndexInstances = new HashMap<>(); + + private final boolean merging; + + // clone for merge: when merging we don't do any instances.put()s + Lucene50DocValuesProducer(Lucene50DocValuesProducer original) throws IOException { + assert Thread.holdsLock(original); + numerics.putAll(original.numerics); + binaries.putAll(original.binaries); + sortedSets.putAll(original.sortedSets); + sortedNumerics.putAll(original.sortedNumerics); + ords.putAll(original.ords); + ordIndexes.putAll(original.ordIndexes); + numFields = original.numFields; + ramBytesUsed = new AtomicLong(original.ramBytesUsed.get()); + data = original.data.clone(); + maxDoc = original.maxDoc; + + addressInstances.putAll(original.addressInstances); + ordIndexInstances.putAll(original.ordIndexInstances); + reverseIndexInstances.putAll(original.reverseIndexInstances); + merging = true; + } + + /** expert: instantiates a new reader */ + Lucene50DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + this.maxDoc = state.segmentInfo.getDocCount(); + merging = false; + + int version = -1; + int numFields = -1; + + // read in the entries from the metadata file. + try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) { + Throwable priorE = null; + try { + version = CodecUtil.checkIndexHeader(in, metaCodec, + Lucene50DocValuesFormat.VERSION_START, + Lucene50DocValuesFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + numFields = readFields(in, state.fieldInfos); + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(in, priorE); + } + } + + this.numFields = numFields; + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + this.data = state.directory.openInput(dataName, state.context); + boolean success = false; + try { + final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, + Lucene50DocValuesFormat.VERSION_START, + Lucene50DocValuesFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + if (version != version2) { + throw new CorruptIndexException("Format versions mismatch: meta=" + version + ", data=" + version2, data); + } + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(data); + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this.data); + } + } + + ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass())); + } + + private void readSortedField(FieldInfo info, IndexInput meta) throws IOException { + // sorted = binary + numeric + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.BINARY) { + throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); + } + BinaryEntry b = readBinaryEntry(meta); + binaries.put(info.name, b); + + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry n = readNumericEntry(meta); + ords.put(info.name, n); + } + + private void readSortedSetFieldWithAddresses(FieldInfo info, IndexInput meta) throws IOException { + // sortedset = binary + numeric (addresses) + ordIndex + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.BINARY) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + BinaryEntry b = readBinaryEntry(meta); + binaries.put(info.name, b); + + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry n1 = readNumericEntry(meta); + ords.put(info.name, n1); + + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry n2 = readNumericEntry(meta); + ordIndexes.put(info.name, n2); + } + + private int readFields(IndexInput meta, FieldInfos infos) throws IOException { + int numFields = 0; + int fieldNumber = meta.readVInt(); + while (fieldNumber != -1) { + numFields++; + FieldInfo info = infos.fieldInfo(fieldNumber); + if (info == null) { + // trickier to validate more: because we use multiple entries for "composite" types like sortedset, etc. + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); + } + byte type = meta.readByte(); + if (type == Lucene50DocValuesFormat.NUMERIC) { + numerics.put(info.name, readNumericEntry(meta)); + } else if (type == Lucene50DocValuesFormat.BINARY) { + BinaryEntry b = readBinaryEntry(meta); + binaries.put(info.name, b); + } else if (type == Lucene50DocValuesFormat.SORTED) { + readSortedField(info, meta); + } else if (type == Lucene50DocValuesFormat.SORTED_SET) { + SortedSetEntry ss = readSortedSetEntry(meta); + sortedSets.put(info.name, ss); + if (ss.format == SORTED_WITH_ADDRESSES) { + readSortedSetFieldWithAddresses(info, meta); + } else if (ss.format == SORTED_SINGLE_VALUED) { + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.SORTED) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + readSortedField(info, meta); + } else { + throw new AssertionError(); + } + } else if (type == Lucene50DocValuesFormat.SORTED_NUMERIC) { + SortedSetEntry ss = readSortedSetEntry(meta); + sortedNumerics.put(info.name, ss); + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + numerics.put(info.name, readNumericEntry(meta)); + if (ss.format == SORTED_WITH_ADDRESSES) { + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry ordIndex = readNumericEntry(meta); + ordIndexes.put(info.name, ordIndex); + } else if (ss.format != SORTED_SINGLE_VALUED) { + throw new AssertionError(); + } + } else { + throw new CorruptIndexException("invalid type: " + type, meta); + } + fieldNumber = meta.readVInt(); + } + return numFields; + } + + private NumericEntry readNumericEntry(IndexInput meta) throws IOException { + NumericEntry entry = new NumericEntry(); + entry.format = meta.readVInt(); + entry.missingOffset = meta.readLong(); + entry.offset = meta.readLong(); + entry.count = meta.readVLong(); + switch(entry.format) { + case CONST_COMPRESSED: + entry.minValue = meta.readLong(); + if (entry.count > Integer.MAX_VALUE) { + // currently just a limitation e.g. of bits interface and so on. + throw new CorruptIndexException("illegal CONST_COMPRESSED count: " + entry.count, meta); + } + break; + case GCD_COMPRESSED: + entry.minValue = meta.readLong(); + entry.gcd = meta.readLong(); + entry.bitsPerValue = meta.readVInt(); + break; + case TABLE_COMPRESSED: + final int uniqueValues = meta.readVInt(); + if (uniqueValues > 256) { + throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, got=" + uniqueValues, meta); + } + entry.table = new long[uniqueValues]; + for (int i = 0; i < uniqueValues; ++i) { + entry.table[i] = meta.readLong(); + } + entry.bitsPerValue = meta.readVInt(); + break; + case DELTA_COMPRESSED: + entry.minValue = meta.readLong(); + entry.bitsPerValue = meta.readVInt(); + break; + case MONOTONIC_COMPRESSED: + entry.packedIntsVersion = meta.readVInt(); + entry.blockSize = meta.readVInt(); + break; + default: + throw new CorruptIndexException("Unknown format: " + entry.format + ", input=", meta); + } + entry.endOffset = meta.readLong(); + return entry; + } + + static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException { + BinaryEntry entry = new BinaryEntry(); + entry.format = meta.readVInt(); + entry.missingOffset = meta.readLong(); + entry.minLength = meta.readVInt(); + entry.maxLength = meta.readVInt(); + entry.count = meta.readVLong(); + entry.offset = meta.readLong(); + switch(entry.format) { + case BINARY_FIXED_UNCOMPRESSED: + break; + case BINARY_PREFIX_COMPRESSED: + entry.addressesOffset = meta.readLong(); + entry.packedIntsVersion = meta.readVInt(); + entry.blockSize = meta.readVInt(); + entry.reverseIndexOffset = meta.readLong(); + break; + case BINARY_VARIABLE_UNCOMPRESSED: + entry.addressesOffset = meta.readLong(); + entry.packedIntsVersion = meta.readVInt(); + entry.blockSize = meta.readVInt(); + break; + default: + throw new CorruptIndexException("Unknown format: " + entry.format, meta); + } + return entry; + } + + SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException { + SortedSetEntry entry = new SortedSetEntry(); + entry.format = meta.readVInt(); + if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) { + throw new CorruptIndexException("Unknown format: " + entry.format, meta); + } + return entry; + } + + @Override + public NumericDocValues getNumeric(FieldInfo field) throws IOException { + NumericEntry entry = numerics.get(field.name); + return getNumeric(entry); + } + + @Override + public long ramBytesUsed() { + return ramBytesUsed.get(); + } + + @Override + public synchronized Iterable getChildResources() { + List resources = new ArrayList<>(); + resources.addAll(Accountables.namedAccountables("addresses field", addressInstances)); + resources.addAll(Accountables.namedAccountables("ord index field", ordIndexInstances)); + resources.addAll(Accountables.namedAccountables("reverse index field", reverseIndexInstances)); + return Collections.unmodifiableList(resources); + } + + @Override + public void checkIntegrity() throws IOException { + CodecUtil.checksumEntireFile(data); + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(fields=" + numFields + ")"; + } + + LongValues getNumeric(NumericEntry entry) throws IOException { + switch (entry.format) { + case CONST_COMPRESSED: { + final long constant = entry.minValue; + final Bits live = getLiveBits(entry.missingOffset, (int)entry.count); + return new LongValues() { + @Override + public long get(long index) { + return live.get((int)index) ? constant : 0; + } + }; + } + case DELTA_COMPRESSED: { + RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset); + final long delta = entry.minValue; + final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue); + return new LongValues() { + @Override + public long get(long id) { + return delta + values.get(id); + } + }; + } + case GCD_COMPRESSED: { + RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset); + final long min = entry.minValue; + final long mult = entry.gcd; + final LongValues quotientReader = DirectReader.getInstance(slice, entry.bitsPerValue); + return new LongValues() { + @Override + public long get(long id) { + return min + mult * quotientReader.get(id); + } + }; + } + case TABLE_COMPRESSED: { + RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset); + final long table[] = entry.table; + final LongValues ords = DirectReader.getInstance(slice, entry.bitsPerValue); + return new LongValues() { + @Override + public long get(long id) { + return table[(int) ords.get(id)]; + } + }; + } + default: + throw new AssertionError(); + } + } + + @Override + public BinaryDocValues getBinary(FieldInfo field) throws IOException { + BinaryEntry bytes = binaries.get(field.name); + switch(bytes.format) { + case BINARY_FIXED_UNCOMPRESSED: + return getFixedBinary(field, bytes); + case BINARY_VARIABLE_UNCOMPRESSED: + return getVariableBinary(field, bytes); + case BINARY_PREFIX_COMPRESSED: + return getCompressedBinary(field, bytes); + default: + throw new AssertionError(); + } + } + + private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final IndexInput data = this.data.slice("fixed-binary", bytes.offset, bytes.count * bytes.maxLength); + + final BytesRef term = new BytesRef(bytes.maxLength); + final byte[] buffer = term.bytes; + final int length = term.length = bytes.maxLength; + + return new LongBinaryDocValues() { + @Override + public BytesRef get(long id) { + try { + data.seek(id * length); + data.readBytes(buffer, 0, buffer.length); + return term; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + + /** returns an address instance for variable-length binary values. */ + private synchronized MonotonicBlockPackedReader getAddressInstance(FieldInfo field, BinaryEntry bytes) throws IOException { + MonotonicBlockPackedReader addresses = addressInstances.get(field.name); + if (addresses == null) { + data.seek(bytes.addressesOffset); + addresses = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count+1, false); + if (!merging) { + addressInstances.put(field.name, addresses); + ramBytesUsed.addAndGet(addresses.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT); + } + } + return addresses; + } + + private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final MonotonicBlockPackedReader addresses = getAddressInstance(field, bytes); + + final IndexInput data = this.data.slice("var-binary", bytes.offset, bytes.addressesOffset - bytes.offset); + final BytesRef term = new BytesRef(Math.max(0, bytes.maxLength)); + final byte buffer[] = term.bytes; + + return new LongBinaryDocValues() { + @Override + public BytesRef get(long id) { + long startAddress = addresses.get(id); + long endAddress = addresses.get(id+1); + int length = (int) (endAddress - startAddress); + try { + data.seek(startAddress); + data.readBytes(buffer, 0, length); + term.length = length; + return term; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + + /** returns an address instance for prefix-compressed binary values. */ + private synchronized MonotonicBlockPackedReader getIntervalInstance(FieldInfo field, BinaryEntry bytes) throws IOException { + MonotonicBlockPackedReader addresses = addressInstances.get(field.name); + if (addresses == null) { + data.seek(bytes.addressesOffset); + final long size = (bytes.count + INTERVAL_MASK) >>> INTERVAL_SHIFT; + addresses = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size, false); + if (!merging) { + addressInstances.put(field.name, addresses); + ramBytesUsed.addAndGet(addresses.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT); + } + } + return addresses; + } + + /** returns a reverse lookup instance for prefix-compressed binary values. */ + private synchronized ReverseTermsIndex getReverseIndexInstance(FieldInfo field, BinaryEntry bytes) throws IOException { + ReverseTermsIndex index = reverseIndexInstances.get(field.name); + if (index == null) { + index = new ReverseTermsIndex(); + data.seek(bytes.reverseIndexOffset); + long size = (bytes.count + REVERSE_INTERVAL_MASK) >>> REVERSE_INTERVAL_SHIFT; + index.termAddresses = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size, false); + long dataSize = data.readVLong(); + PagedBytes pagedBytes = new PagedBytes(15); + pagedBytes.copy(data, dataSize); + index.terms = pagedBytes.freeze(true); + if (!merging) { + reverseIndexInstances.put(field.name, index); + ramBytesUsed.addAndGet(index.ramBytesUsed()); + } + } + return index; + } + + private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final MonotonicBlockPackedReader addresses = getIntervalInstance(field, bytes); + final ReverseTermsIndex index = getReverseIndexInstance(field, bytes); + assert addresses.size() > 0; // we don't have to handle empty case + IndexInput slice = data.slice("terms", bytes.offset, bytes.addressesOffset - bytes.offset); + return new CompressedBinaryDocValues(bytes, addresses, index, slice); + } + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + final int valueCount = (int) binaries.get(field.name).count; + final BinaryDocValues binary = getBinary(field); + NumericEntry entry = ords.get(field.name); + final LongValues ordinals = getNumeric(entry); + return new SortedDocValues() { + + @Override + public int getOrd(int docID) { + return (int) ordinals.get(docID); + } + + @Override + public BytesRef lookupOrd(int ord) { + return binary.get(ord); + } + + @Override + public int getValueCount() { + return valueCount; + } + + @Override + public int lookupTerm(BytesRef key) { + if (binary instanceof CompressedBinaryDocValues) { + return (int) ((CompressedBinaryDocValues)binary).lookupTerm(key); + } else { + return super.lookupTerm(key); + } + } + + @Override + public TermsEnum termsEnum() { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues)binary).getTermsEnum(); + } else { + return super.termsEnum(); + } + } + }; + } + + /** returns an address instance for sortedset ordinal lists */ + private synchronized MonotonicBlockPackedReader getOrdIndexInstance(FieldInfo field, NumericEntry entry) throws IOException { + MonotonicBlockPackedReader instance = ordIndexInstances.get(field.name); + if (instance == null) { + data.seek(entry.offset); + instance = MonotonicBlockPackedReader.of(data, entry.packedIntsVersion, entry.blockSize, entry.count+1, false); + if (!merging) { + ordIndexInstances.put(field.name, instance); + ramBytesUsed.addAndGet(instance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT); + } + } + return instance; + } + + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + SortedSetEntry ss = sortedNumerics.get(field.name); + NumericEntry numericEntry = numerics.get(field.name); + final LongValues values = getNumeric(numericEntry); + if (ss.format == SORTED_SINGLE_VALUED) { + final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc); + return DocValues.singleton(values, docsWithField); + } else if (ss.format == SORTED_WITH_ADDRESSES) { + final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name)); + + return new SortedNumericDocValues() { + long startOffset; + long endOffset; + + @Override + public void setDocument(int doc) { + startOffset = ordIndex.get(doc); + endOffset = ordIndex.get(doc+1L); + } + + @Override + public long valueAt(int index) { + return values.get(startOffset + index); + } + + @Override + public int count() { + return (int) (endOffset - startOffset); + } + }; + } else { + throw new AssertionError(); + } + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + SortedSetEntry ss = sortedSets.get(field.name); + if (ss.format == SORTED_SINGLE_VALUED) { + final SortedDocValues values = getSorted(field); + return DocValues.singleton(values); + } else if (ss.format != SORTED_WITH_ADDRESSES) { + throw new AssertionError(); + } + + final long valueCount = binaries.get(field.name).count; + // we keep the byte[]s and list of ords on disk, these could be large + final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); + final LongValues ordinals = getNumeric(ords.get(field.name)); + // but the addresses to the ord stream are in RAM + final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name)); + + return new RandomAccessOrds() { + long startOffset; + long offset; + long endOffset; + + @Override + public long nextOrd() { + if (offset == endOffset) { + return NO_MORE_ORDS; + } else { + long ord = ordinals.get(offset); + offset++; + return ord; + } + } + + @Override + public void setDocument(int docID) { + startOffset = offset = ordIndex.get(docID); + endOffset = ordIndex.get(docID+1L); + } + + @Override + public BytesRef lookupOrd(long ord) { + return binary.get(ord); + } + + @Override + public long getValueCount() { + return valueCount; + } + + @Override + public long lookupTerm(BytesRef key) { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues)binary).lookupTerm(key); + } else { + return super.lookupTerm(key); + } + } + + @Override + public TermsEnum termsEnum() { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues)binary).getTermsEnum(); + } else { + return super.termsEnum(); + } + } + + @Override + public long ordAt(int index) { + return ordinals.get(startOffset + index); + } + + @Override + public int cardinality() { + return (int) (endOffset - startOffset); + } + }; + } + + private Bits getLiveBits(final long offset, final int count) throws IOException { + if (offset == ALL_MISSING) { + return new Bits.MatchNoBits(count); + } else if (offset == ALL_LIVE) { + return new Bits.MatchAllBits(count); + } else { + int length = (int) ((count + 7L) >>> 3); + final RandomAccessInput in = data.randomAccessSlice(offset, length); + return new Bits() { + @Override + public boolean get(int index) { + try { + return (in.readByte(index >> 3) & (1 << (index & 7))) != 0; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public int length() { + return count; + } + }; + } + } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + switch(field.getDocValuesType()) { + case SORTED_SET: + return DocValues.docsWithValue(getSortedSet(field), maxDoc); + case SORTED_NUMERIC: + return DocValues.docsWithValue(getSortedNumeric(field), maxDoc); + case SORTED: + return DocValues.docsWithValue(getSorted(field), maxDoc); + case BINARY: + BinaryEntry be = binaries.get(field.name); + return getLiveBits(be.missingOffset, maxDoc); + case NUMERIC: + NumericEntry ne = numerics.get(field.name); + return getLiveBits(ne.missingOffset, maxDoc); + default: + throw new AssertionError(); + } + } + + @Override + public synchronized DocValuesProducer getMergeInstance() throws IOException { + return new Lucene50DocValuesProducer(this); + } + + @Override + public void close() throws IOException { + data.close(); + } + + /** metadata entry for a numeric docvalues field */ + static class NumericEntry { + private NumericEntry() {} + /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */ + long missingOffset; + /** offset to the actual numeric values */ + public long offset; + /** end offset to the actual numeric values */ + public long endOffset; + /** bits per value used to pack the numeric values */ + public int bitsPerValue; + + int format; + /** packed ints version used to encode these numerics */ + public int packedIntsVersion; + /** count of values written */ + public long count; + /** packed ints blocksize */ + public int blockSize; + + long minValue; + long gcd; + long table[]; + } + + /** metadata entry for a binary docvalues field */ + static class BinaryEntry { + private BinaryEntry() {} + /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */ + long missingOffset; + /** offset to the actual binary values */ + long offset; + + int format; + /** count of values written */ + public long count; + int minLength; + int maxLength; + /** offset to the addressing data that maps a value to its slice of the byte[] */ + public long addressesOffset; + /** offset to the reverse index */ + public long reverseIndexOffset; + /** packed ints version used to encode addressing information */ + public int packedIntsVersion; + /** packed ints blocksize */ + public int blockSize; + } + + /** metadata entry for a sorted-set docvalues field */ + static class SortedSetEntry { + private SortedSetEntry() {} + int format; + } + + // internally we compose complex dv (sorted/sortedset) from other ones + static abstract class LongBinaryDocValues extends BinaryDocValues { + @Override + public final BytesRef get(int docID) { + return get((long)docID); + } + + abstract BytesRef get(long id); + } + + // used for reverse lookup to a small range of blocks + static class ReverseTermsIndex implements Accountable { + public MonotonicBlockPackedReader termAddresses; + public PagedBytes.Reader terms; + + @Override + public long ramBytesUsed() { + return termAddresses.ramBytesUsed() + terms.ramBytesUsed(); + } + + @Override + public Iterable getChildResources() { + List resources = new ArrayList<>(); + resources.add(Accountables.namedAccountable("term bytes", terms)); + resources.add(Accountables.namedAccountable("term addresses", termAddresses)); + return Collections.unmodifiableList(resources); + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(size=" + termAddresses.size() + ")"; + } + } + + //in the compressed case, we add a few additional operations for + //more efficient reverse lookup and enumeration + static final class CompressedBinaryDocValues extends LongBinaryDocValues { + final long numValues; + final long numIndexValues; + final int maxTermLength; + final MonotonicBlockPackedReader addresses; + final IndexInput data; + final CompressedBinaryTermsEnum termsEnum; + final PagedBytes.Reader reverseTerms; + final MonotonicBlockPackedReader reverseAddresses; + final long numReverseIndexValues; + + public CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, ReverseTermsIndex index, IndexInput data) throws IOException { + this.maxTermLength = bytes.maxLength; + this.numValues = bytes.count; + this.addresses = addresses; + this.numIndexValues = addresses.size(); + this.data = data; + this.reverseTerms = index.terms; + this.reverseAddresses = index.termAddresses; + this.numReverseIndexValues = reverseAddresses.size(); + this.termsEnum = getTermsEnum(data); + } + + @Override + public BytesRef get(long id) { + try { + termsEnum.seekExact(id); + return termsEnum.term(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + long lookupTerm(BytesRef key) { + try { + switch (termsEnum.seekCeil(key)) { + case FOUND: return termsEnum.ord(); + case NOT_FOUND: return -termsEnum.ord()-1; + default: return -numValues-1; + } + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + TermsEnum getTermsEnum() { + try { + return getTermsEnum(data.clone()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private CompressedBinaryTermsEnum getTermsEnum(IndexInput input) throws IOException { + return new CompressedBinaryTermsEnum(input); + } + + class CompressedBinaryTermsEnum extends TermsEnum { + private long currentOrd = -1; + // offset to the start of the current block + private long currentBlockStart; + private final IndexInput input; + // delta from currentBlockStart to start of each term + private final int offsets[] = new int[INTERVAL_COUNT]; + private final byte buffer[] = new byte[2*INTERVAL_COUNT-1]; + + private final BytesRef term = new BytesRef(maxTermLength); + private final BytesRef firstTerm = new BytesRef(maxTermLength); + private final BytesRef scratch = new BytesRef(); + + CompressedBinaryTermsEnum(IndexInput input) throws IOException { + this.input = input; + input.seek(0); + } + + private void readHeader() throws IOException { + firstTerm.length = input.readVInt(); + input.readBytes(firstTerm.bytes, 0, firstTerm.length); + input.readBytes(buffer, 0, INTERVAL_COUNT-1); + if (buffer[0] == -1) { + readShortAddresses(); + } else { + readByteAddresses(); + } + currentBlockStart = input.getFilePointer(); + } + + // read single byte addresses: each is delta - 2 + // (shared prefix byte and length > 0 are both implicit) + private void readByteAddresses() throws IOException { + int addr = 0; + for (int i = 1; i < offsets.length; i++) { + addr += 2 + (buffer[i-1] & 0xFF); + offsets[i] = addr; + } + } + + // read double byte addresses: each is delta - 2 + // (shared prefix byte and length > 0 are both implicit) + private void readShortAddresses() throws IOException { + input.readBytes(buffer, INTERVAL_COUNT-1, INTERVAL_COUNT); + int addr = 0; + for (int i = 1; i < offsets.length; i++) { + int x = i<<1; + addr += 2 + ((buffer[x-1] << 8) | (buffer[x] & 0xFF)); + offsets[i] = addr; + } + } + + // set term to the first term + private void readFirstTerm() throws IOException { + term.length = firstTerm.length; + System.arraycopy(firstTerm.bytes, firstTerm.offset, term.bytes, 0, term.length); + } + + // read term at offset, delta encoded from first term + private void readTerm(int offset) throws IOException { + int start = input.readByte() & 0xFF; + System.arraycopy(firstTerm.bytes, firstTerm.offset, term.bytes, 0, start); + int suffix = offsets[offset] - offsets[offset-1] - 1; + input.readBytes(term.bytes, start, suffix); + term.length = start + suffix; + } + + @Override + public BytesRef next() throws IOException { + currentOrd++; + if (currentOrd >= numValues) { + return null; + } else { + int offset = (int) (currentOrd & INTERVAL_MASK); + if (offset == 0) { + // switch to next block + readHeader(); + readFirstTerm(); + } else { + readTerm(offset); + } + return term; + } + } + + // binary search reverse index to find smaller + // range of blocks to search + long binarySearchIndex(BytesRef text) throws IOException { + long low = 0; + long high = numReverseIndexValues - 1; + while (low <= high) { + long mid = (low + high) >>> 1; + reverseTerms.fill(scratch, reverseAddresses.get(mid)); + int cmp = scratch.compareTo(text); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; + } + } + return high; + } + + // binary search against first term in block range + // to find term's block + long binarySearchBlock(BytesRef text, long low, long high) throws IOException { + while (low <= high) { + long mid = (low + high) >>> 1; + input.seek(addresses.get(mid)); + term.length = input.readVInt(); + input.readBytes(term.bytes, 0, term.length); + int cmp = term.compareTo(text); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; + } + } + return high; + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + // locate block: narrow to block range with index, then search blocks + final long block; + long indexPos = binarySearchIndex(text); + if (indexPos < 0) { + block = 0; + } else { + long low = indexPos << BLOCK_INTERVAL_SHIFT; + long high = Math.min(numIndexValues - 1, low + BLOCK_INTERVAL_MASK); + block = Math.max(low, binarySearchBlock(text, low, high)); + } + + // position before block, then scan to term. + input.seek(addresses.get(block)); + currentOrd = (block << INTERVAL_SHIFT) - 1; + + while (next() != null) { + int cmp = term.compareTo(text); + if (cmp == 0) { + return SeekStatus.FOUND; + } else if (cmp > 0) { + return SeekStatus.NOT_FOUND; + } + } + return SeekStatus.END; + } + + @Override + public void seekExact(long ord) throws IOException { + long block = ord >>> INTERVAL_SHIFT; + if (block != currentOrd >>> INTERVAL_SHIFT) { + // switch to different block + input.seek(addresses.get(block)); + readHeader(); + } + + currentOrd = ord; + + int offset = (int) (ord & INTERVAL_MASK); + if (offset == 0) { + readFirstTerm(); + } else { + input.seek(currentBlockStart + offsets[offset-1]); + readTerm(offset); + } + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + public long ord() throws IOException { + return currentOrd; + } + + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() throws IOException { + return -1; + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java index 77295b6168a..555ce32ecd1 100755 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java @@ -46,7 +46,7 @@ import org.apache.lucene.store.IndexOutput; * FieldBits,DocValuesBits,DocValuesGen,Attributes> FieldsCount,Footer

      *

      Data types: *

        - *
      • Header --> {@link CodecUtil#checkSegmentHeader SegmentHeader}
      • + *
      • Header --> {@link CodecUtil#checkIndexHeader IndexHeader}
      • *
      • FieldsCount --> {@link DataOutput#writeVInt VInt}
      • *
      • FieldName --> {@link DataOutput#writeString String}
      • *
      • FieldBits, IndexOptions, DocValuesBits --> {@link DataOutput#writeByte Byte}
      • @@ -114,9 +114,9 @@ public final class Lucene50FieldInfosFormat extends FieldInfosFormat { Throwable priorE = null; FieldInfo infos[] = null; try { - CodecUtil.checkSegmentHeader(input, CODEC_NAME, - FORMAT_START, - FORMAT_CURRENT, + CodecUtil.checkIndexHeader(input, Lucene50FieldInfosFormat.CODEC_NAME, + Lucene50FieldInfosFormat.FORMAT_START, + Lucene50FieldInfosFormat.FORMAT_CURRENT, segmentInfo.getId(), segmentSuffix); final int size = input.readVInt(); //read in the size @@ -251,7 +251,7 @@ public final class Lucene50FieldInfosFormat extends FieldInfosFormat { public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException { final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, EXTENSION); try (IndexOutput output = directory.createOutput(fileName, context)) { - CodecUtil.writeSegmentHeader(output, CODEC_NAME, FORMAT_CURRENT, segmentInfo.getId(), segmentSuffix); + CodecUtil.writeIndexHeader(output, Lucene50FieldInfosFormat.CODEC_NAME, Lucene50FieldInfosFormat.FORMAT_CURRENT, segmentInfo.getId(), segmentSuffix); output.writeVInt(infos.size()); for (FieldInfo fi : infos) { fi.checkConsistency(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java index 9e64c417b1f..0dd41895ee1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java @@ -41,10 +41,9 @@ import org.apache.lucene.util.MutableBits; * deletions.

        *

        Although per-segment, this file is maintained exterior to compound segment * files.

        - *

        Deletions (.liv) --> SegmentHeader,Generation,Bits

        + *

        Deletions (.liv) --> IndexHeader,Generation,Bits

        *
          - *
        • SegmentHeader --> {@link CodecUtil#writeSegmentHeader SegmentHeader}
        • - *
        • Generation --> {@link DataOutput#writeLong Int64} + *
        • SegmentHeader --> {@link CodecUtil#writeIndexHeader IndexHeader}
        • *
        • Bits --> <{@link DataOutput#writeLong Int64}> LongCount
        • *
        */ @@ -85,11 +84,8 @@ public final class Lucene50LiveDocsFormat extends LiveDocsFormat { try (ChecksumIndexInput input = dir.openChecksumInput(name, context)) { Throwable priorE = null; try { - CodecUtil.checkSegmentHeader(input, CODEC_NAME, VERSION_START, VERSION_CURRENT, info.info.getId(), ""); - long filegen = input.readLong(); - if (gen != filegen) { - throw new CorruptIndexException("file mismatch, expected generation=" + gen + ", got=" + filegen, input); - } + CodecUtil.checkIndexHeader(input, CODEC_NAME, VERSION_START, VERSION_CURRENT, + info.info.getId(), Long.toString(gen, Character.MAX_RADIX)); long data[] = new long[FixedBitSet.bits2words(length)]; for (int i = 0; i < data.length; i++) { data[i] = input.readLong(); @@ -120,8 +116,7 @@ public final class Lucene50LiveDocsFormat extends LiveDocsFormat { } long data[] = fbs.getBits(); try (IndexOutput output = dir.createOutput(name, context)) { - CodecUtil.writeSegmentHeader(output, CODEC_NAME, VERSION_CURRENT, info.info.getId(), ""); - output.writeLong(gen); + CodecUtil.writeIndexHeader(output, CODEC_NAME, VERSION_CURRENT, info.info.getId(), Long.toString(gen, Character.MAX_RADIX)); for (int i = 0; i < data.length; i++) { output.writeLong(data[i]); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java index cc170e45e11..1c5893797f1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java @@ -47,6 +47,7 @@ class Lucene50NormsConsumer extends NormsConsumer { static final byte CONST_COMPRESSED = 2; static final byte UNCOMPRESSED = 3; static final byte INDIRECT = 4; + static final byte PATCHED = 5; static final int BLOCK_SIZE = 1 << 14; // threshold for indirect encoding, computed as 1 - 1/log2(maxint) @@ -61,10 +62,10 @@ class Lucene50NormsConsumer extends NormsConsumer { try { String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); data = state.directory.createOutput(dataName, state.context); - CodecUtil.writeSegmentHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); meta = state.directory.createOutput(metaName, state.context); - CodecUtil.writeSegmentHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); success = true; } finally { if (!success) { @@ -82,6 +83,11 @@ class Lucene50NormsConsumer extends NormsConsumer { @Override public void addNormsField(FieldInfo field, Iterable values) throws IOException { + writeNormsField(field, values, 0); + } + + private void writeNormsField(FieldInfo field, Iterable values, int level) throws IOException { + assert level <= 1; // we only "recurse" once in the indirect case meta.writeVInt(field.number); long minValue = Long.MAX_VALUE; long maxValue = Long.MIN_VALUE; @@ -89,16 +95,12 @@ class Lucene50NormsConsumer extends NormsConsumer { NormMap uniqueValues = new NormMap(); int count = 0; - int missingCount = 0; for (Number nv : values) { if (nv == null) { throw new IllegalStateException("illegal norms data for field " + field.name + ", got null for value: " + count); } final long v = nv.longValue(); - if (v == 0) { - missingCount++; - } minValue = Math.min(minValue, v); maxValue = Math.max(maxValue, v); @@ -115,9 +117,15 @@ class Lucene50NormsConsumer extends NormsConsumer { if (uniqueValues != null && uniqueValues.size == 1) { // 0 bpv addConstant(minValue); - } else if (count > 256 && missingCount > count * INDIRECT_THRESHOLD) { - // sparse encoding - addIndirect(field, values, count, missingCount); + } else if (level == 0 && count > 256 && uniqueValues != null && uniqueValues.maxFreq() > count * INDIRECT_THRESHOLD) { + long commonValue = uniqueValues.getDecodeTable()[uniqueValues.maxOrd()]; + if (commonValue == 0) { + // if the common value is missing, don't waste RAM on a bitset, since we won't be searching those docs + addIndirect(field, values, count, uniqueValues); + } else { + // otherwise, write a sparse bitset, where 1 indicates 'uncommon value'. + addPatched(field, values, count, uniqueValues); + } } else if (uniqueValues != null) { // small number of unique values: this is the typical case: FormatAndBits compression = fastestFormatAndBits(uniqueValues.size-1); @@ -200,10 +208,65 @@ class Lucene50NormsConsumer extends NormsConsumer { writer.finish(); } - private void addIndirect(FieldInfo field, final Iterable values, int count, int missingCount) throws IOException { - meta.writeVInt(count - missingCount); + // encodes only uncommon values in a sparse bitset + // access is constant time, and the common case is predictable + // exceptions nest either to CONST (if there are only 2 values), or INDIRECT (if there are > 2 values) + private void addPatched(FieldInfo field, final Iterable values, int count, NormMap uniqueValues) throws IOException { + final long decodeTable[] = uniqueValues.getDecodeTable(); + int commonCount = uniqueValues.maxFreq(); + final long commonValue = decodeTable[uniqueValues.maxOrd()]; + + meta.writeVInt(count - commonCount); + meta.writeByte(PATCHED); + meta.writeLong(data.getFilePointer()); + + // write docs with value + writeDocsWithValue(values, commonValue); + + // write exceptions: only two cases make sense + // bpv = 1 (folded into sparse bitset already) + // bpv > 1 (add indirect exception table) + meta.writeVInt(field.number); + if (uniqueValues.size == 2) { + // special case: implicit in bitset + int otherOrd = uniqueValues.maxOrd() == 0 ? 1 : 0; + addConstant(decodeTable[otherOrd]); + } else { + // exception table + addIndirect(field, values, count, uniqueValues); + } + } + + // encodes values as sparse array: keys[] and values[] + // access is log(N) where N = keys.length (slow!) + // so this is only appropriate as an exception table for patched, or when common value is 0 (wont be accessed by searching) + private void addIndirect(FieldInfo field, final Iterable values, int count, NormMap uniqueValues) throws IOException { + int commonCount = uniqueValues.maxFreq(); + final long commonValue = uniqueValues.getDecodeTable()[uniqueValues.maxOrd()]; + + meta.writeVInt(count - commonCount); meta.writeByte(INDIRECT); meta.writeLong(data.getFilePointer()); + + // write docs with value + writeDocsWithValue(values, commonValue); + + // write actual values + writeNormsField(field, new Iterable() { + @Override + public Iterator iterator() { + return new FilterIterator(values.iterator()) { + @Override + protected boolean predicateFunction(Number value) { + return value.longValue() != commonValue; + } + }; + } + }, 1); + } + + private void writeDocsWithValue(final Iterable values, long commonValue) throws IOException { + data.writeLong(commonValue); data.writeVInt(PackedInts.VERSION_CURRENT); data.writeVInt(BLOCK_SIZE); @@ -212,25 +275,12 @@ class Lucene50NormsConsumer extends NormsConsumer { int doc = 0; for (Number n : values) { long v = n.longValue(); - if (v != 0) { + if (v != commonValue) { writer.add(doc); } doc++; } writer.finish(); - - // write actual values - addNormsField(field, new Iterable() { - @Override - public Iterator iterator() { - return new FilterIterator(values.iterator()) { - @Override - protected boolean predicateFunction(Number value) { - return value.longValue() != 0; - } - }; - } - }); } @Override @@ -259,6 +309,7 @@ class Lucene50NormsConsumer extends NormsConsumer { static class NormMap { // we use short: at most we will add 257 values to this map before its rejected as too big above. final short[] singleByteRange = new short[256]; + final int[] freqs = new int[257]; final Map other = new HashMap(); int size; @@ -273,18 +324,24 @@ class Lucene50NormsConsumer extends NormsConsumer { int index = (int) (l + 128); short previous = singleByteRange[index]; if (previous < 0) { - singleByteRange[index] = (short) size; + short slot = (short) size; + singleByteRange[index] = slot; + freqs[slot]++; size++; return true; } else { + freqs[previous]++; return false; } } else { - if (!other.containsKey(l)) { + Short previous = other.get(l); + if (previous == null) { + freqs[size]++; other.put(l, (short)size); size++; return true; } else { + freqs[previous]++; return false; } } @@ -315,5 +372,35 @@ class Lucene50NormsConsumer extends NormsConsumer { } return decode; } + + // TODO: if we need more complicated frequency-driven optos, maybe add 'finish' to this api + // and sort all ords by frequency. we could then lower BPV and waste a value to represent 'patched', + + /** retrieves frequency table for items (indexed by ordinal) */ + public int[] getFreqs() { + return freqs; + } + + /** sugar: returns max value over getFreqs() */ + public int maxFreq() { + int max = 0; + for (int i = 0; i < size; i++) { + max = Math.max(max, freqs[i]); + } + return max; + } + + /** sugar: returns ordinal with maxFreq() */ + public int maxOrd() { + long max = 0; + int maxOrd = 0; + for (int i = 0; i < size; i++) { + if (freqs[i] > max) { + max = freqs[i]; + maxOrd = i; + } + } + return maxOrd; + } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java index 2542ab349f9..ebb62b1c4a8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java @@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.SmallFloat; import org.apache.lucene.util.packed.BlockPackedWriter; +import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; import org.apache.lucene.util.packed.PackedInts; /** @@ -50,6 +51,9 @@ import org.apache.lucene.util.packed.PackedInts; *
      • Indirect: when norms are extremely sparse, missing values are omitted. * Access to an individual value is slower, but missing norm values are never accessed * by search code. + *
      • Patched: when a single norm value dominates, a sparse bitset encodes docs with exceptions, + * so that access to the common value is still very fast. outliers fall thru to an exception + * handling mechanism (Indirect or Constant). *
      *

      * Files: @@ -64,7 +68,7 @@ import org.apache.lucene.util.packed.PackedInts; * Norms data (.nvd)

      *

      Norms metadata (.dvm) --> Header,<Entry>NumFields,Footer

      *
        - *
      • Header --> {@link CodecUtil#writeHeader CodecHeader}
      • + *
      • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
      • *
      • Entry --> FieldNumber,Type,Offset
      • *
      • FieldNumber --> {@link DataOutput#writeVInt vInt}
      • *
      • Type --> {@link DataOutput#writeByte Byte}
      • @@ -81,20 +85,24 @@ import org.apache.lucene.util.packed.PackedInts; * a lookup table of unique values is written, followed by the ordinal for each document. *
      • 2 --> constant. When there is a single value for the entire field. *
      • 3 --> uncompressed: Values written as a simple byte[]. - *
      • 4 --> indirect. Only documents with a value are written with a sparse encoding. + *
      • 4 --> indirect. Only documents with a value are written with monotonic compression. a nested + * entry for the same field will follow for the exception handler. + *
      • 5 --> patched. Encoded the same as indirect. *
      *
    • *

      The Norms data or .nvd file.

      *

      For each Norms field, this stores the actual per-document data (the heavy-lifting)

      - *

      Norms data (.nvd) --> Header,<Uncompressed | TableCompressed | DeltaCompressed>NumFields,Footer

      + *

      Norms data (.nvd) --> Header,<Uncompressed | TableCompressed | DeltaCompressed | MonotonicCompressed >NumFields,Footer

      *
        - *
      • Header --> {@link CodecUtil#writeHeader CodecHeader}
      • + *
      • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
      • *
      • Uncompressed --> {@link DataOutput#writeByte Byte}maxDoc
      • *
      • TableCompressed --> PackedIntsVersion,Table,BitPackedData
      • *
      • Table --> TableSize, {@link DataOutput#writeLong int64}TableSize
      • *
      • BitpackedData --> {@link PackedInts}
      • *
      • DeltaCompressed --> PackedIntsVersion,BlockSize,DeltaCompressedData
      • *
      • DeltaCompressedData --> {@link BlockPackedWriter BlockPackedWriter(blockSize=16k)}
      • + *
      • MonotonicCompressed --> PackedIntsVersion,BlockSize,MonotonicCompressedData
      • + *
      • MonotonicCompressedData --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedWriter(blockSize=16k)}
      • *
      • PackedIntsVersion,BlockSize,TableSize --> {@link DataOutput#writeVInt vInt}
      • *
      • Footer --> {@link CodecUtil#writeFooter CodecFooter}
      • *
      diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java index 8c54ffdb06d..7515890ef78 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java @@ -37,6 +37,7 @@ import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountables; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.util.packed.BlockPackedReader; import org.apache.lucene.util.packed.MonotonicBlockPackedReader; import org.apache.lucene.util.packed.PackedInts; @@ -48,6 +49,7 @@ import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.DELTA_COMP import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.TABLE_COMPRESSED; import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.UNCOMPRESSED; import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.INDIRECT; +import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.PATCHED; /** * Reader for {@link Lucene50NormsFormat} @@ -63,6 +65,7 @@ class Lucene50NormsProducer extends NormsProducer { private final AtomicLong ramBytesUsed; private final AtomicInteger activeCount = new AtomicInteger(); + private final int maxDoc; private final boolean merging; @@ -75,11 +78,13 @@ class Lucene50NormsProducer extends NormsProducer { instancesInfo.putAll(original.instancesInfo); ramBytesUsed = new AtomicLong(original.ramBytesUsed.get()); activeCount.set(original.activeCount.get()); + maxDoc = original.maxDoc; merging = true; } Lucene50NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { merging = false; + maxDoc = state.segmentInfo.getDocCount(); String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass())); int version = -1; @@ -88,7 +93,7 @@ class Lucene50NormsProducer extends NormsProducer { try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) { Throwable priorE = null; try { - version = CodecUtil.checkSegmentHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); readFields(in, state.fieldInfos); } catch (Throwable exception) { priorE = exception; @@ -101,7 +106,7 @@ class Lucene50NormsProducer extends NormsProducer { this.data = state.directory.openInput(dataName, state.context); boolean success = false; try { - final int version2 = CodecUtil.checkSegmentHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); if (version != version2) { throw new CorruptIndexException("Format versions mismatch: meta=" + version + ",data=" + version2, data); } @@ -146,6 +151,7 @@ class Lucene50NormsProducer extends NormsProducer { case TABLE_COMPRESSED: case DELTA_COMPRESSED: break; + case PATCHED: case INDIRECT: if (meta.readVInt() != info.number) { throw new CorruptIndexException("indirect norms entry for field: " + info.name + " is corrupt", meta); @@ -254,6 +260,7 @@ class Lucene50NormsProducer extends NormsProducer { } case INDIRECT: { data.seek(entry.offset); + final long common = data.readLong(); int packedIntsVersion = data.readVInt(); int blockSize = data.readVInt(); final MonotonicBlockPackedReader live = MonotonicBlockPackedReader.of(data, packedIntsVersion, blockSize, entry.count, false); @@ -279,7 +286,34 @@ class Lucene50NormsProducer extends NormsProducer { return values.get(mid); } } - return 0; + return common; + } + }; + break; + } + case PATCHED: { + data.seek(entry.offset); + final long common = data.readLong(); + int packedIntsVersion = data.readVInt(); + int blockSize = data.readVInt(); + MonotonicBlockPackedReader live = MonotonicBlockPackedReader.of(data, packedIntsVersion, blockSize, entry.count, true); + final SparseFixedBitSet set = new SparseFixedBitSet(maxDoc); + for (int i = 0; i < live.size(); i++) { + int doc = (int) live.get(i); + set.set(doc); + } + LoadedNorms nestedInstance = loadNorms(entry.nested); + instance.ramBytesUsed = set.ramBytesUsed() + nestedInstance.ramBytesUsed; + instance.info = Accountables.namedAccountable("patched -> " + nestedInstance.info, instance.ramBytesUsed); + final NumericDocValues values = nestedInstance.norms; + instance.norms = new NumericDocValues() { + @Override + public long get(int docID) { + if (set.get(docID)) { + return values.get(docID); + } else { + return common; + } } }; break; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsFormat.java similarity index 87% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsFormat.java index c0861ae3a78..9fa4b1ae924 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsFormat.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.lucene41; +package org.apache.lucene.codecs.lucene50; /* @@ -20,6 +20,7 @@ package org.apache.lucene.codecs.lucene41; import java.io.IOException; +import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; @@ -33,17 +34,15 @@ import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.TermState; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.packed.PackedInts; /** - * Lucene 4.1 postings format, which encodes postings in packed integer blocks + * Lucene 5.0 postings format, which encodes postings in packed integer blocks * for fast decode. * - *

      NOTE: this format is still experimental and - * subject to change without backwards compatibility. - * *

      * Basic idea: *

        @@ -58,7 +57,7 @@ import org.apache.lucene.util.packed.PackedInts; * *
      • * Block structure: - *

        When the postings are long enough, Lucene41PostingsFormat will try to encode most integer data + *

        When the postings are long enough, Lucene50PostingsFormat will try to encode most integer data * as a packed block.

        *

        Take a term with 259 documents as an example, the first 256 document ids are encoded as two packed * blocks, while the remaining 3 are encoded as one VInt block.

        @@ -129,14 +128,14 @@ import org.apache.lucene.util.packed.PackedInts; *
      • PostingsHeader --> Header, PackedBlockSize
      • *
      • TermMetadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?, * SkipFPDelta?
      • - *
      • Header, --> {@link CodecUtil#writeHeader CodecHeader}
      • + *
      • Header, --> {@link CodecUtil#writeIndexHeader IndexHeader}
      • *
      • PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt}
      • *
      • DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --> {@link DataOutput#writeVLong VLong}
      • *
      • Footer --> {@link CodecUtil#writeFooter CodecFooter}
      • *
      *

      Notes:

      *
        - *
      • Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information + *
      • Header is a {@link CodecUtil#writeIndexHeader IndexHeader} storing the version information * for the postings.
      • *
      • PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width is * determined by the largest integer. Smaller block size result in smaller variance among width @@ -162,7 +161,7 @@ import org.apache.lucene.util.packed.PackedInts; *
      • SkipFPDelta determines the position of this term's SkipData within the .doc * file. In particular, it is the length of the TermFreq data. * SkipDelta is only stored if DocFreq is not smaller than SkipMinimum - * (i.e. 128 in Lucene41PostingsFormat).
      • + * (i.e. 128 in Lucene50PostingsFormat). *
      • SingletonDocID is an optimization when a term only appears in one document. In this case, instead * of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the * single document ID is written to the term dictionary.
      • @@ -192,7 +191,7 @@ import org.apache.lucene.util.packed.PackedInts; * *
          *
        • docFile(.doc) --> Header, <TermFreqs, SkipData?>TermCount, Footer
        • - *
        • Header --> {@link CodecUtil#writeHeader CodecHeader}
        • + *
        • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
        • *
        • TermFreqs --> <PackedBlock> PackedDocBlockNum, * VIntBlock?
        • *
        • PackedBlock --> PackedDocDeltaBlock, PackedFreqBlock? @@ -243,10 +242,10 @@ import org.apache.lucene.util.packed.PackedInts; * We use this trick since the definition of skip entry is a little different from base interface. * In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for * skipIntervalth, 2*skipIntervalth ... posting in the list. However, - * in Lucene41PostingsFormat, the skip data is saved for skipInterval+1th, + * in Lucene50PostingsFormat, the skip data is saved for skipInterval+1th, * 2*skipInterval+1th ... posting (skipInterval==PackedBlockSize in this case). * When DocFreq is multiple of PackedBlockSize, MultiLevelSkipListWriter will expect one - * more skip data than Lucene41SkipWriter.
        • + * more skip data than Lucene50SkipWriter. *
        • SkipDatum is the metadata of one skip entry. * For the first block (no matter packed or VInt), it is omitted.
        • *
        • DocSkip records the document number of every PackedBlockSizeth document number in @@ -276,7 +275,7 @@ import org.apache.lucene.util.packed.PackedInts; * sometimes stores part of payloads and offsets for speedup.

          *
            *
          • PosFile(.pos) --> Header, <TermPositions> TermCount, Footer
          • - *
          • Header --> {@link CodecUtil#writeHeader CodecHeader}
          • + *
          • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
          • *
          • TermPositions --> <PackedPosDeltaBlock> PackedPosBlockNum, * VIntBlock?
          • *
          • VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?, @@ -329,7 +328,7 @@ import org.apache.lucene.util.packed.PackedInts; * Some payloads and offsets will be separated out into .pos file, for performance reasons.

            *
              *
            • PayFile(.pay): --> Header, <TermPayloads, TermOffsets?> TermCount, Footer
            • - *
            • Header --> {@link CodecUtil#writeHeader CodecHeader}
            • + *
            • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
            • *
            • TermPayloads --> <PackedPayLengthBlock, SumPayLength, PayData> PackedPayBlockNum *
            • TermOffsets --> <PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock> PackedPayBlockNum *
            • PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --> {@link PackedInts PackedInts}
            • @@ -358,7 +357,7 @@ import org.apache.lucene.util.packed.PackedInts; * @lucene.experimental */ -public final class Lucene41PostingsFormat extends PostingsFormat { +public final class Lucene50PostingsFormat extends PostingsFormat { /** * Filename extension for document number, frequencies, and skip data. * See chapter: Frequencies and Skip Data @@ -376,6 +375,21 @@ public final class Lucene41PostingsFormat extends PostingsFormat { * See chapter: Payloads and Offsets */ public static final String PAY_EXTENSION = "pay"; + + /** + * Expert: The maximum number of skip levels. Smaller values result in + * slightly smaller indexes, but slower skipping in big posting lists. + */ + static final int MAX_SKIP_LEVELS = 10; + + final static String TERMS_CODEC = "Lucene50PostingsWriterTerms"; + final static String DOC_CODEC = "Lucene50PostingsWriterDoc"; + final static String POS_CODEC = "Lucene50PostingsWriterPos"; + final static String PAY_CODEC = "Lucene50PostingsWriterPay"; + + // Increment version to change it + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; private final int minTermBlockSize; private final int maxTermBlockSize; @@ -387,18 +401,18 @@ public final class Lucene41PostingsFormat extends PostingsFormat { // NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding public final static int BLOCK_SIZE = 128; - /** Creates {@code Lucene41PostingsFormat} with default + /** Creates {@code Lucene50PostingsFormat} with default * settings. */ - public Lucene41PostingsFormat() { + public Lucene50PostingsFormat() { this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); } - /** Creates {@code Lucene41PostingsFormat} with custom + /** Creates {@code Lucene50PostingsFormat} with custom * values for {@code minBlockSize} and {@code * maxBlockSize} passed to block terms dictionary. * @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */ - public Lucene41PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { - super("Lucene41"); + public Lucene50PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + super("Lucene50"); this.minTermBlockSize = minTermBlockSize; assert minTermBlockSize > 1; this.maxTermBlockSize = maxTermBlockSize; @@ -412,7 +426,7 @@ public final class Lucene41PostingsFormat extends PostingsFormat { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state); boolean success = false; try { @@ -431,19 +445,10 @@ public final class Lucene41PostingsFormat extends PostingsFormat { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory, - state.fieldInfos, - state.segmentInfo, - state.context, - state.segmentSuffix); + PostingsReaderBase postingsReader = new Lucene50PostingsReader(state); boolean success = false; try { - FieldsProducer ret = new BlockTreeTermsReader(state.directory, - state.fieldInfos, - state.segmentInfo, - postingsReader, - state.context, - state.segmentSuffix); + FieldsProducer ret = new BlockTreeTermsReader(postingsReader, state); success = true; return ret; } finally { @@ -452,4 +457,39 @@ public final class Lucene41PostingsFormat extends PostingsFormat { } } } + + final static class IntBlockTermState extends BlockTermState { + long docStartFP = 0; + long posStartFP = 0; + long payStartFP = 0; + long skipOffset = -1; + long lastPosBlockOffset = -1; + // docid when there is a single pulsed posting, otherwise -1 + // freq is always implicitly totalTermFreq in this case. + int singletonDocID = -1; + + @Override + public IntBlockTermState clone() { + IntBlockTermState other = new IntBlockTermState(); + other.copyFrom(this); + return other; + } + + @Override + public void copyFrom(TermState _other) { + super.copyFrom(_other); + IntBlockTermState other = (IntBlockTermState) _other; + docStartFP = other.docStartFP; + posStartFP = other.posStartFP; + payStartFP = other.payStartFP; + lastPosBlockOffset = other.lastPosBlockOffset; + skipOffset = other.skipOffset; + singletonDocID = other.singletonDocID; + } + + @Override + public String toString() { + return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID; + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsReader.java new file mode 100644 index 00000000000..4c8cbd3f07f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsReader.java @@ -0,0 +1,1328 @@ +package org.apache.lucene.codecs.lucene50; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.codecs.lucene50.ForUtil.MAX_DATA_SIZE; +import static org.apache.lucene.codecs.lucene50.ForUtil.MAX_ENCODED_SIZE; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.DOC_CODEC; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.MAX_SKIP_LEVELS; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.PAY_CODEC; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.POS_CODEC; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.TERMS_CODEC; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.VERSION_CURRENT; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.VERSION_START; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Concrete class that reads docId(maybe frq,pos,offset,payloads) list + * with postings format. + * + * @lucene.experimental + */ +public final class Lucene50PostingsReader extends PostingsReaderBase { + + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Lucene50PostingsReader.class); + + private final IndexInput docIn; + private final IndexInput posIn; + private final IndexInput payIn; + + private final ForUtil forUtil; + private int version; + + /** Sole constructor. */ + public Lucene50PostingsReader(SegmentReadState state) throws IOException { + boolean success = false; + IndexInput docIn = null; + IndexInput posIn = null; + IndexInput payIn = null; + + // NOTE: these data files are too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + + String docName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene50PostingsFormat.DOC_EXTENSION); + try { + docIn = state.directory.openInput(docName, state.context); + version = CodecUtil.checkIndexHeader(docIn, DOC_CODEC, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + forUtil = new ForUtil(docIn); + CodecUtil.retrieveChecksum(docIn); + + if (state.fieldInfos.hasProx()) { + String proxName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene50PostingsFormat.POS_EXTENSION); + posIn = state.directory.openInput(proxName, state.context); + CodecUtil.checkIndexHeader(posIn, POS_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(posIn); + + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + String payName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene50PostingsFormat.PAY_EXTENSION); + payIn = state.directory.openInput(payName, state.context); + CodecUtil.checkIndexHeader(payIn, PAY_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(payIn); + } + } + + this.docIn = docIn; + this.posIn = posIn; + this.payIn = payIn; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(docIn, posIn, payIn); + } + } + } + + @Override + public void init(IndexInput termsIn, SegmentReadState state) throws IOException { + // Make sure we are talking to the matching postings writer + CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + final int indexBlockSize = termsIn.readVInt(); + if (indexBlockSize != BLOCK_SIZE) { + throw new IllegalStateException("index-time BLOCK_SIZE (" + indexBlockSize + ") != read-time BLOCK_SIZE (" + BLOCK_SIZE + ")"); + } + } + + /** + * Read values that have been written using variable-length encoding instead of bit-packing. + */ + static void readVIntBlock(IndexInput docIn, int[] docBuffer, + int[] freqBuffer, int num, boolean indexHasFreq) throws IOException { + if (indexHasFreq) { + for(int i=0;i>> 1; + if ((code & 1) != 0) { + freqBuffer[i] = 1; + } else { + freqBuffer[i] = docIn.readVInt(); + } + } + } else { + for(int i=0;i= 0; + final boolean fieldHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + final boolean fieldHasPayloads = fieldInfo.hasPayloads(); + + if (absolute) { + termState.docStartFP = 0; + termState.posStartFP = 0; + termState.payStartFP = 0; + } + + termState.docStartFP += longs[0]; + if (fieldHasPositions) { + termState.posStartFP += longs[1]; + if (fieldHasOffsets || fieldHasPayloads) { + termState.payStartFP += longs[2]; + } + } + if (termState.docFreq == 1) { + termState.singletonDocID = in.readVInt(); + } else { + termState.singletonDocID = -1; + } + if (fieldHasPositions) { + if (termState.totalTermFreq > BLOCK_SIZE) { + termState.lastPosBlockOffset = in.readVLong(); + } else { + termState.lastPosBlockOffset = -1; + } + } + if (termState.docFreq > BLOCK_SIZE) { + termState.skipOffset = in.readVLong(); + } else { + termState.skipOffset = -1; + } + } + + @Override + public DocsEnum docs(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, DocsEnum reuse, int flags) throws IOException { + BlockDocsEnum docsEnum; + if (reuse instanceof BlockDocsEnum) { + docsEnum = (BlockDocsEnum) reuse; + if (!docsEnum.canReuse(docIn, fieldInfo)) { + docsEnum = new BlockDocsEnum(fieldInfo); + } + } else { + docsEnum = new BlockDocsEnum(fieldInfo); + } + return docsEnum.reset(liveDocs, (IntBlockTermState) termState, flags); + } + + // TODO: specialize to liveDocs vs not + + @Override + public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, + DocsAndPositionsEnum reuse, int flags) + throws IOException { + + boolean indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + boolean indexHasPayloads = fieldInfo.hasPayloads(); + + if ((!indexHasOffsets || (flags & DocsAndPositionsEnum.FLAG_OFFSETS) == 0) && + (!indexHasPayloads || (flags & DocsAndPositionsEnum.FLAG_PAYLOADS) == 0)) { + BlockDocsAndPositionsEnum docsAndPositionsEnum; + if (reuse instanceof BlockDocsAndPositionsEnum) { + docsAndPositionsEnum = (BlockDocsAndPositionsEnum) reuse; + if (!docsAndPositionsEnum.canReuse(docIn, fieldInfo)) { + docsAndPositionsEnum = new BlockDocsAndPositionsEnum(fieldInfo); + } + } else { + docsAndPositionsEnum = new BlockDocsAndPositionsEnum(fieldInfo); + } + return docsAndPositionsEnum.reset(liveDocs, (IntBlockTermState) termState); + } else { + EverythingEnum everythingEnum; + if (reuse instanceof EverythingEnum) { + everythingEnum = (EverythingEnum) reuse; + if (!everythingEnum.canReuse(docIn, fieldInfo)) { + everythingEnum = new EverythingEnum(fieldInfo); + } + } else { + everythingEnum = new EverythingEnum(fieldInfo); + } + return everythingEnum.reset(liveDocs, (IntBlockTermState) termState, flags); + } + } + + final class BlockDocsEnum extends DocsEnum { + private final byte[] encoded; + + private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE]; + private final int[] freqBuffer = new int[MAX_DATA_SIZE]; + + private int docBufferUpto; + + private Lucene50SkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + IndexInput docIn; + final boolean indexHasFreq; + final boolean indexHasPos; + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // sum of freqs in this posting list (or docFreq when omitted) + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int freq; // freq we last read + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private long skipOffset; + + // docID for next skip point, we won't use skipper if + // target docID is not larger than this + private int nextSkipDoc; + + private Bits liveDocs; + + private boolean needsFreq; // true if the caller actually needs frequencies + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + public BlockDocsEnum(FieldInfo fieldInfo) throws IOException { + this.startDocIn = Lucene50PostingsReader.this.docIn; + this.docIn = null; + indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + encoded = new byte[MAX_ENCODED_SIZE]; + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn && + indexHasFreq == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) && + indexHasPos == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) && + indexHasPayloads == fieldInfo.hasPayloads(); + } + + public DocsEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException { + this.liveDocs = liveDocs; + + docFreq = termState.docFreq; + totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; + docTermStartFP = termState.docStartFP; + skipOffset = termState.skipOffset; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = startDocIn.clone(); + } + docIn.seek(docTermStartFP); + } + + doc = -1; + this.needsFreq = (flags & DocsEnum.FLAG_FREQS) != 0; + if (!indexHasFreq) { + Arrays.fill(freqBuffer, 1); + } + accum = 0; + docUpto = 0; + nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block + docBufferUpto = BLOCK_SIZE; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= BLOCK_SIZE) { + forUtil.readBlock(docIn, encoded, docDeltaBuffer); + + if (indexHasFreq) { + if (needsFreq) { + forUtil.readBlock(docIn, encoded, freqBuffer); + } else { + forUtil.skipBlock(docIn); // skip over freqs + } + } + } else if (docFreq == 1) { + docDeltaBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; + } else { + // Read vInts: + readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, indexHasFreq); + } + docBufferUpto = 0; + } + + @Override + public int nextDoc() throws IOException { + while (true) { + + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + accum += docDeltaBuffer[docBufferUpto]; + docUpto++; + + if (liveDocs == null || liveDocs.get(accum)) { + doc = accum; + freq = freqBuffer[docBufferUpto]; + docBufferUpto++; + return doc; + } + docBufferUpto++; + } + } + + @Override + public int advance(int target) throws IOException { + // TODO: make frq block load lazy/skippable + + // current skip docID < docIDs generated from current buffer <= next skip docID + // we don't need to skip if target is buffered already + if (docFreq > BLOCK_SIZE && target > nextSkipDoc) { + + if (skipper == null) { + // Lazy init: first time this enum has ever been used for skipping + skipper = new Lucene50SkipReader(docIn.clone(), + MAX_SKIP_LEVELS, + BLOCK_SIZE, + indexHasPos, + indexHasOffsets, + indexHasPayloads); + } + + if (!skipped) { + assert skipOffset != -1; + // This is the first time this enum has skipped + // since reset() was called; load the skip data: + skipper.init(docTermStartFP+skipOffset, docTermStartFP, 0, 0, docFreq); + skipped = true; + } + + // always plus one to fix the result, since skip position in Lucene50SkipReader + // is a little different from MultiLevelSkipListReader + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto > docUpto) { + // Skipper moved + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); // actually, this is just lastSkipEntry + docIn.seek(skipper.getDocPointer()); // now point to the block we want to search + } + // next time we call advance, this is used to + // foresee whether skipper is necessary. + nextSkipDoc = skipper.getNextSkipDoc(); + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + // Now scan... this is an inlined/pared down version + // of nextDoc(): + while (true) { + accum += docDeltaBuffer[docBufferUpto]; + docUpto++; + + if (accum >= target) { + break; + } + docBufferUpto++; + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + } + + if (liveDocs == null || liveDocs.get(accum)) { + freq = freqBuffer[docBufferUpto]; + docBufferUpto++; + return doc = accum; + } else { + docBufferUpto++; + return nextDoc(); + } + } + + @Override + public long cost() { + return docFreq; + } + } + + + final class BlockDocsAndPositionsEnum extends DocsAndPositionsEnum { + + private final byte[] encoded; + + private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE]; + private final int[] freqBuffer = new int[MAX_DATA_SIZE]; + private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE]; + + private int docBufferUpto; + private int posBufferUpto; + + private Lucene50SkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + IndexInput docIn; + final IndexInput posIn; + + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // number of positions in this posting list + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private long skipOffset; + + private int nextSkipDoc; + + private Bits liveDocs; + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + public BlockDocsAndPositionsEnum(FieldInfo fieldInfo) throws IOException { + this.startDocIn = Lucene50PostingsReader.this.docIn; + this.docIn = null; + this.posIn = Lucene50PostingsReader.this.posIn.clone(); + encoded = new byte[MAX_ENCODED_SIZE]; + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn && + indexHasOffsets == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) && + indexHasPayloads == fieldInfo.hasPayloads(); + } + + public DocsAndPositionsEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException { + this.liveDocs = liveDocs; + + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + skipOffset = termState.skipOffset; + totalTermFreq = termState.totalTermFreq; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = startDocIn.clone(); + } + docIn.seek(docTermStartFP); + } + posPendingFP = posTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + doc = -1; + accum = 0; + docUpto = 0; + if (docFreq > BLOCK_SIZE) { + nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block + } else { + nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping + } + docBufferUpto = BLOCK_SIZE; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= BLOCK_SIZE) { + forUtil.readBlock(docIn, encoded, docDeltaBuffer); + forUtil.readBlock(docIn, encoded, freqBuffer); + } else if (docFreq == 1) { + docDeltaBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; + } else { + // Read vInts: + readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true); + } + docBufferUpto = 0; + } + + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + for(int i=0;i>> 1; + if (payloadLength != 0) { + posIn.seek(posIn.getFilePointer() + payloadLength); + } + } else { + posDeltaBuffer[i] = code; + } + if (indexHasOffsets) { + if ((posIn.readVInt() & 1) != 0) { + // offset length changed + posIn.readVInt(); + } + } + } + } else { + forUtil.readBlock(posIn, encoded, posDeltaBuffer); + } + } + + @Override + public int nextDoc() throws IOException { + while (true) { + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + accum += docDeltaBuffer[docBufferUpto]; + freq = freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + docUpto++; + + if (liveDocs == null || liveDocs.get(accum)) { + doc = accum; + position = 0; + return doc; + } + } + } + + @Override + public int advance(int target) throws IOException { + // TODO: make frq block load lazy/skippable + + if (target > nextSkipDoc) { + if (skipper == null) { + // Lazy init: first time this enum has ever been used for skipping + skipper = new Lucene50SkipReader(docIn.clone(), + MAX_SKIP_LEVELS, + BLOCK_SIZE, + true, + indexHasOffsets, + indexHasPayloads); + } + + if (!skipped) { + assert skipOffset != -1; + // This is the first time this enum has skipped + // since reset() was called; load the skip data: + skipper.init(docTermStartFP+skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq); + skipped = true; + } + + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto > docUpto) { + // Skipper moved + + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); + docIn.seek(skipper.getDocPointer()); + posPendingFP = skipper.getPosPointer(); + posPendingCount = skipper.getPosBufferUpto(); + } + nextSkipDoc = skipper.getNextSkipDoc(); + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + // Now scan... this is an inlined/pared down version + // of nextDoc(): + while (true) { + accum += docDeltaBuffer[docBufferUpto]; + freq = freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + docUpto++; + + if (accum >= target) { + break; + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + } + + if (liveDocs == null || liveDocs.get(accum)) { + position = 0; + return doc = accum; + } else { + return nextDoc(); + } + } + + // TODO: in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointer ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + + final int leftInBlock = BLOCK_SIZE - posBufferUpto; + if (toSkip < leftInBlock) { + posBufferUpto += toSkip; + } else { + toSkip -= leftInBlock; + while(toSkip >= BLOCK_SIZE) { + assert posIn.getFilePointer() != lastPosBlockFP; + forUtil.skipBlock(posIn); + toSkip -= BLOCK_SIZE; + } + refillPositions(); + posBufferUpto = toSkip; + } + + position = 0; + } + + @Override + public int nextPosition() throws IOException { + if (posPendingFP != -1) { + posIn.seek(posPendingFP); + posPendingFP = -1; + + // Force buffer refill: + posBufferUpto = BLOCK_SIZE; + } + + if (posPendingCount > freq) { + skipPositions(); + posPendingCount = freq; + } + + if (posBufferUpto == BLOCK_SIZE) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto++]; + posPendingCount--; + return position; + } + + @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override + public BytesRef getPayload() { + return null; + } + + @Override + public long cost() { + return docFreq; + } + } + + // Also handles payloads + offsets + final class EverythingEnum extends DocsAndPositionsEnum { + + private final byte[] encoded; + + private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE]; + private final int[] freqBuffer = new int[MAX_DATA_SIZE]; + private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE]; + + private final int[] payloadLengthBuffer; + private final int[] offsetStartDeltaBuffer; + private final int[] offsetLengthBuffer; + + private byte[] payloadBytes; + private int payloadByteUpto; + private int payloadLength; + + private int lastStartOffset; + private int startOffset; + private int endOffset; + + private int docBufferUpto; + private int posBufferUpto; + + private Lucene50SkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + IndexInput docIn; + final IndexInput posIn; + final IndexInput payIn; + final BytesRef payload; + + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // number of positions in this posting list + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Lazy pay seek: if != -1 then we must seek to this FP + // before reading payloads/offsets: + private long payPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private long skipOffset; + + private int nextSkipDoc; + + private Bits liveDocs; + + private boolean needsOffsets; // true if we actually need offsets + private boolean needsPayloads; // true if we actually need payloads + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + public EverythingEnum(FieldInfo fieldInfo) throws IOException { + this.startDocIn = Lucene50PostingsReader.this.docIn; + this.docIn = null; + this.posIn = Lucene50PostingsReader.this.posIn.clone(); + this.payIn = Lucene50PostingsReader.this.payIn.clone(); + encoded = new byte[MAX_ENCODED_SIZE]; + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + if (indexHasOffsets) { + offsetStartDeltaBuffer = new int[MAX_DATA_SIZE]; + offsetLengthBuffer = new int[MAX_DATA_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + startOffset = -1; + endOffset = -1; + } + + indexHasPayloads = fieldInfo.hasPayloads(); + if (indexHasPayloads) { + payloadLengthBuffer = new int[MAX_DATA_SIZE]; + payloadBytes = new byte[128]; + payload = new BytesRef(); + } else { + payloadLengthBuffer = null; + payloadBytes = null; + payload = null; + } + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn && + indexHasOffsets == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) && + indexHasPayloads == fieldInfo.hasPayloads(); + } + + public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException { + this.liveDocs = liveDocs; + + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + skipOffset = termState.skipOffset; + totalTermFreq = termState.totalTermFreq; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = startDocIn.clone(); + } + docIn.seek(docTermStartFP); + } + posPendingFP = posTermStartFP; + payPendingFP = payTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + this.needsOffsets = (flags & DocsAndPositionsEnum.FLAG_OFFSETS) != 0; + this.needsPayloads = (flags & DocsAndPositionsEnum.FLAG_PAYLOADS) != 0; + + doc = -1; + accum = 0; + docUpto = 0; + if (docFreq > BLOCK_SIZE) { + nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block + } else { + nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping + } + docBufferUpto = BLOCK_SIZE; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= BLOCK_SIZE) { + forUtil.readBlock(docIn, encoded, docDeltaBuffer); + forUtil.readBlock(docIn, encoded, freqBuffer); + } else if (docFreq == 1) { + docDeltaBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; + } else { + readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true); + } + docBufferUpto = 0; + } + + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + int offsetLength = 0; + payloadByteUpto = 0; + for(int i=0;i>> 1; + if (payloadLength != 0) { + if (payloadByteUpto + payloadLength > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); + } + posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); + payloadByteUpto += payloadLength; + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + int deltaCode = posIn.readVInt(); + if ((deltaCode & 1) != 0) { + offsetLength = posIn.readVInt(); + } + offsetStartDeltaBuffer[i] = deltaCode >>> 1; + offsetLengthBuffer[i] = offsetLength; + } + } + payloadByteUpto = 0; + } else { + forUtil.readBlock(posIn, encoded, posDeltaBuffer); + + if (indexHasPayloads) { + if (needsPayloads) { + forUtil.readBlock(payIn, encoded, payloadLengthBuffer); + int numBytes = payIn.readVInt(); + + if (numBytes > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, numBytes); + } + payIn.readBytes(payloadBytes, 0, numBytes); + } else { + // this works, because when writing a vint block we always force the first length to be written + forUtil.skipBlock(payIn); // skip over lengths + int numBytes = payIn.readVInt(); // read length of payloadBytes + payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes + } + payloadByteUpto = 0; + } + + if (indexHasOffsets) { + if (needsOffsets) { + forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer); + forUtil.readBlock(payIn, encoded, offsetLengthBuffer); + } else { + // this works, because when writing a vint block we always force the first length to be written + forUtil.skipBlock(payIn); // skip over starts + forUtil.skipBlock(payIn); // skip over lengths + } + } + } + } + + @Override + public int nextDoc() throws IOException { + while (true) { + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + accum += docDeltaBuffer[docBufferUpto]; + freq = freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + docUpto++; + + if (liveDocs == null || liveDocs.get(accum)) { + doc = accum; + position = 0; + lastStartOffset = 0; + return doc; + } + } + } + + @Override + public int advance(int target) throws IOException { + // TODO: make frq block load lazy/skippable + + if (target > nextSkipDoc) { + if (skipper == null) { + // Lazy init: first time this enum has ever been used for skipping + skipper = new Lucene50SkipReader(docIn.clone(), + MAX_SKIP_LEVELS, + BLOCK_SIZE, + true, + indexHasOffsets, + indexHasPayloads); + } + + if (!skipped) { + assert skipOffset != -1; + // This is the first time this enum has skipped + // since reset() was called; load the skip data: + skipper.init(docTermStartFP+skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq); + skipped = true; + } + + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto > docUpto) { + // Skipper moved + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); + docIn.seek(skipper.getDocPointer()); + posPendingFP = skipper.getPosPointer(); + payPendingFP = skipper.getPayPointer(); + posPendingCount = skipper.getPosBufferUpto(); + lastStartOffset = 0; // new document + payloadByteUpto = skipper.getPayloadByteUpto(); + } + nextSkipDoc = skipper.getNextSkipDoc(); + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + // Now scan: + while (true) { + accum += docDeltaBuffer[docBufferUpto]; + freq = freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + docUpto++; + + if (accum >= target) { + break; + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + } + + if (liveDocs == null || liveDocs.get(accum)) { + position = 0; + lastStartOffset = 0; + return doc = accum; + } else { + return nextDoc(); + } + } + + // TODO: in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointer ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + // if (DEBUG) { + // System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + // } + + final int leftInBlock = BLOCK_SIZE - posBufferUpto; + if (toSkip < leftInBlock) { + int end = posBufferUpto + toSkip; + while(posBufferUpto < end) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + } else { + toSkip -= leftInBlock; + while(toSkip >= BLOCK_SIZE) { + assert posIn.getFilePointer() != lastPosBlockFP; + forUtil.skipBlock(posIn); + + if (indexHasPayloads) { + // Skip payloadLength block: + forUtil.skipBlock(payIn); + + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } + + if (indexHasOffsets) { + forUtil.skipBlock(payIn); + forUtil.skipBlock(payIn); + } + toSkip -= BLOCK_SIZE; + } + refillPositions(); + payloadByteUpto = 0; + posBufferUpto = 0; + while(posBufferUpto < toSkip) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + } + + position = 0; + lastStartOffset = 0; + } + + @Override + public int nextPosition() throws IOException { + if (posPendingFP != -1) { + posIn.seek(posPendingFP); + posPendingFP = -1; + + if (payPendingFP != -1) { + payIn.seek(payPendingFP); + payPendingFP = -1; + } + + // Force buffer refill: + posBufferUpto = BLOCK_SIZE; + } + + if (posPendingCount > freq) { + skipPositions(); + posPendingCount = freq; + } + + if (posBufferUpto == BLOCK_SIZE) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto]; + + if (indexHasPayloads) { + payloadLength = payloadLengthBuffer[posBufferUpto]; + payload.bytes = payloadBytes; + payload.offset = payloadByteUpto; + payload.length = payloadLength; + payloadByteUpto += payloadLength; + } + + if (indexHasOffsets) { + startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto]; + endOffset = startOffset + offsetLengthBuffer[posBufferUpto]; + lastStartOffset = startOffset; + } + + posBufferUpto++; + posPendingCount--; + return position; + } + + @Override + public int startOffset() { + return startOffset; + } + + @Override + public int endOffset() { + return endOffset; + } + + @Override + public BytesRef getPayload() { + if (payloadLength == 0) { + return null; + } else { + return payload; + } + } + + @Override + public long cost() { + return docFreq; + } + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED; + } + + @Override + public Iterable getChildResources() { + return Collections.emptyList(); + } + + @Override + public void checkIntegrity() throws IOException { + if (docIn != null) { + CodecUtil.checksumEntireFile(docIn); + } + if (posIn != null) { + CodecUtil.checksumEntireFile(posIn); + } + if (payIn != null) { + CodecUtil.checksumEntireFile(payIn); + } + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(positions=" + (posIn != null) + ",payloads=" + (payIn != null) +")"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsWriter.java new file mode 100644 index 00000000000..25393fe7783 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsWriter.java @@ -0,0 +1,480 @@ +package org.apache.lucene.codecs.lucene50; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.codecs.lucene50.ForUtil.MAX_DATA_SIZE; +import static org.apache.lucene.codecs.lucene50.ForUtil.MAX_ENCODED_SIZE; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.DOC_CODEC; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.MAX_SKIP_LEVELS; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.PAY_CODEC; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.POS_CODEC; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.TERMS_CODEC; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.VERSION_CURRENT; + +import java.io.IOException; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.PushPostingsWriterBase; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Concrete class that writes docId(maybe frq,pos,offset,payloads) list + * with postings format. + * + * Postings list for each term will be stored separately. + * + * @see Lucene50SkipWriter for details about skipping setting and postings layout. + * @lucene.experimental + */ +public final class Lucene50PostingsWriter extends PushPostingsWriterBase { + + IndexOutput docOut; + IndexOutput posOut; + IndexOutput payOut; + + final static IntBlockTermState emptyState = new IntBlockTermState(); + IntBlockTermState lastState; + + // Holds starting file pointers for current term: + private long docStartFP; + private long posStartFP; + private long payStartFP; + + final int[] docDeltaBuffer; + final int[] freqBuffer; + private int docBufferUpto; + + final int[] posDeltaBuffer; + final int[] payloadLengthBuffer; + final int[] offsetStartDeltaBuffer; + final int[] offsetLengthBuffer; + private int posBufferUpto; + + private byte[] payloadBytes; + private int payloadByteUpto; + + private int lastBlockDocID; + private long lastBlockPosFP; + private long lastBlockPayFP; + private int lastBlockPosBufferUpto; + private int lastBlockPayloadByteUpto; + + private int lastDocID; + private int lastPosition; + private int lastStartOffset; + private int docCount; + + final byte[] encoded; + + private final ForUtil forUtil; + private final Lucene50SkipWriter skipWriter; + + /** Creates a postings writer */ + public Lucene50PostingsWriter(SegmentWriteState state) throws IOException { + final float acceptableOverheadRatio = PackedInts.COMPACT; + + String docFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene50PostingsFormat.DOC_EXTENSION); + docOut = state.directory.createOutput(docFileName, state.context); + IndexOutput posOut = null; + IndexOutput payOut = null; + boolean success = false; + try { + CodecUtil.writeIndexHeader(docOut, DOC_CODEC, VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); + forUtil = new ForUtil(acceptableOverheadRatio, docOut); + if (state.fieldInfos.hasProx()) { + posDeltaBuffer = new int[MAX_DATA_SIZE]; + String posFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene50PostingsFormat.POS_EXTENSION); + posOut = state.directory.createOutput(posFileName, state.context); + CodecUtil.writeIndexHeader(posOut, POS_CODEC, VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); + + if (state.fieldInfos.hasPayloads()) { + payloadBytes = new byte[128]; + payloadLengthBuffer = new int[MAX_DATA_SIZE]; + } else { + payloadBytes = null; + payloadLengthBuffer = null; + } + + if (state.fieldInfos.hasOffsets()) { + offsetStartDeltaBuffer = new int[MAX_DATA_SIZE]; + offsetLengthBuffer = new int[MAX_DATA_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + } + + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + String payFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene50PostingsFormat.PAY_EXTENSION); + payOut = state.directory.createOutput(payFileName, state.context); + CodecUtil.writeIndexHeader(payOut, PAY_CODEC, VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); + } + } else { + posDeltaBuffer = null; + payloadLengthBuffer = null; + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + payloadBytes = null; + } + this.payOut = payOut; + this.posOut = posOut; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(docOut, posOut, payOut); + } + } + + docDeltaBuffer = new int[MAX_DATA_SIZE]; + freqBuffer = new int[MAX_DATA_SIZE]; + + // TODO: should we try skipping every 2/4 blocks...? + skipWriter = new Lucene50SkipWriter(MAX_SKIP_LEVELS, + BLOCK_SIZE, + state.segmentInfo.getDocCount(), + docOut, + posOut, + payOut); + + encoded = new byte[MAX_ENCODED_SIZE]; + } + + @Override + public IntBlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { + CodecUtil.writeIndexHeader(termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + termsOut.writeVInt(BLOCK_SIZE); + } + + @Override + public int setField(FieldInfo fieldInfo) { + super.setField(fieldInfo); + skipWriter.setField(writePositions, writeOffsets, writePayloads); + lastState = emptyState; + if (writePositions) { + if (writePayloads || writeOffsets) { + return 3; // doc + pos + pay FP + } else { + return 2; // doc + pos FP + } + } else { + return 1; // doc FP + } + } + + @Override + public void startTerm() { + docStartFP = docOut.getFilePointer(); + if (writePositions) { + posStartFP = posOut.getFilePointer(); + if (writePayloads || writeOffsets) { + payStartFP = payOut.getFilePointer(); + } + } + lastDocID = 0; + lastBlockDocID = -1; + skipWriter.resetSkip(); + } + + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + // Have collected a block of docs, and get a new doc. + // Should write skip data as well as postings list for + // current block. + if (lastBlockDocID != -1 && docBufferUpto == 0) { + skipWriter.bufferSkip(lastBlockDocID, docCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockPayloadByteUpto); + } + + final int docDelta = docID - lastDocID; + + if (docID < 0 || (docCount > 0 && docDelta <= 0)) { + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )", docOut.toString()); + } + + docDeltaBuffer[docBufferUpto] = docDelta; + if (writeFreqs) { + freqBuffer[docBufferUpto] = termDocFreq; + } + + docBufferUpto++; + docCount++; + + if (docBufferUpto == BLOCK_SIZE) { + forUtil.writeBlock(docDeltaBuffer, encoded, docOut); + if (writeFreqs) { + forUtil.writeBlock(freqBuffer, encoded, docOut); + } + // NOTE: don't set docBufferUpto back to 0 here; + // finishDoc will do so (because it needs to see that + // the block was filled so it can save skip data) + } + + + lastDocID = docID; + lastPosition = 0; + lastStartOffset = 0; + } + + @Override + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { + posDeltaBuffer[posBufferUpto] = position - lastPosition; + if (writePayloads) { + if (payload == null || payload.length == 0) { + // no payload + payloadLengthBuffer[posBufferUpto] = 0; + } else { + payloadLengthBuffer[posBufferUpto] = payload.length; + if (payloadByteUpto + payload.length > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length); + } + System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length); + payloadByteUpto += payload.length; + } + } + + if (writeOffsets) { + assert startOffset >= lastStartOffset; + assert endOffset >= startOffset; + offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset; + offsetLengthBuffer[posBufferUpto] = endOffset - startOffset; + lastStartOffset = startOffset; + } + + posBufferUpto++; + lastPosition = position; + if (posBufferUpto == BLOCK_SIZE) { + forUtil.writeBlock(posDeltaBuffer, encoded, posOut); + + if (writePayloads) { + forUtil.writeBlock(payloadLengthBuffer, encoded, payOut); + payOut.writeVInt(payloadByteUpto); + payOut.writeBytes(payloadBytes, 0, payloadByteUpto); + payloadByteUpto = 0; + } + if (writeOffsets) { + forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut); + forUtil.writeBlock(offsetLengthBuffer, encoded, payOut); + } + posBufferUpto = 0; + } + } + + @Override + public void finishDoc() throws IOException { + // Since we don't know df for current term, we had to buffer + // those skip data for each block, and when a new doc comes, + // write them to skip file. + if (docBufferUpto == BLOCK_SIZE) { + lastBlockDocID = lastDocID; + if (posOut != null) { + if (payOut != null) { + lastBlockPayFP = payOut.getFilePointer(); + } + lastBlockPosFP = posOut.getFilePointer(); + lastBlockPosBufferUpto = posBufferUpto; + lastBlockPayloadByteUpto = payloadByteUpto; + } + docBufferUpto = 0; + } + } + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(BlockTermState _state) throws IOException { + IntBlockTermState state = (IntBlockTermState) _state; + assert state.docFreq > 0; + + // TODO: wasteful we are counting this (counting # docs + // for this term) in two places? + assert state.docFreq == docCount: state.docFreq + " vs " + docCount; + + // docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to it. + final int singletonDocID; + if (state.docFreq == 1) { + // pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq + singletonDocID = docDeltaBuffer[0]; + } else { + singletonDocID = -1; + // vInt encode the remaining doc deltas and freqs: + for(int i=0;i BLOCK_SIZE) { + // record file offset for last pos in last block + lastPosBlockOffset = posOut.getFilePointer() - posStartFP; + } else { + lastPosBlockOffset = -1; + } + if (posBufferUpto > 0) { + // TODO: should we send offsets/payloads to + // .pay...? seems wasteful (have to store extra + // vLong for low (< BLOCK_SIZE) DF terms = vast vast + // majority) + + // vInt encode the remaining positions/payloads/offsets: + int lastPayloadLength = -1; // force first payload length to be written + int lastOffsetLength = -1; // force first offset length to be written + int payloadBytesReadUpto = 0; + for(int i=0;i BLOCK_SIZE) { + skipOffset = skipWriter.writeSkip(docOut) - docStartFP; + } else { + skipOffset = -1; + } + + state.docStartFP = docStartFP; + state.posStartFP = posStartFP; + state.payStartFP = payStartFP; + state.singletonDocID = singletonDocID; + state.skipOffset = skipOffset; + state.lastPosBlockOffset = lastPosBlockOffset; + docBufferUpto = 0; + posBufferUpto = 0; + lastDocID = 0; + docCount = 0; + } + + @Override + public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + IntBlockTermState state = (IntBlockTermState)_state; + if (absolute) { + lastState = emptyState; + } + longs[0] = state.docStartFP - lastState.docStartFP; + if (writePositions) { + longs[1] = state.posStartFP - lastState.posStartFP; + if (writePayloads || writeOffsets) { + longs[2] = state.payStartFP - lastState.payStartFP; + } + } + if (state.singletonDocID != -1) { + out.writeVInt(state.singletonDocID); + } + if (writePositions) { + if (state.lastPosBlockOffset != -1) { + out.writeVLong(state.lastPosBlockOffset); + } + } + if (state.skipOffset != -1) { + out.writeVLong(state.skipOffset); + } + lastState = state; + } + + @Override + public void close() throws IOException { + // TODO: add a finish() at least to PushBase? DV too...? + boolean success = false; + try { + if (docOut != null) { + CodecUtil.writeFooter(docOut); + } + if (posOut != null) { + CodecUtil.writeFooter(posOut); + } + if (payOut != null) { + CodecUtil.writeFooter(payOut); + } + success = true; + } finally { + if (success) { + IOUtils.close(docOut, posOut, payOut); + } else { + IOUtils.closeWhileHandlingException(docOut, posOut, payOut); + } + docOut = posOut = payOut = null; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java index e32555cd4d6..794b0d0e360 100755 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java @@ -34,7 +34,6 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.Version; /** @@ -42,20 +41,19 @@ import org.apache.lucene.util.Version; *

              * Files: *

                - *
              • .si: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Id, Footer + *
              • .si: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Footer *
              *

              * Data types: *

              *

                - *
              • Header --> {@link CodecUtil#writeHeader CodecHeader}
              • + *
              • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
              • *
              • SegSize --> {@link DataOutput#writeInt Int32}
              • *
              • SegVersion --> {@link DataOutput#writeString String}
              • *
              • Files --> {@link DataOutput#writeStringSet Set<String>}
              • *
              • Diagnostics --> {@link DataOutput#writeStringStringMap Map<String,String>}
              • *
              • IsCompoundFile --> {@link DataOutput#writeByte Int8}
              • *
              • Footer --> {@link CodecUtil#writeFooter CodecFooter}
              • - *
              • Id --> {@link DataOutput#writeString String}
              • *
              *

              * Field Descriptions: @@ -84,15 +82,16 @@ public class Lucene50SegmentInfoFormat extends SegmentInfoFormat { } @Override - public SegmentInfo read(Directory dir, String segment, IOContext context) throws IOException { + public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException { final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene50SegmentInfoFormat.SI_EXTENSION); try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) { Throwable priorE = null; SegmentInfo si = null; try { - CodecUtil.checkHeader(input, Lucene50SegmentInfoFormat.CODEC_NAME, - Lucene50SegmentInfoFormat.VERSION_START, - Lucene50SegmentInfoFormat.VERSION_CURRENT); + CodecUtil.checkIndexHeader(input, Lucene50SegmentInfoFormat.CODEC_NAME, + Lucene50SegmentInfoFormat.VERSION_START, + Lucene50SegmentInfoFormat.VERSION_CURRENT, + segmentID, ""); final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); final int docCount = input.readInt(); @@ -103,10 +102,7 @@ public class Lucene50SegmentInfoFormat extends SegmentInfoFormat { final Map diagnostics = input.readStringStringMap(); final Set files = input.readStringSet(); - byte[] id = new byte[StringHelper.ID_LENGTH]; - input.readBytes(id, 0, id.length); - - si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, id); + si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, segmentID); si.setFiles(files); } catch (Throwable exception) { priorE = exception; @@ -124,7 +120,11 @@ public class Lucene50SegmentInfoFormat extends SegmentInfoFormat { boolean success = false; try (IndexOutput output = dir.createOutput(fileName, ioContext)) { - CodecUtil.writeHeader(output, Lucene50SegmentInfoFormat.CODEC_NAME, Lucene50SegmentInfoFormat.VERSION_CURRENT); + CodecUtil.writeIndexHeader(output, + Lucene50SegmentInfoFormat.CODEC_NAME, + Lucene50SegmentInfoFormat.VERSION_CURRENT, + si.getId(), + ""); Version version = si.getVersion(); if (version.major < 5) { throw new IllegalArgumentException("invalid major version: should be >= 5 but got: " + version.major + " segment=" + si); @@ -145,11 +145,6 @@ public class Lucene50SegmentInfoFormat extends SegmentInfoFormat { } } output.writeStringSet(files); - byte[] id = si.getId(); - if (id.length != StringHelper.ID_LENGTH) { - throw new IllegalArgumentException("invalid id, got=" + StringHelper.idToString(id)); - } - output.writeBytes(id, 0, id.length); CodecUtil.writeFooter(output); success = true; } finally { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SkipReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SkipReader.java new file mode 100644 index 00000000000..a46774a350d --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SkipReader.java @@ -0,0 +1,198 @@ +package org.apache.lucene.codecs.lucene50; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.codecs.MultiLevelSkipListReader; +import org.apache.lucene.store.IndexInput; + +/** + * Implements the skip list reader for block postings format + * that stores positions and payloads. + * + * Although this skipper uses MultiLevelSkipListReader as an interface, + * its definition of skip position will be a little different. + * + * For example, when skipInterval = blockSize = 3, df = 2*skipInterval = 6, + * + * 0 1 2 3 4 5 + * d d d d d d (posting list) + * ^ ^ (skip point in MultiLeveSkipWriter) + * ^ (skip point in Lucene50SkipWriter) + * + * In this case, MultiLevelSkipListReader will use the last document as a skip point, + * while Lucene50SkipReader should assume no skip point will comes. + * + * If we use the interface directly in Lucene50SkipReader, it may silly try to read + * another skip data after the only skip point is loaded. + * + * To illustrate this, we can call skipTo(d[5]), since skip point d[3] has smaller docId, + * and numSkipped+blockSize== df, the MultiLevelSkipListReader will assume the skip list + * isn't exhausted yet, and try to load a non-existed skip point + * + * Therefore, we'll trim df before passing it to the interface. see trim(int) + * + */ +final class Lucene50SkipReader extends MultiLevelSkipListReader { + private final int blockSize; + + private long docPointer[]; + private long posPointer[]; + private long payPointer[]; + private int posBufferUpto[]; + private int payloadByteUpto[]; + + private long lastPosPointer; + private long lastPayPointer; + private int lastPayloadByteUpto; + private long lastDocPointer; + private int lastPosBufferUpto; + + public Lucene50SkipReader(IndexInput skipStream, int maxSkipLevels, int blockSize, boolean hasPos, boolean hasOffsets, boolean hasPayloads) { + super(skipStream, maxSkipLevels, blockSize, 8); + this.blockSize = blockSize; + docPointer = new long[maxSkipLevels]; + if (hasPos) { + posPointer = new long[maxSkipLevels]; + posBufferUpto = new int[maxSkipLevels]; + if (hasPayloads) { + payloadByteUpto = new int[maxSkipLevels]; + } else { + payloadByteUpto = null; + } + if (hasOffsets || hasPayloads) { + payPointer = new long[maxSkipLevels]; + } else { + payPointer = null; + } + } else { + posPointer = null; + } + } + + /** + * Trim original docFreq to tell skipReader read proper number of skip points. + * + * Since our definition in Lucene50Skip* is a little different from MultiLevelSkip* + * This trimmed docFreq will prevent skipReader from: + * 1. silly reading a non-existed skip point after the last block boundary + * 2. moving into the vInt block + * + */ + protected int trim(int df) { + return df % blockSize == 0? df - 1: df; + } + + public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) { + super.init(skipPointer, trim(df)); + lastDocPointer = docBasePointer; + lastPosPointer = posBasePointer; + lastPayPointer = payBasePointer; + + Arrays.fill(docPointer, docBasePointer); + if (posPointer != null) { + Arrays.fill(posPointer, posBasePointer); + if (payPointer != null) { + Arrays.fill(payPointer, payBasePointer); + } + } else { + assert posBasePointer == 0; + } + } + + /** Returns the doc pointer of the doc to which the last call of + * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */ + public long getDocPointer() { + return lastDocPointer; + } + + public long getPosPointer() { + return lastPosPointer; + } + + public int getPosBufferUpto() { + return lastPosBufferUpto; + } + + public long getPayPointer() { + return lastPayPointer; + } + + public int getPayloadByteUpto() { + return lastPayloadByteUpto; + } + + public int getNextSkipDoc() { + return skipDoc[0]; + } + + @Override + protected void seekChild(int level) throws IOException { + super.seekChild(level); + docPointer[level] = lastDocPointer; + if (posPointer != null) { + posPointer[level] = lastPosPointer; + posBufferUpto[level] = lastPosBufferUpto; + if (payloadByteUpto != null) { + payloadByteUpto[level] = lastPayloadByteUpto; + } + if (payPointer != null) { + payPointer[level] = lastPayPointer; + } + } + } + + @Override + protected void setLastSkipData(int level) { + super.setLastSkipData(level); + lastDocPointer = docPointer[level]; + + if (posPointer != null) { + lastPosPointer = posPointer[level]; + lastPosBufferUpto = posBufferUpto[level]; + if (payPointer != null) { + lastPayPointer = payPointer[level]; + } + if (payloadByteUpto != null) { + lastPayloadByteUpto = payloadByteUpto[level]; + } + } + } + + @Override + protected int readSkipData(int level, IndexInput skipStream) throws IOException { + int delta = skipStream.readVInt(); + docPointer[level] += skipStream.readVInt(); + + if (posPointer != null) { + posPointer[level] += skipStream.readVInt(); + posBufferUpto[level] = skipStream.readVInt(); + + if (payloadByteUpto != null) { + payloadByteUpto[level] = skipStream.readVInt(); + } + + if (payPointer != null) { + payPointer[level] += skipStream.readVInt(); + } + } + return delta; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SkipWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SkipWriter.java new file mode 100644 index 00000000000..a4d04594757 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SkipWriter.java @@ -0,0 +1,169 @@ +package org.apache.lucene.codecs.lucene50; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.codecs.MultiLevelSkipListWriter; + +/** + * Write skip lists with multiple levels, and support skip within block ints. + * + * Assume that docFreq = 28, skipInterval = blockSize = 12 + * + * | block#0 | | block#1 | |vInts| + * d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list) + * ^ ^ (level 0 skip point) + * + * Note that skipWriter will ignore first document in block#0, since + * it is useless as a skip point. Also, we'll never skip into the vInts + * block, only record skip data at the start its start point(if it exist). + * + * For each skip point, we will record: + * 1. docID in former position, i.e. for position 12, record docID[11], etc. + * 2. its related file points(position, payload), + * 3. related numbers or uptos(position, payload). + * 4. start offset. + * + */ +final class Lucene50SkipWriter extends MultiLevelSkipListWriter { + private int[] lastSkipDoc; + private long[] lastSkipDocPointer; + private long[] lastSkipPosPointer; + private long[] lastSkipPayPointer; + private int[] lastPayloadByteUpto; + + private final IndexOutput docOut; + private final IndexOutput posOut; + private final IndexOutput payOut; + + private int curDoc; + private long curDocPointer; + private long curPosPointer; + private long curPayPointer; + private int curPosBufferUpto; + private int curPayloadByteUpto; + private boolean fieldHasPositions; + private boolean fieldHasOffsets; + private boolean fieldHasPayloads; + + public Lucene50SkipWriter(int maxSkipLevels, int blockSize, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) { + super(blockSize, 8, maxSkipLevels, docCount); + this.docOut = docOut; + this.posOut = posOut; + this.payOut = payOut; + + lastSkipDoc = new int[maxSkipLevels]; + lastSkipDocPointer = new long[maxSkipLevels]; + if (posOut != null) { + lastSkipPosPointer = new long[maxSkipLevels]; + if (payOut != null) { + lastSkipPayPointer = new long[maxSkipLevels]; + } + lastPayloadByteUpto = new int[maxSkipLevels]; + } + } + + public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) { + this.fieldHasPositions = fieldHasPositions; + this.fieldHasOffsets = fieldHasOffsets; + this.fieldHasPayloads = fieldHasPayloads; + } + + // tricky: we only skip data for blocks (terms with more than 128 docs), but re-init'ing the skipper + // is pretty slow for rare terms in large segments as we have to fill O(log #docs in segment) of junk. + // this is the vast majority of terms (worst case: ID field or similar). so in resetSkip() we save + // away the previous pointers, and lazy-init only if we need to buffer skip data for the term. + private boolean initialized; + long lastDocFP; + long lastPosFP; + long lastPayFP; + + @Override + public void resetSkip() { + lastDocFP = docOut.getFilePointer(); + if (fieldHasPositions) { + lastPosFP = posOut.getFilePointer(); + if (fieldHasOffsets || fieldHasPayloads) { + lastPayFP = payOut.getFilePointer(); + } + } + initialized = false; + } + + public void initSkip() { + if (!initialized) { + super.resetSkip(); + Arrays.fill(lastSkipDoc, 0); + Arrays.fill(lastSkipDocPointer, lastDocFP); + if (fieldHasPositions) { + Arrays.fill(lastSkipPosPointer, lastPosFP); + if (fieldHasPayloads) { + Arrays.fill(lastPayloadByteUpto, 0); + } + if (fieldHasOffsets || fieldHasPayloads) { + Arrays.fill(lastSkipPayPointer, lastPayFP); + } + } + initialized = true; + } + } + + /** + * Sets the values for the current skip data. + */ + public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int payloadByteUpto) throws IOException { + initSkip(); + this.curDoc = doc; + this.curDocPointer = docOut.getFilePointer(); + this.curPosPointer = posFP; + this.curPayPointer = payFP; + this.curPosBufferUpto = posBufferUpto; + this.curPayloadByteUpto = payloadByteUpto; + bufferSkip(numDocs); + } + + @Override + protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { + int delta = curDoc - lastSkipDoc[level]; + + skipBuffer.writeVInt(delta); + lastSkipDoc[level] = curDoc; + + skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level])); + lastSkipDocPointer[level] = curDocPointer; + + if (fieldHasPositions) { + + skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level])); + lastSkipPosPointer[level] = curPosPointer; + skipBuffer.writeVInt(curPosBufferUpto); + + if (fieldHasPayloads) { + skipBuffer.writeVInt(curPayloadByteUpto); + } + + if (fieldHasOffsets || fieldHasPayloads) { + skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level])); + lastSkipPayPointer[level] = curPayPointer; + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java index 5a3c14a1fd2..de6382ebd43 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java @@ -52,7 +52,7 @@ import org.apache.lucene.util.packed.PackedInts; *

              Here is a more detailed description of the field data file format:

              *
                *
              • FieldData (.fdt) --> <Header>, PackedIntsVersion, <Chunk>ChunkCount
              • - *
              • Header --> {@link CodecUtil#writeSegmentHeader SegmentHeader}
              • + *
              • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
              • *
              • PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}
              • *
              • ChunkCount is not known in advance and is the number of chunks necessary to store all document of the segment
              • *
              • Chunk --> DocBase, ChunkDocs, DocFieldCounts, DocLengths, <CompressedDocs>
              • @@ -104,7 +104,7 @@ import org.apache.lucene.util.packed.PackedInts; *

                A fields index file (extension .fdx).

                *
                  *
                • FieldsIndex (.fdx) --> <Header>, <ChunkIndex>
                • - *
                • Header --> {@link CodecUtil#writeSegmentHeader SegmentHeader}
                • + *
                • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
                • *
                • ChunkIndex: See {@link CompressingStoredFieldsIndexWriter}
                • *
                * diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java index e3323420c0e..f2d04848137 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java @@ -59,7 +59,7 @@ import org.apache.lucene.util.packed.PackedInts; *

                Here is a more detailed description of the field data file format:

                *
                  *
                • VectorData (.tvd) --> <Header>, PackedIntsVersion, ChunkSize, <Chunk>ChunkCount, Footer
                • - *
                • Header --> {@link CodecUtil#writeSegmentHeader SegmentHeader}
                • + *
                • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
                • *
                • PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}
                • *
                • ChunkSize is the number of bytes of terms to accumulate before flushing, as a {@link DataOutput#writeVInt VInt}
                • *
                • ChunkCount is not known in advance and is the number of chunks necessary to store all document of the segment
                • @@ -113,7 +113,7 @@ import org.apache.lucene.util.packed.PackedInts; *

                  An index file (extension .tvx).

                  *
                    *
                  • VectorIndex (.tvx) --> <Header>, <ChunkIndex>, Footer
                  • - *
                  • Header --> {@link CodecUtil#writeSegmentHeader SegmentHeader}
                  • + *
                  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
                  • *
                  • ChunkIndex: See {@link CompressingStoredFieldsIndexWriter}
                  • *
                  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
                  • *
                  diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package.html b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package.html index 9ce0a294e2f..76777ecd35c 100755 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package.html +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package.html @@ -154,20 +154,20 @@ its title, url, or an identifier to access a database. The set of stored fields returned for each hit when searching. This is keyed by document number.
                • -{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term dictionary}. +{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term dictionary}. A dictionary containing all of the terms used in all of the indexed fields of all of the documents. The dictionary also contains the number of documents which contain the term, and pointers to the term's frequency and proximity data.
                • -{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Frequency data}. +{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Frequency data}. For each term in the dictionary, the numbers of all the documents that contain that term, and the frequency of the term in that document, unless frequencies are omitted (IndexOptions.DOCS_ONLY)
                • -{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Proximity data}. +{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Proximity data}. For each term in the dictionary, the positions that the term occurs in each document. Note that this will not exist if all fields in all documents omit position data. @@ -185,7 +185,7 @@ term frequency. To add Term Vectors to your index see the {@link org.apache.lucene.document.Field Field} constructors
                • -{@link org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat Per-document values}. +{@link org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat Per-document values}. Like stored values, these are also keyed by document number, but are generally intended to be loaded into main memory for fast access. Whereas stored values are generally intended for summary results from @@ -264,27 +264,27 @@ systems that frequently run out of file handles. The stored fields for documents -{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Dictionary} +{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Dictionary} .tim The term dictionary, stores term info -{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Index} +{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Index} .tip The index into the Term Dictionary -{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Frequencies} +{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Frequencies} .doc Contains the list of docs which contain each term along with frequency -{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Positions} +{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Positions} .pos Stores position information about where a term occurs in the index -{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Payloads} +{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Payloads} .pay Stores additional per-position metadata information such as character offsets and user payloads @@ -294,7 +294,7 @@ systems that frequently run out of file handles. Encodes length and boost factors for docs and fields -{@link org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat Per-Document Values} +{@link org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat Per-Document Values} .dvd, .dvm Encodes additional scoring factors or other per-document information. diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index f7e73f76ee4..5e9de040ba6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -32,8 +32,6 @@ import java.util.Map; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.blocktree.FieldReader; -import org.apache.lucene.codecs.blocktree.Stats; import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.search.DocIdSetIterator; @@ -278,7 +276,7 @@ public class CheckIndex implements Closeable { * tree terms dictionary (this is only set if the * {@link PostingsFormat} for this segment uses block * tree. */ - public Map blockTreeStats = null; + public Map blockTreeStats = null; } /** @@ -453,7 +451,7 @@ public class CheckIndex implements Closeable { public Status checkIndex(List onlySegments) throws IOException { ensureOpen(); NumberFormat nf = NumberFormat.getInstance(Locale.ROOT); - SegmentInfos sis = new SegmentInfos(); + SegmentInfos sis = null; Status result = new Status(); result.dir = dir; String[] files = dir.listAll(); @@ -464,7 +462,7 @@ public class CheckIndex implements Closeable { try { // Do not use SegmentInfos.read(Directory) since the spooky // retrying it does is not necessary here (we hold the write lock): - sis.read(dir, lastSegmentsFile); + sis = SegmentInfos.readCommit(dir, lastSegmentsFile); } catch (Throwable t) { if (failFast) { IOUtils.reThrow(t); @@ -1289,14 +1287,12 @@ public class CheckIndex implements Closeable { // docs got deleted and then merged away): } else { - if (fieldTerms instanceof FieldReader) { - final Stats stats = ((FieldReader) fieldTerms).computeStats(); - assert stats != null; - if (status.blockTreeStats == null) { - status.blockTreeStats = new HashMap<>(); - } - status.blockTreeStats.put(field, stats); + final Object stats = fieldTerms.getStats(); + assert stats != null; + if (status.blockTreeStats == null) { + status.blockTreeStats = new HashMap<>(); } + status.blockTreeStats.put(field, stats); if (sumTotalTermFreq != 0) { final long v = fields.terms(field).getSumTotalTermFreq(); @@ -1423,7 +1419,7 @@ public class CheckIndex implements Closeable { } if (verbose && status.blockTreeStats != null && infoStream != null && status.termCount > 0) { - for(Map.Entry ent : status.blockTreeStats.entrySet()) { + for(Map.Entry ent : status.blockTreeStats.entrySet()) { infoStream.println(" field \"" + ent.getKey() + "\":"); infoStream.println(" " + ent.getValue().toString().replace("\n", "\n ")); } diff --git a/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java index fb2ae1d63fa..c636444769a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java @@ -225,8 +225,7 @@ public abstract class DirectoryReader extends BaseCompositeReader { List commits = new ArrayList<>(); - SegmentInfos latest = new SegmentInfos(); - latest.read(dir); + SegmentInfos latest = SegmentInfos.readLatestCommit(dir); final long currentGen = latest.getGeneration(); commits.add(new StandardDirectoryReader.ReaderCommit(latest, dir)); @@ -239,11 +238,11 @@ public abstract class DirectoryReader extends BaseCompositeReader { !fileName.equals(IndexFileNames.OLD_SEGMENTS_GEN) && SegmentInfos.generationFromSegmentsFileName(fileName) < currentGen) { - SegmentInfos sis = new SegmentInfos(); + SegmentInfos sis = null; try { // IOException allowed to throw there, in case // segments_N is corrupt - sis.read(dir, fileName); + sis = SegmentInfos.readCommit(dir, fileName); } catch (FileNotFoundException | NoSuchFileException fnfe) { // LUCENE-948: on NFS (and maybe others), if // you have writers switching back and forth @@ -252,7 +251,6 @@ public abstract class DirectoryReader extends BaseCompositeReader { // file segments_X exists when in fact it // doesn't. So, we catch this and handle it // as if the file does not exist - sis = null; } if (sis != null) diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java index 34c58adcf8e..fc18de76d72 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java @@ -151,6 +151,11 @@ public class FilterLeafReader extends LeafReader { public boolean hasPayloads() { return in.hasPayloads(); } + + @Override + public Object getStats() throws IOException { + return in.getStats(); + } } /** Base class for filtering {@link TermsEnum} implementations. */ diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java b/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java index ff12559673d..cf1a6f43787 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java @@ -165,9 +165,9 @@ final class IndexFileDeleter implements Closeable { if (infoStream.isEnabled("IFD")) { infoStream.message("IFD", "init: load commit \"" + fileName + "\""); } - SegmentInfos sis = new SegmentInfos(); + SegmentInfos sis = null; try { - sis.read(directory, fileName); + sis = SegmentInfos.readCommit(directory, fileName); } catch (FileNotFoundException | NoSuchFileException e) { // LUCENE-948: on NFS (and maybe others), if // you have writers switching back and forth @@ -179,7 +179,6 @@ final class IndexFileDeleter implements Closeable { if (infoStream.isEnabled("IFD")) { infoStream.message("IFD", "init: hit FileNotFoundException when loading commit \"" + fileName + "\"; skipping this commit point"); } - sis = null; } catch (IOException e) { if (SegmentInfos.generationFromSegmentsFileName(fileName) <= currentGen && directory.fileLength(fileName) > 0) { throw e; @@ -187,7 +186,6 @@ final class IndexFileDeleter implements Closeable { // Most likely we are opening an index that // has an aborted "future" commit, so suppress // exc in this case - sis = null; } } if (sis != null) { @@ -215,9 +213,9 @@ final class IndexFileDeleter implements Closeable { // listing was stale (eg when index accessed via NFS // client with stale directory listing cache). So we // try now to explicitly open this commit point: - SegmentInfos sis = new SegmentInfos(); + SegmentInfos sis = null; try { - sis.read(directory, currentSegmentsFile); + sis = SegmentInfos.readCommit(directory, currentSegmentsFile); } catch (IOException e) { throw new CorruptIndexException("unable to read current segments_N file", currentSegmentsFile, e); } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index f4191ad7c32..c9c5a372152 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -773,7 +773,6 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { // If index is too old, reading the segments will throw // IndexFormatTooOldException. - segmentInfos = new SegmentInfos(); boolean initialIndexExists = true; @@ -782,13 +781,17 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { // against an index that's currently open for // searching. In this case we write the next // segments_N file with no segments: + SegmentInfos sis = null; try { - segmentInfos.read(directory); - segmentInfos.clear(); + sis = SegmentInfos.readLatestCommit(directory); + sis.clear(); } catch (IOException e) { // Likely this means it's a fresh directory initialIndexExists = false; + sis = new SegmentInfos(); } + + segmentInfos = sis; // Record that we have a change (zero out all // segments) pending: @@ -802,7 +805,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { // Do not use SegmentInfos.read(Directory) since the spooky // retrying it does is not necessary here (we hold the write lock): - segmentInfos.read(directory, lastSegmentsFile); + segmentInfos = SegmentInfos.readCommit(directory, lastSegmentsFile); IndexCommit commit = config.getIndexCommit(); if (commit != null) { @@ -813,8 +816,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { // points. if (commit.getDirectory() != directory) throw new IllegalArgumentException("IndexCommit's directory doesn't match my directory"); - SegmentInfos oldInfos = new SegmentInfos(); - oldInfos.read(directory, commit.getSegmentsFileName()); + SegmentInfos oldInfos = SegmentInfos.readCommit(directory, commit.getSegmentsFileName()); segmentInfos.replace(oldInfos); changed(); if (infoStream.isEnabled("IW")) { @@ -2401,8 +2403,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { if (infoStream.isEnabled("IW")) { infoStream.message("IW", "addIndexes: process directory " + dir); } - SegmentInfos sis = new SegmentInfos(); // read infos from dir - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); // read infos from dir totalDocCount += sis.totalDocCount(); for (SegmentCommitInfo info : sis) { diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java index 9cbeb499eb1..2210cbfcdc5 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java @@ -60,18 +60,20 @@ import org.apache.lucene.util.StringHelper; * Files: *
                    *
                  • segments_N: Header, Version, NameCounter, SegCount, <SegName, - * SegCodec, DelGen, DeletionCount, FieldInfosGen, DocValuesGen, + * HasSegID, SegID, SegCodec, DelGen, DeletionCount, FieldInfosGen, DocValuesGen, * UpdatesFiles>SegCount, CommitUserData, Footer *
                  *

                  * Data types: *

                  *

                    - *
                  • Header --> {@link CodecUtil#writeHeader CodecHeader}
                  • - *
                  • GenHeader, NameCounter, SegCount, DeletionCount --> + *
                  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
                  • + *
                  • NameCounter, SegCount, DeletionCount --> * {@link DataOutput#writeInt Int32}
                  • *
                  • Generation, Version, DelGen, Checksum, FieldInfosGen, DocValuesGen --> * {@link DataOutput#writeLong Int64}
                  • + *
                  • HasSegID --> {@link DataOutput#writeByte Int8}
                  • + *
                  • SegID --> {@link DataOutput#writeByte Int8ID_LENGTH}
                  • *
                  • SegName, SegCodec --> {@link DataOutput#writeString String}
                  • *
                  • CommitUserData --> {@link DataOutput#writeStringStringMap * Map<String,String>}
                  • @@ -94,6 +96,10 @@ import org.apache.lucene.util.StringHelper; *
                  • DeletionCount records the number of deleted documents in this segment.
                  • *
                  • SegCodec is the {@link Codec#getName() name} of the Codec that encoded * this segment.
                  • + *
                  • HasSegID is nonzero if the segment has an identifier. Otherwise, when it is 0 + * the identifier is {@code null} and no SegID is written. Null only happens for Lucene + * 4.x segments referenced in commits.
                  • + *
                  • SegID is the identifier of the Codec that encoded this segment.
                  • *
                  • CommitUserData stores an optional user-supplied opaque * Map<String,String> that was passed to * {@link IndexWriter#setCommitData(java.util.Map)}.
                  • @@ -155,8 +161,8 @@ public final class SegmentInfos implements Cloneable, Iterable= VERSION_50) { + id = new byte[StringHelper.ID_LENGTH]; + input.readBytes(id, 0, id.length); + CodecUtil.checkIndexHeaderSuffix(input, Long.toString(generation, Character.MAX_RADIX)); + } + + SegmentInfos infos = new SegmentInfos(); + infos.id = id; + infos.generation = generation; + infos.lastGeneration = generation; + infos.version = input.readLong(); + infos.counter = input.readInt(); int numSegments = input.readInt(); if (numSegments < 0) { throw new CorruptIndexException("invalid segment count: " + numSegments, input); } for (int seg = 0; seg < numSegments; seg++) { String segName = input.readString(); + final byte segmentID[]; + if (format >= VERSION_50) { + byte hasID = input.readByte(); + if (hasID == 1) { + segmentID = new byte[StringHelper.ID_LENGTH]; + input.readBytes(segmentID, 0, segmentID.length); + } else if (hasID == 0) { + segmentID = null; // 4.x segment, doesn't have an ID + } else { + throw new CorruptIndexException("invalid hasID byte, got: " + hasID, input); + } + } else { + segmentID = null; + } Codec codec = Codec.forName(input.readString()); - //System.out.println("SIS.read seg=" + seg + " codec=" + codec); - SegmentInfo info = codec.segmentInfoFormat().read(directory, segName, IOContext.READ); + SegmentInfo info = codec.segmentInfoFormat().read(directory, segName, segmentID, IOContext.READ); info.setCodec(codec); long delGen = input.readLong(); int delCount = input.readInt(); @@ -358,13 +374,9 @@ public final class SegmentInfos implements Cloneable, Iterable= VERSION_50) { - id = new byte[StringHelper.ID_LENGTH]; - input.readBytes(id, 0, id.length); + infos.add(siPerCommit); } + infos.userData = input.readStringStringMap(); if (format >= VERSION_48) { CodecUtil.checkFooter(input); @@ -378,30 +390,17 @@ public final class SegmentInfos implements Cloneable, Iterable(directory) { @Override - protected Object doBody(String segmentFileName) throws IOException { - read(directory, segmentFileName); - return null; + protected SegmentInfos doBody(String segmentFileName) throws IOException { + return readCommit(directory, segmentFileName); } }.run(); } @@ -412,27 +411,38 @@ public final class SegmentInfos implements Cloneable, Iterable(directory) { @Override - protected Object doBody(String segmentFileName) throws IOException { - SegmentInfos sis = new SegmentInfos(); - sis.read(directory, segmentFileName); + protected DirectoryReader doBody(String segmentFileName) throws IOException { + SegmentInfos sis = SegmentInfos.readCommit(directory, segmentFileName); final SegmentReader[] readers = new SegmentReader[sis.size()]; for (int i = sis.size()-1; i >= 0; i--) { boolean success = false; @@ -309,11 +308,10 @@ final class StandardDirectoryReader extends DirectoryReader { } private DirectoryReader doOpenFromCommit(IndexCommit commit) throws IOException { - return (DirectoryReader) new SegmentInfos.FindSegmentsFile(directory) { + return new SegmentInfos.FindSegmentsFile(directory) { @Override - protected Object doBody(String segmentFileName) throws IOException { - final SegmentInfos infos = new SegmentInfos(); - infos.read(directory, segmentFileName); + protected DirectoryReader doBody(String segmentFileName) throws IOException { + final SegmentInfos infos = SegmentInfos.readCommit(directory, segmentFileName); return doOpenIfChanged(infos); } }.run(commit); @@ -338,8 +336,7 @@ final class StandardDirectoryReader extends DirectoryReader { // IndexWriter.prepareCommit has been called (but not // yet commit), then the reader will still see itself as // current: - SegmentInfos sis = new SegmentInfos(); - sis.read(directory); + SegmentInfos sis = SegmentInfos.readLatestCommit(directory); // we loaded SegmentInfos from the directory return sis.getVersion() == segmentInfos.getVersion(); diff --git a/lucene/core/src/java/org/apache/lucene/index/Terms.java b/lucene/core/src/java/org/apache/lucene/index/Terms.java index 419a6f70568..9169bd893fa 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Terms.java +++ b/lucene/core/src/java/org/apache/lucene/index/Terms.java @@ -193,4 +193,18 @@ public abstract class Terms { scratch.grow(scratch.length()); } } + + /** + * Expert: returns additional information about this Terms instance + * for debugging purposes. + */ + public Object getStats() throws IOException { + StringBuilder sb = new StringBuilder(); + sb.append("impl=" + getClass().getSimpleName()); + sb.append(",size=" + size()); + sb.append(",docCount=" + getDocCount()); + sb.append(",sumTotalTermFreq=" + getSumTotalTermFreq()); + sb.append(",sumDocFreq=" + getSumDocFreq()); + return sb.toString(); + } } diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat index 8cc6f70d0f4..c5d32072081 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat +org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 95e92675165..f7390e28cff 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat +org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat.java similarity index 95% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat.java rename to lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat.java index f83d135f839..3898cde5995 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.lucene41; +package org.apache.lucene.codecs.lucene50; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -34,7 +34,7 @@ import org.apache.lucene.util.TestUtil; * Tests BlockPostingsFormat */ public class TestBlockPostingsFormat extends BasePostingsFormatTestCase { - private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene41PostingsFormat()); + private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene50PostingsFormat()); @Override protected Codec getCodec() { @@ -57,7 +57,7 @@ public class TestBlockPostingsFormat extends BasePostingsFormatTestCase { assertEquals(1, r.leaves().size()); FieldReader field = (FieldReader) r.leaves().get(0).reader().fields().terms("field"); // We should see exactly two blocks: one root block (prefix empty string) and one block for z* terms (prefix z): - Stats stats = field.computeStats(); + Stats stats = field.getStats(); assertEquals(0, stats.floorBlockCount); assertEquals(2, stats.nonFloorBlockCount); r.close(); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat2.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat2.java similarity index 91% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat2.java rename to lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat2.java index 7f7b2cea9be..ac29c25f8d3 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat2.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat2.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.lucene41; +package org.apache.lucene.codecs.lucene50; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -45,7 +45,7 @@ public class TestBlockPostingsFormat2 extends LuceneTestCase { super.setUp(); dir = newFSDirectory(createTempDir("testDFBlockSize")); IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene41PostingsFormat())); + iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene50PostingsFormat())); iw = new RandomIndexWriter(random(), dir, iwc); iw.setDoRandomForceMerge(false); // we will ourselves } @@ -55,7 +55,7 @@ public class TestBlockPostingsFormat2 extends LuceneTestCase { iw.close(); TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene41PostingsFormat())); + iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene50PostingsFormat())); iwc.setOpenMode(OpenMode.APPEND); IndexWriter iw = new IndexWriter(dir, iwc); iw.forceMerge(1); @@ -82,7 +82,7 @@ public class TestBlockPostingsFormat2 extends LuceneTestCase { /** tests terms with df = blocksize */ public void testDFBlockSize() throws Exception { Document doc = newDocument(); - for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE; i++) { + for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE; i++) { for (Field f : doc.getFields()) { f.setStringValue(f.name() + " " + f.name() + "_2"); } @@ -93,7 +93,7 @@ public class TestBlockPostingsFormat2 extends LuceneTestCase { /** tests terms with df % blocksize = 0 */ public void testDFBlockSizeMultiple() throws Exception { Document doc = newDocument(); - for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE * 16; i++) { + for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE * 16; i++) { for (Field f : doc.getFields()) { f.setStringValue(f.name() + " " + f.name() + "_2"); } @@ -104,7 +104,7 @@ public class TestBlockPostingsFormat2 extends LuceneTestCase { /** tests terms with ttf = blocksize */ public void testTTFBlockSize() throws Exception { Document doc = newDocument(); - for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE/2; i++) { + for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE/2; i++) { for (Field f : doc.getFields()) { f.setStringValue(f.name() + " " + f.name() + " " + f.name() + "_2 " + f.name() + "_2"); } @@ -115,7 +115,7 @@ public class TestBlockPostingsFormat2 extends LuceneTestCase { /** tests terms with ttf % blocksize = 0 */ public void testTTFBlockSizeMultiple() throws Exception { Document doc = newDocument(); - for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE/2; i++) { + for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE/2; i++) { for (Field f : doc.getFields()) { String proto = (f.name() + " " + f.name() + " " + f.name() + " " + f.name() + " " + f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2"); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat3.java similarity index 98% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java rename to lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat3.java index 4f45e6cdf13..1e209138dc1 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat3.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.lucene41; +package org.apache.lucene.codecs.lucene50; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -61,7 +61,7 @@ import org.apache.lucene.util.automaton.RegExp; * Tests partial enumeration (only pulling a subset of the indexed data) */ public class TestBlockPostingsFormat3 extends LuceneTestCase { - static final int MAXDOC = Lucene41PostingsFormat.BLOCK_SIZE * 20; + static final int MAXDOC = Lucene50PostingsFormat.BLOCK_SIZE * 20; // creates 8 fields with different options and does "duels" of fields against each other public void test() throws Exception { @@ -82,7 +82,7 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase { } }; IndexWriterConfig iwc = newIndexWriterConfig(analyzer); - iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene41PostingsFormat())); + iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene50PostingsFormat())); // TODO we could actually add more fields implemented with different PFs // or, just put this test into the usual rotation? RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); @@ -137,7 +137,7 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase { verify(dir); TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge iwc = newIndexWriterConfig(analyzer); - iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene41PostingsFormat())); + iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene50PostingsFormat())); iwc.setOpenMode(OpenMode.APPEND); IndexWriter iw2 = new IndexWriter(dir, iwc); iw2.forceMerge(1); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestForUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestForUtil.java similarity index 92% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestForUtil.java rename to lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestForUtil.java index 3831033a6ea..716b8b8e683 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestForUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestForUtil.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.lucene41; +package org.apache.lucene.codecs.lucene50; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,9 +17,9 @@ package org.apache.lucene.codecs.lucene41; * limitations under the License. */ -import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE; -import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE; -import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE; +import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene50.ForUtil.MAX_DATA_SIZE; +import static org.apache.lucene.codecs.lucene50.ForUtil.MAX_ENCODED_SIZE; import java.io.IOException; import java.util.Arrays; @@ -89,6 +89,8 @@ public class TestForUtil extends LuceneTestCase { assertEquals(endPointer, in.getFilePointer()); in.close(); } + + d.close(); } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java new file mode 100644 index 00000000000..3468f06bdc7 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java @@ -0,0 +1,273 @@ +package org.apache.lucene.codecs.lucene50; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.asserting.AssertingCodec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.SerialMergeScheduler; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.TestUtil; + +/** + * Tests Lucene50DocValuesFormat + */ +public class TestLucene50DocValuesFormat extends BaseCompressingDocValuesFormatTestCase { + private final Codec codec = TestUtil.alwaysDocValuesFormat(new Lucene50DocValuesFormat()); + + @Override + protected Codec getCodec() { + return codec; + } + + // TODO: these big methods can easily blow up some of the other ram-hungry codecs... + // for now just keep them here, as we want to test this for this format. + + public void testSortedSetVariableLengthBigVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16); + } + } + + @Nightly + public void testSortedSetVariableLengthManyVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16); + } + } + + public void testSortedVariableLengthBigVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedVsStoredFields(atLeast(300), 1, 32766); + } + } + + @Nightly + public void testSortedVariableLengthManyVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500); + } + } + + public void testTermsEnumFixedWidth() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 10, 10); + } + } + + public void testTermsEnumVariableWidth() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 1, 500); + } + } + + @Nightly + public void testTermsEnumRandomMany() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), 1, 500); + } + } + + // TODO: try to refactor this and some termsenum tests into the base class. + // to do this we need to fix the test class to get a DVF not a Codec so we can setup + // the postings format correctly. + private void doTestTermsEnumRandom(int numDocs, int minLength, int maxLength) throws Exception { + Directory dir = newFSDirectory(createTempDir()); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + conf.setMergeScheduler(new SerialMergeScheduler()); + // set to duel against a codec which has ordinals: + final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random()); + final DocValuesFormat dv = new Lucene50DocValuesFormat(); + conf.setCodec(new AssertingCodec() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return pf; + } + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return dv; + } + }); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + + // index some docs + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + Field idField = new StringField("id", Integer.toString(i), Field.Store.NO); + doc.add(idField); + final int length = TestUtil.nextInt(random(), minLength, maxLength); + int numValues = random().nextInt(17); + // create a random list of strings + List values = new ArrayList<>(); + for (int v = 0; v < numValues; v++) { + values.add(TestUtil.randomSimpleString(random(), minLength, length)); + } + + // add in any order to the indexed field + ArrayList unordered = new ArrayList<>(values); + Collections.shuffle(unordered, random()); + for (String v : values) { + doc.add(newStringField("indexed", v, Field.Store.NO)); + } + + // add in any order to the dv field + ArrayList unordered2 = new ArrayList<>(values); + Collections.shuffle(unordered2, random()); + for (String v : unordered2) { + doc.add(new SortedSetDocValuesField("dv", new BytesRef(v))); + } + + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs/10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + + // compare per-segment + DirectoryReader ir = writer.getReader(); + for (LeafReaderContext context : ir.leaves()) { + LeafReader r = context.reader(); + Terms terms = r.terms("indexed"); + if (terms != null) { + assertEquals(terms.size(), r.getSortedSetDocValues("dv").getValueCount()); + TermsEnum expected = terms.iterator(null); + TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum(); + assertEquals(terms.size(), expected, actual); + } + } + ir.close(); + + writer.forceMerge(1); + + // now compare again after the merge + ir = writer.getReader(); + LeafReader ar = getOnlySegmentReader(ir); + Terms terms = ar.terms("indexed"); + if (terms != null) { + assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount()); + TermsEnum expected = terms.iterator(null); + TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum(); + assertEquals(terms.size(), expected, actual); + } + ir.close(); + + writer.close(); + dir.close(); + } + + private void assertEquals(long numOrds, TermsEnum expected, TermsEnum actual) throws Exception { + BytesRef ref; + + // sequential next() through all terms + while ((ref = expected.next()) != null) { + assertEquals(ref, actual.next()); + assertEquals(expected.ord(), actual.ord()); + assertEquals(expected.term(), actual.term()); + } + assertNull(actual.next()); + + // sequential seekExact(ord) through all terms + for (long i = 0; i < numOrds; i++) { + expected.seekExact(i); + actual.seekExact(i); + assertEquals(expected.ord(), actual.ord()); + assertEquals(expected.term(), actual.term()); + } + + // sequential seekExact(BytesRef) through all terms + for (long i = 0; i < numOrds; i++) { + expected.seekExact(i); + assertTrue(actual.seekExact(expected.term())); + assertEquals(expected.ord(), actual.ord()); + assertEquals(expected.term(), actual.term()); + } + + // sequential seekCeil(BytesRef) through all terms + for (long i = 0; i < numOrds; i++) { + expected.seekExact(i); + assertEquals(SeekStatus.FOUND, actual.seekCeil(expected.term())); + assertEquals(expected.ord(), actual.ord()); + assertEquals(expected.term(), actual.term()); + } + + // random seekExact(ord) + for (long i = 0; i < numOrds; i++) { + long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1); + expected.seekExact(randomOrd); + actual.seekExact(randomOrd); + assertEquals(expected.ord(), actual.ord()); + assertEquals(expected.term(), actual.term()); + } + + // random seekExact(BytesRef) + for (long i = 0; i < numOrds; i++) { + long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1); + expected.seekExact(randomOrd); + actual.seekExact(expected.term()); + assertEquals(expected.ord(), actual.ord()); + assertEquals(expected.term(), actual.term()); + } + + // random seekCeil(BytesRef) + for (long i = 0; i < numOrds; i++) { + BytesRef target = new BytesRef(TestUtil.randomUnicodeString(random())); + SeekStatus expectedStatus = expected.seekCeil(target); + assertEquals(expectedStatus, actual.seekCeil(target)); + if (expectedStatus != SeekStatus.END) { + assertEquals(expected.ord(), actual.ord()); + assertEquals(expected.term(), actual.term()); + } + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java index 0dfc22fa496..19939d9e22e 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java @@ -17,7 +17,13 @@ package org.apache.lucene.codecs.lucene50; * limitations under the License. */ +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.NormMap; import org.apache.lucene.index.BaseNormsFormatTestCase; import org.apache.lucene.util.TestUtil; @@ -30,5 +36,92 @@ public class TestLucene50NormsFormat extends BaseNormsFormatTestCase { @Override protected Codec getCodec() { return codec; - } + } + + // NormMap is rather complicated, doing domain encoding / tracking frequencies etc. + // test it directly some here... + + public void testNormMapSimple() { + NormMap map = new NormMap(); + map.add(10); + map.add(5); + map.add(4); + map.add(10); + assertEquals(3, map.size); + + // first come, first serve ord assignment + + // encode + assertEquals(0, map.getOrd(10)); + assertEquals(1, map.getOrd(5)); + assertEquals(2, map.getOrd(4)); + + // decode + long decode[] = map.getDecodeTable(); + assertEquals(10, decode[0]); + assertEquals(5, decode[1]); + assertEquals(4, decode[2]); + + // freqs + int freqs[] = map.getFreqs(); + assertEquals(2, freqs[0]); + assertEquals(1, freqs[1]); + assertEquals(1, freqs[2]); + + assertEquals(2, map.maxFreq()); + } + + public void testNormMapRandom() { + Map freqs = new HashMap<>(); + Map ords = new HashMap<>(); + + Set uniqueValuesSet = new HashSet<>(); + int numUniqValues = TestUtil.nextInt(random(), 1, 256); + for (int i = 0; i < numUniqValues; i++) { + if (random().nextBoolean()) { + uniqueValuesSet.add(TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE)); + } else { + uniqueValuesSet.add(TestUtil.nextLong(random(), Byte.MIN_VALUE, Byte.MAX_VALUE)); + } + } + + Long uniqueValues[] = uniqueValuesSet.toArray(new Long[uniqueValuesSet.size()]); + + NormMap map = new NormMap(); + int numdocs = TestUtil.nextInt(random(), 1, 100000); + for (int i = 0; i < numdocs; i++) { + long value = uniqueValues[random().nextInt(uniqueValues.length)]; + // now add to both expected and actual + map.add(value); + + Integer ord = ords.get(value); + if (ord == null) { + ord = ords.size(); + ords.put(value, ord); + freqs.put(value, 1); + } else { + freqs.put(value, freqs.get(value)+1); + } + } + + // value -> ord + assertEquals(ords.size(), map.size); + for (Map.Entry kv : ords.entrySet()) { + assertEquals(kv.getValue().intValue(), map.getOrd(kv.getKey())); + } + + // ord -> value + Map reversed = new HashMap<>(); + long table[] = map.getDecodeTable(); + for (int i = 0; i < map.size; i++) { + reversed.put(table[i], i); + } + assertEquals(ords, reversed); + + // freqs + int freqTable[] = map.getFreqs(); + for (int i = 0; i < map.size; i++) { + assertEquals(freqs.get(table[i]).longValue(), freqTable[i]); + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java index 86c7c7e879f..3b19087755a 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java @@ -22,7 +22,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.asserting.AssertingCodec; -import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapFixedInterval; +import org.apache.lucene.codecs.blockterms.LuceneVarGapFixedInterval; import org.apache.lucene.codecs.memory.MemoryPostingsFormat; import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat; import org.apache.lucene.document.Document; @@ -287,9 +287,9 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase { @Override public PostingsFormat getPostingsFormatForField(String field) { if ("id".equals(field)) { - return new Lucene41VarGapFixedInterval(1); + return new LuceneVarGapFixedInterval(1); } else if ("date".equals(field)) { - return new Lucene41VarGapFixedInterval(2); + return new LuceneVarGapFixedInterval(2); } else { return super.getPostingsFormatForField(field); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 2c1be93915a..e4bdd17833d 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1116,8 +1116,7 @@ public class TestAddIndexes extends LuceneTestCase { w3.close(); // we should now see segments_X, // _Y.cfs,_Y.cfe, _Z.si - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals("Only one compound segment should exist", 1, sis.size()); assertTrue(sis.info(0).info.getUseCompoundFile()); dir.close(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java index f14ca9536d9..66eb343f7cf 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java @@ -26,7 +26,6 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -36,10 +35,6 @@ import org.apache.lucene.util.TestUtil; public class TestAllFilesHaveChecksumFooter extends LuceneTestCase { public void test() throws Exception { Directory dir = newDirectory(); - if (dir instanceof MockDirectoryWrapper) { - // Else we might remove .cfe but not the corresponding .cfs, causing false exc when trying to verify headers: - ((MockDirectoryWrapper) dir).setEnableVirusScanner(false); - } IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); conf.setCodec(TestUtil.getDefaultCodec()); RandomIndexWriter riw = new RandomIndexWriter(random(), dir, conf); @@ -68,8 +63,7 @@ public class TestAllFilesHaveChecksumFooter extends LuceneTestCase { } private void checkFooters(Directory dir) throws IOException { - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); checkFooter(dir, sis.getSegmentsFileName()); for (SegmentCommitInfo si : sis) { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java index 162b74aff77..c2b515db718 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java @@ -30,22 +30,16 @@ import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.TextField; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; /** - * Test that a plain default puts codec headers in all files. + * Test that a plain default puts codec headers in all files */ public class TestAllFilesHaveCodecHeader extends LuceneTestCase { public void test() throws Exception { Directory dir = newDirectory(); - if (dir instanceof MockDirectoryWrapper) { - // Else we might remove .cfe but not the corresponding .cfs, causing false exc when trying to verify headers: - ((MockDirectoryWrapper) dir).setEnableVirusScanner(false); - } - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); conf.setCodec(TestUtil.getDefaultCodec()); RandomIndexWriter riw = new RandomIndexWriter(random(), dir, conf); @@ -70,10 +64,12 @@ public class TestAllFilesHaveCodecHeader extends LuceneTestCase { if (random().nextInt(7) == 0) { riw.commit(); } - // TODO: we should make a new format with a clean header... - // if (random().nextInt(20) == 0) { - // riw.deleteDocuments(new Term("id", Integer.toString(i))); - // } + if (random().nextInt(20) == 0) { + riw.deleteDocuments(new Term("id", Integer.toString(i))); + } + if (random().nextInt(15) == 0) { + riw.updateNumericDocValue(new Term("id"), "dv", Long.valueOf(i)); + } } riw.close(); checkHeaders(dir, new HashMap()); @@ -81,25 +77,25 @@ public class TestAllFilesHaveCodecHeader extends LuceneTestCase { } private void checkHeaders(Directory dir, Map namesToExtensions) throws IOException { - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); - checkHeader(dir, sis.getSegmentsFileName(), namesToExtensions); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); + checkHeader(dir, sis.getSegmentsFileName(), namesToExtensions, sis.getId()); for (SegmentCommitInfo si : sis) { + assertNotNull(si.info.getId()); for (String file : si.files()) { - checkHeader(dir, file, namesToExtensions); + checkHeader(dir, file, namesToExtensions, si.info.getId()); } if (si.info.getUseCompoundFile()) { try (Directory cfsDir = si.info.getCodec().compoundFormat().getCompoundReader(dir, si.info, newIOContext(random()))) { for (String cfsFile : cfsDir.listAll()) { - checkHeader(cfsDir, cfsFile, namesToExtensions); + checkHeader(cfsDir, cfsFile, namesToExtensions, si.info.getId()); } } } } } - private void checkHeader(Directory dir, String file, Map namesToExtensions) throws IOException { + private void checkHeader(Directory dir, String file, Map namesToExtensions, byte[] id) throws IOException { try (IndexInput in = dir.openInput(file, newIOContext(random()))) { int val = in.readInt(); assertEquals(file + " has no codec header, instead found: " + val, CodecUtil.CODEC_MAGIC, val); @@ -114,6 +110,10 @@ public class TestAllFilesHaveCodecHeader extends LuceneTestCase { if (previous != null && !previous.equals(extension)) { fail("extensions " + previous + " and " + extension + " share same codecName " + codecName); } + // read version + in.readInt(); + // read object id + CodecUtil.checkIndexHeaderID(in, id); } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCodecUtil.java b/lucene/core/src/test/org/apache/lucene/index/TestCodecUtil.java index 4e273454313..ce1eb07e1eb 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCodecUtil.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecUtil.java @@ -199,12 +199,12 @@ public class TestCodecUtil extends LuceneTestCase { public void testSegmentHeaderLength() throws Exception { RAMFile file = new RAMFile(); IndexOutput output = new RAMOutputStream(file, true); - CodecUtil.writeSegmentHeader(output, "FooBar", 5, StringHelper.randomId(), "xyz"); + CodecUtil.writeIndexHeader(output, "FooBar", 5, StringHelper.randomId(), "xyz"); output.writeString("this is the data"); output.close(); IndexInput input = new RAMInputStream("file", file); - input.seek(CodecUtil.segmentHeaderLength("FooBar", "xyz")); + input.seek(CodecUtil.indexHeaderLength("FooBar", "xyz")); assertEquals("this is the data", input.readString()); input.close(); } @@ -217,7 +217,7 @@ public class TestCodecUtil extends LuceneTestCase { RAMFile file = new RAMFile(); IndexOutput output = new RAMOutputStream(file, true); try { - CodecUtil.writeSegmentHeader(output, "foobar", 5, StringHelper.randomId(), tooLong.toString()); + CodecUtil.writeIndexHeader(output, "foobar", 5, StringHelper.randomId(), tooLong.toString()); fail("didn't get expected exception"); } catch (IllegalArgumentException expected) { // expected @@ -232,13 +232,13 @@ public class TestCodecUtil extends LuceneTestCase { RAMFile file = new RAMFile(); IndexOutput output = new RAMOutputStream(file, true); byte[] id = StringHelper.randomId(); - CodecUtil.writeSegmentHeader(output, "foobar", 5, id, justLongEnough.toString()); + CodecUtil.writeIndexHeader(output, "foobar", 5, id, justLongEnough.toString()); output.close(); IndexInput input = new RAMInputStream("file", file); - CodecUtil.checkSegmentHeader(input, "foobar", 5, 5, id, justLongEnough.toString()); + CodecUtil.checkIndexHeader(input, "foobar", 5, 5, id, justLongEnough.toString()); assertEquals(input.getFilePointer(), input.length()); - assertEquals(input.getFilePointer(), CodecUtil.segmentHeaderLength("foobar", justLongEnough.toString())); + assertEquals(input.getFilePointer(), CodecUtil.indexHeaderLength("foobar", justLongEnough.toString())); input.close(); } @@ -246,7 +246,7 @@ public class TestCodecUtil extends LuceneTestCase { RAMFile file = new RAMFile(); IndexOutput output = new RAMOutputStream(file, true); try { - CodecUtil.writeSegmentHeader(output, "foobar", 5, StringHelper.randomId(), "\u1234"); + CodecUtil.writeIndexHeader(output, "foobar", 5, StringHelper.randomId(), "\u1234"); fail("didn't get expected exception"); } catch (IllegalArgumentException expected) { // expected diff --git a/lucene/core/src/test/org/apache/lucene/index/TestConsistentFieldNumbers.java b/lucene/core/src/test/org/apache/lucene/index/TestConsistentFieldNumbers.java index 06f10b7e348..dd4e3bb980f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestConsistentFieldNumbers.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestConsistentFieldNumbers.java @@ -64,8 +64,7 @@ public class TestConsistentFieldNumbers extends LuceneTestCase { writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(2, sis.size()); FieldInfos fis1 = IndexWriter.readFieldInfos(sis.info(0)); @@ -82,8 +81,7 @@ public class TestConsistentFieldNumbers extends LuceneTestCase { writer.forceMerge(1); writer.close(); - sis = new SegmentInfos(); - sis.read(dir); + sis = SegmentInfos.readLatestCommit(dir); assertEquals(1, sis.size()); FieldInfos fis3 = IndexWriter.readFieldInfos(sis.info(0)); @@ -130,8 +128,7 @@ public class TestConsistentFieldNumbers extends LuceneTestCase { writer.addIndexes(dir2); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir1); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir1); assertEquals(2, sis.size()); FieldInfos fis1 = IndexWriter.readFieldInfos(sis.info(0)); @@ -161,8 +158,7 @@ public class TestConsistentFieldNumbers extends LuceneTestCase { d.add(new TextField("f2", "d1 second field", Field.Store.YES)); writer.addDocument(d); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(1, sis.size()); FieldInfos fis1 = IndexWriter.readFieldInfos(sis.info(0)); assertEquals("f1", fis1.fieldInfo(0).name); @@ -178,8 +174,7 @@ public class TestConsistentFieldNumbers extends LuceneTestCase { d.add(new StoredField("f3", new byte[] { 1, 2, 3 })); writer.addDocument(d); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(2, sis.size()); FieldInfos fis1 = IndexWriter.readFieldInfos(sis.info(0)); FieldInfos fis2 = IndexWriter.readFieldInfos(sis.info(1)); @@ -199,8 +194,7 @@ public class TestConsistentFieldNumbers extends LuceneTestCase { d.add(new StoredField("f3", new byte[] { 1, 2, 3, 4, 5 })); writer.addDocument(d); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(3, sis.size()); FieldInfos fis1 = IndexWriter.readFieldInfos(sis.info(0)); FieldInfos fis2 = IndexWriter.readFieldInfos(sis.info(1)); @@ -231,8 +225,7 @@ public class TestConsistentFieldNumbers extends LuceneTestCase { writer.forceMerge(1); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(1, sis.size()); FieldInfos fis1 = IndexWriter.readFieldInfos(sis.info(0)); assertEquals("f1", fis1.fieldInfo(0).name); @@ -269,8 +262,7 @@ public class TestConsistentFieldNumbers extends LuceneTestCase { writer.forceMerge(1); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); for (SegmentCommitInfo si : sis) { FieldInfos fis = IndexWriter.readFieldInfos(si); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java index 6bc4a94d549..0a20e5f32a1 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java @@ -285,8 +285,7 @@ public class TestDeletionPolicy extends LuceneTestCase { // if we are on a filesystem that seems to have only // 1 second resolution, allow +1 second in commit // age tolerance: - SegmentInfos sis = new SegmentInfos(); - sis.read(dir, fileName); + SegmentInfos sis = SegmentInfos.readCommit(dir, fileName); long modTime = Long.parseLong(sis.getUserData().get("commitTime")); oneSecondResolution &= (modTime % 1000) == 0; final long leeway = (long) ((SECONDS + (oneSecondResolution ? 1.0:0.0))*1000); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java b/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java index 90cc11e9b93..c989ecd0254 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java @@ -664,8 +664,7 @@ public void testFilesOpenClose() throws IOException { addDocumentWithFields(writer); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(d); + SegmentInfos sis = SegmentInfos.readLatestCommit(d); DirectoryReader r = DirectoryReader.open(d); IndexCommit c = r.getIndexCommit(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFilterLeafReader.java b/lucene/core/src/test/org/apache/lucene/index/TestFilterLeafReader.java index cbd9c1c5369..d12c8491636 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFilterLeafReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFilterLeafReader.java @@ -166,7 +166,7 @@ public class TestFilterLeafReader extends LuceneTestCase { for (Method m : superClazz.getMethods()) { final int mods = m.getModifiers(); if (Modifier.isStatic(mods) || Modifier.isAbstract(mods) || Modifier.isFinal(mods) || m.isSynthetic() - || m.getName().equals("attributes")) { + || m.getName().equals("attributes") || m.getName().equals("getStats")) { continue; } // The point of these checks is to ensure that methods that have a default diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexFileDeleter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexFileDeleter.java index ab8f3f6cbb7..9717e224612 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexFileDeleter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexFileDeleter.java @@ -85,8 +85,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { writer.close(); // read in index to try to not depend on codec-specific filenames so much - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); SegmentInfo si0 = sis.info(0).info; SegmentInfo si1 = sis.info(1).info; SegmentInfo si3 = sis.info(3).info; @@ -123,10 +122,6 @@ public class TestIndexFileDeleter extends LuceneTestCase { // Create a bogus fnm file when the CFS already exists: copyFile(dir, cfsFiles0[0], "_0.fnm"); - - // Create some old segments file: - copyFile(dir, "segments_2", "segments"); - copyFile(dir, "segments_2", "segments_1"); // Create a bogus cfs file shadowing a non-cfs segment: @@ -143,8 +138,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { String[] filesPre = dir.listAll(); - // Open & close a writer: it should delete the above 4 - // files and nothing more: + // Open & close a writer: it should delete the above files and nothing more: writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setOpenMode(OpenMode.APPEND)); writer.close(); @@ -265,8 +259,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { // empty commit new IndexWriter(dir, new IndexWriterConfig(null)).close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(1, sis.getGeneration()); // no inflation @@ -283,8 +276,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { // empty commit new IndexWriter(dir, new IndexWriterConfig(null)).close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(1, sis.getGeneration()); // add trash commit @@ -308,8 +300,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { // empty commit new IndexWriter(dir, new IndexWriterConfig(null)).close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(0, sis.counter); // no inflation @@ -333,8 +324,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { iw.addDocument(new Document()); iw.commit(); iw.close(); - sis = new SegmentInfos(); - sis.read(dir); + sis = SegmentInfos.readLatestCommit(dir); assertEquals("_4", sis.info(0).info.name); assertEquals(5, sis.counter); @@ -351,8 +341,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { iw.close(); // no deletes: start at 1 - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(1, sis.info(0).getNextDelGen()); // no inflation @@ -376,8 +365,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { // empty commit new IndexWriter(dir, new IndexWriterConfig(null)).close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(1, sis.getGeneration()); // add trash file @@ -400,8 +388,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { iw.close(); // no deletes: start at 1 - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(1, sis.info(0).getNextDelGen()); // add trash file diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index a7b0536dc42..3581dca4599 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -575,8 +575,7 @@ public class TestIndexWriter extends LuceneTestCase { writer.addDocument(doc); writer.flush(false, true); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); // Since we flushed w/o allowing merging we should now // have 10 segments assertEquals(10, sis.size()); @@ -2768,8 +2767,7 @@ public class TestIndexWriter extends LuceneTestCase { w.addDocument(new Document()); w.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(d); + SegmentInfos sis = SegmentInfos.readLatestCommit(d); byte[] id1 = sis.getId(); assertNotNull(id1); assertEquals(StringHelper.ID_LENGTH, id1.length); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java index 1714f12e027..779fe430c7a 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java @@ -1236,8 +1236,7 @@ public class TestIndexWriterExceptions extends LuceneTestCase { assertTrue("segment generation should be > 0 but got " + gen, gen > 0); boolean corrupted = false; - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); for (SegmentCommitInfo si : sis) { assertTrue(si.info.getUseCompoundFile()); String cfsFiles[] = si.info.getCodec().compoundFormat().files(si.info); @@ -1314,8 +1313,7 @@ public class TestIndexWriterExceptions extends LuceneTestCase { w.close(); IndexReader reader = DirectoryReader.open(dir); assertTrue(reader.numDocs() > 0); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); for(LeafReaderContext context : reader.leaves()) { assertFalse(context.reader().getFieldInfos().hasVectors()); } @@ -1682,7 +1680,7 @@ public class TestIndexWriterExceptions extends LuceneTestCase { if (doFail && name.startsWith("segments_")) { StackTraceElement[] trace = new Exception().getStackTrace(); for (int i = 0; i < trace.length; i++) { - if ("read".equals(trace[i].getMethodName())) { + if ("readCommit".equals(trace[i].getMethodName()) || "readLatestCommit".equals(trace[i].getMethodName())) { throw new UnsupportedOperationException("expected UOE"); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java index 30997fe40de..fcbc461ef08 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java @@ -48,8 +48,7 @@ public class TestIndexWriterForceMerge extends LuceneTestCase { writer.addDocument(doc); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); final int segCount = sis.size(); ldmp = new LogDocMergePolicy(); @@ -59,8 +58,7 @@ public class TestIndexWriterForceMerge extends LuceneTestCase { writer.forceMerge(3); writer.close(); - sis = new SegmentInfos(); - sis.read(dir); + sis = SegmentInfos.readLatestCommit(dir); final int optSegCount = sis.size(); if (segCount < 3) @@ -93,16 +91,14 @@ public class TestIndexWriterForceMerge extends LuceneTestCase { writer.waitForMerges(); writer.commit(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); final int segCount = sis.size(); writer.forceMerge(7); writer.commit(); writer.waitForMerges(); - sis = new SegmentInfos(); - sis.read(dir); + sis = SegmentInfos.readLatestCommit(dir); final int optSegCount = sis.size(); if (segCount < 7) @@ -226,8 +222,7 @@ public class TestIndexWriterForceMerge extends LuceneTestCase { assertTrue(reader.leaves().size() > 1); reader.close(); - SegmentInfos infos = new SegmentInfos(); - infos.read(dir); + SegmentInfos infos = SegmentInfos.readLatestCommit(dir); assertEquals(2, infos.size()); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java index 644893fafe7..c7078129b53 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java @@ -34,8 +34,10 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.TestUtil; public class TestIndexWriterThreadsToSegments extends LuceneTestCase { @@ -331,7 +333,8 @@ public class TestIndexWriterThreadsToSegments extends LuceneTestCase { String segName = IndexFileNames.parseSegmentName(fileName); if (segSeen.contains(segName) == false) { segSeen.add(segName); - SegmentInfo si = TestUtil.getDefaultCodec().segmentInfoFormat().read(dir, segName, IOContext.DEFAULT); + byte id[] = readSegmentInfoID(dir, fileName); + SegmentInfo si = TestUtil.getDefaultCodec().segmentInfoFormat().read(dir, segName, id, IOContext.DEFAULT); si.setCodec(codec); SegmentCommitInfo sci = new SegmentCommitInfo(si, 0, -1, -1, -1); SegmentReader sr = new SegmentReader(sci, IOContext.DEFAULT); @@ -349,4 +352,17 @@ public class TestIndexWriterThreadsToSegments extends LuceneTestCase { w.close(); dir.close(); } + + // TODO: remove this hack and fix this test to be better? + // the whole thing relies on default codec too... + byte[] readSegmentInfoID(Directory dir, String file) throws IOException { + try (IndexInput in = dir.openInput(file, IOContext.DEFAULT)) { + in.readInt(); // magic + in.readString(); // codec name + in.readInt(); // version + byte id[] = new byte[StringHelper.ID_LENGTH]; + in.readBytes(id, 0, id.length); + return id; + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java b/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java index 73849d513fa..86985d877a9 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java @@ -143,8 +143,7 @@ public class TestRollingUpdates extends LuceneTestCase { docs.close(); // LUCENE-4455: - SegmentInfos infos = new SegmentInfos(); - infos.read(dir); + SegmentInfos infos = SegmentInfos.readLatestCommit(dir); long totalBytes = 0; for(SegmentCommitInfo sipc : infos) { totalBytes += sipc.sizeInBytes(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSizeBoundedForceMerge.java b/lucene/core/src/test/org/apache/lucene/index/TestSizeBoundedForceMerge.java index 48ab0780c9e..b31c77bedac 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSizeBoundedForceMerge.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSizeBoundedForceMerge.java @@ -66,8 +66,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { } writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); double min = sis.info(0).sizeInBytes(); conf = newWriterConfig(); @@ -80,8 +79,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { writer.close(); // Should only be 3 segments in the index, because one of them exceeds the size limit - sis = new SegmentInfos(); - sis.read(dir); + sis = SegmentInfos.readLatestCommit(dir); assertEquals(3, sis.size()); } @@ -113,8 +111,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { writer.close(); // Should only be 3 segments in the index, because one of them exceeds the size limit - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(3, sis.size()); } @@ -140,8 +137,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { writer.forceMerge(1); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(2, sis.size()); } @@ -167,8 +163,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { writer.forceMerge(1); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(2, sis.size()); } @@ -194,8 +189,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { writer.forceMerge(1); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(1, sis.size()); } @@ -220,8 +214,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { writer.forceMerge(1); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(3, sis.size()); } @@ -247,8 +240,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { writer.forceMerge(1); writer.close(); - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(4, sis.size()); } @@ -280,8 +272,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { // Should only be 4 segments in the index, because of the merge factor and // max merge docs settings. - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(4, sis.size()); } @@ -309,8 +300,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { writer.close(); // Verify that the last segment does not have deletions. - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(3, sis.size()); assertFalse(sis.info(2).hasDeletions()); } @@ -335,8 +325,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { writer.close(); // Verify that the last segment does not have deletions. - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(1, sis.size()); } @@ -363,8 +352,7 @@ public class TestSizeBoundedForceMerge extends LuceneTestCase { writer.close(); // Verify that the last segment does not have deletions. - SegmentInfos sis = new SegmentInfos(); - sis.read(dir); + SegmentInfos sis = SegmentInfos.readLatestCommit(dir); assertEquals(1, sis.size()); assertTrue(sis.info(0).hasDeletions()); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java index 2ed62888e77..fc234751397 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java @@ -131,8 +131,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { /** Reads the commit data from a Directory. */ private static Map readCommitData(Directory dir) throws IOException { - SegmentInfos infos = new SegmentInfos(); - infos.read(dir); + SegmentInfos infos = SegmentInfos.readLatestCommit(dir); return infos.getUserData(); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java index 762e5ebd4b0..dae94ad85b8 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java @@ -324,8 +324,7 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase { } private long getEpoch(Directory taxoDir) throws IOException { - SegmentInfos infos = new SegmentInfos(); - infos.read(taxoDir); + SegmentInfos infos = SegmentInfos.readLatestCommit(taxoDir); return Long.parseLong(infos.getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH)); } diff --git a/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java b/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java index 947f67c96ef..49028920348 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java +++ b/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java @@ -90,8 +90,7 @@ public class IndexSplitter { public IndexSplitter(Path dir) throws IOException { this.dir = dir; fsDir = FSDirectory.open(dir); - infos = new SegmentInfos(); - infos.read(fsDir); + infos = SegmentInfos.readLatestCommit(fsDir); } public void listSegments() throws IOException { diff --git a/lucene/misc/src/test/org/apache/lucene/index/TestIndexSplitter.java b/lucene/misc/src/test/org/apache/lucene/index/TestIndexSplitter.java index b7653701c32..bf58e537018 100644 --- a/lucene/misc/src/test/org/apache/lucene/index/TestIndexSplitter.java +++ b/lucene/misc/src/test/org/apache/lucene/index/TestIndexSplitter.java @@ -78,8 +78,7 @@ public class TestIndexSplitter extends LuceneTestCase { Path destDir2 = createTempDir(LuceneTestCase.getTestClass().getSimpleName()); IndexSplitter.main(new String[] {dir.toAbsolutePath().toString(), destDir2.toAbsolutePath().toString(), splitSegName}); Directory fsDirDest2 = newFSDirectory(destDir2); - SegmentInfos sis = new SegmentInfos(); - sis.read(fsDirDest2); + SegmentInfos sis = SegmentInfos.readLatestCommit(fsDirDest2); assertEquals(1, sis.size()); r = DirectoryReader.open(fsDirDest2); assertEquals(50, r.maxDoc()); diff --git a/lucene/misc/src/test/org/apache/lucene/uninverting/TestDocTermOrds.java b/lucene/misc/src/test/org/apache/lucene/uninverting/TestDocTermOrds.java index f4c6059d23a..74bec6295a2 100644 --- a/lucene/misc/src/test/org/apache/lucene/uninverting/TestDocTermOrds.java +++ b/lucene/misc/src/test/org/apache/lucene/uninverting/TestDocTermOrds.java @@ -27,7 +27,6 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; @@ -129,7 +128,7 @@ public class TestDocTermOrds extends LuceneTestCase { // Sometimes swap in codec that impls ord(): if (random().nextInt(10) == 7) { // Make sure terms index has ords: - Codec codec = TestUtil.alwaysPostingsFormat(PostingsFormat.forName("Lucene41WithOrds")); + Codec codec = TestUtil.alwaysPostingsFormat(TestUtil.getPostingsFormatWithOrds(random())); conf.setCodec(codec); } @@ -226,7 +225,7 @@ public class TestDocTermOrds extends LuceneTestCase { // Sometimes swap in codec that impls ord(): if (random().nextInt(10) == 7) { - Codec codec = TestUtil.alwaysPostingsFormat(PostingsFormat.forName("Lucene41WithOrds")); + Codec codec = TestUtil.alwaysPostingsFormat(TestUtil.getPostingsFormatWithOrds(random())); conf.setCodec(codec); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java index e425d1945e7..2523c7fc036 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsFormat.java @@ -81,7 +81,7 @@ public class IDVersionPostingsFormat extends PostingsFormat { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new IDVersionPostingsWriter(state); + PostingsWriterBase postingsWriter = new IDVersionPostingsWriter(state.liveDocs); boolean success = false; try { FieldsConsumer ret = new VersionBlockTreeTermsWriter(state, @@ -99,7 +99,7 @@ public class IDVersionPostingsFormat extends PostingsFormat { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new IDVersionPostingsReader(state); + PostingsReaderBase postingsReader = new IDVersionPostingsReader(); boolean success = false; try { FieldsProducer ret = new VersionBlockTreeTermsReader(postingsReader, state); diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java index 8f0d0f7de22..e3d2ae0414e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java @@ -33,16 +33,11 @@ import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Bits; final class IDVersionPostingsReader extends PostingsReaderBase { - final SegmentReadState state; - - public IDVersionPostingsReader(SegmentReadState state) { - this.state = state; - } @Override - public void init(IndexInput termsIn) throws IOException { + public void init(IndexInput termsIn, SegmentReadState state) throws IOException { // Make sure we are talking to the matching postings writer - CodecUtil.checkSegmentHeader(termsIn, + CodecUtil.checkIndexHeader(termsIn, IDVersionPostingsWriter.TERMS_CODEC, IDVersionPostingsWriter.VERSION_START, IDVersionPostingsWriter.VERSION_CURRENT, diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java index fd7d69bf532..6d7e4d3d7c5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; final class IDVersionPostingsWriter extends PushPostingsWriterBase { @@ -43,10 +44,10 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { private int lastPosition; private long lastVersion; - private final SegmentWriteState state; + private final Bits liveDocs; - public IDVersionPostingsWriter(SegmentWriteState state) { - this.state = state; + public IDVersionPostingsWriter(Bits liveDocs) { + this.liveDocs = liveDocs; } @Override @@ -55,8 +56,8 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { } @Override - public void init(IndexOutput termsOut) throws IOException { - CodecUtil.writeSegmentHeader(termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { + CodecUtil.writeIndexHeader(termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); } @Override @@ -82,7 +83,7 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { @Override public void startDoc(int docID, int termDocFreq) throws IOException { // TODO: LUCENE-5693: we don't need this check if we fix IW to not send deleted docs to us on flush: - if (state.liveDocs != null && state.liveDocs.get(docID) == false) { + if (liveDocs != null && liveDocs.get(docID) == false) { return; } if (lastDocID != -1) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java index 49205461e59..36d2ba9012f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnum.java @@ -631,7 +631,7 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum { int cmp = 0; - // TOOD: we should write our vLong backwards (MSB + // TODO: we should write our vLong backwards (MSB // first) to get better sharing from the FST // First compare up to valid seek frames: @@ -645,7 +645,7 @@ public final class IDVersionSegmentTermsEnum extends TermsEnum { } arc = arcs[1+targetUpto]; assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); - // TOOD: we could save the outputs in local + // TODO: we could save the outputs in local // byte[][] instead of making new objs ever // seek; but, often the FST doesn't have any // shared bytes (but this could change if we diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java index f407bf6466b..93937cebd27 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java @@ -72,7 +72,7 @@ public final class VersionBlockTreeTermsReader extends FieldsProducer { IndexInput indexIn = null; try { - int termsVersion = CodecUtil.checkSegmentHeader(in, VersionBlockTreeTermsWriter.TERMS_CODEC_NAME, + int termsVersion = CodecUtil.checkIndexHeader(in, VersionBlockTreeTermsWriter.TERMS_CODEC_NAME, VersionBlockTreeTermsWriter.VERSION_START, VersionBlockTreeTermsWriter.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); @@ -81,7 +81,7 @@ public final class VersionBlockTreeTermsReader extends FieldsProducer { state.segmentSuffix, VersionBlockTreeTermsWriter.TERMS_INDEX_EXTENSION); indexIn = state.directory.openInput(indexFile, state.context); - int indexVersion = CodecUtil.checkSegmentHeader(indexIn, VersionBlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME, + int indexVersion = CodecUtil.checkIndexHeader(indexIn, VersionBlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME, VersionBlockTreeTermsWriter.VERSION_START, VersionBlockTreeTermsWriter.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); @@ -94,7 +94,7 @@ public final class VersionBlockTreeTermsReader extends FieldsProducer { CodecUtil.checksumEntireFile(indexIn); // Have PostingsReader init itself - postingsReader.init(in); + postingsReader.init(in, state); // NOTE: data file is too costly to verify checksum against all the bytes on open, // but for now we at least verify proper structure of the checksum footer: which looks diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java index 85cea559e8d..a055396cdb7 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java @@ -118,7 +118,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { /** Extension of terms file */ static final String TERMS_EXTENSION = "tiv"; - final static String TERMS_CODEC_NAME = "VERSION_BLOCK_TREE_TERMS_DICT"; + final static String TERMS_CODEC_NAME = "VersionBlockTreeTermsDict"; /** Initial terms format. */ public static final int VERSION_START = 1; @@ -128,7 +128,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { /** Extension of terms index file */ static final String TERMS_INDEX_EXTENSION = "tipv"; - final static String TERMS_INDEX_CODEC_NAME = "VERSION_BLOCK_TREE_TERMS_INDEX"; + final static String TERMS_INDEX_CODEC_NAME = "VersionBlockTreeTermsIndex"; private final IndexOutput out; private final IndexOutput indexOut; @@ -199,20 +199,20 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { fieldInfos = state.fieldInfos; this.minItemsInBlock = minItemsInBlock; this.maxItemsInBlock = maxItemsInBlock; - CodecUtil.writeSegmentHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); //DEBUG = state.segmentName.equals("_4a"); final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); indexOut = state.directory.createOutput(termsIndexFileName, state.context); - CodecUtil.writeSegmentHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); this.postingsWriter = postingsWriter; // segment = state.segmentInfo.name; // System.out.println("BTW.init seg=" + state.segmentName); - postingsWriter.init(out); // have consumer write its format/header + postingsWriter.init(out, state); // have consumer write its format/header success = true; } finally { if (!success) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41ords/Lucene41WithOrds.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/LuceneFixedGap.java similarity index 81% rename from lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41ords/Lucene41WithOrds.java rename to lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/LuceneFixedGap.java index 57f8eaa57ea..c77ba23ff85 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41ords/Lucene41WithOrds.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/LuceneFixedGap.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.lucene41ords; +package org.apache.lucene.codecs.blockterms; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -30,34 +30,34 @@ import org.apache.lucene.codecs.blockterms.FixedGapTermsIndexReader; import org.apache.lucene.codecs.blockterms.FixedGapTermsIndexWriter; import org.apache.lucene.codecs.blockterms.TermsIndexReaderBase; import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; // javadocs -import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; // javadocs +import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -// TODO: we could make separate base class that can wrapp -// any PostingsBaseFormat and make it ord-able... +// TODO: we could make separate base class that can wrap +// any PostingsFormat and make it ord-able... /** - * Customized version of {@link Lucene41PostingsFormat} that uses + * Customized version of {@link Lucene50PostingsFormat} that uses * {@link FixedGapTermsIndexWriter}. */ -public final class Lucene41WithOrds extends PostingsFormat { +public final class LuceneFixedGap extends PostingsFormat { final int termIndexInterval; - public Lucene41WithOrds() { + public LuceneFixedGap() { this(FixedGapTermsIndexWriter.DEFAULT_TERM_INDEX_INTERVAL); } - public Lucene41WithOrds(int termIndexInterval) { - super("Lucene41WithOrds"); + public LuceneFixedGap(int termIndexInterval) { + super("LuceneFixedGap"); this.termIndexInterval = termIndexInterval; } @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase docs = new Lucene41PostingsWriter(state); + PostingsWriterBase docs = new Lucene50PostingsWriter(state); // TODO: should we make the terms index more easily // pluggable? Ie so that this codec would record which @@ -94,7 +94,7 @@ public final class Lucene41WithOrds extends PostingsFormat { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postings = new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix); + PostingsReaderBase postings = new Lucene50PostingsReader(state); TermsIndexReaderBase indexReader; boolean success = false; diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41vargap/Lucene41VarGapDocFreqInterval.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/LuceneVarGapDocFreqInterval.java similarity index 81% rename from lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41vargap/Lucene41VarGapDocFreqInterval.java rename to lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/LuceneVarGapDocFreqInterval.java index 4a320c8cb63..a2675262a41 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41vargap/Lucene41VarGapDocFreqInterval.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/LuceneVarGapDocFreqInterval.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.lucene41vargap; +package org.apache.lucene.codecs.blockterms; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -31,37 +31,37 @@ import org.apache.lucene.codecs.blockterms.TermsIndexReaderBase; import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; // javadocs -import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; // javadocs +import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -// TODO: we could make separate base class that can wrapp -// any PostingsBaseFormat and make it ord-able... +// TODO: we could make separate base class that can wrap +// any PostingsFormat and make it ord-able... /** - * Customized version of {@link Lucene41PostingsFormat} that uses + * Customized version of {@link Lucene50PostingsFormat} that uses * {@link VariableGapTermsIndexWriter} with a fixed interval, but * forcing high docfreq terms to be indexed terms. */ -public final class Lucene41VarGapDocFreqInterval extends PostingsFormat { +public final class LuceneVarGapDocFreqInterval extends PostingsFormat { final int termIndexInterval; final int docFreqThreshold; - public Lucene41VarGapDocFreqInterval() { + public LuceneVarGapDocFreqInterval() { this(1000000, FixedGapTermsIndexWriter.DEFAULT_TERM_INDEX_INTERVAL); } - public Lucene41VarGapDocFreqInterval(int docFreqThreshold, int termIndexInterval) { - super("Lucene41VarGapFixedInterval"); + public LuceneVarGapDocFreqInterval(int docFreqThreshold, int termIndexInterval) { + super("LuceneVarGapDocFreqInterval"); this.termIndexInterval = termIndexInterval; this.docFreqThreshold = docFreqThreshold; } @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase docs = new Lucene41PostingsWriter(state); + PostingsWriterBase docs = new Lucene50PostingsWriter(state); // TODO: should we make the terms index more easily // pluggable? Ie so that this codec would record which @@ -98,7 +98,7 @@ public final class Lucene41VarGapDocFreqInterval extends PostingsFormat { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postings = new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix); + PostingsReaderBase postings = new Lucene50PostingsReader(state); TermsIndexReaderBase indexReader; boolean success = false; diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41vargap/Lucene41VarGapFixedInterval.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/LuceneVarGapFixedInterval.java similarity index 81% rename from lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41vargap/Lucene41VarGapFixedInterval.java rename to lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/LuceneVarGapFixedInterval.java index c797ee3a8b8..d664fa9474f 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41vargap/Lucene41VarGapFixedInterval.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/LuceneVarGapFixedInterval.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.lucene41vargap; +package org.apache.lucene.codecs.blockterms; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -31,34 +31,34 @@ import org.apache.lucene.codecs.blockterms.TermsIndexReaderBase; import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; // javadocs -import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; // javadocs +import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -// TODO: we could make separate base class that can wrapp -// any PostingsBaseFormat and make it ord-able... +// TODO: we could make separate base class that can wrap +// any PostingsFormat and make it ord-able... /** - * Customized version of {@link Lucene41PostingsFormat} that uses + * Customized version of {@link Lucene50PostingsFormat} that uses * {@link VariableGapTermsIndexWriter} with a fixed interval. */ -public final class Lucene41VarGapFixedInterval extends PostingsFormat { +public final class LuceneVarGapFixedInterval extends PostingsFormat { final int termIndexInterval; - public Lucene41VarGapFixedInterval() { + public LuceneVarGapFixedInterval() { this(FixedGapTermsIndexWriter.DEFAULT_TERM_INDEX_INTERVAL); } - public Lucene41VarGapFixedInterval(int termIndexInterval) { - super("Lucene41VarGapFixedInterval"); + public LuceneVarGapFixedInterval(int termIndexInterval) { + super("LuceneVarGapFixedInterval"); this.termIndexInterval = termIndexInterval; } @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase docs = new Lucene41PostingsWriter(state); + PostingsWriterBase docs = new Lucene50PostingsWriter(state); // TODO: should we make the terms index more easily // pluggable? Ie so that this codec would record which @@ -95,7 +95,7 @@ public final class Lucene41VarGapFixedInterval extends PostingsFormat { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postings = new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix); + PostingsReaderBase postings = new Lucene50PostingsReader(state); TermsIndexReaderBase indexReader; boolean success = false; diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41vargap/package.html b/lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/package.html similarity index 96% rename from lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41vargap/package.html rename to lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/package.html index 606bb064d70..f6c674a9817 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41vargap/package.html +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/blockterms/package.html @@ -20,6 +20,6 @@ -Codecs for testing that support {@link org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader} +Codecs for testing that support {@link org.apache.lucene.codecs.blockterms} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankySegmentInfoFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankySegmentInfoFormat.java index 936fae44c95..c12f65113a4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankySegmentInfoFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankySegmentInfoFormat.java @@ -35,8 +35,8 @@ class CrankySegmentInfoFormat extends SegmentInfoFormat { } @Override - public SegmentInfo read(Directory directory, String segmentName, IOContext context) throws IOException { - return delegate.read(directory, segmentName, context); + public SegmentInfo read(Directory directory, String segmentName, byte[] segmentID, IOContext context) throws IOException { + return delegate.read(directory, segmentName, segmentID, context); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41ords/package.html b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41ords/package.html deleted file mode 100644 index 6cb3c42cd28..00000000000 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene41ords/package.html +++ /dev/null @@ -1,25 +0,0 @@ - - - - - - - -Codec for testing that supports {@link org.apache.lucene.index.TermsEnum#ord()} - - diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java index b9f8310f4e2..cc8f628dada 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java @@ -38,8 +38,8 @@ import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader; import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter; import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsReader; import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsWriter; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter; import org.apache.lucene.codecs.memory.FSTOrdTermsReader; import org.apache.lucene.codecs.memory.FSTOrdTermsWriter; import org.apache.lucene.codecs.memory.FSTTermsReader; @@ -92,7 +92,9 @@ public final class MockRandomPostingsFormat extends PostingsFormat { } // we pull this before the seed intentionally: because its not consumed at runtime - // (the skipInterval is written into postings header) + // (the skipInterval is written into postings header). + // NOTE: Currently not passed to postings writer. + // before, it was being passed in wrongly as acceptableOverhead! int skipInterval = TestUtil.nextInt(seedRandom, minSkipInterval, 10); if (LuceneTestCase.VERBOSE) { @@ -117,7 +119,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat { random.nextInt(); // consume a random for buffersize - PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state, skipInterval); + PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state); final FieldsConsumer fields; final int t1 = random.nextInt(5); @@ -280,7 +282,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat { System.out.println("MockRandomCodec: readBufferSize=" + readBufferSize); } - PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix); + PostingsReaderBase postingsReader = new Lucene50PostingsReader(state); final FieldsProducer fields; final int t1 = random.nextInt(5); @@ -312,12 +314,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat { boolean success = false; try { - fields = new BlockTreeTermsReader(state.directory, - state.fieldInfos, - state.segmentInfo, - postingsReader, - state.context, - state.segmentSuffix); + fields = new BlockTreeTermsReader(postingsReader, state); success = true; } finally { if (!success) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseCompoundFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseCompoundFormatTestCase.java index 53d1bce3b03..fd8c94faa9b 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseCompoundFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseCompoundFormatTestCase.java @@ -197,8 +197,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest } } riw.close(); - SegmentInfos infos = new SegmentInfos(); - infos.read(dir); + SegmentInfos infos = SegmentInfos.readLatestCommit(dir); for (SegmentCommitInfo si : infos) { if (si.info.getUseCompoundFile()) { try (Directory cfsDir = si.info.getCodec().compoundFormat().getCompoundReader(dir, si.info, newIOContext(random()))) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index faf057c5bae..b3d9d54903e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -541,6 +541,34 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes ireader.close(); directory.close(); } + + public void testBytesMergeAwayAllValues() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.NO)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new BinaryDocValuesField("field", new BytesRef("hi"))); + iwriter.addDocument(doc); + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + BinaryDocValues dv = getOnlySegmentReader(ireader).getBinaryDocValues("field"); + assertEquals(new BytesRef(), dv.get(0)); + + ireader.close(); + directory.close(); + } public void testSortedBytes() throws IOException { Analyzer analyzer = new MockAnalyzer(random()); @@ -2749,6 +2777,34 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes directory.close(); } + public void testNumberMergeAwayAllValues() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.NO)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new NumericDocValuesField("field", 5)); + iwriter.addDocument(doc); + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + NumericDocValues dv = getOnlySegmentReader(ireader).getNumericDocValues("field"); + assertEquals(0, dv.get(0)); + + ireader.close(); + directory.close(); + } + public void testTwoSortedNumber() throws IOException { assumeTrue("Codec does not support SORTED_NUMERIC", codecSupportsSortedNumeric()); Directory directory = newDirectory(); @@ -2772,6 +2828,29 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes directory.close(); } + public void testTwoSortedNumberSameValue() throws IOException { + assumeTrue("Codec does not support SORTED_NUMERIC", codecSupportsSortedNumeric()); + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + Document doc = new Document(); + doc.add(new SortedNumericDocValuesField("dv", 11)); + doc.add(new SortedNumericDocValuesField("dv", 11)); + writer.addDocument(doc); + writer.close(); + + // Now search the index: + IndexReader reader = DirectoryReader.open(directory); + assert reader.leaves().size() == 1; + SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); + dv.setDocument(0); + assertEquals(2, dv.count()); + assertEquals(11, dv.valueAt(0)); + assertEquals(11, dv.valueAt(1)); + + reader.close(); + directory.close(); + } + public void testTwoSortedNumberOneMissing() throws IOException { assumeTrue("Codec does not support SORTED_NUMERIC", codecSupportsSortedNumeric()); Directory directory = newDirectory(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java index 9eaeeef7f67..d3ff8895700 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java @@ -153,6 +153,35 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas } } + public void testOutliers() throws Exception { + int iterations = atLeast(1); + final Random r = random(); + for (int i = 0; i < iterations; i++) { + final long commonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); + doTestNormsVersusStoredFields(new LongProducer() { + @Override + long next() { + return r.nextInt(100) == 0 ? TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE) : commonValue; + } + }); + } + } + + public void testOutliers2() throws Exception { + int iterations = atLeast(1); + final Random r = random(); + for (int i = 0; i < iterations; i++) { + final long commonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); + final long uncommonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); + doTestNormsVersusStoredFields(new LongProducer() { + @Override + long next() { + return r.nextInt(100) == 0 ? uncommonValue : commonValue; + } + }); + } + } + private void doTestNormsVersusStoredFields(LongProducer longs) throws Exception { int numDocs = atLeast(500); long norms[] = new long[numDocs]; diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java index 4a8736f8f1a..2d3500f2e62 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java @@ -47,11 +47,12 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT public void testFiles() throws Exception { Directory dir = newDirectory(); Codec codec = getCodec(); + byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - Collections.emptyMap(), StringHelper.randomId()); + Collections.emptyMap(), id); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); - SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", IOContext.DEFAULT); + SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); assertEquals(info.files(), info2.files()); dir.close(); } @@ -60,8 +61,9 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT public void testAddsSelfToFiles() throws Exception { Directory dir = newDirectory(); Codec codec = getCodec(); + byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - Collections.emptyMap(), StringHelper.randomId()); + Collections.emptyMap(), id); Set originalFiles = Collections.singleton("_123.a"); info.setFiles(originalFiles); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); @@ -70,7 +72,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT assertTrue(modifiedFiles.containsAll(originalFiles)); assertTrue("did you forget to add yourself to files()", modifiedFiles.size() > originalFiles.size()); - SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", IOContext.DEFAULT); + SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); assertEquals(info.files(), info2.files()); dir.close(); } @@ -79,14 +81,15 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT public void testDiagnostics() throws Exception { Directory dir = newDirectory(); Codec codec = getCodec(); + byte id[] = StringHelper.randomId(); Map diagnostics = new HashMap<>(); diagnostics.put("key1", "value1"); diagnostics.put("key2", "value2"); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - diagnostics, StringHelper.randomId()); + diagnostics, id); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); - SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", IOContext.DEFAULT); + SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); assertEquals(diagnostics, info2.getDiagnostics()); dir.close(); } @@ -100,7 +103,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT Collections.emptyMap(), id); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); - SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", IOContext.DEFAULT); + SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); assertIDEquals(id, info2.getId()); dir.close(); } @@ -110,11 +113,12 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT Codec codec = getCodec(); for (Version v : getVersions()) { Directory dir = newDirectory(); + byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo(dir, v, "_123", 1, false, codec, - Collections.emptyMap(), StringHelper.randomId()); + Collections.emptyMap(), id); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); - SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", IOContext.DEFAULT); + SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); assertEquals(info2.getVersion(), v); dir.close(); } @@ -152,7 +156,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT SegmentInfo info = new SegmentInfo(dir, version, name, docCount, isCompoundFile, codec, diagnostics, id); info.setFiles(files); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); - SegmentInfo info2 = codec.segmentInfoFormat().read(dir, name, IOContext.DEFAULT); + SegmentInfo info2 = codec.segmentInfoFormat().read(dir, name, id, IOContext.DEFAULT); assertEquals(info, info2); dir.close(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java index 17ed3cdfccd..095e39b37b7 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java @@ -32,11 +32,11 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.asserting.AssertingCodec; import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat; import org.apache.lucene.codecs.asserting.AssertingPostingsFormat; -import org.apache.lucene.codecs.blocktreeords.Ords41PostingsFormat; +import org.apache.lucene.codecs.blockterms.LuceneFixedGap; +import org.apache.lucene.codecs.blockterms.LuceneVarGapDocFreqInterval; +import org.apache.lucene.codecs.blockterms.LuceneVarGapFixedInterval; +import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat; import org.apache.lucene.codecs.bloom.TestBloomFilteredLucenePostings; -import org.apache.lucene.codecs.lucene41ords.Lucene41WithOrds; -import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapDocFreqInterval; -import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapFixedInterval; import org.apache.lucene.codecs.memory.DirectPostingsFormat; import org.apache.lucene.codecs.memory.FSTOrdPostingsFormat; import org.apache.lucene.codecs.memory.FSTPostingsFormat; @@ -127,15 +127,15 @@ public class RandomCodec extends AssertingCodec { new FSTOrdPostingsFormat(), new DirectPostingsFormat(LuceneTestCase.rarely(random) ? 1 : (LuceneTestCase.rarely(random) ? Integer.MAX_VALUE : maxItemsPerBlock), LuceneTestCase.rarely(random) ? 1 : (LuceneTestCase.rarely(random) ? Integer.MAX_VALUE : lowFreqCutoff)), - //TODO as a PostingsFormat which wraps others, we should allow TestBloomFilteredLucene41Postings to be constructed + //TODO as a PostingsFormat which wraps others, we should allow TestBloomFilteredLucenePostings to be constructed //with a choice of concrete PostingsFormats. Maybe useful to have a generic means of marking and dealing //with such "wrapper" classes? new TestBloomFilteredLucenePostings(), new MockRandomPostingsFormat(random), - new Ords41PostingsFormat(minItemsPerBlock, maxItemsPerBlock), - new Lucene41WithOrds(TestUtil.nextInt(random, 1, 1000)), - new Lucene41VarGapFixedInterval(TestUtil.nextInt(random, 1, 1000)), - new Lucene41VarGapDocFreqInterval(TestUtil.nextInt(random, 1, 100), TestUtil.nextInt(random, 1, 1000)), + new BlockTreeOrdsPostingsFormat(minItemsPerBlock, maxItemsPerBlock), + new LuceneFixedGap(TestUtil.nextInt(random, 1, 1000)), + new LuceneVarGapFixedInterval(TestUtil.nextInt(random, 1, 1000)), + new LuceneVarGapDocFreqInterval(TestUtil.nextInt(random, 1, 100), TestUtil.nextInt(random, 1, 1000)), new SimpleTextPostingsFormat(), new AssertingPostingsFormat(), new MemoryPostingsFormat(true, random.nextFloat()), diff --git a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java index 0100a15aece..cb5ac08580d 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java @@ -800,11 +800,12 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { if (LuceneTestCase.VERBOSE) { System.out.println("MDW: Unreferenced check: Ignoring segments file: " + file + " that we could not delete."); } - SegmentInfos sis = new SegmentInfos(); + SegmentInfos sis; try { - sis.read(in, file); + sis = SegmentInfos.readCommit(in, file); } catch (IOException ioe) { // OK: likely some of the .si files were deleted + sis = new SegmentInfos(); } try { diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java index ef75a4848cf..9cf4ec3a1f4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java @@ -45,9 +45,11 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.asserting.AssertingCodec; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; -import org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat; +import org.apache.lucene.codecs.blockterms.LuceneFixedGap; +import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat; import org.apache.lucene.codecs.lucene50.Lucene50Codec; +import org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat; +import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.document.BinaryDocValuesField; @@ -746,7 +748,7 @@ public final class TestUtil { * Returns the actual default postings format (e.g. LuceneMNPostingsFormat for this version of Lucene. */ public static PostingsFormat getDefaultPostingsFormat() { - return new Lucene41PostingsFormat(); + return new Lucene50PostingsFormat(); } /** @@ -754,14 +756,25 @@ public final class TestUtil { * @lucene.internal this may disappear at any time */ public static PostingsFormat getDefaultPostingsFormat(int minItemsPerBlock, int maxItemsPerBlock) { - return new Lucene41PostingsFormat(minItemsPerBlock, maxItemsPerBlock); + return new Lucene50PostingsFormat(minItemsPerBlock, maxItemsPerBlock); + } + + /** Returns a random postings format that supports term ordinals */ + public static PostingsFormat getPostingsFormatWithOrds(Random r) { + switch (r.nextInt(2)) { + case 0: return new LuceneFixedGap(); + case 1: return new BlockTreeOrdsPostingsFormat(); + // TODO: these don't actually support ords! + //case 2: return new FSTOrdPostingsFormat(); + default: throw new AssertionError(); + } } /** * Returns the actual default docvalues format (e.g. LuceneMNDocValuesFormat for this version of Lucene. */ public static DocValuesFormat getDefaultDocValuesFormat() { - return new Lucene410DocValuesFormat(); + return new Lucene50DocValuesFormat(); } // TODO: generalize all 'test-checks-for-crazy-codecs' to diff --git a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 8ab55585d88..f3f360d8f24 100644 --- a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -15,8 +15,8 @@ org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat org.apache.lucene.codecs.ramonly.RAMOnlyPostingsFormat -org.apache.lucene.codecs.lucene41ords.Lucene41WithOrds -org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapFixedInterval -org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapDocFreqInterval +org.apache.lucene.codecs.blockterms.LuceneFixedGap +org.apache.lucene.codecs.blockterms.LuceneVarGapFixedInterval +org.apache.lucene.codecs.blockterms.LuceneVarGapDocFreqInterval org.apache.lucene.codecs.bloom.TestBloomFilteredLucenePostings org.apache.lucene.codecs.asserting.AssertingPostingsFormat diff --git a/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml b/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml index 439472e6fe9..ed8f222e621 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml @@ -19,9 +19,9 @@ - + - + diff --git a/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java b/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java index 15254517527..e04a17f041c 100644 --- a/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java +++ b/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java @@ -22,6 +22,7 @@ import java.util.Map; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; +import org.apache.lucene.util.TestUtil; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.schema.SchemaField; import org.junit.BeforeClass; @@ -43,9 +44,9 @@ public class TestCodecSupport extends SolrTestCaseJ4 { assertEquals("SimpleText", format.getPostingsFormatForField(schemaField.getName()).getName()); schemaField = fields.get("string_standard_f"); - assertEquals("Lucene41", format.getPostingsFormatForField(schemaField.getName()).getName()); + assertEquals(TestUtil.getDefaultPostingsFormat().getName(), format.getPostingsFormatForField(schemaField.getName()).getName()); schemaField = fields.get("string_f"); - assertEquals("Lucene41", format.getPostingsFormatForField(schemaField.getName()).getName()); + assertEquals(TestUtil.getDefaultPostingsFormat().getName(), format.getPostingsFormatForField(schemaField.getName()).getName()); } public void testDocValuesFormats() { @@ -53,12 +54,12 @@ public class TestCodecSupport extends SolrTestCaseJ4 { Map fields = h.getCore().getLatestSchema().getFields(); SchemaField schemaField = fields.get("string_disk_f"); PerFieldDocValuesFormat format = (PerFieldDocValuesFormat) codec.docValuesFormat(); - assertEquals("Lucene410", format.getDocValuesFormatForField(schemaField.getName()).getName()); + assertEquals(TestUtil.getDefaultDocValuesFormat().getName(), format.getDocValuesFormatForField(schemaField.getName()).getName()); schemaField = fields.get("string_memory_f"); assertEquals("Memory", format.getDocValuesFormatForField(schemaField.getName()).getName()); schemaField = fields.get("string_f"); - assertEquals("Lucene410", + assertEquals(TestUtil.getDefaultDocValuesFormat().getName(), format.getDocValuesFormatForField(schemaField.getName()).getName()); } @@ -70,16 +71,16 @@ public class TestCodecSupport extends SolrTestCaseJ4 { assertEquals("SimpleText", format.getPostingsFormatForField("bar_simple").getName()); assertEquals("Direct", format.getPostingsFormatForField("foo_direct").getName()); assertEquals("Direct", format.getPostingsFormatForField("bar_direct").getName()); - assertEquals("Lucene41", format.getPostingsFormatForField("foo_standard").getName()); - assertEquals("Lucene41", format.getPostingsFormatForField("bar_standard").getName()); + assertEquals(TestUtil.getDefaultPostingsFormat().getName(), format.getPostingsFormatForField("foo_standard").getName()); + assertEquals(TestUtil.getDefaultPostingsFormat().getName(), format.getPostingsFormatForField("bar_standard").getName()); } public void testDynamicFieldsDocValuesFormats() { Codec codec = h.getCore().getCodec(); PerFieldDocValuesFormat format = (PerFieldDocValuesFormat) codec.docValuesFormat(); - assertEquals("Lucene410", format.getDocValuesFormatForField("foo_disk").getName()); - assertEquals("Lucene410", format.getDocValuesFormatForField("bar_disk").getName()); + assertEquals(TestUtil.getDefaultDocValuesFormat().getName(), format.getDocValuesFormatForField("foo_disk").getName()); + assertEquals(TestUtil.getDefaultDocValuesFormat().getName(), format.getDocValuesFormatForField("bar_disk").getName()); assertEquals("Memory", format.getDocValuesFormatForField("foo_memory").getName()); assertEquals("Memory", format.getDocValuesFormatForField("bar_memory").getName()); }