LUCENE-5969: finish porting rest of codec to 5.0 features

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1633991 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-10-24 02:43:41 +00:00
commit ff50c35fc2
181 changed files with 11429 additions and 1391 deletions

View File

@ -100,6 +100,7 @@
<or>
<containsregexp expression="@author\b" casesensitive="yes"/>
<containsregexp expression="\bno(n|)commit\b" casesensitive="no"/>
<containsregexp expression="\bTOOD:" casesensitive="yes"/>
<containsregexp expression="\t" casesensitive="no"/>
<containsregexp expression="\$(?:LastChanged)?Date\b" casesensitive="yes"/>
<containsregexp expression="\$(?:(?:LastChanged)?Revision|Rev)\b" casesensitive="yes"/>
@ -112,7 +113,7 @@
</fileset>
<map from="${validate.currDir}${file.separator}" to="* "/>
</pathconvert>
<fail if="validate.patternsFound">The following files contain @author tags, tabs, svn keywords or nocommits:${line.separator}${validate.patternsFound}</fail>
<fail if="validate.patternsFound">The following files contain @author tags, tabs, TOODs, svn keywords or nocommits:${line.separator}${validate.patternsFound}</fail>
</target>
<target name="rat-sources" description="Runs rat across all sources and tests">

View File

@ -75,7 +75,7 @@ New Features
* LUCENE-5969: Lucene 5.0 has a new index format with mismatched file detection,
improved exception handling, and indirect norms encoding for sparse fields.
(Mike McCandless, Robert Muir)
(Mike McCandless, Ryan Ernst, Robert Muir)
API Changes
@ -148,7 +148,10 @@ API Changes
* LUCENE-5969: Add Codec.compoundFormat, which handles the encoding of compound
files. Add getMergeInstance() to codec producer APIs, which can be overridden
to return an instance optimized for merging instead of searching.
to return an instance optimized for merging instead of searching. Add
Terms.getStats() which can return additional codec-specific statistics about a field.
Change instance method SegmentInfos.read() to two static methods: SegmentInfos.readCommit()
and SegmentInfos.readLatestCommit().
(Mike McCandless, Robert Muir)
* LUCENE-5992: Remove FieldInfos from SegmentInfosWriter.write API. (Robert Muir, Mike McCandless)

View File

@ -0,0 +1,358 @@
package org.apache.lucene.codecs.blocktree;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.TreeMap;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.Outputs;
/** A block-based terms index and dictionary that assigns
* terms to variable length blocks according to how they
* share prefixes. The terms index is a prefix trie
* whose leaves are term blocks. The advantage of this
* approach is that seekExact is often able to
* determine a term cannot exist without doing any IO, and
* intersection with Automata is very fast. Note that this
* terms dictionary has it's own fixed terms index (ie, it
* does not support a pluggable terms index
* implementation).
*
* <p><b>NOTE</b>: this terms dictionary supports
* min/maxItemsPerBlock during indexing to control how
* much memory the terms index uses.</p>
*
* <p>The data structure used by this implementation is very
* similar to a burst trie
* (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499),
* but with added logic to break up too-large blocks of all
* terms sharing a given prefix into smaller ones.</p>
*
* <p>Use {@link org.apache.lucene.index.CheckIndex} with the <code>-verbose</code>
* option to see summary statistics on the blocks in the
* dictionary.
*
* @lucene.experimental
* @deprecated Only for 4.x backcompat
*/
@Deprecated
public final class Lucene40BlockTreeTermsReader extends FieldsProducer {
/** Extension of terms file */
static final String TERMS_EXTENSION = "tim";
final static String TERMS_CODEC_NAME = "BLOCK_TREE_TERMS_DICT";
/** Initial terms format. */
public static final int VERSION_START = 0;
/** Append-only */
public static final int VERSION_APPEND_ONLY = 1;
/** Meta data as array */
public static final int VERSION_META_ARRAY = 2;
/** checksums */
public static final int VERSION_CHECKSUM = 3;
/** min/max term */
public static final int VERSION_MIN_MAX_TERMS = 4;
/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_MIN_MAX_TERMS;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
final static String TERMS_INDEX_CODEC_NAME = "BLOCK_TREE_TERMS_INDEX";
static final int OUTPUT_FLAGS_NUM_BITS = 2;
static final int OUTPUT_FLAGS_MASK = 0x3;
static final int OUTPUT_FLAG_IS_FLOOR = 0x1;
static final int OUTPUT_FLAG_HAS_TERMS = 0x2;
static final Outputs<BytesRef> FST_OUTPUTS = ByteSequenceOutputs.getSingleton();
static final BytesRef NO_OUTPUT = FST_OUTPUTS.getNoOutput();
// Open input to the main terms dict file (_X.tib)
final IndexInput in;
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
// Reads the terms dict entries, to gather state to
// produce DocsEnum on demand
final PostingsReaderBase postingsReader;
private final TreeMap<String,Lucene40FieldReader> fields = new TreeMap<>();
/** File offset where the directory starts in the terms file. */
private long dirOffset;
/** File offset where the directory starts in the index file. */
private long indexDirOffset;
final String segment;
private final int version;
/** Sole constructor. */
public Lucene40BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state)
throws IOException {
this.postingsReader = postingsReader;
this.segment = state.segmentInfo.name;
String termsFileName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
in = state.directory.openInput(termsFileName, state.context);
boolean success = false;
IndexInput indexIn = null;
try {
version = readHeader(in);
String indexFileName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_INDEX_EXTENSION);
indexIn = state.directory.openInput(indexFileName, state.context);
int indexVersion = readIndexHeader(indexIn);
if (indexVersion != version) {
throw new CorruptIndexException("mixmatched version files: " + in + "=" + version + "," + indexIn + "=" + indexVersion, indexIn);
}
// verify
if (version >= VERSION_CHECKSUM) {
CodecUtil.checksumEntireFile(indexIn);
}
// Have PostingsReader init itself
postingsReader.init(in, state);
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
if (version >= VERSION_CHECKSUM) {
CodecUtil.retrieveChecksum(in);
}
// Read per-field details
seekDir(in, dirOffset);
seekDir(indexIn, indexDirOffset);
final int numFields = in.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, in);
}
for(int i=0;i<numFields;i++) {
final int field = in.readVInt();
final long numTerms = in.readVLong();
if (numTerms <= 0) {
throw new CorruptIndexException("Illegal numTerms for field number: " + field, in);
}
final int numBytes = in.readVInt();
if (numBytes < 0) {
throw new CorruptIndexException("invalid rootCode for field number: " + field + ", numBytes=" + numBytes, in);
}
final BytesRef rootCode = new BytesRef(new byte[numBytes]);
in.readBytes(rootCode.bytes, 0, numBytes);
rootCode.length = numBytes;
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
throw new CorruptIndexException("invalid field number: " + field, in);
}
final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
final long sumDocFreq = in.readVLong();
final int docCount = in.readVInt();
final int longsSize = version >= VERSION_META_ARRAY ? in.readVInt() : 0;
if (longsSize < 0) {
throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, in);
}
BytesRef minTerm, maxTerm;
if (version >= VERSION_MIN_MAX_TERMS) {
minTerm = readBytesRef(in);
maxTerm = readBytesRef(in);
} else {
minTerm = maxTerm = null;
}
if (docCount < 0 || docCount > state.segmentInfo.getDocCount()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.getDocCount(), in);
}
if (sumDocFreq < docCount) { // #postings must be >= #docs with field
throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, in);
}
if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, in);
}
final long indexStartFP = indexIn.readVLong();
Lucene40FieldReader previous = fields.put(fieldInfo.name,
new Lucene40FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
indexStartFP, longsSize, indexIn, minTerm, maxTerm));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in);
}
}
indexIn.close();
success = true;
} finally {
if (!success) {
// this.close() will close in:
IOUtils.closeWhileHandlingException(indexIn, this);
}
}
}
private static BytesRef readBytesRef(IndexInput in) throws IOException {
BytesRef bytes = new BytesRef();
bytes.length = in.readVInt();
bytes.bytes = new byte[bytes.length];
in.readBytes(bytes.bytes, 0, bytes.length);
return bytes;
}
/** Reads terms file header. */
private int readHeader(IndexInput input) throws IOException {
int version = CodecUtil.checkHeader(input, TERMS_CODEC_NAME,
VERSION_START,
VERSION_CURRENT);
if (version < VERSION_APPEND_ONLY) {
dirOffset = input.readLong();
}
return version;
}
/** Reads index file header. */
private int readIndexHeader(IndexInput input) throws IOException {
int version = CodecUtil.checkHeader(input, TERMS_INDEX_CODEC_NAME,
VERSION_START,
VERSION_CURRENT);
if (version < VERSION_APPEND_ONLY) {
indexDirOffset = input.readLong();
}
return version;
}
/** Seek {@code input} to the directory offset. */
private void seekDir(IndexInput input, long dirOffset)
throws IOException {
if (version >= VERSION_CHECKSUM) {
input.seek(input.length() - CodecUtil.footerLength() - 8);
dirOffset = input.readLong();
} else if (version >= VERSION_APPEND_ONLY) {
input.seek(input.length() - 8);
dirOffset = input.readLong();
}
input.seek(dirOffset);
}
// for debugging
// private static String toHex(int v) {
// return "0x" + Integer.toHexString(v);
// }
@Override
public void close() throws IOException {
try {
IOUtils.close(in, postingsReader);
} finally {
// Clear so refs to terms index is GCable even if
// app hangs onto us:
fields.clear();
}
}
@Override
public Iterator<String> iterator() {
return Collections.unmodifiableSet(fields.keySet()).iterator();
}
@Override
public Terms terms(String field) throws IOException {
assert field != null;
return fields.get(field);
}
@Override
public int size() {
return fields.size();
}
// for debugging
String brToString(BytesRef b) {
if (b == null) {
return "null";
} else {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return b.toString();
}
}
}
@Override
public long ramBytesUsed() {
long sizeInBytes = postingsReader.ramBytesUsed();
for(Lucene40FieldReader reader : fields.values()) {
sizeInBytes += reader.ramBytesUsed();
}
return sizeInBytes;
}
@Override
public Iterable<? extends Accountable> getChildResources() {
List<Accountable> resources = new ArrayList<>();
resources.addAll(Accountables.namedAccountables("field", fields));
resources.add(Accountables.namedAccountable("delegate", postingsReader));
return Collections.unmodifiableList(resources);
}
@Override
public void checkIntegrity() throws IOException {
if (version >= VERSION_CHECKSUM) {
// term dictionary
CodecUtil.checksumEntireFile(in);
// postings
postingsReader.checkIntegrity();
}
}
@Override
public String toString() {
return getClass().getSimpleName() + "(fields=" + fields.size() + ",delegate=" + postingsReader + ")";
}
}

View File

@ -0,0 +1,202 @@
package org.apache.lucene.codecs.blocktree;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collections;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
/**
* BlockTree's implementation of {@link Terms}.
* @deprecated Only for 4.x backcompat
*/
@Deprecated
final class Lucene40FieldReader extends Terms implements Accountable {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(Lucene40FieldReader.class)
+ 3 * RamUsageEstimator.shallowSizeOfInstance(BytesRef.class);
final long numTerms;
final FieldInfo fieldInfo;
final long sumTotalTermFreq;
final long sumDocFreq;
final int docCount;
final long indexStartFP;
final long rootBlockFP;
final BytesRef rootCode;
final BytesRef minTerm;
final BytesRef maxTerm;
final int longsSize;
final Lucene40BlockTreeTermsReader parent;
final FST<BytesRef> index;
//private boolean DEBUG;
Lucene40FieldReader(Lucene40BlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount,
long indexStartFP, int longsSize, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
//DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id");
this.parent = parent;
this.numTerms = numTerms;
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
this.indexStartFP = indexStartFP;
this.rootCode = rootCode;
this.longsSize = longsSize;
this.minTerm = minTerm;
this.maxTerm = maxTerm;
// if (DEBUG) {
// System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor);
// }
rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
if (indexIn != null) {
final IndexInput clone = indexIn.clone();
//System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
clone.seek(indexStartFP);
index = new FST<>(clone, ByteSequenceOutputs.getSingleton());
/*
if (false) {
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
Util.toDot(index, w, false, false);
System.out.println("FST INDEX: SAVED to " + dotFileName);
w.close();
}
*/
} else {
index = null;
}
}
@Override
public BytesRef getMin() throws IOException {
if (minTerm == null) {
// Older index that didn't store min/maxTerm
return super.getMin();
} else {
return minTerm;
}
}
@Override
public BytesRef getMax() throws IOException {
if (maxTerm == null) {
// Older index that didn't store min/maxTerm
return super.getMax();
} else {
return maxTerm;
}
}
/** For debugging -- used by CheckIndex too*/
@Override
public Lucene40Stats getStats() throws IOException {
return new Lucene40SegmentTermsEnum(this).computeBlockStats();
}
@Override
public boolean hasFreqs() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
}
@Override
public boolean hasOffsets() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
@Override
public boolean hasPayloads() {
return fieldInfo.hasPayloads();
}
@Override
public TermsEnum iterator(TermsEnum reuse) throws IOException {
return new Lucene40SegmentTermsEnum(this);
}
@Override
public long size() {
return numTerms;
}
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
@Override
public long getSumDocFreq() {
return sumDocFreq;
}
@Override
public int getDocCount() {
return docCount;
}
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
return new Lucene40IntersectTermsEnum(this, compiled, startTerm);
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED + ((index!=null)? index.ramBytesUsed() : 0);
}
@Override
public Iterable<? extends Accountable> getChildResources() {
if (index == null) {
return Collections.emptyList();
} else {
return Collections.singleton(Accountables.namedAccountable("term index", index));
}
}
@Override
public String toString() {
return "BlockTreeTerms(terms=" + numTerms + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")";
}
}

View File

@ -0,0 +1,490 @@
package org.apache.lucene.codecs.blocktree;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RunAutomaton;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
// NOTE: cannot seek!
/**
* @deprecated Only for 4.x backcompat
*/
@Deprecated
final class Lucene40IntersectTermsEnum extends TermsEnum {
final IndexInput in;
final static Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
private Lucene40IntersectTermsEnumFrame[] stack;
@SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<BytesRef>[] arcs = new FST.Arc[5];
final RunAutomaton runAutomaton;
final CompiledAutomaton compiledAutomaton;
private Lucene40IntersectTermsEnumFrame currentFrame;
private final BytesRef term = new BytesRef();
private final FST.BytesReader fstReader;
final Lucene40FieldReader fr;
private BytesRef savedStartTerm;
// TODO: in some cases we can filter by length? eg
// regexp foo*bar must be at least length 6 bytes
public Lucene40IntersectTermsEnum(Lucene40FieldReader fr, CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
// if (DEBUG) {
// System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef));
// }
this.fr = fr;
runAutomaton = compiled.runAutomaton;
compiledAutomaton = compiled;
in = fr.parent.in.clone();
stack = new Lucene40IntersectTermsEnumFrame[5];
for(int idx=0;idx<stack.length;idx++) {
stack[idx] = new Lucene40IntersectTermsEnumFrame(this, idx);
}
for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
arcs[arcIdx] = new FST.Arc<>();
}
if (fr.index == null) {
fstReader = null;
} else {
fstReader = fr.index.getBytesReader();
}
// TODO: if the automaton is "smallish" we really
// should use the terms index to seek at least to
// the initial term and likely to subsequent terms
// (or, maybe just fallback to ATE for such cases).
// Else the seek cost of loading the frames will be
// too costly.
final FST.Arc<BytesRef> arc = fr.index.getFirstArc(arcs[0]);
// Empty string prefix must have an output in the index!
assert arc.isFinal();
// Special pushFrame since it's the first one:
final Lucene40IntersectTermsEnumFrame f = stack[0];
f.fp = f.fpOrig = fr.rootBlockFP;
f.prefix = 0;
f.setState(runAutomaton.getInitialState());
f.arc = arc;
f.outputPrefix = arc.output;
f.load(fr.rootCode);
// for assert:
assert setSavedStartTerm(startTerm);
currentFrame = f;
if (startTerm != null) {
seekToStartTerm(startTerm);
}
}
// only for assert:
private boolean setSavedStartTerm(BytesRef startTerm) {
savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm);
return true;
}
@Override
public TermState termState() throws IOException {
currentFrame.decodeMetaData();
return currentFrame.termState.clone();
}
private Lucene40IntersectTermsEnumFrame getFrame(int ord) throws IOException {
if (ord >= stack.length) {
final Lucene40IntersectTermsEnumFrame[] next = new Lucene40IntersectTermsEnumFrame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(stack, 0, next, 0, stack.length);
for(int stackOrd=stack.length;stackOrd<next.length;stackOrd++) {
next[stackOrd] = new Lucene40IntersectTermsEnumFrame(this, stackOrd);
}
stack = next;
}
assert stack[ord].ord == ord;
return stack[ord];
}
private FST.Arc<BytesRef> getArc(int ord) {
if (ord >= arcs.length) {
@SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<BytesRef>[] next =
new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(arcs, 0, next, 0, arcs.length);
for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) {
next[arcOrd] = new FST.Arc<>();
}
arcs = next;
}
return arcs[ord];
}
private Lucene40IntersectTermsEnumFrame pushFrame(int state) throws IOException {
final Lucene40IntersectTermsEnumFrame f = getFrame(currentFrame == null ? 0 : 1+currentFrame.ord);
f.fp = f.fpOrig = currentFrame.lastSubFP;
f.prefix = currentFrame.prefix + currentFrame.suffix;
// if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix);
f.setState(state);
// Walk the arc through the index -- we only
// "bother" with this so we can get the floor data
// from the index and skip floor blocks when
// possible:
FST.Arc<BytesRef> arc = currentFrame.arc;
int idx = currentFrame.prefix;
assert currentFrame.suffix > 0;
BytesRef output = currentFrame.outputPrefix;
while (idx < f.prefix) {
final int target = term.bytes[idx] & 0xff;
// TODO: we could be more efficient for the next()
// case by using current arc as starting point,
// passed to findTargetArc
arc = fr.index.findTargetArc(target, arc, getArc(1+idx), fstReader);
assert arc != null;
output = fstOutputs.add(output, arc.output);
idx++;
}
f.arc = arc;
f.outputPrefix = output;
assert arc.isFinal();
f.load(fstOutputs.add(output, arc.nextFinalOutput));
return f;
}
@Override
public BytesRef term() {
return term;
}
@Override
public int docFreq() throws IOException {
//if (DEBUG) System.out.println("BTIR.docFreq");
currentFrame.decodeMetaData();
//if (DEBUG) System.out.println(" return " + currentFrame.termState.docFreq);
return currentFrame.termState.docFreq;
}
@Override
public long totalTermFreq() throws IOException {
currentFrame.decodeMetaData();
return currentFrame.termState.totalTermFreq;
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse, int flags) throws IOException {
currentFrame.decodeMetaData();
return fr.parent.postingsReader.docs(fr.fieldInfo, currentFrame.termState, skipDocs, reuse, flags);
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
if (fr.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
// Positions were not indexed:
return null;
}
currentFrame.decodeMetaData();
return fr.parent.postingsReader.docsAndPositions(fr.fieldInfo, currentFrame.termState, skipDocs, reuse, flags);
}
private int getState() {
int state = currentFrame.state;
for(int idx=0;idx<currentFrame.suffix;idx++) {
state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
assert state != -1;
}
return state;
}
// NOTE: specialized to only doing the first-time
// seek, but we could generalize it to allow
// arbitrary seekExact/Ceil. Note that this is a
// seekFloor!
private void seekToStartTerm(BytesRef target) throws IOException {
//if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString());
assert currentFrame.ord == 0;
if (term.length < target.length) {
term.bytes = ArrayUtil.grow(term.bytes, target.length);
}
FST.Arc<BytesRef> arc = arcs[0];
assert arc == currentFrame.arc;
for(int idx=0;idx<=target.length;idx++) {
while (true) {
final int savePos = currentFrame.suffixesReader.getPosition();
final int saveStartBytePos = currentFrame.startBytePos;
final int saveSuffix = currentFrame.suffix;
final long saveLastSubFP = currentFrame.lastSubFP;
final int saveTermBlockOrd = currentFrame.termState.termBlockOrd;
final boolean isSubBlock = currentFrame.next();
//if (DEBUG) System.out.println(" cycle ent=" + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + currentFrame.suffix + " isBlock=" + isSubBlock + " firstLabel=" + (currentFrame.suffix == 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff));
term.length = currentFrame.prefix + currentFrame.suffix;
if (term.bytes.length < term.length) {
term.bytes = ArrayUtil.grow(term.bytes, term.length);
}
System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
if (isSubBlock && StringHelper.startsWith(target, term)) {
// Recurse
//if (DEBUG) System.out.println(" recurse!");
currentFrame = pushFrame(getState());
break;
} else {
final int cmp = term.compareTo(target);
if (cmp < 0) {
if (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
//if (DEBUG) System.out.println(" load floorBlock");
currentFrame.loadNextFloorBlock();
continue;
} else {
//if (DEBUG) System.out.println(" return term=" + brToString(term));
return;
}
}
continue;
} else if (cmp == 0) {
//if (DEBUG) System.out.println(" return term=" + brToString(term));
return;
} else {
// Fallback to prior entry: the semantics of
// this method is that the first call to
// next() will return the term after the
// requested term
currentFrame.nextEnt--;
currentFrame.lastSubFP = saveLastSubFP;
currentFrame.startBytePos = saveStartBytePos;
currentFrame.suffix = saveSuffix;
currentFrame.suffixesReader.setPosition(savePos);
currentFrame.termState.termBlockOrd = saveTermBlockOrd;
System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
term.length = currentFrame.prefix + currentFrame.suffix;
// If the last entry was a block we don't
// need to bother recursing and pushing to
// the last term under it because the first
// next() will simply skip the frame anyway
return;
}
}
}
}
assert false;
}
@Override
public BytesRef next() throws IOException {
// if (DEBUG) {
// System.out.println("\nintEnum.next seg=" + segment);
// System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
// }
nextTerm:
while(true) {
// Pop finished frames
while (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
//if (DEBUG) System.out.println(" next-floor-block");
currentFrame.loadNextFloorBlock();
//if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
} else {
//if (DEBUG) System.out.println(" pop frame");
if (currentFrame.ord == 0) {
return null;
}
final long lastFP = currentFrame.fpOrig;
currentFrame = stack[currentFrame.ord-1];
assert currentFrame.lastSubFP == lastFP;
//if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
}
}
final boolean isSubBlock = currentFrame.next();
// if (DEBUG) {
// final BytesRef suffixRef = new BytesRef();
// suffixRef.bytes = currentFrame.suffixBytes;
// suffixRef.offset = currentFrame.startBytePos;
// suffixRef.length = currentFrame.suffix;
// System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef));
// }
if (currentFrame.suffix != 0) {
final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff;
while (label > currentFrame.curTransitionMax) {
if (currentFrame.transitionIndex >= currentFrame.transitionCount-1) {
// Stop processing this frame -- no further
// matches are possible because we've moved
// beyond what the max transition will allow
//if (DEBUG) System.out.println(" break: trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]));
// sneaky! forces a pop above
currentFrame.isLastInFloor = true;
currentFrame.nextEnt = currentFrame.entCount;
continue nextTerm;
}
currentFrame.transitionIndex++;
compiledAutomaton.automaton.getNextTransition(currentFrame.transition);
currentFrame.curTransitionMax = currentFrame.transition.max;
//if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]);
}
}
// First test the common suffix, if set:
if (compiledAutomaton.commonSuffixRef != null && !isSubBlock) {
final int termLen = currentFrame.prefix + currentFrame.suffix;
if (termLen < compiledAutomaton.commonSuffixRef.length) {
// No match
// if (DEBUG) {
// System.out.println(" skip: common suffix length");
// }
continue nextTerm;
}
final byte[] suffixBytes = currentFrame.suffixBytes;
final byte[] commonSuffixBytes = compiledAutomaton.commonSuffixRef.bytes;
final int lenInPrefix = compiledAutomaton.commonSuffixRef.length - currentFrame.suffix;
assert compiledAutomaton.commonSuffixRef.offset == 0;
int suffixBytesPos;
int commonSuffixBytesPos = 0;
if (lenInPrefix > 0) {
// A prefix of the common suffix overlaps with
// the suffix of the block prefix so we first
// test whether the prefix part matches:
final byte[] termBytes = term.bytes;
int termBytesPos = currentFrame.prefix - lenInPrefix;
assert termBytesPos >= 0;
final int termBytesPosEnd = currentFrame.prefix;
while (termBytesPos < termBytesPosEnd) {
if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
// if (DEBUG) {
// System.out.println(" skip: common suffix mismatch (in prefix)");
// }
continue nextTerm;
}
}
suffixBytesPos = currentFrame.startBytePos;
} else {
suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.commonSuffixRef.length;
}
// Test overlapping suffix part:
final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length;
while (commonSuffixBytesPos < commonSuffixBytesPosEnd) {
if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
// if (DEBUG) {
// System.out.println(" skip: common suffix mismatch");
// }
continue nextTerm;
}
}
}
// TODO: maybe we should do the same linear test
// that AutomatonTermsEnum does, so that if we
// reach a part of the automaton where .* is
// "temporarily" accepted, we just blindly .next()
// until the limit
// See if the term prefix matches the automaton:
int state = currentFrame.state;
for (int idx=0;idx<currentFrame.suffix;idx++) {
state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
if (state == -1) {
// No match
//System.out.println(" no s=" + state);
continue nextTerm;
} else {
//System.out.println(" c s=" + state);
}
}
if (isSubBlock) {
// Match! Recurse:
//if (DEBUG) System.out.println(" sub-block match to state=" + state + "; recurse fp=" + currentFrame.lastSubFP);
copyTerm();
currentFrame = pushFrame(state);
//if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
} else if (runAutomaton.isAccept(state)) {
copyTerm();
//if (DEBUG) System.out.println(" term match to state=" + state + "; return term=" + brToString(term));
assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString();
return term;
} else {
//System.out.println(" no s=" + state);
}
}
}
private void copyTerm() {
//System.out.println(" copyTerm cur.prefix=" + currentFrame.prefix + " cur.suffix=" + currentFrame.suffix + " first=" + (char) currentFrame.suffixBytes[currentFrame.startBytePos]);
final int len = currentFrame.prefix + currentFrame.suffix;
if (term.bytes.length < len) {
term.bytes = ArrayUtil.grow(term.bytes, len);
}
System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
term.length = len;
}
@Override
public boolean seekExact(BytesRef text) {
throw new UnsupportedOperationException();
}
@Override
public void seekExact(long ord) {
throw new UnsupportedOperationException();
}
@Override
public long ord() {
throw new UnsupportedOperationException();
}
@Override
public SeekStatus seekCeil(BytesRef text) {
throw new UnsupportedOperationException();
}
}

View File

@ -0,0 +1,302 @@
package org.apache.lucene.codecs.blocktree;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.FST;
/**
* @deprecated Only for 4.x backcompat
*/
// TODO: can we share this with the frame in STE?
@Deprecated
final class Lucene40IntersectTermsEnumFrame {
final int ord;
long fp;
long fpOrig;
long fpEnd;
long lastSubFP;
// State in automaton
int state;
int metaDataUpto;
byte[] suffixBytes = new byte[128];
final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
byte[] statBytes = new byte[64];
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
byte[] floorData = new byte[32];
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
// Length of prefix shared by all terms in this block
int prefix;
// Number of entries (term or sub-block) in this block
int entCount;
// Which term we will next read
int nextEnt;
// True if this block is either not a floor block,
// or, it's the last sub-block of a floor block
boolean isLastInFloor;
// True if all entries are terms
boolean isLeafBlock;
int numFollowFloorBlocks;
int nextFloorLabel;
Transition transition = new Transition();
int curTransitionMax;
int transitionIndex;
int transitionCount;
FST.Arc<BytesRef> arc;
final BlockTermState termState;
// metadata buffer, holding monotonic values
public long[] longs;
// metadata buffer, holding general values
public byte[] bytes;
ByteArrayDataInput bytesReader;
// Cumulative output so far
BytesRef outputPrefix;
int startBytePos;
int suffix;
private final Lucene40IntersectTermsEnum ite;
public Lucene40IntersectTermsEnumFrame(Lucene40IntersectTermsEnum ite, int ord) throws IOException {
this.ite = ite;
this.ord = ord;
this.termState = ite.fr.parent.postingsReader.newTermState();
this.termState.totalTermFreq = -1;
this.longs = new long[ite.fr.longsSize];
}
void loadNextFloorBlock() throws IOException {
assert numFollowFloorBlocks > 0;
//if (DEBUG) System.out.println(" loadNextFoorBlock trans=" + transitions[transitionIndex]);
do {
fp = fpOrig + (floorDataReader.readVLong() >>> 1);
numFollowFloorBlocks--;
// if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[transitionIndex].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
if (numFollowFloorBlocks != 0) {
nextFloorLabel = floorDataReader.readByte() & 0xff;
} else {
nextFloorLabel = 256;
}
// if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel);
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
load(null);
}
public void setState(int state) {
this.state = state;
transitionIndex = 0;
transitionCount = ite.compiledAutomaton.automaton.getNumTransitions(state);
if (transitionCount != 0) {
ite.compiledAutomaton.automaton.initTransition(state, transition);
ite.compiledAutomaton.automaton.getNextTransition(transition);
curTransitionMax = transition.max;
} else {
curTransitionMax = -1;
}
}
void load(BytesRef frameIndexData) throws IOException {
// if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
if (frameIndexData != null && transitionCount != 0) {
// Floor frame
if (floorData.length < frameIndexData.length) {
this.floorData = new byte[ArrayUtil.oversize(frameIndexData.length, 1)];
}
System.arraycopy(frameIndexData.bytes, frameIndexData.offset, floorData, 0, frameIndexData.length);
floorDataReader.reset(floorData, 0, frameIndexData.length);
// Skip first long -- has redundant fp, hasTerms
// flag, isFloor flag
final long code = floorDataReader.readVLong();
if ((code & Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;
// if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel);
// If current state is accept, we must process
// first block in case it has empty suffix:
if (!ite.runAutomaton.isAccept(state)) {
// Maybe skip floor blocks:
assert transitionIndex == 0: "transitionIndex=" + transitionIndex;
while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) {
fp = fpOrig + (floorDataReader.readVLong() >>> 1);
numFollowFloorBlocks--;
// if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
if (numFollowFloorBlocks != 0) {
nextFloorLabel = floorDataReader.readByte() & 0xff;
} else {
nextFloorLabel = 256;
}
}
}
}
}
ite.in.seek(fp);
int code = ite.in.readVInt();
entCount = code >>> 1;
assert entCount > 0;
isLastInFloor = (code & 1) != 0;
// term suffixes:
code = ite.in.readVInt();
isLeafBlock = (code & 1) != 0;
int numBytes = code >>> 1;
// if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes);
if (suffixBytes.length < numBytes) {
suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ite.in.readBytes(suffixBytes, 0, numBytes);
suffixesReader.reset(suffixBytes, 0, numBytes);
// stats
numBytes = ite.in.readVInt();
if (statBytes.length < numBytes) {
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ite.in.readBytes(statBytes, 0, numBytes);
statsReader.reset(statBytes, 0, numBytes);
metaDataUpto = 0;
termState.termBlockOrd = 0;
nextEnt = 0;
// metadata
numBytes = ite.in.readVInt();
if (bytes == null) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
bytesReader = new ByteArrayDataInput();
} else if (bytes.length < numBytes) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ite.in.readBytes(bytes, 0, numBytes);
bytesReader.reset(bytes, 0, numBytes);
if (!isLastInFloor) {
// Sub-blocks of a single floor block are always
// written one after another -- tail recurse:
fpEnd = ite.in.getFilePointer();
}
}
// TODO: maybe add scanToLabel; should give perf boost
public boolean next() {
return isLeafBlock ? nextLeaf() : nextNonLeaf();
}
// Decodes next entry; returns true if it's a sub-block
public boolean nextLeaf() {
//if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
suffix = suffixesReader.readVInt();
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
return false;
}
public boolean nextNonLeaf() {
//if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
final int code = suffixesReader.readVInt();
suffix = code >>> 1;
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
if ((code & 1) == 0) {
// A normal term
termState.termBlockOrd++;
return false;
} else {
// A sub-block; make sub-FP absolute:
lastSubFP = fp - suffixesReader.readVLong();
return true;
}
}
public int getTermBlockOrd() {
return isLeafBlock ? nextEnt : termState.termBlockOrd;
}
public void decodeMetaData() throws IOException {
// lazily catch up on metadata decode:
final int limit = getTermBlockOrd();
boolean absolute = metaDataUpto == 0;
assert limit > 0;
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
// TODO: we could make "tiers" of metadata, ie,
// decode docFreq/totalTF but don't decode postings
// metadata; this way caller could get
// docFreq/totalTF w/o paying decode cost for
// postings
// TODO: if docFreq were bulk decoded we could
// just skipN here:
// stats
termState.docFreq = statsReader.readVInt();
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
if (ite.fr.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
// metadata
for (int i = 0; i < ite.fr.longsSize; i++) {
longs[i] = bytesReader.readVLong();
}
ite.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ite.fr.fieldInfo, termState, absolute);
metaDataUpto++;
absolute = false;
}
termState.termBlockOrd = metaDataUpto;
}
}

View File

@ -0,0 +1,732 @@
package org.apache.lucene.codecs.blocktree;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.fst.FST;
/**
* @deprecated Only for 4.x backcompat
*/
@Deprecated
final class Lucene40SegmentTermsEnumFrame {
// Our index in stack[]:
final int ord;
boolean hasTerms;
boolean hasTermsOrig;
boolean isFloor;
FST.Arc<BytesRef> arc;
// File pointer where this block was loaded from
long fp;
long fpOrig;
long fpEnd;
byte[] suffixBytes = new byte[128];
final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
byte[] statBytes = new byte[64];
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
byte[] floorData = new byte[32];
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
// Length of prefix shared by all terms in this block
int prefix;
// Number of entries (term or sub-block) in this block
int entCount;
// Which term we will next read, or -1 if the block
// isn't loaded yet
int nextEnt;
// True if this block is either not a floor block,
// or, it's the last sub-block of a floor block
boolean isLastInFloor;
// True if all entries are terms
boolean isLeafBlock;
long lastSubFP;
int nextFloorLabel;
int numFollowFloorBlocks;
// Next term to decode metaData; we decode metaData
// lazily so that scanning to find the matching term is
// fast and only if you find a match and app wants the
// stats or docs/positions enums, will we decode the
// metaData
int metaDataUpto;
final BlockTermState state;
// metadata buffer, holding monotonic values
public long[] longs;
// metadata buffer, holding general values
public byte[] bytes;
ByteArrayDataInput bytesReader;
private final Lucene40SegmentTermsEnum ste;
public Lucene40SegmentTermsEnumFrame(Lucene40SegmentTermsEnum ste, int ord) throws IOException {
this.ste = ste;
this.ord = ord;
this.state = ste.fr.parent.postingsReader.newTermState();
this.state.totalTermFreq = -1;
this.longs = new long[ste.fr.longsSize];
}
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
final int numBytes = source.length - (in.getPosition() - source.offset);
if (numBytes > floorData.length) {
floorData = new byte[ArrayUtil.oversize(numBytes, 1)];
}
System.arraycopy(source.bytes, source.offset+in.getPosition(), floorData, 0, numBytes);
floorDataReader.reset(floorData, 0, numBytes);
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;
//if (DEBUG) {
//System.out.println(" setFloorData fpOrig=" + fpOrig + " bytes=" + new BytesRef(source.bytes, source.offset + in.getPosition(), numBytes) + " numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + toHex(nextFloorLabel));
//}
}
public int getTermBlockOrd() {
return isLeafBlock ? nextEnt : state.termBlockOrd;
}
void loadNextFloorBlock() throws IOException {
//if (DEBUG) {
//System.out.println(" loadNextFloorBlock fp=" + fp + " fpEnd=" + fpEnd);
//}
assert arc == null || isFloor: "arc=" + arc + " isFloor=" + isFloor;
fp = fpEnd;
nextEnt = -1;
loadBlock();
}
/* Does initial decode of next block of terms; this
doesn't actually decode the docFreq, totalTermFreq,
postings details (frq/prx offset, etc.) metadata;
it just loads them as byte[] blobs which are then
decoded on-demand if the metadata is ever requested
for any term in this block. This enables terms-only
intensive consumes (eg certain MTQs, respelling) to
not pay the price of decoding metadata they won't
use. */
void loadBlock() throws IOException {
// Clone the IndexInput lazily, so that consumers
// that just pull a TermsEnum to
// seekExact(TermState) don't pay this cost:
ste.initIndexInput();
if (nextEnt != -1) {
// Already loaded
return;
}
//System.out.println("blc=" + blockLoadCount);
ste.in.seek(fp);
int code = ste.in.readVInt();
entCount = code >>> 1;
assert entCount > 0;
isLastInFloor = (code & 1) != 0;
assert arc == null || (isLastInFloor || isFloor): "fp=" + fp + " arc=" + arc + " isFloor=" + isFloor + " isLastInFloor=" + isLastInFloor;
// TODO: if suffixes were stored in random-access
// array structure, then we could do binary search
// instead of linear scan to find target term; eg
// we could have simple array of offsets
// term suffixes:
code = ste.in.readVInt();
isLeafBlock = (code & 1) != 0;
int numBytes = code >>> 1;
if (suffixBytes.length < numBytes) {
suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ste.in.readBytes(suffixBytes, 0, numBytes);
suffixesReader.reset(suffixBytes, 0, numBytes);
/*if (DEBUG) {
if (arc == null) {
System.out.println(" loadBlock (next) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock);
} else {
System.out.println(" loadBlock (seek) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock);
}
}*/
// stats
numBytes = ste.in.readVInt();
if (statBytes.length < numBytes) {
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ste.in.readBytes(statBytes, 0, numBytes);
statsReader.reset(statBytes, 0, numBytes);
metaDataUpto = 0;
state.termBlockOrd = 0;
nextEnt = 0;
lastSubFP = -1;
// TODO: we could skip this if !hasTerms; but
// that's rare so won't help much
// metadata
numBytes = ste.in.readVInt();
if (bytes == null) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
bytesReader = new ByteArrayDataInput();
} else if (bytes.length < numBytes) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ste.in.readBytes(bytes, 0, numBytes);
bytesReader.reset(bytes, 0, numBytes);
// Sub-blocks of a single floor block are always
// written one after another -- tail recurse:
fpEnd = ste.in.getFilePointer();
// if (DEBUG) {
// System.out.println(" fpEnd=" + fpEnd);
// }
}
void rewind() {
// Force reload:
fp = fpOrig;
nextEnt = -1;
hasTerms = hasTermsOrig;
if (isFloor) {
floorDataReader.rewind();
numFollowFloorBlocks = floorDataReader.readVInt();
assert numFollowFloorBlocks > 0;
nextFloorLabel = floorDataReader.readByte() & 0xff;
}
/*
//System.out.println("rewind");
// Keeps the block loaded, but rewinds its state:
if (nextEnt > 0 || fp != fpOrig) {
if (DEBUG) {
System.out.println(" rewind frame ord=" + ord + " fpOrig=" + fpOrig + " fp=" + fp + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " nextEnt=" + nextEnt + " prefixLen=" + prefix);
}
if (fp != fpOrig) {
fp = fpOrig;
nextEnt = -1;
} else {
nextEnt = 0;
}
hasTerms = hasTermsOrig;
if (isFloor) {
floorDataReader.rewind();
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;
}
assert suffixBytes != null;
suffixesReader.rewind();
assert statBytes != null;
statsReader.rewind();
metaDataUpto = 0;
state.termBlockOrd = 0;
// TODO: skip this if !hasTerms? Then postings
// impl wouldn't have to write useless 0 byte
postingsReader.resetTermsBlock(fieldInfo, state);
lastSubFP = -1;
} else if (DEBUG) {
System.out.println(" skip rewind fp=" + fp + " fpOrig=" + fpOrig + " nextEnt=" + nextEnt + " ord=" + ord);
}
*/
}
public boolean next() {
return isLeafBlock ? nextLeaf() : nextNonLeaf();
}
// Decodes next entry; returns true if it's a sub-block
public boolean nextLeaf() {
//if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
suffix = suffixesReader.readVInt();
startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix);
ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
// A normal term
ste.termExists = true;
return false;
}
public boolean nextNonLeaf() {
//if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
final int code = suffixesReader.readVInt();
suffix = code >>> 1;
startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix);
ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
if ((code & 1) == 0) {
// A normal term
ste.termExists = true;
subCode = 0;
state.termBlockOrd++;
return false;
} else {
// A sub-block; make sub-FP absolute:
ste.termExists = false;
subCode = suffixesReader.readVLong();
lastSubFP = fp - subCode;
//if (DEBUG) {
//System.out.println(" lastSubFP=" + lastSubFP);
//}
return true;
}
}
// TODO: make this array'd so we can do bin search?
// likely not worth it? need to measure how many
// floor blocks we "typically" get
public void scanToFloorFrame(BytesRef target) {
if (!isFloor || target.length <= prefix) {
// if (DEBUG) {
// System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + target.length + " vs prefix=" + prefix);
// }
return;
}
final int targetLabel = target.bytes[target.offset + prefix] & 0xFF;
// if (DEBUG) {
// System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + toHex(targetLabel) + " vs nextFloorLabel=" + toHex(nextFloorLabel) + " numFollowFloorBlocks=" + numFollowFloorBlocks);
// }
if (targetLabel < nextFloorLabel) {
// if (DEBUG) {
// System.out.println(" already on correct block");
// }
return;
}
assert numFollowFloorBlocks != 0;
long newFP = fpOrig;
while (true) {
final long code = floorDataReader.readVLong();
newFP = fpOrig + (code >>> 1);
hasTerms = (code & 1) != 0;
// if (DEBUG) {
// System.out.println(" label=" + toHex(nextFloorLabel) + " fp=" + newFP + " hasTerms?=" + hasTerms + " numFollowFloor=" + numFollowFloorBlocks);
// }
isLastInFloor = numFollowFloorBlocks == 1;
numFollowFloorBlocks--;
if (isLastInFloor) {
nextFloorLabel = 256;
// if (DEBUG) {
// System.out.println(" stop! last block nextFloorLabel=" + toHex(nextFloorLabel));
// }
break;
} else {
nextFloorLabel = floorDataReader.readByte() & 0xff;
if (targetLabel < nextFloorLabel) {
// if (DEBUG) {
// System.out.println(" stop! nextFloorLabel=" + toHex(nextFloorLabel));
// }
break;
}
}
}
if (newFP != fp) {
// Force re-load of the block:
// if (DEBUG) {
// System.out.println(" force switch to fp=" + newFP + " oldFP=" + fp);
// }
nextEnt = -1;
fp = newFP;
} else {
// if (DEBUG) {
// System.out.println(" stay on same fp=" + newFP);
// }
}
}
public void decodeMetaData() throws IOException {
//if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + segment + " mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd);
// lazily catch up on metadata decode:
final int limit = getTermBlockOrd();
boolean absolute = metaDataUpto == 0;
assert limit > 0;
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
// TODO: we could make "tiers" of metadata, ie,
// decode docFreq/totalTF but don't decode postings
// metadata; this way caller could get
// docFreq/totalTF w/o paying decode cost for
// postings
// TODO: if docFreq were bulk decoded we could
// just skipN here:
// stats
state.docFreq = statsReader.readVInt();
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
if (ste.fr.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
state.totalTermFreq = state.docFreq + statsReader.readVLong();
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
// metadata
for (int i = 0; i < ste.fr.longsSize; i++) {
longs[i] = bytesReader.readVLong();
}
ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute);
metaDataUpto++;
absolute = false;
}
state.termBlockOrd = metaDataUpto;
}
// Used only by assert
private boolean prefixMatches(BytesRef target) {
for(int bytePos=0;bytePos<prefix;bytePos++) {
if (target.bytes[target.offset + bytePos] != ste.term.byteAt(bytePos)) {
return false;
}
}
return true;
}
// Scans to sub-block that has this target fp; only
// called by next(); NOTE: does not set
// startBytePos/suffix as a side effect
public void scanToSubBlock(long subFP) {
assert !isLeafBlock;
//if (DEBUG) System.out.println(" scanToSubBlock fp=" + fp + " subFP=" + subFP + " entCount=" + entCount + " lastSubFP=" + lastSubFP);
//assert nextEnt == 0;
if (lastSubFP == subFP) {
//if (DEBUG) System.out.println(" already positioned");
return;
}
assert subFP < fp : "fp=" + fp + " subFP=" + subFP;
final long targetSubCode = fp - subFP;
//if (DEBUG) System.out.println(" targetSubCode=" + targetSubCode);
while(true) {
assert nextEnt < entCount;
nextEnt++;
final int code = suffixesReader.readVInt();
suffixesReader.skipBytes(isLeafBlock ? code : code >>> 1);
//if (DEBUG) System.out.println(" " + nextEnt + " (of " + entCount + ") ent isSubBlock=" + ((code&1)==1));
if ((code & 1) != 0) {
final long subCode = suffixesReader.readVLong();
//if (DEBUG) System.out.println(" subCode=" + subCode);
if (targetSubCode == subCode) {
//if (DEBUG) System.out.println(" match!");
lastSubFP = subFP;
return;
}
} else {
state.termBlockOrd++;
}
}
}
// NOTE: sets startBytePos/suffix as a side effect
public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException {
return isLeafBlock ? scanToTermLeaf(target, exactOnly) : scanToTermNonLeaf(target, exactOnly);
}
private int startBytePos;
private int suffix;
private long subCode;
// Target's prefix matches this block's prefix; we
// scan the entries check if the suffix matches.
public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOException {
// if (DEBUG) System.out.println(" scanToTermLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + brToString(term));
assert nextEnt != -1;
ste.termExists = true;
subCode = 0;
if (nextEnt == entCount) {
if (exactOnly) {
fillTerm();
}
return SeekStatus.END;
}
assert prefixMatches(target);
// Loop over each entry (term or sub-block) in this block:
//nextTerm: while(nextEnt < entCount) {
nextTerm: while (true) {
nextEnt++;
suffix = suffixesReader.readVInt();
// if (DEBUG) {
// BytesRef suffixBytesRef = new BytesRef();
// suffixBytesRef.bytes = suffixBytes;
// suffixBytesRef.offset = suffixesReader.getPosition();
// suffixBytesRef.length = suffix;
// System.out.println(" cycle: term " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef));
// }
final int termLen = prefix + suffix;
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen);
int targetPos = target.offset + prefix;
// Loop over bytes in the suffix, comparing to
// the target
int bytePos = startBytePos;
while(true) {
final int cmp;
final boolean stop;
if (targetPos < targetLimit) {
cmp = (suffixBytes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF);
stop = false;
} else {
assert targetPos == targetLimit;
cmp = termLen - target.length;
stop = true;
}
if (cmp < 0) {
// Current entry is still before the target;
// keep scanning
if (nextEnt == entCount) {
if (exactOnly) {
fillTerm();
}
// We are done scanning this block
break nextTerm;
} else {
continue nextTerm;
}
} else if (cmp > 0) {
// Done! Current entry is after target --
// return NOT_FOUND:
fillTerm();
//if (DEBUG) System.out.println(" not found");
return SeekStatus.NOT_FOUND;
} else if (stop) {
// Exact match!
// This cannot be a sub-block because we
// would have followed the index to this
// sub-block from the start:
assert ste.termExists;
fillTerm();
//if (DEBUG) System.out.println(" found!");
return SeekStatus.FOUND;
}
}
}
// It is possible (and OK) that terms index pointed us
// at this block, but, we scanned the entire block and
// did not find the term to position to. This happens
// when the target is after the last term in the block
// (but, before the next term in the index). EG
// target could be foozzz, and terms index pointed us
// to the foo* block, but the last term in this block
// was fooz (and, eg, first term in the next block will
// bee fop).
//if (DEBUG) System.out.println(" block end");
if (exactOnly) {
fillTerm();
}
// TODO: not consistent that in the
// not-exact case we don't next() into the next
// frame here
return SeekStatus.END;
}
// Target's prefix matches this block's prefix; we
// scan the entries check if the suffix matches.
public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException {
//if (DEBUG) System.out.println(" scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + brToString(term));
assert nextEnt != -1;
if (nextEnt == entCount) {
if (exactOnly) {
fillTerm();
ste.termExists = subCode == 0;
}
return SeekStatus.END;
}
assert prefixMatches(target);
// Loop over each entry (term or sub-block) in this block:
//nextTerm: while(nextEnt < entCount) {
nextTerm: while (true) {
nextEnt++;
final int code = suffixesReader.readVInt();
suffix = code >>> 1;
// if (DEBUG) {
// BytesRef suffixBytesRef = new BytesRef();
// suffixBytesRef.bytes = suffixBytes;
// suffixBytesRef.offset = suffixesReader.getPosition();
// suffixBytesRef.length = suffix;
// System.out.println(" cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef));
// }
ste.termExists = (code & 1) == 0;
final int termLen = prefix + suffix;
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
if (ste.termExists) {
state.termBlockOrd++;
subCode = 0;
} else {
subCode = suffixesReader.readVLong();
lastSubFP = fp - subCode;
}
final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen);
int targetPos = target.offset + prefix;
// Loop over bytes in the suffix, comparing to
// the target
int bytePos = startBytePos;
while(true) {
final int cmp;
final boolean stop;
if (targetPos < targetLimit) {
cmp = (suffixBytes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF);
stop = false;
} else {
assert targetPos == targetLimit;
cmp = termLen - target.length;
stop = true;
}
if (cmp < 0) {
// Current entry is still before the target;
// keep scanning
if (nextEnt == entCount) {
if (exactOnly) {
fillTerm();
//termExists = true;
}
// We are done scanning this block
break nextTerm;
} else {
continue nextTerm;
}
} else if (cmp > 0) {
// Done! Current entry is after target --
// return NOT_FOUND:
fillTerm();
if (!exactOnly && !ste.termExists) {
// We are on a sub-block, and caller wants
// us to position to the next term after
// the target, so we must recurse into the
// sub-frame(s):
ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen);
ste.currentFrame.loadBlock();
while (ste.currentFrame.next()) {
ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length());
ste.currentFrame.loadBlock();
}
}
//if (DEBUG) System.out.println(" not found");
return SeekStatus.NOT_FOUND;
} else if (stop) {
// Exact match!
// This cannot be a sub-block because we
// would have followed the index to this
// sub-block from the start:
assert ste.termExists;
fillTerm();
//if (DEBUG) System.out.println(" found!");
return SeekStatus.FOUND;
}
}
}
// It is possible (and OK) that terms index pointed us
// at this block, but, we scanned the entire block and
// did not find the term to position to. This happens
// when the target is after the last term in the block
// (but, before the next term in the index). EG
// target could be foozzz, and terms index pointed us
// to the foo* block, but the last term in this block
// was fooz (and, eg, first term in the next block will
// bee fop).
//if (DEBUG) System.out.println(" block end");
if (exactOnly) {
fillTerm();
}
// TODO: not consistent that in the
// not-exact case we don't next() into the next
// frame here
return SeekStatus.END;
}
private void fillTerm() {
final int termLength = prefix + suffix;
ste.term.setLength(termLength);
ste.term.grow(termLength);
System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefix, suffix);
}
}

View File

@ -0,0 +1,201 @@
package org.apache.lucene.codecs.blocktree;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.Locale;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/**
* BlockTree statistics for a single field
* returned by {@link Lucene40FieldReader#getStats()}.
* @deprecated Only for 4.x backcompat
*/
@Deprecated
final class Lucene40Stats {
/** How many nodes in the index FST. */
public long indexNodeCount;
/** How many arcs in the index FST. */
public long indexArcCount;
/** Byte size of the index. */
public long indexNumBytes;
/** Total number of terms in the field. */
public long totalTermCount;
/** Total number of bytes (sum of term lengths) across all terms in the field. */
public long totalTermBytes;
/** The number of normal (non-floor) blocks in the terms file. */
public int nonFloorBlockCount;
/** The number of floor blocks (meta-blocks larger than the
* allowed {@code maxItemsPerBlock}) in the terms file. */
public int floorBlockCount;
/** The number of sub-blocks within the floor blocks. */
public int floorSubBlockCount;
/** The number of "internal" blocks (that have both
* terms and sub-blocks). */
public int mixedBlockCount;
/** The number of "leaf" blocks (blocks that have only
* terms). */
public int termsOnlyBlockCount;
/** The number of "internal" blocks that do not contain
* terms (have only sub-blocks). */
public int subBlocksOnlyBlockCount;
/** Total number of blocks. */
public int totalBlockCount;
/** Number of blocks at each prefix depth. */
public int[] blockCountByPrefixLen = new int[10];
private int startBlockCount;
private int endBlockCount;
/** Total number of bytes used to store term suffixes. */
public long totalBlockSuffixBytes;
/** Total number of bytes used to store term stats (not
* including what the {@link PostingsReaderBase}
* stores. */
public long totalBlockStatsBytes;
/** Total bytes stored by the {@link PostingsReaderBase},
* plus the other few vInts stored in the frame. */
public long totalBlockOtherBytes;
/** Segment name. */
public final String segment;
/** Field name. */
public final String field;
Lucene40Stats(String segment, String field) {
this.segment = segment;
this.field = field;
}
void startBlock(Lucene40SegmentTermsEnumFrame frame, boolean isFloor) {
totalBlockCount++;
if (isFloor) {
if (frame.fp == frame.fpOrig) {
floorBlockCount++;
}
floorSubBlockCount++;
} else {
nonFloorBlockCount++;
}
if (blockCountByPrefixLen.length <= frame.prefix) {
blockCountByPrefixLen = ArrayUtil.grow(blockCountByPrefixLen, 1+frame.prefix);
}
blockCountByPrefixLen[frame.prefix]++;
startBlockCount++;
totalBlockSuffixBytes += frame.suffixesReader.length();
totalBlockStatsBytes += frame.statsReader.length();
}
void endBlock(Lucene40SegmentTermsEnumFrame frame) {
final int termCount = frame.isLeafBlock ? frame.entCount : frame.state.termBlockOrd;
final int subBlockCount = frame.entCount - termCount;
totalTermCount += termCount;
if (termCount != 0 && subBlockCount != 0) {
mixedBlockCount++;
} else if (termCount != 0) {
termsOnlyBlockCount++;
} else if (subBlockCount != 0) {
subBlocksOnlyBlockCount++;
} else {
throw new IllegalStateException();
}
endBlockCount++;
final long otherBytes = frame.fpEnd - frame.fp - frame.suffixesReader.length() - frame.statsReader.length();
assert otherBytes > 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd;
totalBlockOtherBytes += otherBytes;
}
void term(BytesRef term) {
totalTermBytes += term.length;
}
void finish() {
assert startBlockCount == endBlockCount: "startBlockCount=" + startBlockCount + " endBlockCount=" + endBlockCount;
assert totalBlockCount == floorSubBlockCount + nonFloorBlockCount: "floorSubBlockCount=" + floorSubBlockCount + " nonFloorBlockCount=" + nonFloorBlockCount + " totalBlockCount=" + totalBlockCount;
assert totalBlockCount == mixedBlockCount + termsOnlyBlockCount + subBlocksOnlyBlockCount: "totalBlockCount=" + totalBlockCount + " mixedBlockCount=" + mixedBlockCount + " subBlocksOnlyBlockCount=" + subBlocksOnlyBlockCount + " termsOnlyBlockCount=" + termsOnlyBlockCount;
}
@Override
public String toString() {
final ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
PrintStream out;
try {
out = new PrintStream(bos, false, IOUtils.UTF_8);
} catch (UnsupportedEncodingException bogus) {
throw new RuntimeException(bogus);
}
out.println(" index FST:");
out.println(" " + indexNodeCount + " nodes");
out.println(" " + indexArcCount + " arcs");
out.println(" " + indexNumBytes + " bytes");
out.println(" terms:");
out.println(" " + totalTermCount + " terms");
out.println(" " + totalTermBytes + " bytes" + (totalTermCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalTermBytes)/totalTermCount) + " bytes/term)" : ""));
out.println(" blocks:");
out.println(" " + totalBlockCount + " blocks");
out.println(" " + termsOnlyBlockCount + " terms-only blocks");
out.println(" " + subBlocksOnlyBlockCount + " sub-block-only blocks");
out.println(" " + mixedBlockCount + " mixed blocks");
out.println(" " + floorBlockCount + " floor blocks");
out.println(" " + (totalBlockCount-floorSubBlockCount) + " non-floor blocks");
out.println(" " + floorSubBlockCount + " floor sub-blocks");
out.println(" " + totalBlockSuffixBytes + " term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockSuffixBytes)/totalBlockCount) + " suffix-bytes/block)" : ""));
out.println(" " + totalBlockStatsBytes + " term stats bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : ""));
out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : ""));
if (totalBlockCount != 0) {
out.println(" by prefix length:");
int total = 0;
for(int prefix=0;prefix<blockCountByPrefixLen.length;prefix++) {
final int blockCount = blockCountByPrefixLen[prefix];
total += blockCount;
if (blockCount != 0) {
out.println(" " + String.format(Locale.ROOT, "%2d", prefix) + ": " + blockCount);
}
}
assert totalBlockCount == total;
}
try {
return bos.toString(IOUtils.UTF_8);
} catch (UnsupportedEncodingException bogus) {
throw new RuntimeException(bogus);
}
}
}

View File

@ -0,0 +1,26 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
BlockTree terms dictionary from Lucene 4.0-4.10
</body>
</html>

View File

@ -1,48 +0,0 @@
package org.apache.lucene.codecs.lucene40;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.PostingsBaseFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
/**
* PostingsReaderBase for 4.0 segments
* @deprecated Only for reading old 4.0 segments */
@Deprecated
final class Lucene40PostingsBaseFormat extends PostingsBaseFormat {
/** Sole constructor. */
Lucene40PostingsBaseFormat() {
super("Lucene40");
}
@Override
public PostingsReaderBase postingsReaderBase(SegmentReadState state) throws IOException {
return new Lucene40PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
}
@Override
public PostingsWriterBase postingsWriterBase(SegmentWriteState state) throws IOException {
throw new UnsupportedOperationException("this codec can only be used for reading");
}
}

View File

@ -23,7 +23,7 @@ import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader;
import org.apache.lucene.codecs.blocktree.Lucene40BlockTreeTermsReader;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@ -51,13 +51,7 @@ public class Lucene40PostingsFormat extends PostingsFormat {
boolean success = false;
try {
FieldsProducer ret = new BlockTreeTermsReader(
state.directory,
state.fieldInfos,
state.segmentInfo,
postings,
state.context,
state.segmentSuffix);
FieldsProducer ret = new Lucene40BlockTreeTermsReader(postings, state);
success = true;
return ret;
} finally {

View File

@ -31,6 +31,7 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
@ -104,7 +105,7 @@ final class Lucene40PostingsReader extends PostingsReaderBase {
}
@Override
public void init(IndexInput termsIn) throws IOException {
public void init(IndexInput termsIn, SegmentReadState state) throws IOException {
// Make sure we are talking to the matching past writer
CodecUtil.checkHeader(termsIn, TERMS_CODEC, VERSION_START, VERSION_CURRENT);

View File

@ -45,7 +45,7 @@ public class Lucene40SegmentInfoFormat extends SegmentInfoFormat {
}
@Override
public final SegmentInfo read(Directory dir, String segment, IOContext context) throws IOException {
public final SegmentInfo read(Directory dir, String segment, byte segmentID[], IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene40SegmentInfoFormat.SI_EXTENSION);
final IndexInput input = dir.openInput(fileName, context);
boolean success = false;

View File

@ -0,0 +1,247 @@
package org.apache.lucene.codecs.lucene41;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.packed.PackedInts.Decoder;
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE;
/**
* Lucene 4.1 postings format.
* @deprecated only for reading old 4.x segments
*/
@Deprecated
final class ForUtil {
/**
* Special number of bits per value used whenever all values to encode are equal.
*/
private static final int ALL_VALUES_EQUAL = 0;
/**
* Upper limit of the number of bytes that might be required to stored
* <code>BLOCK_SIZE</code> encoded values.
*/
static final int MAX_ENCODED_SIZE = BLOCK_SIZE * 4;
/**
* Upper limit of the number of values that might be decoded in a single call to
* {@link #readBlock(IndexInput, byte[], int[])}. Although values after
* <code>BLOCK_SIZE</code> are garbage, it is necessary to allocate value buffers
* whose size is >= MAX_DATA_SIZE to avoid {@link ArrayIndexOutOfBoundsException}s.
*/
static final int MAX_DATA_SIZE;
static {
int maxDataSize = 0;
for(int version=PackedInts.VERSION_START;version<=PackedInts.VERSION_CURRENT;version++) {
for (PackedInts.Format format : PackedInts.Format.values()) {
for (int bpv = 1; bpv <= 32; ++bpv) {
if (!format.isSupported(bpv)) {
continue;
}
final PackedInts.Decoder decoder = PackedInts.getDecoder(format, version, bpv);
final int iterations = computeIterations(decoder);
maxDataSize = Math.max(maxDataSize, iterations * decoder.byteValueCount());
}
}
}
MAX_DATA_SIZE = maxDataSize;
}
/**
* Compute the number of iterations required to decode <code>BLOCK_SIZE</code>
* values with the provided {@link Decoder}.
*/
private static int computeIterations(PackedInts.Decoder decoder) {
return (int) Math.ceil((float) BLOCK_SIZE / decoder.byteValueCount());
}
/**
* Compute the number of bytes required to encode a block of values that require
* <code>bitsPerValue</code> bits per value with format <code>format</code>.
*/
private static int encodedSize(PackedInts.Format format, int packedIntsVersion, int bitsPerValue) {
final long byteCount = format.byteCount(packedIntsVersion, BLOCK_SIZE, bitsPerValue);
assert byteCount >= 0 && byteCount <= Integer.MAX_VALUE : byteCount;
return (int) byteCount;
}
private final int[] encodedSizes;
private final PackedInts.Encoder[] encoders;
private final PackedInts.Decoder[] decoders;
private final int[] iterations;
/**
* Create a new {@link ForUtil} instance and save state into <code>out</code>.
*/
ForUtil(float acceptableOverheadRatio, DataOutput out) throws IOException {
out.writeVInt(PackedInts.VERSION_CURRENT);
encodedSizes = new int[33];
encoders = new PackedInts.Encoder[33];
decoders = new PackedInts.Decoder[33];
iterations = new int[33];
for (int bpv = 1; bpv <= 32; ++bpv) {
final FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(
BLOCK_SIZE, bpv, acceptableOverheadRatio);
assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue);
assert formatAndBits.bitsPerValue <= 32;
encodedSizes[bpv] = encodedSize(formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
encoders[bpv] = PackedInts.getEncoder(
formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
decoders[bpv] = PackedInts.getDecoder(
formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
iterations[bpv] = computeIterations(decoders[bpv]);
out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1));
}
}
/**
* Restore a {@link ForUtil} from a {@link DataInput}.
*/
ForUtil(DataInput in) throws IOException {
int packedIntsVersion = in.readVInt();
PackedInts.checkVersion(packedIntsVersion);
encodedSizes = new int[33];
encoders = new PackedInts.Encoder[33];
decoders = new PackedInts.Decoder[33];
iterations = new int[33];
for (int bpv = 1; bpv <= 32; ++bpv) {
final int code = in.readVInt();
final int formatId = code >>> 5;
final int bitsPerValue = (code & 31) + 1;
final PackedInts.Format format = PackedInts.Format.byId(formatId);
assert format.isSupported(bitsPerValue);
encodedSizes[bpv] = encodedSize(format, packedIntsVersion, bitsPerValue);
encoders[bpv] = PackedInts.getEncoder(
format, packedIntsVersion, bitsPerValue);
decoders[bpv] = PackedInts.getDecoder(
format, packedIntsVersion, bitsPerValue);
iterations[bpv] = computeIterations(decoders[bpv]);
}
}
/**
* Write a block of data (<code>For</code> format).
*
* @param data the data to write
* @param encoded a buffer to use to encode data
* @param out the destination output
* @throws IOException If there is a low-level I/O error
*/
void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException {
if (isAllEqual(data)) {
out.writeByte((byte) ALL_VALUES_EQUAL);
out.writeVInt(data[0]);
return;
}
final int numBits = bitsRequired(data);
assert numBits > 0 && numBits <= 32 : numBits;
final PackedInts.Encoder encoder = encoders[numBits];
final int iters = iterations[numBits];
assert iters * encoder.byteValueCount() >= BLOCK_SIZE;
final int encodedSize = encodedSizes[numBits];
assert iters * encoder.byteBlockCount() >= encodedSize;
out.writeByte((byte) numBits);
encoder.encode(data, 0, encoded, 0, iters);
out.writeBytes(encoded, encodedSize);
}
/**
* Read the next block of data (<code>For</code> format).
*
* @param in the input to use to read data
* @param encoded a buffer that can be used to store encoded data
* @param decoded where to write decoded data
* @throws IOException If there is a low-level I/O error
*/
void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException {
final int numBits = in.readByte();
assert numBits <= 32 : numBits;
if (numBits == ALL_VALUES_EQUAL) {
final int value = in.readVInt();
Arrays.fill(decoded, 0, BLOCK_SIZE, value);
return;
}
final int encodedSize = encodedSizes[numBits];
in.readBytes(encoded, 0, encodedSize);
final PackedInts.Decoder decoder = decoders[numBits];
final int iters = iterations[numBits];
assert iters * decoder.byteValueCount() >= BLOCK_SIZE;
decoder.decode(encoded, 0, decoded, 0, iters);
}
/**
* Skip the next block of data.
*
* @param in the input where to read data
* @throws IOException If there is a low-level I/O error
*/
void skipBlock(IndexInput in) throws IOException {
final int numBits = in.readByte();
if (numBits == ALL_VALUES_EQUAL) {
in.readVInt();
return;
}
assert numBits > 0 && numBits <= 32 : numBits;
final int encodedSize = encodedSizes[numBits];
in.seek(in.getFilePointer() + encodedSize);
}
private static boolean isAllEqual(final int[] data) {
final int v = data[0];
for (int i = 1; i < BLOCK_SIZE; ++i) {
if (data[i] != v) {
return false;
}
}
return true;
}
/**
* Compute the number of bits required to serialize any of the longs in
* <code>data</code>.
*/
private static int bitsRequired(final int[] data) {
long or = 0;
for (int i = 0; i < BLOCK_SIZE; ++i) {
assert data[i] >= 0;
or |= data[i];
}
return PackedInts.bitsRequired(or);
}
}

View File

@ -0,0 +1,62 @@
package org.apache.lucene.codecs.lucene41;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.index.TermState;
/**
* term state for Lucene 4.1 postings format
* @deprecated only for reading old 4.x segments
*/
@Deprecated
final class IntBlockTermState extends BlockTermState {
long docStartFP = 0;
long posStartFP = 0;
long payStartFP = 0;
long skipOffset = -1;
long lastPosBlockOffset = -1;
// docid when there is a single pulsed posting, otherwise -1
// freq is always implicitly totalTermFreq in this case.
int singletonDocID = -1;
@Override
public IntBlockTermState clone() {
IntBlockTermState other = new IntBlockTermState();
other.copyFrom(this);
return other;
}
@Override
public void copyFrom(TermState _other) {
super.copyFrom(_other);
IntBlockTermState other = (IntBlockTermState) _other;
docStartFP = other.docStartFP;
posStartFP = other.posStartFP;
payStartFP = other.payStartFP;
lastPosBlockOffset = other.lastPosBlockOffset;
skipOffset = other.skipOffset;
singletonDocID = other.singletonDocID;
}
@Override
public String toString() {
return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID;
}
}

View File

@ -0,0 +1,113 @@
package org.apache.lucene.codecs.lucene41;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.blocktree.Lucene40BlockTreeTermsReader;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/**
* Lucene 4.1 postings format.
* @deprecated only for reading old 4.x segments
*/
@Deprecated
public class Lucene41PostingsFormat extends PostingsFormat {
/**
* Filename extension for document number, frequencies, and skip data.
* See chapter: <a href="#Frequencies">Frequencies and Skip Data</a>
*/
public static final String DOC_EXTENSION = "doc";
/**
* Filename extension for positions.
* See chapter: <a href="#Positions">Positions</a>
*/
public static final String POS_EXTENSION = "pos";
/**
* Filename extension for payloads and offsets.
* See chapter: <a href="#Payloads">Payloads and Offsets</a>
*/
public static final String PAY_EXTENSION = "pay";
/**
* Expert: The maximum number of skip levels. Smaller values result in
* slightly smaller indexes, but slower skipping in big posting lists.
*/
static final int maxSkipLevels = 10;
final static String TERMS_CODEC = "Lucene41PostingsWriterTerms";
final static String DOC_CODEC = "Lucene41PostingsWriterDoc";
final static String POS_CODEC = "Lucene41PostingsWriterPos";
final static String PAY_CODEC = "Lucene41PostingsWriterPay";
// Increment version to change it
final static int VERSION_START = 0;
final static int VERSION_META_ARRAY = 1;
final static int VERSION_CHECKSUM = 2;
final static int VERSION_CURRENT = VERSION_CHECKSUM;
/**
* Fixed packed block size, number of integers encoded in
* a single packed block.
*/
// NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding
public final static int BLOCK_SIZE = 128;
/** Creates {@code Lucene41PostingsFormat} with default
* settings. */
public Lucene41PostingsFormat() {
super("Lucene41");
}
@Override
public String toString() {
return getName() + "(blocksize=" + BLOCK_SIZE + ")";
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
throw new UnsupportedOperationException("this codec can only be used for reading");
}
@Override
public final FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory,
state.fieldInfos,
state.segmentInfo,
state.context,
state.segmentSuffix);
boolean success = false;
try {
FieldsProducer ret = new Lucene40BlockTreeTermsReader(postingsReader, state);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsReader);
}
}
}
}

View File

@ -20,7 +20,6 @@ package org.apache.lucene.codecs.lucene41;
import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE;
import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE;
import static org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter.IntBlockTermState;
import java.io.IOException;
import java.util.Arrays;
@ -32,6 +31,7 @@ import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
@ -48,12 +48,10 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Concrete class that reads docId(maybe frq,pos,offset,payloads) list
* with postings format.
*
* @see Lucene41SkipReader for details
* @lucene.experimental
* Lucene 4.1 postings format.
* @deprecated only for reading old 4.x segments
*/
@Deprecated
public final class Lucene41PostingsReader extends PostingsReaderBase {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Lucene41PostingsReader.class);
@ -77,12 +75,12 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.DOC_EXTENSION),
ioContext);
version = CodecUtil.checkHeader(docIn,
Lucene41PostingsWriter.DOC_CODEC,
Lucene41PostingsWriter.VERSION_START,
Lucene41PostingsWriter.VERSION_CURRENT);
Lucene41PostingsFormat.DOC_CODEC,
Lucene41PostingsFormat.VERSION_START,
Lucene41PostingsFormat.VERSION_CURRENT);
forUtil = new ForUtil(docIn);
if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
if (version >= Lucene41PostingsFormat.VERSION_CHECKSUM) {
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
@ -93,9 +91,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
if (fieldInfos.hasProx()) {
posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.POS_EXTENSION),
ioContext);
CodecUtil.checkHeader(posIn, Lucene41PostingsWriter.POS_CODEC, version, version);
CodecUtil.checkHeader(posIn, Lucene41PostingsFormat.POS_CODEC, version, version);
if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
if (version >= Lucene41PostingsFormat.VERSION_CHECKSUM) {
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
@ -106,9 +104,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) {
payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.PAY_EXTENSION),
ioContext);
CodecUtil.checkHeader(payIn, Lucene41PostingsWriter.PAY_CODEC, version, version);
CodecUtil.checkHeader(payIn, Lucene41PostingsFormat.PAY_CODEC, version, version);
if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
if (version >= Lucene41PostingsFormat.VERSION_CHECKSUM) {
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
@ -130,12 +128,12 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
}
@Override
public void init(IndexInput termsIn) throws IOException {
public void init(IndexInput termsIn, SegmentReadState state) throws IOException {
// Make sure we are talking to the matching postings writer
CodecUtil.checkHeader(termsIn,
Lucene41PostingsWriter.TERMS_CODEC,
Lucene41PostingsWriter.VERSION_START,
Lucene41PostingsWriter.VERSION_CURRENT);
Lucene41PostingsFormat.TERMS_CODEC,
Lucene41PostingsFormat.VERSION_START,
Lucene41PostingsFormat.VERSION_CURRENT);
final int indexBlockSize = termsIn.readVInt();
if (indexBlockSize != BLOCK_SIZE) {
throw new IllegalStateException("index-time BLOCK_SIZE (" + indexBlockSize + ") != read-time BLOCK_SIZE (" + BLOCK_SIZE + ")");
@ -187,7 +185,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
termState.posStartFP = 0;
termState.payStartFP = 0;
}
if (version < Lucene41PostingsWriter.VERSION_META_ARRAY) { // backward compatibility
if (version < Lucene41PostingsFormat.VERSION_META_ARRAY) { // backward compatibility
_decodeTerm(in, fieldInfo, termState);
return;
}
@ -488,7 +486,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
if (skipper == null) {
// Lazy init: first time this enum has ever been used for skipping
skipper = new Lucene41SkipReader(docIn.clone(),
Lucene41PostingsWriter.maxSkipLevels,
Lucene41PostingsFormat.maxSkipLevels,
BLOCK_SIZE,
indexHasPos,
indexHasOffsets,
@ -821,7 +819,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// System.out.println(" create skipper");
// }
skipper = new Lucene41SkipReader(docIn.clone(),
Lucene41PostingsWriter.maxSkipLevels,
Lucene41PostingsFormat.maxSkipLevels,
BLOCK_SIZE,
true,
indexHasOffsets,
@ -1347,7 +1345,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// System.out.println(" create skipper");
// }
skipper = new Lucene41SkipReader(docIn.clone(),
Lucene41PostingsWriter.maxSkipLevels,
Lucene41PostingsFormat.maxSkipLevels,
BLOCK_SIZE,
true,
indexHasOffsets,
@ -1590,7 +1588,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
@Override
public void checkIntegrity() throws IOException {
if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
if (version >= Lucene41PostingsFormat.VERSION_CHECKSUM) {
if (docIn != null) {
CodecUtil.checksumEntireFile(docIn);
}

View File

@ -24,32 +24,10 @@ import org.apache.lucene.codecs.MultiLevelSkipListReader;
import org.apache.lucene.store.IndexInput;
/**
* Implements the skip list reader for block postings format
* that stores positions and payloads.
*
* Although this skipper uses MultiLevelSkipListReader as an interface,
* its definition of skip position will be a little different.
*
* For example, when skipInterval = blockSize = 3, df = 2*skipInterval = 6,
*
* 0 1 2 3 4 5
* d d d d d d (posting list)
* ^ ^ (skip point in MultiLeveSkipWriter)
* ^ (skip point in Lucene41SkipWriter)
*
* In this case, MultiLevelSkipListReader will use the last document as a skip point,
* while Lucene41SkipReader should assume no skip point will comes.
*
* If we use the interface directly in Lucene41SkipReader, it may silly try to read
* another skip data after the only skip point is loaded.
*
* To illustrate this, we can call skipTo(d[5]), since skip point d[3] has smaller docId,
* and numSkipped+blockSize== df, the MultiLevelSkipListReader will assume the skip list
* isn't exhausted yet, and try to load a non-existed skip point
*
* Therefore, we'll trim df before passing it to the interface. see trim(int)
*
* Lucene 4.1 skiplist format.
* @deprecated only for reading old 4.x segments
*/
@Deprecated
final class Lucene41SkipReader extends MultiLevelSkipListReader {
// private boolean DEBUG = Lucene41PostingsReader.DEBUG;
private final int blockSize;

View File

@ -74,12 +74,12 @@ public class Lucene410Codec extends Codec {
}
@Override
public final StoredFieldsFormat storedFieldsFormat() {
public StoredFieldsFormat storedFieldsFormat() {
return fieldsFormat;
}
@Override
public final TermVectorsFormat termVectorsFormat() {
public TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
}
@ -94,7 +94,7 @@ public class Lucene410Codec extends Codec {
}
@Override
public final SegmentInfoFormat segmentInfoFormat() {
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}
@ -127,7 +127,7 @@ public class Lucene410Codec extends Codec {
}
@Override
public final DocValuesFormat docValuesFormat() {
public DocValuesFormat docValuesFormat() {
return docValuesFormat;
}
@ -142,7 +142,7 @@ public class Lucene410Codec extends Codec {
};
@Override
public final NormsFormat normsFormat() {
public NormsFormat normsFormat() {
return normsFormat;
}
}

View File

@ -28,6 +28,7 @@ import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
@ -41,7 +42,11 @@ import org.apache.lucene.util.packed.DirectWriter;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
/** writer for {@link Lucene410DocValuesFormat} */
/**
* writer for 4.10 docvalues format
* @deprecated only for old 4.x segments
*/
@Deprecated
class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable {
static final int BLOCK_SIZE = 16384;
@ -108,6 +113,7 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable
@Override
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
checkCanWrite(field);
addNumericField(field, values, true);
}
@ -262,6 +268,7 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable
@Override
public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
checkCanWrite(field);
// write the byte[] data
meta.writeVInt(field.number);
meta.writeByte(Lucene410DocValuesFormat.BINARY);
@ -466,6 +473,7 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
checkCanWrite(field);
meta.writeVInt(field.number);
meta.writeByte(Lucene410DocValuesFormat.SORTED);
addTermsDict(field, values);
@ -474,6 +482,7 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable
@Override
public void addSortedNumericField(FieldInfo field, final Iterable<Number> docToValueCount, final Iterable<Number> values) throws IOException {
checkCanWrite(field);
meta.writeVInt(field.number);
meta.writeByte(Lucene410DocValuesFormat.SORTED_NUMERIC);
if (isSingleValued(docToValueCount)) {
@ -491,6 +500,7 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable
@Override
public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, final Iterable<Number> docToOrdCount, final Iterable<Number> ords) throws IOException {
checkCanWrite(field);
meta.writeVInt(field.number);
meta.writeByte(Lucene410DocValuesFormat.SORTED_SET);
@ -556,4 +566,14 @@ class Lucene410DocValuesConsumer extends DocValuesConsumer implements Closeable
meta = data = null;
}
}
void checkCanWrite(FieldInfo field) {
if ((field.getDocValuesType() == DocValuesType.NUMERIC ||
field.getDocValuesType() == DocValuesType.BINARY) &&
field.getDocValuesGen() != -1) {
// ok
} else {
throw new UnsupportedOperationException("this codec can only be used for reading");
}
}
}

View File

@ -0,0 +1,61 @@
package org.apache.lucene.codecs.lucene410;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
/**
* 4.10 docvalues format
* @deprecated only for old 4.x segments
*/
@Deprecated
public class Lucene410DocValuesFormat extends DocValuesFormat {
/** Sole Constructor */
public Lucene410DocValuesFormat() {
super("Lucene410");
}
@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new Lucene410DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
}
@Override
public final DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
return new Lucene410DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
}
static final String DATA_CODEC = "Lucene410DocValuesData";
static final String DATA_EXTENSION = "dvd";
static final String META_CODEC = "Lucene410ValuesMetadata";
static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
static final byte NUMERIC = 0;
static final byte BINARY = 1;
static final byte SORTED = 2;
static final byte SORTED_SET = 3;
static final byte SORTED_NUMERIC = 4;
}

View File

@ -74,7 +74,11 @@ import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.DirectReader;
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
/** reader for {@link Lucene410DocValuesFormat} */
/**
* reader for 4.10 docvalues format
* @deprecated only for old 4.x segments
*/
@Deprecated
class Lucene410DocValuesProducer extends DocValuesProducer implements Closeable {
private final Map<String,NumericEntry> numerics = new HashMap<>();
private final Map<String,BinaryEntry> binaries = new HashMap<>();

View File

@ -44,7 +44,7 @@ public class Lucene46SegmentInfoFormat extends SegmentInfoFormat {
}
@Override
public SegmentInfo read(Directory dir, String segment, IOContext context) throws IOException {
public SegmentInfo read(Directory dir, String segment, byte segmentID[], IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene46SegmentInfoFormat.SI_EXTENSION);
try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) {
int codecVersion = CodecUtil.checkHeader(input, Lucene46SegmentInfoFormat.CODEC_NAME,

View File

@ -16,3 +16,5 @@
org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat
org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat
org.apache.lucene.codecs.lucene49.Lucene49DocValuesFormat
org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat

View File

@ -14,3 +14,4 @@
# limitations under the License.
org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat
org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat

View File

@ -0,0 +1,67 @@
package org.apache.lucene.codecs.blocktree;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.blocktree.Lucene40FieldReader;
import org.apache.lucene.codecs.blocktree.Lucene40Stats;
import org.apache.lucene.codecs.lucene41.Lucene41RWCodec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.BasePostingsFormatTestCase;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
/**
* Tests BlockPostingsFormat
*/
public class TestLucene40BlockFormat extends BasePostingsFormatTestCase {
private final Codec codec = new Lucene41RWCodec();
@Override
protected Codec getCodec() {
return codec;
}
/** Make sure the final sub-block(s) are not skipped. */
public void testFinalBlock() throws Exception {
Directory d = newDirectory();
IndexWriter w = new IndexWriter(d, new IndexWriterConfig(new MockAnalyzer(random())));
for(int i=0;i<25;i++) {
Document doc = new Document();
doc.add(newStringField("field", Character.toString((char) (97+i)), Field.Store.NO));
doc.add(newStringField("field", "z" + Character.toString((char) (97+i)), Field.Store.NO));
w.addDocument(doc);
}
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w, true);
assertEquals(1, r.leaves().size());
Lucene40FieldReader field = (Lucene40FieldReader) r.leaves().get(0).reader().fields().terms("field");
// We should see exactly two blocks: one root block (prefix empty string) and one block for z* terms (prefix z):
Lucene40Stats stats = field.getStats();
assertEquals(0, stats.floorBlockCount);
assertEquals(2, stats.nonFloorBlockCount);
r.close();
w.close();
d.close();
}
}

View File

@ -129,7 +129,7 @@ final class Lucene40PostingsWriter extends PushPostingsWriterBase {
}
@Override
public void init(IndexOutput termsOut) throws IOException {
public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException {
CodecUtil.writeHeader(termsOut, Lucene40PostingsReader.TERMS_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
termsOut.writeInt(skipInterval); // write skipInterval
termsOut.writeInt(maxSkipLevels); // write maxSkipLevels

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
import org.apache.lucene.codecs.blocktree.Lucene40BlockTreeTermsWriter;
import org.apache.lucene.index.SegmentWriteState;
/**
@ -46,7 +46,7 @@ public final class Lucene40RWPostingsFormat extends Lucene40PostingsFormat {
// Or... you must make a new Codec for this?
boolean success = false;
try {
FieldsConsumer ret = new BlockTreeTermsWriter(state, docs, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
FieldsConsumer ret = new Lucene40BlockTreeTermsWriter(state, docs, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
success = true;
return ret;
} finally {

View File

@ -26,7 +26,6 @@ import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
@ -38,35 +37,13 @@ import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE;
import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE;
import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE;
/**
* Concrete class that writes docId(maybe frq,pos,offset,payloads) list
* with postings format.
*
* Postings list for each term will be stored separately.
*
* @see Lucene41SkipWriter for details about skipping setting and postings layout.
* @lucene.experimental
* Writes 4.1 postings for testing
* @deprecated for test purposes only
*/
@Deprecated
public final class Lucene41PostingsWriter extends PushPostingsWriterBase {
/**
* Expert: The maximum number of skip levels. Smaller values result in
* slightly smaller indexes, but slower skipping in big posting lists.
*/
static final int maxSkipLevels = 10;
final static String TERMS_CODEC = "Lucene41PostingsWriterTerms";
final static String DOC_CODEC = "Lucene41PostingsWriterDoc";
final static String POS_CODEC = "Lucene41PostingsWriterPos";
final static String PAY_CODEC = "Lucene41PostingsWriterPay";
// Increment version to change it
final static int VERSION_START = 0;
final static int VERSION_META_ARRAY = 1;
final static int VERSION_CHECKSUM = 2;
final static int VERSION_CURRENT = VERSION_CHECKSUM;
IndexOutput docOut;
IndexOutput posOut;
IndexOutput payOut;
@ -119,13 +96,13 @@ public final class Lucene41PostingsWriter extends PushPostingsWriterBase {
IndexOutput payOut = null;
boolean success = false;
try {
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
CodecUtil.writeHeader(docOut, Lucene41PostingsFormat.DOC_CODEC, Lucene41PostingsFormat.VERSION_CURRENT);
forUtil = new ForUtil(acceptableOverheadRatio, docOut);
if (state.fieldInfos.hasProx()) {
posDeltaBuffer = new int[MAX_DATA_SIZE];
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene41PostingsFormat.POS_EXTENSION),
state.context);
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
CodecUtil.writeHeader(posOut, Lucene41PostingsFormat.POS_CODEC, Lucene41PostingsFormat.VERSION_CURRENT);
if (state.fieldInfos.hasPayloads()) {
payloadBytes = new byte[128];
@ -146,7 +123,7 @@ public final class Lucene41PostingsWriter extends PushPostingsWriterBase {
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene41PostingsFormat.PAY_EXTENSION),
state.context);
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
CodecUtil.writeHeader(payOut, Lucene41PostingsFormat.PAY_CODEC, Lucene41PostingsFormat.VERSION_CURRENT);
}
} else {
posDeltaBuffer = null;
@ -168,7 +145,7 @@ public final class Lucene41PostingsWriter extends PushPostingsWriterBase {
freqBuffer = new int[MAX_DATA_SIZE];
// TODO: should we try skipping every 2/4 blocks...?
skipWriter = new Lucene41SkipWriter(maxSkipLevels,
skipWriter = new Lucene41SkipWriter(Lucene41PostingsFormat.maxSkipLevels,
BLOCK_SIZE,
state.segmentInfo.getDocCount(),
docOut,
@ -183,50 +160,14 @@ public final class Lucene41PostingsWriter extends PushPostingsWriterBase {
this(state, PackedInts.COMPACT);
}
final static class IntBlockTermState extends BlockTermState {
long docStartFP = 0;
long posStartFP = 0;
long payStartFP = 0;
long skipOffset = -1;
long lastPosBlockOffset = -1;
// docid when there is a single pulsed posting, otherwise -1
// freq is always implicitly totalTermFreq in this case.
int singletonDocID = -1;
@Override
public IntBlockTermState clone() {
IntBlockTermState other = new IntBlockTermState();
other.copyFrom(this);
return other;
}
@Override
public void copyFrom(TermState _other) {
super.copyFrom(_other);
IntBlockTermState other = (IntBlockTermState) _other;
docStartFP = other.docStartFP;
posStartFP = other.posStartFP;
payStartFP = other.payStartFP;
lastPosBlockOffset = other.lastPosBlockOffset;
skipOffset = other.skipOffset;
singletonDocID = other.singletonDocID;
}
@Override
public String toString() {
return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID;
}
}
@Override
public IntBlockTermState newTermState() {
return new IntBlockTermState();
}
@Override
public void init(IndexOutput termsOut) throws IOException {
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException {
CodecUtil.writeHeader(termsOut, Lucene41PostingsFormat.TERMS_CODEC, Lucene41PostingsFormat.VERSION_CURRENT);
termsOut.writeVInt(BLOCK_SIZE);
}

View File

@ -3,6 +3,7 @@ package org.apache.lucene.codecs.lucene41;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
@ -40,6 +41,12 @@ public final class Lucene41RWCodec extends Lucene41Codec {
private final DocValuesFormat docValues = new Lucene40RWDocValuesFormat();
private final NormsFormat norms = new Lucene40RWNormsFormat();
private final TermVectorsFormat vectors = new Lucene40RWTermVectorsFormat();
private final PostingsFormat postings = new Lucene41RWPostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postings;
}
@Override
public FieldInfosFormat fieldInfosFormat() {

View File

@ -0,0 +1,56 @@
package org.apache.lucene.codecs.lucene41;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.blocktree.Lucene40BlockTreeTermsWriter;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/**
* Read-write version of 4.1 postings format for testing
* @deprecated for test purposes only
*/
@Deprecated
public class Lucene41RWPostingsFormat extends Lucene41PostingsFormat {
static final int MIN_BLOCK_SIZE = 25;
static final int MAX_BLOCK_SIZE = 48;
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret = new Lucene40BlockTreeTermsWriter(state,
postingsWriter,
MIN_BLOCK_SIZE,
MAX_BLOCK_SIZE);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
}

View File

@ -24,25 +24,10 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
/**
* Write skip lists with multiple levels, and support skip within block ints.
*
* Assume that docFreq = 28, skipInterval = blockSize = 12
*
* | block#0 | | block#1 | |vInts|
* d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
* ^ ^ (level 0 skip point)
*
* Note that skipWriter will ignore first document in block#0, since
* it is useless as a skip point. Also, we'll never skip into the vInts
* block, only record skip data at the start its start point(if it exist).
*
* For each skip point, we will record:
* 1. docID in former position, i.e. for position 12, record docID[11], etc.
* 2. its related file points(position, payload),
* 3. related numbers or uptos(position, payload).
* 4. start offset.
*
* Writes 4.1 skiplists for testing
* @deprecated for test purposes only
*/
@Deprecated
final class Lucene41SkipWriter extends MultiLevelSkipListWriter {
// private boolean DEBUG = Lucene41PostingsReader.DEBUG;

View File

@ -0,0 +1,94 @@
package org.apache.lucene.codecs.lucene41;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE;
import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.packed.PackedInts;
import com.carrotsearch.randomizedtesting.generators.RandomInts;
public class TestLucene41ForUtil extends LuceneTestCase {
public void testEncodeDecode() throws IOException {
final int iterations = RandomInts.randomIntBetween(random(), 1, 1000);
final float acceptableOverheadRatio = random().nextFloat();
final int[] values = new int[(iterations - 1) * BLOCK_SIZE + ForUtil.MAX_DATA_SIZE];
for (int i = 0; i < iterations; ++i) {
final int bpv = random().nextInt(32);
if (bpv == 0) {
final int value = RandomInts.randomIntBetween(random(), 0, Integer.MAX_VALUE);
for (int j = 0; j < BLOCK_SIZE; ++j) {
values[i * BLOCK_SIZE + j] = value;
}
} else {
for (int j = 0; j < BLOCK_SIZE; ++j) {
values[i * BLOCK_SIZE + j] = RandomInts.randomIntBetween(random(),
0, (int) PackedInts.maxValue(bpv));
}
}
}
final Directory d = new RAMDirectory();
final long endPointer;
{
// encode
IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
final ForUtil forUtil = new ForUtil(acceptableOverheadRatio, out);
for (int i = 0; i < iterations; ++i) {
forUtil.writeBlock(
Arrays.copyOfRange(values, i * BLOCK_SIZE, values.length),
new byte[MAX_ENCODED_SIZE], out);
}
endPointer = out.getFilePointer();
out.close();
}
{
// decode
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
final ForUtil forUtil = new ForUtil(in);
for (int i = 0; i < iterations; ++i) {
if (random().nextBoolean()) {
forUtil.skipBlock(in);
continue;
}
final int[] restored = new int[MAX_DATA_SIZE];
forUtil.readBlock(in, new byte[MAX_ENCODED_SIZE], restored);
assertArrayEquals(Arrays.copyOfRange(values, i * BLOCK_SIZE, (i + 1) * BLOCK_SIZE),
Arrays.copyOf(restored, BLOCK_SIZE));
}
assertEquals(endPointer, in.getFilePointer());
in.close();
}
}
}

View File

@ -0,0 +1,132 @@
package org.apache.lucene.codecs.lucene41;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/**
* Tests special cases of BlockPostingsFormat
*/
public class TestLucene41PostingsFormat2 extends LuceneTestCase {
Directory dir;
RandomIndexWriter iw;
@Override
public void setUp() throws Exception {
super.setUp();
dir = newFSDirectory(createTempDir("testDFBlockSize"));
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(new Lucene41RWCodec());
iw = new RandomIndexWriter(random(), dir, iwc);
iw.setDoRandomForceMerge(false); // we will ourselves
}
@Override
public void tearDown() throws Exception {
iw.close();
TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(new Lucene41RWCodec());
iwc.setOpenMode(OpenMode.APPEND);
IndexWriter iw = new IndexWriter(dir, iwc);
iw.forceMerge(1);
iw.close();
dir.close(); // just force a checkindex for now
super.tearDown();
}
private Document newDocument() {
Document doc = new Document();
for (IndexOptions option : FieldInfo.IndexOptions.values()) {
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
// turn on tvs for a cross-check, since we rely upon checkindex in this test (for now)
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorPayloads(true);
ft.setIndexOptions(option);
doc.add(new Field(option.toString(), "", ft));
}
return doc;
}
/** tests terms with df = blocksize */
public void testDFBlockSize() throws Exception {
Document doc = newDocument();
for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE; i++) {
for (Field f : doc.getFields()) {
f.setStringValue(f.name() + " " + f.name() + "_2");
}
iw.addDocument(doc);
}
}
/** tests terms with df % blocksize = 0 */
public void testDFBlockSizeMultiple() throws Exception {
Document doc = newDocument();
for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE * 16; i++) {
for (Field f : doc.getFields()) {
f.setStringValue(f.name() + " " + f.name() + "_2");
}
iw.addDocument(doc);
}
}
/** tests terms with ttf = blocksize */
public void testTTFBlockSize() throws Exception {
Document doc = newDocument();
for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE/2; i++) {
for (Field f : doc.getFields()) {
f.setStringValue(f.name() + " " + f.name() + " " + f.name() + "_2 " + f.name() + "_2");
}
iw.addDocument(doc);
}
}
/** tests terms with ttf % blocksize = 0 */
public void testTTFBlockSizeMultiple() throws Exception {
Document doc = newDocument();
for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE/2; i++) {
for (Field f : doc.getFields()) {
String proto = (f.name() + " " + f.name() + " " + f.name() + " " + f.name() + " "
+ f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2");
StringBuilder val = new StringBuilder();
for (int j = 0; j < 16; j++) {
val.append(proto);
val.append(" ");
}
f.setStringValue(val.toString());
}
iw.addDocument(doc);
}
}
}

View File

@ -0,0 +1,521 @@
package org.apache.lucene.codecs.lucene41;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockFixedLengthPayloadFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.MockVariableLengthPayloadFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.English;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RegExp;
/**
* Tests partial enumeration (only pulling a subset of the indexed data)
*/
public class TestLucene41PostingsFormat3 extends LuceneTestCase {
static final int MAXDOC = Lucene41PostingsFormat.BLOCK_SIZE * 20;
// creates 8 fields with different options and does "duels" of fields against each other
public void test() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer();
if (fieldName.contains("payloadsFixed")) {
TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1);
return new TokenStreamComponents(tokenizer, filter);
} else if (fieldName.contains("payloadsVariable")) {
TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer);
return new TokenStreamComponents(tokenizer, filter);
} else {
return new TokenStreamComponents(tokenizer);
}
}
};
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setCodec(new Lucene41RWCodec());
// TODO we could actually add more fields implemented with different PFs
// or, just put this test into the usual rotation?
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED);
// turn this on for a cross-check
docsOnlyType.setStoreTermVectors(true);
docsOnlyType.setIndexOptions(IndexOptions.DOCS_ONLY);
FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED);
// turn this on for a cross-check
docsAndFreqsType.setStoreTermVectors(true);
docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED);
// turn these on for a cross-check
positionsType.setStoreTermVectors(true);
positionsType.setStoreTermVectorPositions(true);
positionsType.setStoreTermVectorOffsets(true);
positionsType.setStoreTermVectorPayloads(true);
FieldType offsetsType = new FieldType(positionsType);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field field1 = new Field("field1docs", "", docsOnlyType);
Field field2 = new Field("field2freqs", "", docsAndFreqsType);
Field field3 = new Field("field3positions", "", positionsType);
Field field4 = new Field("field4offsets", "", offsetsType);
Field field5 = new Field("field5payloadsFixed", "", positionsType);
Field field6 = new Field("field6payloadsVariable", "", positionsType);
Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType);
Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType);
doc.add(field1);
doc.add(field2);
doc.add(field3);
doc.add(field4);
doc.add(field5);
doc.add(field6);
doc.add(field7);
doc.add(field8);
for (int i = 0; i < MAXDOC; i++) {
String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + TestUtil.randomSimpleString(random());
field1.setStringValue(stringValue);
field2.setStringValue(stringValue);
field3.setStringValue(stringValue);
field4.setStringValue(stringValue);
field5.setStringValue(stringValue);
field6.setStringValue(stringValue);
field7.setStringValue(stringValue);
field8.setStringValue(stringValue);
iw.addDocument(doc);
}
iw.close();
verify(dir);
TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge
iwc = newIndexWriterConfig(analyzer);
iwc.setCodec(new Lucene41RWCodec());
iwc.setOpenMode(OpenMode.APPEND);
IndexWriter iw2 = new IndexWriter(dir, iwc);
iw2.forceMerge(1);
iw2.close();
verify(dir);
dir.close();
}
private void verify(Directory dir) throws Exception {
DirectoryReader ir = DirectoryReader.open(dir);
for (LeafReaderContext leaf : ir.leaves()) {
LeafReader leafReader = leaf.reader();
assertTerms(leafReader.terms("field1docs"), leafReader.terms("field2freqs"), true);
assertTerms(leafReader.terms("field3positions"), leafReader.terms("field4offsets"), true);
assertTerms(leafReader.terms("field4offsets"), leafReader.terms("field5payloadsFixed"), true);
assertTerms(leafReader.terms("field5payloadsFixed"), leafReader.terms("field6payloadsVariable"), true);
assertTerms(leafReader.terms("field6payloadsVariable"), leafReader.terms("field7payloadsFixedOffsets"), true);
assertTerms(leafReader.terms("field7payloadsFixedOffsets"), leafReader.terms("field8payloadsVariableOffsets"), true);
}
ir.close();
}
// following code is almost an exact dup of code from TestDuelingCodecs: sorry!
public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception {
if (leftTerms == null || rightTerms == null) {
assertNull(leftTerms);
assertNull(rightTerms);
return;
}
assertTermsStatistics(leftTerms, rightTerms);
// NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different
TermsEnum leftTermsEnum = leftTerms.iterator(null);
TermsEnum rightTermsEnum = rightTerms.iterator(null);
assertTermsEnum(leftTermsEnum, rightTermsEnum, true);
assertTermsSeeking(leftTerms, rightTerms);
if (deep) {
int numIntersections = atLeast(3);
for (int i = 0; i < numIntersections; i++) {
String re = AutomatonTestUtil.randomRegexp(random());
CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
// TODO: test start term too
TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
assertTermsEnum(leftIntersection, rightIntersection, rarely());
}
}
}
}
private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception {
TermsEnum leftEnum = null;
TermsEnum rightEnum = null;
// just an upper bound
int numTests = atLeast(20);
Random random = random();
// collect this number of terms from the left side
HashSet<BytesRef> tests = new HashSet<>();
int numPasses = 0;
while (numPasses < 10 && tests.size() < numTests) {
leftEnum = leftTerms.iterator(leftEnum);
BytesRef term = null;
while ((term = leftEnum.next()) != null) {
int code = random.nextInt(10);
if (code == 0) {
// the term
tests.add(BytesRef.deepCopyOf(term));
} else if (code == 1) {
// truncated subsequence of term
term = BytesRef.deepCopyOf(term);
if (term.length > 0) {
// truncate it
term.length = random.nextInt(term.length);
}
} else if (code == 2) {
// term, but ensure a non-zero offset
byte newbytes[] = new byte[term.length+5];
System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length);
tests.add(new BytesRef(newbytes, 5, term.length));
}
}
numPasses++;
}
ArrayList<BytesRef> shuffledTests = new ArrayList<>(tests);
Collections.shuffle(shuffledTests, random);
for (BytesRef b : shuffledTests) {
leftEnum = leftTerms.iterator(leftEnum);
rightEnum = rightTerms.iterator(rightEnum);
assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b));
assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b));
SeekStatus leftStatus;
SeekStatus rightStatus;
leftStatus = leftEnum.seekCeil(b);
rightStatus = rightEnum.seekCeil(b);
assertEquals(leftStatus, rightStatus);
if (leftStatus != SeekStatus.END) {
assertEquals(leftEnum.term(), rightEnum.term());
}
leftStatus = leftEnum.seekCeil(b);
rightStatus = rightEnum.seekCeil(b);
assertEquals(leftStatus, rightStatus);
if (leftStatus != SeekStatus.END) {
assertEquals(leftEnum.term(), rightEnum.term());
}
}
}
/**
* checks collection-level statistics on Terms
*/
public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception {
if (leftTerms.getDocCount() != -1 && rightTerms.getDocCount() != -1) {
assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount());
}
if (leftTerms.getSumDocFreq() != -1 && rightTerms.getSumDocFreq() != -1) {
assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq());
}
if (leftTerms.getSumTotalTermFreq() != -1 && rightTerms.getSumTotalTermFreq() != -1) {
assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq());
}
if (leftTerms.size() != -1 && rightTerms.size() != -1) {
assertEquals(leftTerms.size(), rightTerms.size());
}
}
/**
* checks the terms enum sequentially
* if deep is false, it does a 'shallow' test that doesnt go down to the docsenums
*/
public void assertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, boolean deep) throws Exception {
BytesRef term;
Bits randomBits = new RandomBits(MAXDOC, random().nextDouble(), random());
DocsAndPositionsEnum leftPositions = null;
DocsAndPositionsEnum rightPositions = null;
DocsEnum leftDocs = null;
DocsEnum rightDocs = null;
while ((term = leftTermsEnum.next()) != null) {
assertEquals(term, rightTermsEnum.next());
assertTermStats(leftTermsEnum, rightTermsEnum);
if (deep) {
// with payloads + off
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions));
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions));
assertPositionsSkipping(leftTermsEnum.docFreq(),
leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions));
assertPositionsSkipping(leftTermsEnum.docFreq(),
leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions));
// with payloads only
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS));
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS));
assertPositionsSkipping(leftTermsEnum.docFreq(),
leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS));
assertPositionsSkipping(leftTermsEnum.docFreq(),
leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS));
// with offsets only
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS));
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS));
assertPositionsSkipping(leftTermsEnum.docFreq(),
leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS));
assertPositionsSkipping(leftTermsEnum.docFreq(),
leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS));
// with positions only
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE));
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE));
assertPositionsSkipping(leftTermsEnum.docFreq(),
leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE));
assertPositionsSkipping(leftTermsEnum.docFreq(),
leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE));
// with freqs:
assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs),
rightDocs = rightTermsEnum.docs(null, rightDocs));
assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs),
rightDocs = rightTermsEnum.docs(randomBits, rightDocs));
// w/o freqs:
assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs, DocsEnum.FLAG_NONE),
rightDocs = rightTermsEnum.docs(null, rightDocs, DocsEnum.FLAG_NONE));
assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs, DocsEnum.FLAG_NONE),
rightDocs = rightTermsEnum.docs(randomBits, rightDocs, DocsEnum.FLAG_NONE));
// with freqs:
assertDocsSkipping(leftTermsEnum.docFreq(),
leftDocs = leftTermsEnum.docs(null, leftDocs),
rightDocs = rightTermsEnum.docs(null, rightDocs));
assertDocsSkipping(leftTermsEnum.docFreq(),
leftDocs = leftTermsEnum.docs(randomBits, leftDocs),
rightDocs = rightTermsEnum.docs(randomBits, rightDocs));
// w/o freqs:
assertDocsSkipping(leftTermsEnum.docFreq(),
leftDocs = leftTermsEnum.docs(null, leftDocs, DocsEnum.FLAG_NONE),
rightDocs = rightTermsEnum.docs(null, rightDocs, DocsEnum.FLAG_NONE));
assertDocsSkipping(leftTermsEnum.docFreq(),
leftDocs = leftTermsEnum.docs(randomBits, leftDocs, DocsEnum.FLAG_NONE),
rightDocs = rightTermsEnum.docs(randomBits, rightDocs, DocsEnum.FLAG_NONE));
}
}
assertNull(rightTermsEnum.next());
}
/**
* checks term-level statistics
*/
public void assertTermStats(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum) throws Exception {
assertEquals(leftTermsEnum.docFreq(), rightTermsEnum.docFreq());
if (leftTermsEnum.totalTermFreq() != -1 && rightTermsEnum.totalTermFreq() != -1) {
assertEquals(leftTermsEnum.totalTermFreq(), rightTermsEnum.totalTermFreq());
}
}
/**
* checks docs + freqs + positions + payloads, sequentially
*/
public void assertDocsAndPositionsEnum(DocsAndPositionsEnum leftDocs, DocsAndPositionsEnum rightDocs) throws Exception {
if (leftDocs == null || rightDocs == null) {
assertNull(leftDocs);
assertNull(rightDocs);
return;
}
assertEquals(-1, leftDocs.docID());
assertEquals(-1, rightDocs.docID());
int docid;
while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
assertEquals(docid, rightDocs.nextDoc());
int freq = leftDocs.freq();
assertEquals(freq, rightDocs.freq());
for (int i = 0; i < freq; i++) {
assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
// we don't assert offsets/payloads, they are allowed to be different
}
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc());
}
/**
* checks docs + freqs, sequentially
*/
public void assertDocsEnum(DocsEnum leftDocs, DocsEnum rightDocs) throws Exception {
if (leftDocs == null) {
assertNull(rightDocs);
return;
}
assertEquals(-1, leftDocs.docID());
assertEquals(-1, rightDocs.docID());
int docid;
while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
assertEquals(docid, rightDocs.nextDoc());
// we don't assert freqs, they are allowed to be different
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc());
}
/**
* checks advancing docs
*/
public void assertDocsSkipping(int docFreq, DocsEnum leftDocs, DocsEnum rightDocs) throws Exception {
if (leftDocs == null) {
assertNull(rightDocs);
return;
}
int docid = -1;
int averageGap = MAXDOC / (1+docFreq);
int skipInterval = 16;
while (true) {
if (random().nextBoolean()) {
// nextDoc()
docid = leftDocs.nextDoc();
assertEquals(docid, rightDocs.nextDoc());
} else {
// advance()
int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap));
docid = leftDocs.advance(skip);
assertEquals(docid, rightDocs.advance(skip));
}
if (docid == DocIdSetIterator.NO_MORE_DOCS) {
return;
}
// we don't assert freqs, they are allowed to be different
}
}
/**
* checks advancing docs + positions
*/
public void assertPositionsSkipping(int docFreq, DocsAndPositionsEnum leftDocs, DocsAndPositionsEnum rightDocs) throws Exception {
if (leftDocs == null || rightDocs == null) {
assertNull(leftDocs);
assertNull(rightDocs);
return;
}
int docid = -1;
int averageGap = MAXDOC / (1+docFreq);
int skipInterval = 16;
while (true) {
if (random().nextBoolean()) {
// nextDoc()
docid = leftDocs.nextDoc();
assertEquals(docid, rightDocs.nextDoc());
} else {
// advance()
int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap));
docid = leftDocs.advance(skip);
assertEquals(docid, rightDocs.advance(skip));
}
if (docid == DocIdSetIterator.NO_MORE_DOCS) {
return;
}
int freq = leftDocs.freq();
assertEquals(freq, rightDocs.freq());
for (int i = 0; i < freq; i++) {
assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
// we don't compare the payloads, its allowed that one is empty etc
}
}
}
private static class RandomBits implements Bits {
FixedBitSet bits;
RandomBits(int maxDoc, double pctLive, Random random) {
bits = new FixedBitSet(maxDoc);
for (int i = 0; i < maxDoc; i++) {
if (random.nextDouble() <= pctLive) {
bits.set(i);
}
}
}
@Override
public boolean get(int index) {
return bits.get(index);
}
@Override
public int length() {
return bits.length();
}
}
}

View File

@ -0,0 +1,80 @@
package org.apache.lucene.codecs.lucene410;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41RWPostingsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat;
import org.apache.lucene.codecs.lucene42.Lucene42RWTermVectorsFormat;
import org.apache.lucene.codecs.lucene46.Lucene46RWSegmentInfoFormat;
import org.apache.lucene.codecs.lucene49.Lucene49RWNormsFormat;
/**
* Read-Write version of 4.10 codec for testing
* @deprecated for test purposes only
*/
@Deprecated
public final class Lucene410RWCodec extends Lucene410Codec {
private final PostingsFormat postings = new Lucene41RWPostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postings;
}
private static final DocValuesFormat docValues = new Lucene410RWDocValuesFormat();
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return docValues;
}
private static final NormsFormat norms = new Lucene49RWNormsFormat();
@Override
public NormsFormat normsFormat() {
return norms;
}
private static final SegmentInfoFormat segmentInfos = new Lucene46RWSegmentInfoFormat();
@Override
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfos;
}
private static final StoredFieldsFormat storedFields = new Lucene41RWStoredFieldsFormat();
@Override
public StoredFieldsFormat storedFieldsFormat() {
return storedFields;
}
private final TermVectorsFormat vectorsFormat = new Lucene42RWTermVectorsFormat();
@Override
public TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
}
}

View File

@ -0,0 +1,42 @@
package org.apache.lucene.codecs.lucene410;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
/**
* Read-Write version of 4.10 docvalues format for testing
* @deprecated for test purposes only
*/
class Lucene410RWDocValuesFormat extends Lucene410DocValuesFormat {
@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new Lucene410DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION) {
@Override
void checkCanWrite(FieldInfo field) {
// allow writing all fields
}
};
}
}

View File

@ -26,8 +26,6 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.asserting.AssertingCodec;
import org.apache.lucene.codecs.blocktreeords.Ords41PostingsFormat;
import org.apache.lucene.codecs.lucene41ords.Lucene41WithOrds;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedSetDocValuesField;
@ -51,7 +49,7 @@ import org.apache.lucene.util.TestUtil;
* Tests Lucene410DocValuesFormat
*/
public class TestLucene410DocValuesFormat extends BaseCompressingDocValuesFormatTestCase {
private final Codec codec = TestUtil.alwaysDocValuesFormat(new Lucene410DocValuesFormat());
private final Codec codec = new Lucene410RWCodec();
@Override
protected Codec getCodec() {
@ -121,18 +119,8 @@ public class TestLucene410DocValuesFormat extends BaseCompressingDocValuesFormat
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMergeScheduler(new SerialMergeScheduler());
// set to duel against a codec which has ordinals:
final PostingsFormat pf;
switch (random().nextInt(2)) {
case 0: pf = new Lucene41WithOrds();
break;
case 1: pf = new Ords41PostingsFormat();
break;
// TODO: these don't actually support ords!
//case 2: pf = new FSTOrdPostingsFormat();
// break;
default: throw new AssertionError();
}
final DocValuesFormat dv = new Lucene410DocValuesFormat();
final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
final DocValuesFormat dv = new Lucene410RWDocValuesFormat();
conf.setCodec(new AssertingCodec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {

View File

@ -20,10 +20,12 @@ package org.apache.lucene.codecs.lucene42;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40RWSegmentInfoFormat;
import org.apache.lucene.codecs.lucene41.Lucene41RWPostingsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat;
/**
@ -37,6 +39,13 @@ public final class Lucene42RWCodec extends Lucene42Codec {
private static final NormsFormat norms = new Lucene42RWNormsFormat();
private static final StoredFieldsFormat storedFields = new Lucene41RWStoredFieldsFormat();
private static final FieldInfosFormat fieldInfosFormat = new Lucene42RWFieldInfosFormat();
private final PostingsFormat postings = new Lucene41RWPostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postings;
}
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {

View File

@ -20,10 +20,12 @@ package org.apache.lucene.codecs.lucene45;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40RWSegmentInfoFormat;
import org.apache.lucene.codecs.lucene41.Lucene41RWPostingsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat;
import org.apache.lucene.codecs.lucene42.Lucene42RWFieldInfosFormat;
import org.apache.lucene.codecs.lucene42.Lucene42RWNormsFormat;
@ -31,10 +33,18 @@ import org.apache.lucene.codecs.lucene42.Lucene42RWTermVectorsFormat;
/**
* Read-write version of {@link Lucene45Codec} for testing.
* @deprecated for test purposes only
*/
@SuppressWarnings("deprecation")
@Deprecated
public final class Lucene45RWCodec extends Lucene45Codec {
private final PostingsFormat postings = new Lucene41RWPostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postings;
}
private static final FieldInfosFormat fieldInfosFormat = new Lucene42RWFieldInfosFormat();
@Override

View File

@ -19,9 +19,11 @@ package org.apache.lucene.codecs.lucene46;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41RWPostingsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat;
import org.apache.lucene.codecs.lucene42.Lucene42RWNormsFormat;
import org.apache.lucene.codecs.lucene42.Lucene42RWTermVectorsFormat;
@ -34,6 +36,13 @@ import org.apache.lucene.codecs.lucene45.Lucene45RWDocValuesFormat;
@Deprecated
public final class Lucene46RWCodec extends Lucene46Codec {
private final PostingsFormat postings = new Lucene41RWPostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postings;
}
private static final DocValuesFormat docValues = new Lucene45RWDocValuesFormat();
@Override

View File

@ -19,9 +19,11 @@ package org.apache.lucene.codecs.lucene49;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41RWPostingsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat;
import org.apache.lucene.codecs.lucene42.Lucene42RWTermVectorsFormat;
import org.apache.lucene.codecs.lucene46.Lucene46RWSegmentInfoFormat;
@ -33,6 +35,13 @@ import org.apache.lucene.codecs.lucene46.Lucene46RWSegmentInfoFormat;
@Deprecated
public final class Lucene49RWCodec extends Lucene49Codec {
private final PostingsFormat postings = new Lucene41RWPostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postings;
}
private static final DocValuesFormat docValues = new Lucene49RWDocValuesFormat();
@Override

View File

@ -1075,8 +1075,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
}
private int checkAllSegmentsUpgraded(Directory dir) throws IOException {
final SegmentInfos infos = new SegmentInfos();
infos.read(dir);
final SegmentInfos infos = SegmentInfos.readLatestCommit(dir);
if (VERBOSE) {
System.out.println("checkAllSegmentsUpgraded: " + infos);
}
@ -1087,8 +1086,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
}
private int getNumberOfSegments(Directory dir) throws IOException {
final SegmentInfos infos = new SegmentInfos();
infos.read(dir);
final SegmentInfos infos = SegmentInfos.readLatestCommit(dir);
return infos.size();
}
@ -1306,7 +1304,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
writer.forceMerge(1);
writer.commit();
writer.rollback();
new SegmentInfos().read(dir);
SegmentInfos.readLatestCommit(dir);
dir.close();
}
}

View File

@ -32,7 +32,10 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
/**
* Tests performing docvalues updates against versions of lucene
* that did not support it.
*/
public class TestDocValuesUpdatesOnOldSegments extends LuceneTestCase {
static long getValue(BinaryDocValues bdv, int idx) {
@ -60,56 +63,62 @@ public class TestDocValuesUpdatesOnOldSegments extends LuceneTestCase {
public void testBinaryUpdates() throws Exception {
Codec[] oldCodecs = new Codec[] { new Lucene40RWCodec(), new Lucene41RWCodec(), new Lucene42RWCodec(), new Lucene45RWCodec() };
Directory dir = newDirectory();
// create a segment with an old Codec
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setCodec(oldCodecs[random().nextInt(oldCodecs.length)]);
IndexWriter writer = new IndexWriter(dir, conf);
Document doc = new Document();
doc.add(new StringField("id", "doc", Store.NO));
doc.add(new BinaryDocValuesField("f", toBytes(5L)));
writer.addDocument(doc);
writer.close();
conf = newIndexWriterConfig(new MockAnalyzer(random()));
writer = new IndexWriter(dir, conf);
writer.updateBinaryDocValue(new Term("id", "doc"), "f", toBytes(4L));
try {
for (Codec codec : oldCodecs) {
Directory dir = newDirectory();
// create a segment with an old Codec
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setCodec(codec);
IndexWriter writer = new IndexWriter(dir, conf);
Document doc = new Document();
doc.add(new StringField("id", "doc", Store.NO));
doc.add(new BinaryDocValuesField("f", toBytes(5L)));
writer.addDocument(doc);
writer.close();
fail("should not have succeeded to update a segment written with an old Codec");
} catch (UnsupportedOperationException e) {
writer.rollback();
conf = newIndexWriterConfig(new MockAnalyzer(random()));
writer = new IndexWriter(dir, conf);
writer.updateBinaryDocValue(new Term("id", "doc"), "f", toBytes(4L));
try {
writer.close();
fail("should not have succeeded to update a segment written with an old Codec");
} catch (UnsupportedOperationException e) {
writer.rollback();
}
dir.close();
}
dir.close();
}
public void testNumericUpdates() throws Exception {
Codec[] oldCodecs = new Codec[] { new Lucene40RWCodec(), new Lucene41RWCodec(), new Lucene42RWCodec(), new Lucene45RWCodec() };
Directory dir = newDirectory();
// create a segment with an old Codec
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setCodec(oldCodecs[random().nextInt(oldCodecs.length)]);
IndexWriter writer = new IndexWriter(dir, conf);
Document doc = new Document();
doc.add(new StringField("id", "doc", Store.NO));
doc.add(new NumericDocValuesField("f", 5));
writer.addDocument(doc);
writer.close();
conf = newIndexWriterConfig(new MockAnalyzer(random()));
writer = new IndexWriter(dir, conf);
writer.updateNumericDocValue(new Term("id", "doc"), "f", 4L);
try {
for (Codec codec : oldCodecs) {
Directory dir = newDirectory();
// create a segment with an old Codec
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setCodec(codec);
IndexWriter writer = new IndexWriter(dir, conf);
Document doc = new Document();
doc.add(new StringField("id", "doc", Store.NO));
doc.add(new NumericDocValuesField("f", 5));
writer.addDocument(doc);
writer.close();
fail("should not have succeeded to update a segment written with an old Codec");
} catch (UnsupportedOperationException e) {
writer.rollback();
conf = newIndexWriterConfig(new MockAnalyzer(random()));
writer = new IndexWriter(dir, conf);
writer.updateNumericDocValue(new Term("id", "doc"), "f", 4L);
try {
writer.close();
fail("should not have succeeded to update a segment written with an old Codec");
} catch (UnsupportedOperationException e) {
writer.rollback();
}
dir.close();
}
dir.close();
}
}

View File

@ -833,8 +833,7 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
ir.close();
// Make sure we have 3 segments:
SegmentInfos infos = new SegmentInfos();
infos.read(benchmark.getRunData().getDirectory());
SegmentInfos infos = SegmentInfos.readLatestCommit(benchmark.getRunData().getDirectory());
assertEquals(3, infos.size());
}

View File

@ -50,8 +50,7 @@ public class CommitIndexTaskTest extends BenchmarkTestCase {
CommitIndexTask task = new CommitIndexTask(runData);
task.setParams("params");
task.doLogic();
SegmentInfos infos = new SegmentInfos();
infos.read(runData.getDirectory());
SegmentInfos infos = SegmentInfos.readLatestCommit(runData.getDirectory());
assertEquals("params", infos.getUserData().get(OpenReaderTask.USER_DATA));
new CloseIndexTask(runData).doLogic();
}

View File

@ -113,13 +113,13 @@ public class BlockTermsReader extends FieldsProducer {
boolean success = false;
try {
CodecUtil.checkSegmentHeader(in, BlockTermsWriter.CODEC_NAME,
CodecUtil.checkIndexHeader(in, BlockTermsWriter.CODEC_NAME,
BlockTermsWriter.VERSION_START,
BlockTermsWriter.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
// Have PostingsReader init itself
postingsReader.init(in);
postingsReader.init(in, state);
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks

View File

@ -110,14 +110,14 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable {
boolean success = false;
try {
fieldInfos = state.fieldInfos;
CodecUtil.writeSegmentHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
currentField = null;
this.postingsWriter = postingsWriter;
// segment = state.segmentName;
//System.out.println("BTW.init seg=" + state.segmentName);
postingsWriter.init(out); // have consumer write its format/header
postingsWriter.init(out, state); // have consumer write its format/header
success = true;
} finally {
if (!success) {

View File

@ -74,7 +74,7 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
try {
CodecUtil.checkSegmentHeader(in, FixedGapTermsIndexWriter.CODEC_NAME,
CodecUtil.checkIndexHeader(in, FixedGapTermsIndexWriter.CODEC_NAME,
FixedGapTermsIndexWriter.VERSION_CURRENT,
FixedGapTermsIndexWriter.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);

View File

@ -72,7 +72,7 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
out = state.directory.createOutput(indexFileName, state.context);
boolean success = false;
try {
CodecUtil.writeSegmentHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
out.writeVInt(termIndexInterval);
out.writeVInt(PackedInts.VERSION_CURRENT);
out.writeVInt(BLOCKSIZE);

View File

@ -54,7 +54,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
try {
CodecUtil.checkSegmentHeader(in, VariableGapTermsIndexWriter.CODEC_NAME,
CodecUtil.checkIndexHeader(in, VariableGapTermsIndexWriter.CODEC_NAME,
VariableGapTermsIndexWriter.VERSION_START,
VariableGapTermsIndexWriter.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);

View File

@ -182,7 +182,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
try {
fieldInfos = state.fieldInfos;
this.policy = policy;
CodecUtil.writeSegmentHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
success = true;
} finally {
if (!success) {

View File

@ -24,14 +24,14 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene41PostingsWriter}. */
public class Ords41PostingsFormat extends PostingsFormat {
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene50PostingsWriter}. */
public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
private final int minTermBlockSize;
private final int maxTermBlockSize;
@ -45,7 +45,7 @@ public class Ords41PostingsFormat extends PostingsFormat {
/** Creates {@code Lucene41PostingsFormat} with default
* settings. */
public Ords41PostingsFormat() {
public BlockTreeOrdsPostingsFormat() {
this(OrdsBlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, OrdsBlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
@ -53,8 +53,8 @@ public class Ords41PostingsFormat extends PostingsFormat {
* values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
* @see OrdsBlockTreeTermsWriter#OrdsBlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */
public Ords41PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("OrdsLucene41");
public BlockTreeOrdsPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("BlockTreeOrds");
this.minTermBlockSize = minTermBlockSize;
assert minTermBlockSize > 1;
this.maxTermBlockSize = maxTermBlockSize;
@ -68,7 +68,7 @@ public class Ords41PostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state);
boolean success = false;
try {
@ -87,11 +87,7 @@ public class Ords41PostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory,
state.fieldInfos,
state.segmentInfo,
state.context,
state.segmentSuffix);
PostingsReaderBase postingsReader = new Lucene50PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state);

View File

@ -73,7 +73,7 @@ public final class OrdsBlockTreeTermsReader extends FieldsProducer {
IndexInput indexIn = null;
try {
int version = CodecUtil.checkSegmentHeader(in, OrdsBlockTreeTermsWriter.TERMS_CODEC_NAME,
int version = CodecUtil.checkIndexHeader(in, OrdsBlockTreeTermsWriter.TERMS_CODEC_NAME,
OrdsBlockTreeTermsWriter.VERSION_START,
OrdsBlockTreeTermsWriter.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
@ -82,7 +82,7 @@ public final class OrdsBlockTreeTermsReader extends FieldsProducer {
state.segmentSuffix,
OrdsBlockTreeTermsWriter.TERMS_INDEX_EXTENSION);
indexIn = state.directory.openInput(indexFile, state.context);
int indexVersion = CodecUtil.checkSegmentHeader(indexIn, OrdsBlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME,
int indexVersion = CodecUtil.checkIndexHeader(indexIn, OrdsBlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME,
OrdsBlockTreeTermsWriter.VERSION_START,
OrdsBlockTreeTermsWriter.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
@ -94,7 +94,7 @@ public final class OrdsBlockTreeTermsReader extends FieldsProducer {
CodecUtil.checksumEntireFile(indexIn);
// Have PostingsReader init itself
postingsReader.init(in);
postingsReader.init(in, state);
// NOTE: data file is too costly to verify checksum against all the bytes on open,

View File

@ -203,18 +203,18 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
fieldInfos = state.fieldInfos;
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;
CodecUtil.writeSegmentHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
indexOut = state.directory.createOutput(termsIndexFileName, state.context);
CodecUtil.writeSegmentHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
this.postingsWriter = postingsWriter;
// segment = state.segmentInfo.name;
// System.out.println("BTW.init seg=" + state.segmentName);
postingsWriter.init(out); // have consumer write its format/header
postingsWriter.init(out, state); // have consumer write its format/header
success = true;
} finally {
if (!success) {

View File

@ -541,7 +541,7 @@ public final class OrdsSegmentTermsEnum extends TermsEnum {
int cmp = 0;
// TOOD: we should write our vLong backwards (MSB
// TODO: we should write our vLong backwards (MSB
// first) to get better sharing from the FST
// First compare up to valid seek frames:
@ -555,7 +555,7 @@ public final class OrdsSegmentTermsEnum extends TermsEnum {
}
arc = arcs[1+targetUpto];
assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
// TOOD: we could save the outputs in local
// TODO: we could save the outputs in local
// byte[][] instead of making new objs ever
// seek; but, often the FST doesn't have any
// shared bytes (but this could change if we

View File

@ -72,7 +72,7 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
* NumFilteredFields, Filter<sup>NumFilteredFields</sup>, Footer</li>
* <li>Filter --&gt; FieldNumber, FuzzySet</li>
* <li>FuzzySet --&gt;See {@link FuzzySet#serialize(DataOutput)}</li>
* <li>Header --&gt; {@link CodecUtil#writeSegmentHeader SegmentHeader}</li>
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
* <li>DelegatePostingsFormatName --&gt; {@link DataOutput#writeString(String)
* String} The name of a ServiceProvider registered {@link PostingsFormat}</li>
* <li>NumFilteredFields --&gt; {@link DataOutput#writeInt Uint32}</li>
@ -166,7 +166,7 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
boolean success = false;
try {
bloomIn = state.directory.openChecksumInput(bloomFileName, state.context);
CodecUtil.checkSegmentHeader(bloomIn, BLOOM_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.checkIndexHeader(bloomIn, BLOOM_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
// // Load the hash function used in the BloomFilter
// hashFunction = HashFunction.forName(bloomIn.readString());
// Load the delegate postings format
@ -502,7 +502,7 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
IndexOutput bloomOutput = null;
try {
bloomOutput = state.directory.createOutput(bloomFileName, state.context);
CodecUtil.writeSegmentHeader(bloomOutput, BLOOM_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(bloomOutput, BLOOM_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
// remember the name of the postings format we will delegate to
bloomOutput.writeString(delegatePostingsFormat.getName());

View File

@ -52,10 +52,10 @@ class DirectDocValuesConsumer extends DocValuesConsumer {
try {
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
data = state.directory.createOutput(dataName, state.context);
CodecUtil.writeSegmentHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
meta = state.directory.createOutput(metaName, state.context);
CodecUtil.writeSegmentHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
success = true;
} finally {
if (!success) {

View File

@ -122,7 +122,7 @@ class DirectDocValuesProducer extends DocValuesProducer {
ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass()));
boolean success = false;
try {
version = CodecUtil.checkSegmentHeader(in, metaCodec, VERSION_START, VERSION_CURRENT,
version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
numEntries = readFields(in, state.fieldInfos);
@ -140,7 +140,7 @@ class DirectDocValuesProducer extends DocValuesProducer {
this.data = state.directory.openInput(dataName, state.context);
success = false;
try {
final int version2 = CodecUtil.checkSegmentHeader(data, dataCodec, VERSION_START, VERSION_CURRENT,
final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
if (version != version2) {
throw new CorruptIndexException("Format versions mismatch: meta=" + version + ", data=" + version2, data);

View File

@ -26,7 +26,7 @@ import java.util.TreeMap;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; // javadocs
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; // javadocs
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
@ -54,7 +54,7 @@ import org.apache.lucene.util.automaton.Transition;
// - build depth-N prefix hash?
// - or: longer dense skip lists than just next byte?
/** Wraps {@link Lucene41PostingsFormat} format for on-disk
/** Wraps {@link Lucene50PostingsFormat} format for on-disk
* storage, but then at read time loads and stores all
* terms & postings directly in RAM as byte[], int[].
*
@ -102,12 +102,12 @@ public final class DirectPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return PostingsFormat.forName("Lucene41").fieldsConsumer(state);
return PostingsFormat.forName("Lucene50").fieldsConsumer(state);
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
FieldsProducer postings = PostingsFormat.forName("Lucene41").fieldsProducer(state);
FieldsProducer postings = PostingsFormat.forName("Lucene50").fieldsProducer(state);
if (state.context.context != IOContext.Context.MERGE) {
FieldsProducer loadedPostings;
try {

View File

@ -25,19 +25,19 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/**
* FSTOrd term dict + Lucene41PBF
* FSTOrd term dict + Lucene50PBF
*/
public final class FSTOrdPostingsFormat extends PostingsFormat {
public FSTOrdPostingsFormat() {
super("FSTOrd41");
super("FSTOrd50");
}
@Override
@ -47,7 +47,7 @@ public final class FSTOrdPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state);
boolean success = false;
try {
@ -63,11 +63,7 @@ public final class FSTOrdPostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory,
state.fieldInfos,
state.segmentInfo,
state.context,
state.segmentSuffix);
PostingsReaderBase postingsReader = new Lucene50PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new FSTOrdTermsReader(state, postingsReader);

View File

@ -88,11 +88,11 @@ public class FSTOrdTermsReader extends FieldsProducer {
try {
indexIn = state.directory.openChecksumInput(termsIndexFileName, state.context);
blockIn = state.directory.openInput(termsBlockFileName, state.context);
int version = CodecUtil.checkSegmentHeader(indexIn, FSTOrdTermsWriter.TERMS_INDEX_CODEC_NAME,
int version = CodecUtil.checkIndexHeader(indexIn, FSTOrdTermsWriter.TERMS_INDEX_CODEC_NAME,
FSTOrdTermsWriter.VERSION_START,
FSTOrdTermsWriter.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
int version2 = CodecUtil.checkSegmentHeader(blockIn, FSTOrdTermsWriter.TERMS_CODEC_NAME,
int version2 = CodecUtil.checkIndexHeader(blockIn, FSTOrdTermsWriter.TERMS_CODEC_NAME,
FSTOrdTermsWriter.VERSION_START,
FSTOrdTermsWriter.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
@ -103,7 +103,7 @@ public class FSTOrdTermsReader extends FieldsProducer {
CodecUtil.checksumEntireFile(blockIn);
this.postingsReader.init(blockIn);
this.postingsReader.init(blockIn, state);
seekDir(blockIn);
final FieldInfos fieldInfos = state.fieldInfos;

View File

@ -75,7 +75,7 @@ import org.apache.lucene.util.fst.Util;
* <ul>
* <li>TermIndex(.tix) --&gt; Header, TermFST<sup>NumFields</sup>, Footer</li>
* <li>TermFST --&gt; {@link FST FST&lt;long&gt;}</li>
* <li>Header --&gt; {@link CodecUtil#writeSegmentHeader SegmentHeader}</li>
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
*
@ -113,7 +113,7 @@ import org.apache.lucene.util.fst.Util;
* <li>StatsBlock --&gt; &lt; DocFreq[Same?], (TotalTermFreq-DocFreq) ? &gt; <sup>NumTerms</sup>
* <li>MetaLongsBlock --&gt; &lt; LongDelta<sup>LongsSize</sup>, BytesSize &gt; <sup>NumTerms</sup>
* <li>MetaBytesBlock --&gt; Byte <sup>MetaBytesBlockLength</sup>
* <li>Header --&gt; {@link CodecUtil#writeSegmentHeader CodecHeader}</li>
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
* <li>DirOffset --&gt; {@link DataOutput#writeLong Uint64}</li>
* <li>NumFields, FieldNumber, DocCount, DocFreq, LongsSize,
* FieldNumber, DocCount --&gt; {@link DataOutput#writeVInt VInt}</li>
@ -174,11 +174,11 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
try {
this.indexOut = state.directory.createOutput(termsIndexFileName, state.context);
this.blockOut = state.directory.createOutput(termsBlockFileName, state.context);
CodecUtil.writeSegmentHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT,
CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeSegmentHeader(blockOut, TERMS_CODEC_NAME, VERSION_CURRENT,
CodecUtil.writeIndexHeader(blockOut, TERMS_CODEC_NAME, VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
this.postingsWriter.init(blockOut);
this.postingsWriter.init(blockOut, state);
success = true;
} finally {
if (!success) {

View File

@ -25,19 +25,19 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/**
* FST term dict + Lucene41PBF
* FST term dict + Lucene50PBF
*/
public final class FSTPostingsFormat extends PostingsFormat {
public FSTPostingsFormat() {
super("FST41");
super("FST50");
}
@Override
@ -47,7 +47,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state);
boolean success = false;
try {
@ -63,11 +63,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory,
state.fieldInfos,
state.segmentInfo,
state.context,
state.segmentSuffix);
PostingsReaderBase postingsReader = new Lucene50PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new FSTTermsReader(state, postingsReader);

View File

@ -81,12 +81,12 @@ public class FSTTermsReader extends FieldsProducer {
boolean success = false;
try {
CodecUtil.checkSegmentHeader(in, FSTTermsWriter.TERMS_CODEC_NAME,
CodecUtil.checkIndexHeader(in, FSTTermsWriter.TERMS_CODEC_NAME,
FSTTermsWriter.TERMS_VERSION_START,
FSTTermsWriter.TERMS_VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.checksumEntireFile(in);
this.postingsReader.init(in);
this.postingsReader.init(in, state);
seekDir(in);
final FieldInfos fieldInfos = state.fieldInfos;

View File

@ -90,7 +90,7 @@ import org.apache.lucene.util.fst.Util;
* <li>TermFST --&gt; {@link FST FST&lt;TermData&gt;}</li>
* <li>TermData --&gt; Flag, BytesSize?, LongDelta<sup>LongsSize</sup>?, Byte<sup>BytesSize</sup>?,
* &lt; DocFreq[Same?], (TotalTermFreq-DocFreq) &gt; ? </li>
* <li>Header --&gt; {@link CodecUtil#writeSegmentHeader SegmentHeader}</li>
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
* <li>DirOffset --&gt; {@link DataOutput#writeLong Uint64}</li>
* <li>DocFreq, LongsSize, BytesSize, NumFields,
* FieldNumber, DocCount --&gt; {@link DataOutput#writeVInt VInt}</li>
@ -142,10 +142,10 @@ public class FSTTermsWriter extends FieldsConsumer {
boolean success = false;
try {
CodecUtil.writeSegmentHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT,
CodecUtil.writeIndexHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
this.postingsWriter.init(out);
this.postingsWriter.init(out, state);
success = true;
} finally {
if (!success) {

View File

@ -74,10 +74,10 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
try {
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
data = state.directory.createOutput(dataName, state.context);
CodecUtil.writeSegmentHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
meta = state.directory.createOutput(metaName, state.context);
CodecUtil.writeSegmentHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
success = true;
} finally {
if (!success) {

View File

@ -146,7 +146,7 @@ class MemoryDocValuesProducer extends DocValuesProducer {
ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context);
boolean success = false;
try {
version = CodecUtil.checkSegmentHeader(in, metaCodec, VERSION_START, VERSION_CURRENT,
version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
numEntries = readFields(in, state.fieldInfos);
CodecUtil.checkFooter(in);
@ -164,7 +164,7 @@ class MemoryDocValuesProducer extends DocValuesProducer {
this.data = state.directory.openInput(dataName, state.context);
success = false;
try {
final int version2 = CodecUtil.checkSegmentHeader(data, dataCodec, VERSION_START, VERSION_CURRENT,
final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
if (version != version2) {
throw new CorruptIndexException("Format versions mismatch: meta=" + version + ", data=" + version2, data);

View File

@ -288,7 +288,7 @@ public final class MemoryPostingsFormat extends PostingsFormat {
out = state.directory.createOutput(fileName, state.context);
boolean success = false;
try {
CodecUtil.writeSegmentHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
success = true;
} finally {
if (!success) {
@ -981,7 +981,7 @@ public final class MemoryPostingsFormat extends PostingsFormat {
try (ChecksumIndexInput in = state.directory.openChecksumInput(fileName, IOContext.READONCE)) {
Throwable priorE = null;
try {
CodecUtil.checkSegmentHeader(in, CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.checkIndexHeader(in, CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
while(true) {
final int termCount = in.readVInt();
if (termCount == 0) {

View File

@ -60,7 +60,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
public static final String SI_EXTENSION = "si";
@Override
public SegmentInfo read(Directory directory, String segmentName, IOContext context) throws IOException {
public SegmentInfo read(Directory directory, String segmentName, byte[] segmentID, IOContext context) throws IOException {
BytesRefBuilder scratch = new BytesRefBuilder();
String segFileName = IndexFileNames.segmentFileName(segmentName, "", SimpleTextSegmentInfoFormat.SI_EXTENSION);
ChecksumIndexInput input = directory.openChecksumInput(segFileName, context);
@ -114,6 +114,11 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SI_ID);
final byte[] id = Arrays.copyOfRange(scratch.bytes(), SI_ID.length, scratch.length());
if (!Arrays.equals(segmentID, id)) {
throw new CorruptIndexException("file mismatch, expected: " + StringHelper.idToString(segmentID)
+ ", got: " + StringHelper.idToString(id), input);
}
SimpleTextUtil.checkFooter(input);

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.codecs.blocktreeords.Ords41PostingsFormat
org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat
org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
org.apache.lucene.codecs.memory.DirectPostingsFormat
org.apache.lucene.codecs.memory.FSTOrdPostingsFormat

View File

@ -18,7 +18,6 @@ package org.apache.lucene.codecs.blockterms;
*/
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene41ords.Lucene41WithOrds;
import org.apache.lucene.index.BasePostingsFormatTestCase;
import org.apache.lucene.util.TestUtil;
@ -26,7 +25,7 @@ import org.apache.lucene.util.TestUtil;
* Basic tests of a PF using FixedGap terms dictionary
*/
public class TestFixedGapPostingsFormat extends BasePostingsFormatTestCase {
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene41WithOrds(TestUtil.nextInt(random(), 1, 1000)));
private final Codec codec = TestUtil.alwaysPostingsFormat(new LuceneFixedGap(TestUtil.nextInt(random(), 1, 1000)));
@Override
protected Codec getCodec() {

View File

@ -18,7 +18,7 @@ package org.apache.lucene.codecs.blockterms;
*/
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapFixedInterval;
import org.apache.lucene.codecs.blockterms.LuceneVarGapFixedInterval;
import org.apache.lucene.index.BasePostingsFormatTestCase;
import org.apache.lucene.util.TestUtil;
@ -26,7 +26,7 @@ import org.apache.lucene.util.TestUtil;
* Basic tests of a PF using VariableGap terms dictionary (fixed interval)
*/
public class TestVarGapDocFreqIntervalPostingsFormat extends BasePostingsFormatTestCase {
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene41VarGapFixedInterval(TestUtil.nextInt(random(), 1, 1000)));
private final Codec codec = TestUtil.alwaysPostingsFormat(new LuceneVarGapFixedInterval(TestUtil.nextInt(random(), 1, 1000)));
@Override
protected Codec getCodec() {

View File

@ -18,7 +18,7 @@ package org.apache.lucene.codecs.blockterms;
*/
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapDocFreqInterval;
import org.apache.lucene.codecs.blockterms.LuceneVarGapDocFreqInterval;
import org.apache.lucene.index.BasePostingsFormatTestCase;
import org.apache.lucene.util.TestUtil;
@ -26,7 +26,7 @@ import org.apache.lucene.util.TestUtil;
* Basic tests of a PF using VariableGap terms dictionary (fixed interval, docFreq threshold)
*/
public class TestVarGapFixedIntervalPostingsFormat extends BasePostingsFormatTestCase {
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene41VarGapDocFreqInterval(TestUtil.nextInt(random(), 1, 100), TestUtil.nextInt(random(), 1, 1000)));
private final Codec codec = TestUtil.alwaysPostingsFormat(new LuceneVarGapDocFreqInterval(TestUtil.nextInt(random(), 1, 100), TestUtil.nextInt(random(), 1, 1000)));
@Override
protected Codec getCodec() {

View File

@ -39,7 +39,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TestUtil;
public class TestOrdsBlockTree extends BasePostingsFormatTestCase {
private final Codec codec = TestUtil.alwaysPostingsFormat(new Ords41PostingsFormat());
private final Codec codec = TestUtil.alwaysPostingsFormat(new BlockTreeOrdsPostingsFormat());
@Override
protected Codec getCodec() {

View File

@ -94,46 +94,48 @@ public final class CodecUtil {
}
/**
* Writes a codec header for a per-segment, which records both a string to
* identify the file, a version number, and the unique ID of the segment.
* This header can be parsed and validated with
* {@link #checkSegmentHeader(DataInput, String, int, int, byte[], String) checkSegmentHeader()}.
* Writes a codec header for an index file, which records both a string to
* identify the format of the file, a version number, and data to identify
* the file instance (ID and auxiliary suffix such as generation).
* <p>
* CodecSegmentHeader --&gt; CodecHeader,SegmentID,SegmentSuffix
* This header can be parsed and validated with
* {@link #checkIndexHeader(DataInput, String, int, int, byte[], String) checkIndexHeader()}.
* <p>
* IndexHeader --&gt; CodecHeader,ObjectID,ObjectSuffix
* <ul>
* <li>CodecHeader --&gt; {@link #writeHeader}
* <li>SegmentID --&gt; {@link DataOutput#writeByte byte}<sup>16</sup>
* <li>SegmentSuffix --&gt; SuffixLength,SuffixBytes
* <li>ObjectID --&gt; {@link DataOutput#writeByte byte}<sup>16</sup>
* <li>ObjectSuffix --&gt; SuffixLength,SuffixBytes
* <li>SuffixLength --&gt; {@link DataOutput#writeByte byte}
* <li>SuffixBytes --&gt; {@link DataOutput#writeByte byte}<sup>SuffixLength</sup>
* </ul>
* <p>
* Note that the length of a segment header depends only upon the
* Note that the length of an index header depends only upon the
* name of the codec and suffix, so this length can be computed at any time
* with {@link #segmentHeaderLength(String,String)}.
* with {@link #indexHeaderLength(String,String)}.
*
* @param out Output stream
* @param codec String to identify this file. It should be simple ASCII,
* @param codec String to identify the format of this file. It should be simple ASCII,
* less than 128 characters in length.
* @param segmentID Unique identifier for the segment
* @param segmentSuffix auxiliary suffix for the file. It should be simple ASCII,
* @param id Unique identifier for this particular file instance.
* @param suffix auxiliary suffix information for the file. It should be simple ASCII,
* less than 256 characters in length.
* @param version Version number
* @throws IOException If there is an I/O error writing to the underlying medium.
* @throws IllegalArgumentException If the codec name is not simple ASCII, or
* is more than 127 characters in length, or if segmentID is invalid,
* or if the segmentSuffix is not simple ASCII, or more than 255 characters
* is more than 127 characters in length, or if id is invalid,
* or if the suffix is not simple ASCII, or more than 255 characters
* in length.
*/
public static void writeSegmentHeader(DataOutput out, String codec, int version, byte[] segmentID, String segmentSuffix) throws IOException {
if (segmentID.length != StringHelper.ID_LENGTH) {
throw new IllegalArgumentException("Invalid id: " + StringHelper.idToString(segmentID));
public static void writeIndexHeader(DataOutput out, String codec, int version, byte[] id, String suffix) throws IOException {
if (id.length != StringHelper.ID_LENGTH) {
throw new IllegalArgumentException("Invalid id: " + StringHelper.idToString(id));
}
writeHeader(out, codec, version);
out.writeBytes(segmentID, 0, segmentID.length);
BytesRef suffixBytes = new BytesRef(segmentSuffix);
if (suffixBytes.length != segmentSuffix.length() || suffixBytes.length >= 256) {
throw new IllegalArgumentException("codec must be simple ASCII, less than 256 characters in length [got " + segmentSuffix + "]");
out.writeBytes(id, 0, id.length);
BytesRef suffixBytes = new BytesRef(suffix);
if (suffixBytes.length != suffix.length() || suffixBytes.length >= 256) {
throw new IllegalArgumentException("codec must be simple ASCII, less than 256 characters in length [got " + suffix + "]");
}
out.writeByte((byte)suffixBytes.length);
out.writeBytes(suffixBytes.bytes, suffixBytes.offset, suffixBytes.length);
@ -151,14 +153,14 @@ public final class CodecUtil {
}
/**
* Computes the length of a segment header.
* Computes the length of an index header.
*
* @param codec Codec name.
* @return length of the entire segment header.
* @see #writeSegmentHeader(DataOutput, String, int, byte[], String)
* @return length of the entire index header.
* @see #writeIndexHeader(DataOutput, String, int, byte[], String)
*/
public static int segmentHeaderLength(String codec, String segmentSuffix) {
return headerLength(codec) + StringHelper.ID_LENGTH + 1 + segmentSuffix.length();
public static int indexHeaderLength(String codec, String suffix) {
return headerLength(codec) + StringHelper.ID_LENGTH + 1 + suffix.length();
}
/**
@ -220,11 +222,11 @@ public final class CodecUtil {
/**
* Reads and validates a header previously written with
* {@link #writeSegmentHeader(DataOutput, String, int, byte[], String)}.
* {@link #writeIndexHeader(DataOutput, String, int, byte[], String)}.
* <p>
* When reading a file, supply the expected <code>codec</code>,
* expected version range (<code>minVersion to maxVersion</code>),
* and segment ID.
* and object ID and suffix.
*
* @param in Input stream, positioned at the point where the
* header was previously written. Typically this is located
@ -232,41 +234,53 @@ public final class CodecUtil {
* @param codec The expected codec name.
* @param minVersion The minimum supported expected version number.
* @param maxVersion The maximum supported expected version number.
* @param segmentID The expected segment this file belongs to.
* @param segmentSuffix The expected auxiliary segment suffix for this file.
* @param expectedID The expected object identifier for this file.
* @param expectedSuffix The expected auxiliary suffix for this file.
* @return The actual version found, when a valid header is found
* that matches <code>codec</code>, with an actual version
* where <code>minVersion <= actual <= maxVersion</code>,
* and matching <code>segmentID</code>
* and matching <code>expectedID</code> and <code>expectedSuffix</code>
* Otherwise an exception is thrown.
* @throws CorruptIndexException If the first four bytes are not
* {@link #CODEC_MAGIC}, or if the actual codec found is
* not <code>codec</code>, or if the <code>segmentID</code>
* or <code>segmentSuffix</code> do not match.
* not <code>codec</code>, or if the <code>expectedID</code>
* or <code>expectedSuffix</code> do not match.
* @throws IndexFormatTooOldException If the actual version is less
* than <code>minVersion</code>.
* @throws IndexFormatTooNewException If the actual version is greater
* than <code>maxVersion</code>.
* @throws IOException If there is an I/O error reading from the underlying medium.
* @see #writeSegmentHeader(DataOutput, String, int, byte[],String)
* @see #writeIndexHeader(DataOutput, String, int, byte[],String)
*/
public static int checkSegmentHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] segmentID, String segmentSuffix) throws IOException {
public static int checkIndexHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] expectedID, String expectedSuffix) throws IOException {
int version = checkHeader(in, codec, minVersion, maxVersion);
checkIndexHeaderID(in, expectedID);
checkIndexHeaderSuffix(in, expectedSuffix);
return version;
}
/** Expert: just reads and verifies the object ID of an index header */
public static byte[] checkIndexHeaderID(DataInput in, byte[] expectedID) throws IOException {
byte id[] = new byte[StringHelper.ID_LENGTH];
in.readBytes(id, 0, id.length);
if (!Arrays.equals(id, segmentID)) {
throw new CorruptIndexException("file mismatch, expected segment id=" + StringHelper.idToString(segmentID)
+ ", got=" + StringHelper.idToString(id), in);
if (!Arrays.equals(id, expectedID)) {
throw new CorruptIndexException("file mismatch, expected id=" + StringHelper.idToString(expectedID)
+ ", got=" + StringHelper.idToString(id), in);
}
return id;
}
/** Expert: just reads and verifies the suffix of an index header */
public static String checkIndexHeaderSuffix(DataInput in, String expectedSuffix) throws IOException {
int suffixLength = in.readByte() & 0xFF;
byte suffixBytes[] = new byte[suffixLength];
in.readBytes(suffixBytes, 0, suffixBytes.length);
String suffix = new String(suffixBytes, 0, suffixBytes.length, StandardCharsets.UTF_8);
if (!suffix.equals(segmentSuffix)) {
throw new CorruptIndexException("file mismatch, expected segment suffix=" + segmentSuffix
+ ", got=" + suffix, in);
if (!suffix.equals(expectedSuffix)) {
throw new CorruptIndexException("file mismatch, expected suffix=" + expectedSuffix
+ ", got=" + suffix, in);
}
return version;
return suffix;
}
/**

View File

@ -1,55 +0,0 @@
package org.apache.lucene.codecs;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SegmentReadState;
/**
* Provides a {@link PostingsReaderBase} and {@link
* PostingsWriterBase}.
*
* @lucene.experimental */
// TODO: find a better name; this defines the API that the
// terms dict impls use to talk to a postings impl.
// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
// can we clean this up and do this some other way?
// refactor some of these classes and use covariant return?
public abstract class PostingsBaseFormat {
/** Unique name that's used to retrieve this codec when
* reading the index */
public final String name;
/** Sole constructor. */
protected PostingsBaseFormat(String name) {
this.name = name;
}
/** Creates the {@link PostingsReaderBase} for this
* format. */
public abstract PostingsReaderBase postingsReaderBase(SegmentReadState state) throws IOException;
/** Creates the {@link PostingsWriterBase} for this
* format. */
public abstract PostingsWriterBase postingsWriterBase(SegmentWriteState state) throws IOException;
}

View File

@ -23,6 +23,7 @@ import java.io.IOException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
@ -53,7 +54,7 @@ public abstract class PostingsReaderBase implements Closeable, Accountable {
/** Performs any initialization, such as reading and
* verifying the header from the provided terms
* dictionary {@link IndexInput}. */
public abstract void init(IndexInput termsIn) throws IOException;
public abstract void init(IndexInput termsIn, SegmentReadState state) throws IOException;
/** Return a newly created empty TermState */
public abstract BlockTermState newTermState() throws IOException;

View File

@ -24,6 +24,7 @@ import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
import org.apache.lucene.index.DocsAndPositionsEnum; // javadocs
import org.apache.lucene.index.DocsEnum; // javadocs
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
@ -50,7 +51,7 @@ public abstract class PostingsWriterBase implements Closeable {
/** Called once after startup, before any terms have been
* added. Implementations typically write a header to
* the provided {@code termsOut}. */
public abstract void init(IndexOutput termsOut) throws IOException;
public abstract void init(IndexOutput termsOut, SegmentWriteState state) throws IOException;
/** Write all postings for one term; use the provided
* {@link TermsEnum} to pull a {@link DocsEnum} or {@link

View File

@ -24,8 +24,6 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
@ -72,11 +70,6 @@ public abstract class PushPostingsWriterBase extends PostingsWriterBase {
protected PushPostingsWriterBase() {
}
/** Called once after startup, before any terms have been
* added. Implementations typically write a header to
* the provided {@code termsOut}. */
public abstract void init(IndexOutput termsOut) throws IOException;
/** Return a newly created empty TermState */
public abstract BlockTermState newTermState() throws IOException;
@ -90,26 +83,11 @@ public abstract class PushPostingsWriterBase extends PostingsWriterBase {
* and will holds metadata from PBF when returned */
public abstract void finishTerm(BlockTermState state) throws IOException;
/**
* Encode metadata as long[] and byte[]. {@code absolute} controls whether
* current term is delta encoded according to latest term.
* Usually elements in {@code longs} are file pointers, so each one always
* increases when a new term is consumed. {@code out} is used to write generic
* bytes, which are not monotonic.
*
* NOTE: sometimes long[] might contain "don't care" values that are unused, e.g.
* the pointer to postings list may not be defined for some terms but is defined
* for others, if it is designed to inline some postings data in term dictionary.
* In this case, the postings writer should always use the last value, so that each
* element in metadata long[] remains monotonic.
*/
public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
/**
* Sets the current field for writing, and returns the
* fixed length of long[] metadata (which is fixed per
* field), called when the writing switches to another field. */
// TODO: better name?
@Override
public int setField(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
indexOptions = fieldInfo.getIndexOptions();

View File

@ -41,10 +41,11 @@ public abstract class SegmentInfoFormat {
* Read {@link SegmentInfo} data from a directory.
* @param directory directory to read from
* @param segmentName name of the segment to read
* @param segmentID expected identifier for the segment
* @return infos instance to be populated with data
* @throws IOException If an I/O error occurs
*/
public abstract SegmentInfo read(Directory directory, String segmentName, IOContext context) throws IOException;
public abstract SegmentInfo read(Directory directory, String segmentName, byte segmentID[], IOContext context) throws IOException;
/**
* Write {@link SegmentInfo} data.

View File

@ -30,17 +30,16 @@ import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.Outputs;
/** A block-based terms index and dictionary that assigns
* terms to variable length blocks according to how they
@ -74,8 +73,31 @@ import org.apache.lucene.util.IOUtils;
public final class BlockTreeTermsReader extends FieldsProducer {
static final Outputs<BytesRef> FST_OUTPUTS = ByteSequenceOutputs.getSingleton();
static final BytesRef NO_OUTPUT = FST_OUTPUTS.getNoOutput();
static final int OUTPUT_FLAGS_NUM_BITS = 2;
static final int OUTPUT_FLAGS_MASK = 0x3;
static final int OUTPUT_FLAG_IS_FLOOR = 0x1;
static final int OUTPUT_FLAG_HAS_TERMS = 0x2;
/** Extension of terms file */
static final String TERMS_EXTENSION = "tim";
final static String TERMS_CODEC_NAME = "BlockTreeTermsDict";
/** Initial terms format. */
public static final int VERSION_START = 0;
/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_START;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
final static String TERMS_INDEX_CODEC_NAME = "BlockTreeTermsIndex";
// Open input to the main terms dict file (_X.tib)
final IndexInput in;
final IndexInput termsIn;
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
@ -96,105 +118,86 @@ public final class BlockTreeTermsReader extends FieldsProducer {
private final int version;
/** Sole constructor. */
public BlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo info,
PostingsReaderBase postingsReader, IOContext ioContext,
String segmentSuffix)
throws IOException {
this.postingsReader = postingsReader;
this.segment = info.name;
in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_EXTENSION),
ioContext);
public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) throws IOException {
boolean success = false;
IndexInput indexIn = null;
this.postingsReader = postingsReader;
this.segment = state.segmentInfo.name;
String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
try {
version = readHeader(in);
indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION),
ioContext);
int indexVersion = readIndexHeader(indexIn);
if (indexVersion != version) {
throw new CorruptIndexException("mixmatched version files: " + in + "=" + version + "," + indexIn + "=" + indexVersion, indexIn);
}
termsIn = state.directory.openInput(termsName, state.context);
version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
// verify
if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) {
CodecUtil.checksumEntireFile(indexIn);
}
String indexName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_INDEX_EXTENSION);
indexIn = state.directory.openInput(indexName, state.context);
CodecUtil.checkIndexHeader(indexIn, TERMS_INDEX_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.checksumEntireFile(indexIn);
// Have PostingsReader init itself
postingsReader.init(in);
postingsReader.init(termsIn, state);
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) {
CodecUtil.retrieveChecksum(in);
}
CodecUtil.retrieveChecksum(termsIn);
// Read per-field details
seekDir(in, dirOffset);
seekDir(termsIn, dirOffset);
seekDir(indexIn, indexDirOffset);
final int numFields = in.readVInt();
final int numFields = termsIn.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, in);
throw new CorruptIndexException("invalid numFields: " + numFields, termsIn);
}
for(int i=0;i<numFields;i++) {
final int field = in.readVInt();
final long numTerms = in.readVLong();
for (int i = 0; i < numFields; ++i) {
final int field = termsIn.readVInt();
final long numTerms = termsIn.readVLong();
if (numTerms <= 0) {
throw new CorruptIndexException("Illegal numTerms for field number: " + field, in);
throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsIn);
}
final int numBytes = in.readVInt();
final int numBytes = termsIn.readVInt();
if (numBytes < 0) {
throw new CorruptIndexException("invalid rootCode for field number: " + field + ", numBytes=" + numBytes, in);
throw new CorruptIndexException("invalid rootCode for field number: " + field + ", numBytes=" + numBytes, termsIn);
}
final BytesRef rootCode = new BytesRef(new byte[numBytes]);
in.readBytes(rootCode.bytes, 0, numBytes);
termsIn.readBytes(rootCode.bytes, 0, numBytes);
rootCode.length = numBytes;
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
throw new CorruptIndexException("invalid field number: " + field, in);
throw new CorruptIndexException("invalid field number: " + field, termsIn);
}
final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
final long sumDocFreq = in.readVLong();
final int docCount = in.readVInt();
final int longsSize = version >= BlockTreeTermsWriter.VERSION_META_ARRAY ? in.readVInt() : 0;
final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : termsIn.readVLong();
final long sumDocFreq = termsIn.readVLong();
final int docCount = termsIn.readVInt();
final int longsSize = termsIn.readVInt();
if (longsSize < 0) {
throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, in);
throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn);
}
BytesRef minTerm, maxTerm;
if (version >= BlockTreeTermsWriter.VERSION_MIN_MAX_TERMS) {
minTerm = readBytesRef(in);
maxTerm = readBytesRef(in);
} else {
minTerm = maxTerm = null;
}
if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount(), in);
BytesRef minTerm = readBytesRef(termsIn);
BytesRef maxTerm = readBytesRef(termsIn);
if (docCount < 0 || docCount > state.segmentInfo.getDocCount()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.getDocCount(), termsIn);
}
if (sumDocFreq < docCount) { // #postings must be >= #docs with field
throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, in);
throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsIn);
}
if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, in);
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsIn);
}
final long indexStartFP = indexIn.readVLong();
FieldReader previous = fields.put(fieldInfo.name,
new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
indexStartFP, longsSize, indexIn, minTerm, maxTerm));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in);
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
}
}
indexIn.close();
success = true;
} finally {
if (!success) {
@ -212,38 +215,11 @@ public final class BlockTreeTermsReader extends FieldsProducer {
return bytes;
}
/** Reads terms file header. */
private int readHeader(IndexInput input) throws IOException {
int version = CodecUtil.checkHeader(input, BlockTreeTermsWriter.TERMS_CODEC_NAME,
BlockTreeTermsWriter.VERSION_START,
BlockTreeTermsWriter.VERSION_CURRENT);
if (version < BlockTreeTermsWriter.VERSION_APPEND_ONLY) {
dirOffset = input.readLong();
}
return version;
}
/** Reads index file header. */
private int readIndexHeader(IndexInput input) throws IOException {
int version = CodecUtil.checkHeader(input, BlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME,
BlockTreeTermsWriter.VERSION_START,
BlockTreeTermsWriter.VERSION_CURRENT);
if (version < BlockTreeTermsWriter.VERSION_APPEND_ONLY) {
indexDirOffset = input.readLong();
}
return version;
}
/** Seek {@code input} to the directory offset. */
private void seekDir(IndexInput input, long dirOffset)
throws IOException {
if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) {
input.seek(input.length() - CodecUtil.footerLength() - 8);
dirOffset = input.readLong();
} else if (version >= BlockTreeTermsWriter.VERSION_APPEND_ONLY) {
input.seek(input.length() - 8);
dirOffset = input.readLong();
}
input.seek(input.length() - CodecUtil.footerLength() - 8);
dirOffset = input.readLong();
input.seek(dirOffset);
}
@ -255,7 +231,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
@Override
public void close() throws IOException {
try {
IOUtils.close(in, postingsReader);
IOUtils.close(termsIn, postingsReader);
} finally {
// Clear so refs to terms index is GCable even if
// app hangs onto us:
@ -313,14 +289,12 @@ public final class BlockTreeTermsReader extends FieldsProducer {
}
@Override
public void checkIntegrity() throws IOException {
if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) {
// term dictionary
CodecUtil.checksumEntireFile(in);
public void checkIntegrity() throws IOException {
// term dictionary
CodecUtil.checksumEntireFile(termsIn);
// postings
postingsReader.checkIntegrity();
}
// postings
postingsReader.checkIntegrity();
}
@Override

View File

@ -41,14 +41,12 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.PackedInts;
@ -192,10 +190,6 @@ import org.apache.lucene.util.packed.PackedInts;
*/
public final class BlockTreeTermsWriter extends FieldsConsumer {
static final Outputs<BytesRef> FST_OUTPUTS = ByteSequenceOutputs.getSingleton();
static final BytesRef NO_OUTPUT = FST_OUTPUTS.getNoOutput();
/** Suggested default value for the {@code
* minItemsInBlock} parameter to {@link
* #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
@ -209,38 +203,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// public final static boolean DEBUG = false;
//private final static boolean SAVE_DOT_FILES = false;
static final int OUTPUT_FLAGS_NUM_BITS = 2;
static final int OUTPUT_FLAGS_MASK = 0x3;
static final int OUTPUT_FLAG_IS_FLOOR = 0x1;
static final int OUTPUT_FLAG_HAS_TERMS = 0x2;
/** Extension of terms file */
static final String TERMS_EXTENSION = "tim";
final static String TERMS_CODEC_NAME = "BLOCK_TREE_TERMS_DICT";
/** Initial terms format. */
public static final int VERSION_START = 0;
/** Append-only */
public static final int VERSION_APPEND_ONLY = 1;
/** Meta data as array */
public static final int VERSION_META_ARRAY = 2;
/** checksums */
public static final int VERSION_CHECKSUM = 3;
/** min/max term */
public static final int VERSION_MIN_MAX_TERMS = 4;
/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_MIN_MAX_TERMS;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
final static String TERMS_INDEX_CODEC_NAME = "BLOCK_TREE_TERMS_INDEX";
private final IndexOutput out;
private final IndexOutput termsOut;
private final IndexOutput indexOut;
final int maxDoc;
final int minItemsInBlock;
@ -286,8 +249,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
* sub-blocks) per block will aim to be between
* minItemsPerBlock and maxItemsPerBlock, though in some
* cases the blocks may be smaller than the min. */
public BlockTreeTermsWriter(
SegmentWriteState state,
public BlockTreeTermsWriter(SegmentWriteState state,
PostingsWriterBase postingsWriter,
int minItemsInBlock,
int maxItemsInBlock)
@ -306,47 +268,34 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
throw new IllegalArgumentException("maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock);
}
maxDoc = state.segmentInfo.getDocCount();
this.maxDoc = state.segmentInfo.getDocCount();
this.fieldInfos = state.fieldInfos;
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;
this.postingsWriter = postingsWriter;
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
out = state.directory.createOutput(termsFileName, state.context);
final String termsName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_EXTENSION);
termsOut = state.directory.createOutput(termsName, state.context);
boolean success = false;
IndexOutput indexOut = null;
try {
fieldInfos = state.fieldInfos;
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;
writeHeader(out);
CodecUtil.writeIndexHeader(termsOut, BlockTreeTermsReader.TERMS_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
//DEBUG = state.segmentName.equals("_4a");
final String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_INDEX_EXTENSION);
indexOut = state.directory.createOutput(indexName, state.context);
CodecUtil.writeIndexHeader(indexOut, BlockTreeTermsReader.TERMS_INDEX_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
indexOut = state.directory.createOutput(termsIndexFileName, state.context);
writeIndexHeader(indexOut);
this.postingsWriter = postingsWriter;
// segment = state.segmentInfo.name;
// System.out.println("BTW.init seg=" + state.segmentName);
postingsWriter.init(out); // have consumer write its format/header
postingsWriter.init(termsOut, state); // have consumer write its format/header
this.indexOut = indexOut;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(out, indexOut);
IOUtils.closeWhileHandlingException(termsOut, indexOut);
}
}
this.indexOut = indexOut;
}
/** Writes the terms file header. */
private void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT);
}
/** Writes the index file header. */
private void writeIndexHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT);
}
/** Writes the terms file trailer. */
@ -389,7 +338,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
assert fp < (1L << 62);
return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0);
return (fp << 2) | (hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0);
}
private static class PendingEntry {
@ -686,7 +635,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
assert end > start;
long startFP = out.getFilePointer();
long startFP = termsOut.getFilePointer();
boolean hasFloorLeadLabel = isFloor && floorLeadLabel != -1;
@ -701,7 +650,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// Last block:
code |= 1;
}
out.writeVInt(code);
termsOut.writeVInt(code);
/*
if (DEBUG) {
@ -847,18 +796,18 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// search on lookup
// Write suffixes byte[] blob to terms dict output:
out.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
suffixWriter.writeTo(out);
termsOut.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
suffixWriter.writeTo(termsOut);
suffixWriter.reset();
// Write term stats byte[] blob
out.writeVInt((int) statsWriter.getFilePointer());
statsWriter.writeTo(out);
termsOut.writeVInt((int) statsWriter.getFilePointer());
statsWriter.writeTo(termsOut);
statsWriter.reset();
// Write term meta data byte[] blob
out.writeVInt((int) metaWriter.getFilePointer());
metaWriter.writeTo(out);
termsOut.writeVInt((int) metaWriter.getFilePointer());
metaWriter.writeTo(termsOut);
metaWriter.reset();
// if (DEBUG) {
@ -1013,38 +962,38 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
boolean success = false;
try {
final long dirStart = out.getFilePointer();
final long dirStart = termsOut.getFilePointer();
final long indexDirStart = indexOut.getFilePointer();
out.writeVInt(fields.size());
termsOut.writeVInt(fields.size());
for(FieldMetaData field : fields) {
//System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms");
out.writeVInt(field.fieldInfo.number);
termsOut.writeVInt(field.fieldInfo.number);
assert field.numTerms > 0;
out.writeVLong(field.numTerms);
out.writeVInt(field.rootCode.length);
out.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length);
termsOut.writeVLong(field.numTerms);
termsOut.writeVInt(field.rootCode.length);
termsOut.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length);
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
out.writeVLong(field.sumTotalTermFreq);
termsOut.writeVLong(field.sumTotalTermFreq);
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
out.writeVInt(field.longsSize);
termsOut.writeVLong(field.sumDocFreq);
termsOut.writeVInt(field.docCount);
termsOut.writeVInt(field.longsSize);
indexOut.writeVLong(field.indexStartFP);
writeBytesRef(out, field.minTerm);
writeBytesRef(out, field.maxTerm);
writeBytesRef(termsOut, field.minTerm);
writeBytesRef(termsOut, field.maxTerm);
}
writeTrailer(out, dirStart);
CodecUtil.writeFooter(out);
writeTrailer(termsOut, dirStart);
CodecUtil.writeFooter(termsOut);
writeIndexTrailer(indexOut, indexDirStart);
CodecUtil.writeFooter(indexOut);
success = true;
} finally {
if (success) {
IOUtils.close(out, indexOut, postingsWriter);
IOUtils.close(termsOut, indexOut, postingsWriter);
} else {
IOUtils.closeWhileHandlingException(out, indexOut, postingsWriter);
IOUtils.closeWhileHandlingException(termsOut, indexOut, postingsWriter);
}
}
}

View File

@ -34,8 +34,10 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
/** BlockTree's implementation of {@link Terms}. */
// public for CheckIndex:
/**
* BlockTree's implementation of {@link Terms}.
* @lucene.internal
*/
public final class FieldReader extends Terms implements Accountable {
private static final long BASE_RAM_BYTES_USED =
@ -77,7 +79,7 @@ public final class FieldReader extends Terms implements Accountable {
// System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor);
// }
rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
if (indexIn != null) {
final IndexInput clone = indexIn.clone();
@ -120,8 +122,8 @@ public final class FieldReader extends Terms implements Accountable {
}
/** For debugging -- used by CheckIndex too*/
// TODO: maybe push this into Terms?
public Stats computeStats() throws IOException {
@Override
public Stats getStats() throws IOException {
return new SegmentTermsEnum(this).computeBlockStats();
}

View File

@ -67,7 +67,7 @@ final class IntersectTermsEnum extends TermsEnum {
this.fr = fr;
runAutomaton = compiled.runAutomaton;
compiledAutomaton = compiled;
in = fr.parent.in.clone();
in = fr.parent.termsIn.clone();
stack = new IntersectTermsEnumFrame[5];
for(int idx=0;idx<stack.length;idx++) {
stack[idx] = new IntersectTermsEnumFrame(this, idx);

View File

@ -145,7 +145,7 @@ final class IntersectTermsEnumFrame {
// Skip first long -- has redundant fp, hasTerms
// flag, isFloor flag
final long code = floorDataReader.readVLong();
if ((code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0) {
if ((code & BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;
// if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel);

View File

@ -113,7 +113,7 @@ final class SegmentTermsEnum extends TermsEnum {
// Not private to avoid synthetic access$NNN methods
void initIndexInput() {
if (this.in == null) {
this.in = fr.parent.in.clone();
this.in = fr.parent.termsIn.clone();
}
}
@ -238,11 +238,11 @@ final class SegmentTermsEnum extends TermsEnum {
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length) throws IOException {
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
final long code = scratchReader.readVLong();
final long fpSeek = code >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
final long fpSeek = code >>> BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
final SegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
f.hasTerms = (code & BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
f.hasTerms = (code & BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
f.hasTermsOrig = f.hasTerms;
f.isFloor = (code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;
f.isFloor = (code & BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0;
if (f.isFloor) {
f.setFloorData(scratchReader, frameData);
}
@ -370,8 +370,8 @@ final class SegmentTermsEnum extends TermsEnum {
}
arc = arcs[1+targetUpto];
assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
if (arc.output != BlockTreeTermsReader.NO_OUTPUT) {
output = BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output);
}
if (arc.isFinal()) {
lastFrame = stack[1+lastFrame.ord];
@ -461,7 +461,7 @@ final class SegmentTermsEnum extends TermsEnum {
//term.length = 0;
targetUpto = 0;
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
currentFrame = pushFrame(arc, BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
}
// if (DEBUG) {
@ -517,8 +517,8 @@ final class SegmentTermsEnum extends TermsEnum {
term.setByteAt(targetUpto, (byte) targetLabel);
// Aggregate output as we go:
assert arc.output != null;
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
if (arc.output != BlockTreeTermsReader.NO_OUTPUT) {
output = BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output);
}
// if (DEBUG) {
@ -528,7 +528,7 @@ final class SegmentTermsEnum extends TermsEnum {
if (arc.isFinal()) {
//if (DEBUG) System.out.println(" arc is final!");
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
currentFrame = pushFrame(arc, BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
}
}
@ -612,7 +612,7 @@ final class SegmentTermsEnum extends TermsEnum {
int cmp = 0;
// TOOD: we should write our vLong backwards (MSB
// TODO: we should write our vLong backwards (MSB
// first) to get better sharing from the FST
// First compare up to valid seek frames:
@ -626,13 +626,13 @@ final class SegmentTermsEnum extends TermsEnum {
}
arc = arcs[1+targetUpto];
assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
// TOOD: we could save the outputs in local
// TODO: we could save the outputs in local
// byte[][] instead of making new objs ever
// seek; but, often the FST doesn't have any
// shared bytes (but this could change if we
// reverse vLong byte order)
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
if (arc.output != BlockTreeTermsReader.NO_OUTPUT) {
output = BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output);
}
if (arc.isFinal()) {
lastFrame = stack[1+lastFrame.ord];
@ -717,7 +717,7 @@ final class SegmentTermsEnum extends TermsEnum {
//term.length = 0;
targetUpto = 0;
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
currentFrame = pushFrame(arc, BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
}
//if (DEBUG) {
@ -773,8 +773,8 @@ final class SegmentTermsEnum extends TermsEnum {
arc = nextArc;
// Aggregate output as we go:
assert arc.output != null;
if (arc.output != BlockTreeTermsWriter.NO_OUTPUT) {
output = BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
if (arc.output != BlockTreeTermsReader.NO_OUTPUT) {
output = BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output);
}
//if (DEBUG) {
@ -784,7 +784,7 @@ final class SegmentTermsEnum extends TermsEnum {
if (arc.isFinal()) {
//if (DEBUG) System.out.println(" arc is final!");
currentFrame = pushFrame(arc, BlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
currentFrame = pushFrame(arc, BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
}
}
@ -831,9 +831,9 @@ final class SegmentTermsEnum extends TermsEnum {
assert f != null;
final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix);
if (f.nextEnt == -1) {
out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<< BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
} else {
out.println(" frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
out.println(" frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<< BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR:0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
}
if (fr.index != null) {
assert !isSeekFrame || f.arc != null: "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
@ -848,7 +848,7 @@ final class SegmentTermsEnum extends TermsEnum {
} else if (isSeekFrame && !f.isFloor) {
final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length);
final long codeOrig = reader.readVLong();
final long code = (f.fp << BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) | (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0);
final long code = (f.fp << BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS:0) | (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR:0);
if (codeOrig != code) {
out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code);
throw new RuntimeException("seek state is broken");

View File

@ -22,14 +22,15 @@ import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.Locale;
import org.apache.lucene.codecs.PostingsBaseFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/**
* BlockTree statistics for a single field
* returned by {@link FieldReader#computeStats()}.
* returned by {@link FieldReader#getStats()}.
* @lucene.internal
*/
public class Stats {
/** How many nodes in the index FST. */
@ -81,11 +82,11 @@ public class Stats {
public long totalBlockSuffixBytes;
/** Total number of bytes used to store term stats (not
* including what the {@link PostingsBaseFormat}
* including what the {@link PostingsReaderBase}
* stores. */
public long totalBlockStatsBytes;
/** Total bytes stored by the {@link PostingsBaseFormat},
/** Total bytes stored by the {@link PostingsReaderBase},
* plus the other few vInts stored in the frame. */
public long totalBlockOtherBytes;

View File

@ -27,7 +27,7 @@ This terms dictionary organizes all terms into blocks according to
shared prefix, such that each block has enough terms, and then stores
the prefix trie in memory as an FST as the index structure. It allows
you to plug in your own {@link
org.apache.lucene.codecs.PostingsBaseFormat} to implement the
org.apache.lucene.codecs.PostingsWriterBase} to implement the
postings.
</p>

View File

@ -64,7 +64,7 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
* <p>
* <code>formatName</code> is the name of the format. This name will be used
* in the file formats to perform
* {@link CodecUtil#checkSegmentHeader codec header checks}.
* {@link CodecUtil#checkIndexHeader codec header checks}.
* <p>
* <code>segmentSuffix</code> is the segment suffix. This suffix is added to
* the result file name only if it's not the empty string.

View File

@ -118,8 +118,8 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
Throwable priorE = null;
try {
final String codecNameIdx = formatName + CODEC_SFX_IDX;
version = CodecUtil.checkSegmentHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
assert CodecUtil.segmentHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer();
version = CodecUtil.checkIndexHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer();
indexReader = new CompressingStoredFieldsIndexReader(indexStream, si);
maxPointer = indexStream.readVLong();
} catch (Throwable exception) {
@ -141,11 +141,11 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
throw new CorruptIndexException("Invalid fieldsStream maxPointer (file truncated?): maxPointer=" + maxPointer + ", length=" + fieldsStream.length(), fieldsStream);
}
final String codecNameDat = formatName + CODEC_SFX_DAT;
final int fieldsVersion = CodecUtil.checkSegmentHeader(fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
final int fieldsVersion = CodecUtil.checkIndexHeader(fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
if (version != fieldsVersion) {
throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + fieldsVersion, fieldsStream);
}
assert CodecUtil.segmentHeaderLength(codecNameDat, segmentSuffix) == fieldsStream.getFilePointer();
assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == fieldsStream.getFilePointer();
chunkSize = fieldsStream.readVInt();
packedIntsVersion = fieldsStream.readVInt();

View File

@ -29,13 +29,9 @@ import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -118,10 +114,10 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
final String codecNameIdx = formatName + CODEC_SFX_IDX;
final String codecNameDat = formatName + CODEC_SFX_DAT;
CodecUtil.writeSegmentHeader(indexStream, codecNameIdx, VERSION_CURRENT, si.getId(), segmentSuffix);
CodecUtil.writeSegmentHeader(fieldsStream, codecNameDat, VERSION_CURRENT, si.getId(), segmentSuffix);
assert CodecUtil.segmentHeaderLength(codecNameDat, segmentSuffix) == fieldsStream.getFilePointer();
assert CodecUtil.segmentHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer();
CodecUtil.writeIndexHeader(indexStream, codecNameIdx, VERSION_CURRENT, si.getId(), segmentSuffix);
CodecUtil.writeIndexHeader(fieldsStream, codecNameDat, VERSION_CURRENT, si.getId(), segmentSuffix);
assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == fieldsStream.getFilePointer();
assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer();
indexWriter = new CompressingStoredFieldsIndexWriter(indexStream);
indexStream = null;

View File

@ -46,7 +46,7 @@ public class CompressingTermVectorsFormat extends TermVectorsFormat {
* <p>
* <code>formatName</code> is the name of the format. This name will be used
* in the file formats to perform
* {@link CodecUtil#checkSegmentHeader codec header checks}.
* {@link CodecUtil#checkIndexHeader codec header checks}.
* <p>
* The <code>compressionMode</code> parameter allows you to choose between
* compression algorithms that have various compression and decompression

View File

@ -114,8 +114,8 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
Throwable priorE = null;
try {
final String codecNameIdx = formatName + CODEC_SFX_IDX;
version = CodecUtil.checkSegmentHeader(input, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
assert CodecUtil.segmentHeaderLength(codecNameIdx, segmentSuffix) == input.getFilePointer();
version = CodecUtil.checkIndexHeader(input, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == input.getFilePointer();
indexReader = new CompressingStoredFieldsIndexReader(input, si);
input.readVLong(); // the end of the data file
} catch (Throwable exception) {
@ -133,11 +133,11 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
final String vectorsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
vectorsStream = d.openInput(vectorsStreamFN, context);
final String codecNameDat = formatName + CODEC_SFX_DAT;
int version2 = CodecUtil.checkSegmentHeader(vectorsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
int version2 = CodecUtil.checkIndexHeader(vectorsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
if (version != version2) {
throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + version2, vectorsStream);
}
assert CodecUtil.segmentHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer();
assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer();
long pos = vectorsStream.getFilePointer();
// NOTE: data file is too costly to verify checksum against all the bytes on open,

View File

@ -32,11 +32,8 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.store.BufferedChecksumIndexInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
@ -231,10 +228,10 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
final String codecNameIdx = formatName + CODEC_SFX_IDX;
final String codecNameDat = formatName + CODEC_SFX_DAT;
CodecUtil.writeSegmentHeader(indexStream, codecNameIdx, VERSION_CURRENT, si.getId(), segmentSuffix);
CodecUtil.writeSegmentHeader(vectorsStream, codecNameDat, VERSION_CURRENT, si.getId(), segmentSuffix);
assert CodecUtil.segmentHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer();
assert CodecUtil.segmentHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer();
CodecUtil.writeIndexHeader(indexStream, codecNameIdx, VERSION_CURRENT, si.getId(), segmentSuffix);
CodecUtil.writeIndexHeader(vectorsStream, codecNameDat, VERSION_CURRENT, si.getId(), segmentSuffix);
assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer();
assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer();
indexWriter = new CompressingStoredFieldsIndexWriter(indexStream);
indexStream = null;

View File

@ -1,51 +0,0 @@
package org.apache.lucene.codecs.lucene41;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.PostingsBaseFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
/**
* Provides a {@link PostingsReaderBase} and {@link
* PostingsWriterBase}.
*
* @lucene.experimental */
// TODO: should these also be named / looked up via SPI?
public final class Lucene41PostingsBaseFormat extends PostingsBaseFormat {
/** Sole constructor. */
public Lucene41PostingsBaseFormat() {
super("Lucene41");
}
@Override
public PostingsReaderBase postingsReaderBase(SegmentReadState state) throws IOException {
return new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
}
@Override
public PostingsWriterBase postingsWriterBase(SegmentWriteState state) throws IOException {
return new Lucene41PostingsWriter(state);
}
}

View File

@ -1,25 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Lucene 4.1 file format.
</body>
</html>

View File

@ -1,25 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Lucene 4.10 file format.
</body>
</html>

Some files were not shown because too many files have changed in this diff Show More