Remove dead codec code.

This commit is contained in:
Adrien Grand 2018-02-20 18:57:20 +01:00
parent 4bb705bad8
commit ca22f17662
11 changed files with 100 additions and 1000 deletions

View File

@ -26,7 +26,6 @@ import java.util.function.IntFunction;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.MutablePointValues;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.store.ChecksumIndexInput;
@ -44,7 +43,6 @@ import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.MSBRadixSorter;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.bkd.BKDWriter;
import org.apache.lucene.util.bkd.HeapPointWriter;
@ -309,124 +307,6 @@ final class SimpleTextBKDWriter implements Closeable {
return pointCount;
}
private static class MergeReader {
final SimpleTextBKDReader bkd;
final SimpleTextBKDReader.IntersectState state;
final MergeState.DocMap docMap;
/** Current doc ID */
public int docID;
/** Which doc in this block we are up to */
private int docBlockUpto;
/** How many docs in the current block */
private int docsInBlock;
/** Which leaf block we are up to */
private int blockID;
private final byte[] packedValues;
public MergeReader(SimpleTextBKDReader bkd, MergeState.DocMap docMap) throws IOException {
this.bkd = bkd;
state = new SimpleTextBKDReader.IntersectState(bkd.in.clone(),
bkd.numDims,
bkd.packedBytesLength,
bkd.maxPointsInLeafNode,
null);
this.docMap = docMap;
long minFP = Long.MAX_VALUE;
//System.out.println("MR.init " + this + " bkdreader=" + bkd + " leafBlockFPs.length=" + bkd.leafBlockFPs.length);
for(long fp : bkd.leafBlockFPs) {
minFP = Math.min(minFP, fp);
//System.out.println(" leaf fp=" + fp);
}
state.in.seek(minFP);
this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength];
}
public boolean next() throws IOException {
//System.out.println("MR.next this=" + this);
while (true) {
if (docBlockUpto == docsInBlock) {
if (blockID == bkd.leafBlockFPs.length) {
//System.out.println(" done!");
return false;
}
//System.out.println(" new block @ fp=" + state.in.getFilePointer());
docsInBlock = bkd.readDocIDs(state.in, state.in.getFilePointer(), state.scratchDocIDs);
assert docsInBlock > 0;
docBlockUpto = 0;
bkd.visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, docsInBlock, new IntersectVisitor() {
int i = 0;
@Override
public void visit(int docID) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void visit(int docID, byte[] packedValue) throws IOException {
assert docID == state.scratchDocIDs[i];
System.arraycopy(packedValue, 0, packedValues, i * bkd.packedBytesLength, bkd.packedBytesLength);
i++;
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
throw new UnsupportedOperationException();
}
});
blockID++;
}
final int index = docBlockUpto++;
int oldDocID = state.scratchDocIDs[index];
int mappedDocID;
if (docMap == null) {
mappedDocID = oldDocID;
} else {
mappedDocID = docMap.get(oldDocID);
}
if (mappedDocID != -1) {
// Not deleted!
docID = mappedDocID;
System.arraycopy(packedValues, index * bkd.packedBytesLength, state.scratchPackedValue, 0, bkd.packedBytesLength);
return true;
}
}
}
}
private static class BKDMergeQueue extends PriorityQueue<MergeReader> {
private final int bytesPerDim;
public BKDMergeQueue(int bytesPerDim, int maxSize) {
super(maxSize);
this.bytesPerDim = bytesPerDim;
}
@Override
public boolean lessThan(MergeReader a, MergeReader b) {
assert a != b;
int cmp = StringHelper.compare(bytesPerDim, a.state.scratchPackedValue, 0, b.state.scratchPackedValue, 0);
if (cmp < 0) {
return true;
} else if (cmp > 0) {
return false;
}
// Tie break by sorting smaller docIDs earlier:
return a.docID < b.docID;
}
}
/** Write a field from a {@link MutablePointValues}. This way of writing
* points is faster than regular writes with {@link BKDWriter#add} since
* there is opportunity for reordering points before writing them to
@ -527,50 +407,6 @@ final class SimpleTextBKDWriter implements Closeable {
return oneDimWriter.finish();
}
// TODO: remove this opto: SimpleText is supposed to be simple!
/** More efficient bulk-add for incoming {@link SimpleTextBKDReader}s. This does a merge sort of the already
* sorted values and currently only works when numDims==1. This returns -1 if all documents containing
* dimensional values were deleted. */
public long merge(IndexOutput out, List<MergeState.DocMap> docMaps, List<SimpleTextBKDReader> readers) throws IOException {
assert docMaps == null || readers.size() == docMaps.size();
BKDMergeQueue queue = new BKDMergeQueue(bytesPerDim, readers.size());
for(int i=0;i<readers.size();i++) {
SimpleTextBKDReader bkd = readers.get(i);
MergeState.DocMap docMap;
if (docMaps == null) {
docMap = null;
} else {
docMap = docMaps.get(i);
}
MergeReader reader = new MergeReader(bkd, docMap);
if (reader.next()) {
queue.add(reader);
}
}
OneDimensionBKDWriter oneDimWriter = new OneDimensionBKDWriter(out);
while (queue.size() != 0) {
MergeReader reader = queue.top();
// System.out.println("iter reader=" + reader);
// NOTE: doesn't work with subclasses (e.g. SimpleText!)
oneDimWriter.add(reader.state.scratchPackedValue, reader.docID);
if (reader.next()) {
queue.updateTop();
} else {
// This segment was exhausted
queue.pop();
}
}
return oneDimWriter.finish();
}
private class OneDimensionBKDWriter {
final IndexOutput out;

View File

@ -514,16 +514,6 @@ class SimpleTextFieldsReader extends FieldsProducer {
}
}
static class TermData {
public long docsStart;
public int docFreq;
public TermData(long docsStart, int docFreq) {
this.docsStart = docsStart;
this.docFreq = docFreq;
}
}
private static final long TERMS_BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(SimpleTextTerms.class)
+ RamUsageEstimator.shallowSizeOfInstance(BytesRef.class)

View File

@ -1,95 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blocktree;
import java.io.IOException;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet; // javadocs
/** Takes a {@link FixedBitSet} and creates a DOCS {@link PostingsEnum} from it. */
class BitSetPostingsEnum extends PostingsEnum {
private final BitSet bits;
private DocIdSetIterator in;
BitSetPostingsEnum(BitSet bits) {
this.bits = bits;
reset();
}
@Override
public int freq() throws IOException {
return 1;
}
@Override
public int docID() {
if (in == null) {
return -1;
} else {
return in.docID();
}
}
@Override
public int nextDoc() throws IOException {
if (in == null) {
in = new BitSetIterator(bits, 0);
}
return in.nextDoc();
}
@Override
public int advance(int target) throws IOException {
return in.advance(target);
}
@Override
public long cost() {
return in.cost();
}
void reset() {
in = null;
}
@Override
public BytesRef getPayload() {
return null;
}
@Override
public int nextPosition() {
throw new UnsupportedOperationException();
}
@Override
public int startOffset() {
throw new UnsupportedOperationException();
}
@Override
public int endOffset() {
throw new UnsupportedOperationException();
}
}

View File

@ -1,92 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blocktree;
import java.io.IOException;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BytesRef;
/** Silly stub class, used only when writing an auto-prefix
* term in order to expose DocsEnum over a FixedBitSet. We
* pass this to {@link PostingsWriterBase#writeTerm} so
* that it can pull .docs() multiple times for the
* current term. */
class BitSetTermsEnum extends TermsEnum {
private final BitSetPostingsEnum postingsEnum;
public BitSetTermsEnum(BitSet docs) {
postingsEnum = new BitSetPostingsEnum(docs);
}
@Override
public SeekStatus seekCeil(BytesRef text) {
throw new UnsupportedOperationException();
}
@Override
public void seekExact(long ord) {
throw new UnsupportedOperationException();
}
@Override
public BytesRef term() {
throw new UnsupportedOperationException();
}
@Override
public BytesRef next() {
throw new UnsupportedOperationException();
}
@Override
public long ord() {
throw new UnsupportedOperationException();
}
@Override
public int docFreq() {
throw new UnsupportedOperationException();
}
@Override
public long totalTermFreq() {
throw new UnsupportedOperationException();
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) {
if (flags != PostingsEnum.NONE) {
// We only work with DOCS_ONLY fields
return null;
}
postingsEnum.reset();
return postingsEnum;
}
@Override
public ImpactsEnum impacts(SimScorer scorer, int flags) throws IOException {
throw new UnsupportedOperationException();
}
}

View File

@ -1,268 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene62;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexWriter; // javadocs
import org.apache.lucene.index.SegmentInfo; // javadocs
import org.apache.lucene.index.SegmentInfos; // javadocs
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSelector;
import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput; // javadocs
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Version;
/**
* Lucene 6.2 Segment info format.
* <p>
* Files:
* <ul>
* <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Attributes, IndexSort, Footer
* </ul>
* Data types:
* <ul>
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
* <li>SegSize --&gt; {@link DataOutput#writeInt Int32}</li>
* <li>SegVersion --&gt; {@link DataOutput#writeString String}</li>
* <li>Files --&gt; {@link DataOutput#writeSetOfStrings Set&lt;String&gt;}</li>
* <li>Diagnostics,Attributes --&gt; {@link DataOutput#writeMapOfStrings Map&lt;String,String&gt;}</li>
* <li>IsCompoundFile --&gt; {@link DataOutput#writeByte Int8}</li>
* <li>IndexSort --&gt; {@link DataOutput#writeVInt Int32} count, followed by {@code count} SortField</li>
* <li>SortField --&gt; {@link DataOutput#writeString String} field name, followed by {@link DataOutput#writeVInt Int32} sort type ID,
* followed by {@link DataOutput#writeByte Int8} indicatating reversed sort, followed by a type-specific encoding of the optional missing value
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
* Field Descriptions:
* <ul>
* <li>SegVersion is the code version that created the segment.</li>
* <li>SegSize is the number of documents contained in the segment index.</li>
* <li>IsCompoundFile records whether the segment is written as a compound file or
* not. If this is -1, the segment is not a compound file. If it is 1, the segment
* is a compound file.</li>
* <li>The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid,
* for each segment it creates. It includes metadata like the current Lucene
* version, OS, Java version, why the segment was created (merge, flush,
* addIndexes), etc.</li>
* <li>Files is a list of files referred to by this segment.</li>
* </ul>
*
* @see SegmentInfos
* @lucene.experimental
*/
public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
/** Sole constructor. */
public Lucene62SegmentInfoFormat() {
}
@Override
public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene62SegmentInfoFormat.SI_EXTENSION);
try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) {
Throwable priorE = null;
SegmentInfo si = null;
try {
int format = CodecUtil.checkIndexHeader(input, Lucene62SegmentInfoFormat.CODEC_NAME,
Lucene62SegmentInfoFormat.VERSION_START,
Lucene62SegmentInfoFormat.VERSION_CURRENT,
segmentID, "");
final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
final int docCount = input.readInt();
if (docCount < 0) {
throw new CorruptIndexException("invalid docCount: " + docCount, input);
}
final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
final Map<String,String> diagnostics = input.readMapOfStrings();
final Set<String> files = input.readSetOfStrings();
final Map<String,String> attributes = input.readMapOfStrings();
int numSortFields = input.readVInt();
Sort indexSort;
if (numSortFields > 0) {
SortField[] sortFields = new SortField[numSortFields];
for(int i=0;i<numSortFields;i++) {
String fieldName = input.readString();
int sortTypeID = input.readVInt();
SortField.Type sortType;
SortedSetSelector.Type sortedSetSelector = null;
SortedNumericSelector.Type sortedNumericSelector = null;
switch(sortTypeID) {
case 0:
sortType = SortField.Type.STRING;
break;
case 1:
sortType = SortField.Type.LONG;
break;
case 2:
sortType = SortField.Type.INT;
break;
case 3:
sortType = SortField.Type.DOUBLE;
break;
case 4:
sortType = SortField.Type.FLOAT;
break;
case 5:
sortType = SortField.Type.STRING;
byte selector = input.readByte();
if (selector == 0) {
sortedSetSelector = SortedSetSelector.Type.MIN;
} else if (selector == 1) {
sortedSetSelector = SortedSetSelector.Type.MAX;
} else if (selector == 2) {
sortedSetSelector = SortedSetSelector.Type.MIDDLE_MIN;
} else if (selector == 3) {
sortedSetSelector = SortedSetSelector.Type.MIDDLE_MAX;
} else {
throw new CorruptIndexException("invalid index SortedSetSelector ID: " + selector, input);
}
break;
case 6:
byte type = input.readByte();
if (type == 0) {
sortType = SortField.Type.LONG;
} else if (type == 1) {
sortType = SortField.Type.INT;
} else if (type == 2) {
sortType = SortField.Type.DOUBLE;
} else if (type == 3) {
sortType = SortField.Type.FLOAT;
} else {
throw new CorruptIndexException("invalid index SortedNumericSortField type ID: " + type, input);
}
byte numericSelector = input.readByte();
if (numericSelector == 0) {
sortedNumericSelector = SortedNumericSelector.Type.MIN;
} else if (numericSelector == 1) {
sortedNumericSelector = SortedNumericSelector.Type.MAX;
} else {
throw new CorruptIndexException("invalid index SortedNumericSelector ID: " + numericSelector, input);
}
break;
default:
throw new CorruptIndexException("invalid index sort field type ID: " + sortTypeID, input);
}
byte b = input.readByte();
boolean reverse;
if (b == 0) {
reverse = true;
} else if (b == 1) {
reverse = false;
} else {
throw new CorruptIndexException("invalid index sort reverse: " + b, input);
}
if (sortedSetSelector != null) {
sortFields[i] = new SortedSetSortField(fieldName, reverse, sortedSetSelector);
} else if (sortedNumericSelector != null) {
sortFields[i] = new SortedNumericSortField(fieldName, sortType, reverse, sortedNumericSelector);
} else {
sortFields[i] = new SortField(fieldName, sortType, reverse);
}
Object missingValue;
b = input.readByte();
if (b == 0) {
missingValue = null;
} else {
switch(sortType) {
case STRING:
if (b == 1) {
missingValue = SortField.STRING_LAST;
} else if (b == 2) {
missingValue = SortField.STRING_FIRST;
} else {
throw new CorruptIndexException("invalid missing value flag: " + b, input);
}
break;
case LONG:
if (b != 1) {
throw new CorruptIndexException("invalid missing value flag: " + b, input);
}
missingValue = input.readLong();
break;
case INT:
if (b != 1) {
throw new CorruptIndexException("invalid missing value flag: " + b, input);
}
missingValue = input.readInt();
break;
case DOUBLE:
if (b != 1) {
throw new CorruptIndexException("invalid missing value flag: " + b, input);
}
missingValue = Double.longBitsToDouble(input.readLong());
break;
case FLOAT:
if (b != 1) {
throw new CorruptIndexException("invalid missing value flag: " + b, input);
}
missingValue = Float.intBitsToFloat(input.readInt());
break;
default:
throw new AssertionError("unhandled sortType=" + sortType);
}
}
if (missingValue != null) {
sortFields[i].setMissingValue(missingValue);
}
}
indexSort = new Sort(sortFields);
} else if (numSortFields < 0) {
throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input);
} else {
indexSort = null;
}
si = new SegmentInfo(dir, version, null, segment, docCount, isCompoundFile, null, diagnostics, segmentID, attributes, indexSort);
si.setFiles(files);
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(input, priorE);
}
return si;
}
}
@Override
public void write(Directory dir, SegmentInfo info, IOContext ioContext) throws IOException {
throw new UnsupportedOperationException("This format can only be used for reading");
}
/** File extension used to store {@link SegmentInfo}. */
public final static String SI_EXTENSION = "si";
static final String CODEC_NAME = "Lucene62SegmentInfo";
static final int VERSION_START = 0;
static final int VERSION_MULTI_VALUED_SORT = 1;
static final int VERSION_CURRENT = VERSION_MULTI_VALUED_SORT;
}

View File

@ -1,24 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Components from the Lucene 6.2 index format
* See {@link org.apache.lucene.codecs.lucene70} for an overview
* of the current index format.
*/
package org.apache.lucene.codecs.lucene62;

View File

@ -128,7 +128,7 @@
* <p>Each segment index maintains the following:</p>
* <ul>
* <li>
* {@link org.apache.lucene.codecs.lucene62.Lucene62SegmentInfoFormat Segment info}.
* {@link org.apache.lucene.codecs.lucene70.Lucene70SegmentInfoFormat Segment info}.
* This contains metadata about a segment, such as the number of documents,
* what files it uses,
* </li>
@ -233,7 +233,7 @@
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene62.Lucene62SegmentInfoFormat Segment Info}</td>
* <td>{@link org.apache.lucene.codecs.lucene70.Lucene70SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>

View File

@ -26,7 +26,6 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
/** Handles intersection of an multi-dimensional shape in byte[] space with a block KD-tree previously written with {@link BKDWriter}.
@ -48,21 +47,14 @@ public final class BKDReader extends PointValues implements Accountable {
final int version;
protected final int packedBytesLength;
// Used for 6.4.0+ index format:
final byte[] packedIndex;
// Used for Legacy (pre-6.4.0) index format, to hold a compact form of the index:
final private byte[] splitPackedValues;
final int bytesPerIndexEntry;
final long[] leafBlockFPs;
/** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
public BKDReader(IndexInput in) throws IOException {
version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
numDims = in.readVInt();
maxPointsInLeafNode = in.readVInt();
bytesPerDim = in.readVInt();
bytesPerIndexEntry = numDims == 1 && version >= BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D ? bytesPerDim : bytesPerDim + 1;
packedBytesLength = numDims * bytesPerDim;
// Read index:
@ -85,112 +77,114 @@ public final class BKDReader extends PointValues implements Accountable {
pointCount = in.readVLong();
docCount = in.readVInt();
if (version >= BKDWriter.VERSION_PACKED_INDEX) {
int numBytes = in.readVInt();
packedIndex = new byte[numBytes];
in.readBytes(packedIndex, 0, numBytes);
leafBlockFPs = null;
splitPackedValues = null;
} else {
// legacy un-packed index
splitPackedValues = new byte[bytesPerIndexEntry*numLeaves];
in.readBytes(splitPackedValues, 0, splitPackedValues.length);
// Read the file pointers to the start of each leaf block:
long[] leafBlockFPs = new long[numLeaves];
long lastFP = 0;
for(int i=0;i<numLeaves;i++) {
long delta = in.readVLong();
leafBlockFPs[i] = lastFP + delta;
lastFP += delta;
}
// Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
// if it was created by BKDWriter.merge or OneDimWriter). In this case the leaf nodes may straddle the two bottom
// levels of the binary tree:
if (numDims == 1 && numLeaves > 1) {
int levelCount = 2;
while (true) {
if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
int lastLevel = 2*(numLeaves - levelCount);
assert lastLevel >= 0;
if (lastLevel != 0) {
// Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
// at read-time, so that we can still delta code them on disk at write:
long[] newLeafBlockFPs = new long[numLeaves];
System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
leafBlockFPs = newLeafBlockFPs;
}
break;
}
levelCount *= 2;
}
}
this.leafBlockFPs = leafBlockFPs;
packedIndex = null;
}
int numBytes = in.readVInt();
packedIndex = new byte[numBytes];
in.readBytes(packedIndex, 0, numBytes);
this.in = in;
}
long getMinLeafBlockFP() {
if (packedIndex != null) {
return new ByteArrayDataInput(packedIndex).readVLong();
} else {
long minFP = Long.MAX_VALUE;
for(long fp : leafBlockFPs) {
minFP = Math.min(minFP, fp);
}
return minFP;
}
return new ByteArrayDataInput(packedIndex).readVLong();
}
/** Used to walk the in-heap index
*
* @lucene.internal */
public abstract class IndexTree implements Cloneable {
protected int nodeID;
/** Used to walk the in-heap index. The format takes advantage of the limited
* access pattern to the BKD tree at search time, i.e. starting at the root
* node and recursing downwards one child at a time.
* @lucene.internal */
public class IndexTree implements Cloneable {
private int nodeID;
// level is 1-based so that we can do level-1 w/o checking each time:
protected int level;
protected int splitDim;
protected final byte[][] splitPackedValueStack;
private int level;
private int splitDim;
private final byte[][] splitPackedValueStack;
// used to read the packed byte[]
private final ByteArrayDataInput in;
// holds the minimum (left most) leaf block file pointer for each level we've recursed to:
private final long[] leafBlockFPStack;
// holds the address, in the packed byte[] index, of the left-node of each level:
private final int[] leftNodePositions;
// holds the address, in the packed byte[] index, of the right-node of each level:
private final int[] rightNodePositions;
// holds the splitDim for each level:
private final int[] splitDims;
// true if the per-dim delta we read for the node at this level is a negative offset vs. the last split on this dim; this is a packed
// 2D array, i.e. to access array[level][dim] you read from negativeDeltas[level*numDims+dim]. this will be true if the last time we
// split on this dimension, we next pushed to the left sub-tree:
private final boolean[] negativeDeltas;
// holds the packed per-level split values; the intersect method uses this to save the cell min/max as it recurses:
private final byte[][] splitValuesStack;
// scratch value to return from getPackedValue:
private final BytesRef scratch;
protected IndexTree() {
IndexTree() {
int treeDepth = getTreeDepth();
splitPackedValueStack = new byte[treeDepth+1][];
nodeID = 1;
level = 1;
splitPackedValueStack[level] = new byte[packedBytesLength];
leafBlockFPStack = new long[treeDepth+1];
leftNodePositions = new int[treeDepth+1];
rightNodePositions = new int[treeDepth+1];
splitValuesStack = new byte[treeDepth+1][];
splitDims = new int[treeDepth+1];
negativeDeltas = new boolean[numDims*(treeDepth+1)];
in = new ByteArrayDataInput(packedIndex);
splitValuesStack[0] = new byte[packedBytesLength];
readNodeData(false);
scratch = new BytesRef();
scratch.length = bytesPerDim;
}
public void pushLeft() {
int nodePosition = leftNodePositions[level];
nodeID *= 2;
level++;
if (splitPackedValueStack[level] == null) {
splitPackedValueStack[level] = new byte[packedBytesLength];
}
System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
assert splitDim != -1;
negativeDeltas[level*numDims+splitDim] = true;
in.setPosition(nodePosition);
readNodeData(true);
}
/** Clone, but you are not allowed to pop up past the point where the clone happened. */
public abstract IndexTree clone();
@Override
public IndexTree clone() {
IndexTree index = new IndexTree();
index.nodeID = nodeID;
index.level = level;
index.splitDim = splitDim;
index.leafBlockFPStack[level] = leafBlockFPStack[level];
index.leftNodePositions[level] = leftNodePositions[level];
index.rightNodePositions[level] = rightNodePositions[level];
index.splitValuesStack[index.level] = splitValuesStack[index.level].clone();
System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims);
index.splitDims[level] = splitDims[level];
return index;
}
public void pushRight() {
int nodePosition = rightNodePositions[level];
nodeID = nodeID * 2 + 1;
level++;
if (splitPackedValueStack[level] == null) {
splitPackedValueStack[level] = new byte[packedBytesLength];
}
System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
assert splitDim != -1;
negativeDeltas[level*numDims+splitDim] = false;
in.setPosition(nodePosition);
readNodeData(false);
}
public void pop() {
nodeID /= 2;
level--;
splitDim = -1;
splitDim = splitDims[level];
//System.out.println(" pop nodeID=" + nodeID);
}
@ -219,10 +213,18 @@ public final class BKDReader extends PointValues implements Accountable {
}
/** Only valid after pushLeft or pushRight, not pop! */
public abstract BytesRef getSplitDimValue();
public BytesRef getSplitDimValue() {
assert isLeafNode() == false;
scratch.bytes = splitValuesStack[level];
scratch.offset = splitDim * bytesPerDim;
return scratch;
}
/** Only valid after pushLeft or pushRight, not pop! */
public abstract long getLeafBlockFP();
public long getLeafBlockFP() {
assert isLeafNode(): "nodeID=" + nodeID + " is not a leaf";
return leafBlockFPStack[level];
}
/** Return the number of leaves below the current node. */
public int getNumLeaves() {
@ -258,180 +260,6 @@ public final class BKDReader extends PointValues implements Accountable {
return leftCount + rightCount;
}
}
}
/** Reads the original simple yet heap-heavy index format */
private final class LegacyIndexTree extends IndexTree {
private long leafBlockFP;
private final byte[] splitDimValue = new byte[bytesPerDim];
private final BytesRef scratch = new BytesRef();
public LegacyIndexTree() {
setNodeData();
scratch.bytes = splitDimValue;
scratch.length = bytesPerDim;
}
@Override
public LegacyIndexTree clone() {
LegacyIndexTree index = new LegacyIndexTree();
index.nodeID = nodeID;
index.level = level;
index.splitDim = splitDim;
index.leafBlockFP = leafBlockFP;
index.splitPackedValueStack[index.level] = splitPackedValueStack[index.level].clone();
return index;
}
@Override
public void pushLeft() {
super.pushLeft();
setNodeData();
}
@Override
public void pushRight() {
super.pushRight();
setNodeData();
}
private void setNodeData() {
if (isLeafNode()) {
leafBlockFP = leafBlockFPs[nodeID - leafNodeOffset];
splitDim = -1;
} else {
leafBlockFP = -1;
int address = nodeID * bytesPerIndexEntry;
if (numDims == 1) {
splitDim = 0;
if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
// skip over wastefully encoded 0 splitDim:
assert splitPackedValues[address] == 0;
address++;
}
} else {
splitDim = splitPackedValues[address++] & 0xff;
}
System.arraycopy(splitPackedValues, address, splitDimValue, 0, bytesPerDim);
}
}
@Override
public long getLeafBlockFP() {
assert isLeafNode();
return leafBlockFP;
}
@Override
public BytesRef getSplitDimValue() {
assert isLeafNode() == false;
return scratch;
}
@Override
public void pop() {
super.pop();
leafBlockFP = -1;
}
}
/** Reads the new packed byte[] index format which can be up to ~63% smaller than the legacy index format on 20M NYC taxis tests. This
* format takes advantage of the limited access pattern to the BKD tree at search time, i.e. starting at the root node and recursing
* downwards one child at a time. */
private final class PackedIndexTree extends IndexTree {
// used to read the packed byte[]
private final ByteArrayDataInput in;
// holds the minimum (left most) leaf block file pointer for each level we've recursed to:
private final long[] leafBlockFPStack;
// holds the address, in the packed byte[] index, of the left-node of each level:
private final int[] leftNodePositions;
// holds the address, in the packed byte[] index, of the right-node of each level:
private final int[] rightNodePositions;
// holds the splitDim for each level:
private final int[] splitDims;
// true if the per-dim delta we read for the node at this level is a negative offset vs. the last split on this dim; this is a packed
// 2D array, i.e. to access array[level][dim] you read from negativeDeltas[level*numDims+dim]. this will be true if the last time we
// split on this dimension, we next pushed to the left sub-tree:
private final boolean[] negativeDeltas;
// holds the packed per-level split values; the intersect method uses this to save the cell min/max as it recurses:
private final byte[][] splitValuesStack;
// scratch value to return from getPackedValue:
private final BytesRef scratch;
public PackedIndexTree() {
int treeDepth = getTreeDepth();
leafBlockFPStack = new long[treeDepth+1];
leftNodePositions = new int[treeDepth+1];
rightNodePositions = new int[treeDepth+1];
splitValuesStack = new byte[treeDepth+1][];
splitDims = new int[treeDepth+1];
negativeDeltas = new boolean[numDims*(treeDepth+1)];
in = new ByteArrayDataInput(packedIndex);
splitValuesStack[0] = new byte[packedBytesLength];
readNodeData(false);
scratch = new BytesRef();
scratch.length = bytesPerDim;
}
@Override
public PackedIndexTree clone() {
PackedIndexTree index = new PackedIndexTree();
index.nodeID = nodeID;
index.level = level;
index.splitDim = splitDim;
index.leafBlockFPStack[level] = leafBlockFPStack[level];
index.leftNodePositions[level] = leftNodePositions[level];
index.rightNodePositions[level] = rightNodePositions[level];
index.splitValuesStack[index.level] = splitValuesStack[index.level].clone();
System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims);
index.splitDims[level] = splitDims[level];
return index;
}
@Override
public void pushLeft() {
int nodePosition = leftNodePositions[level];
super.pushLeft();
System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
assert splitDim != -1;
negativeDeltas[level*numDims+splitDim] = true;
in.setPosition(nodePosition);
readNodeData(true);
}
@Override
public void pushRight() {
int nodePosition = rightNodePositions[level];
super.pushRight();
System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
assert splitDim != -1;
negativeDeltas[level*numDims+splitDim] = false;
in.setPosition(nodePosition);
readNodeData(false);
}
@Override
public void pop() {
super.pop();
splitDim = splitDims[level];
}
@Override
public long getLeafBlockFP() {
assert isLeafNode(): "nodeID=" + nodeID + " is not a leaf";
return leafBlockFPStack[level];
}
@Override
public BytesRef getSplitDimValue() {
assert isLeafNode() == false;
scratch.bytes = splitValuesStack[level];
scratch.offset = splitDim * bytesPerDim;
return scratch;
}
private void readNodeData(boolean isLeft) {
@ -559,12 +387,7 @@ public final class BKDReader extends PointValues implements Accountable {
/** Create a new {@link IntersectState} */
public IntersectState getIntersectState(IntersectVisitor visitor) {
IndexTree index;
if (packedIndex != null) {
index = new PackedIndexTree();
} else {
index = new LegacyIndexTree();
}
IndexTree index = new IndexTree();
return new IntersectState(in.clone(), numDims,
packedBytesLength,
maxPointsInLeafNode,
@ -590,11 +413,7 @@ public final class BKDReader extends PointValues implements Accountable {
int count = in.readVInt();
// No need to call grow(), it has been called up-front
if (version < BKDWriter.VERSION_COMPRESSED_DOC_IDS) {
DocIdsWriter.readInts32(in, count, visitor);
} else {
DocIdsWriter.readInts(in, count, visitor);
}
DocIdsWriter.readInts(in, count, visitor);
}
int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
@ -603,11 +422,7 @@ public final class BKDReader extends PointValues implements Accountable {
// How many points are stored in this leaf cell:
int count = in.readVInt();
if (version < BKDWriter.VERSION_COMPRESSED_DOC_IDS) {
DocIdsWriter.readInts32(in, count, docIDs);
} else {
DocIdsWriter.readInts(in, count, docIDs);
}
DocIdsWriter.readInts(in, count, docIDs);
return count;
}
@ -617,9 +432,7 @@ public final class BKDReader extends PointValues implements Accountable {
readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in);
int compressedDim = version < BKDWriter.VERSION_COMPRESSED_VALUES
? -1
: readCompressedDim(in);
int compressedDim = readCompressedDim(in);
if (compressedDim == -1) {
visitRawDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor);
@ -803,11 +616,7 @@ public final class BKDReader extends PointValues implements Accountable {
@Override
public long ramBytesUsed() {
if (packedIndex != null) {
return packedIndex.length;
} else {
return RamUsageEstimator.sizeOf(splitPackedValues) + RamUsageEstimator.sizeOf(leafBlockFPs);
}
return packedIndex.length;
}
@Override

View File

@ -82,12 +82,8 @@ import org.apache.lucene.util.StringHelper;
public class BKDWriter implements Closeable {
public static final String CODEC_NAME = "BKD";
public static final int VERSION_START = 0;
public static final int VERSION_COMPRESSED_DOC_IDS = 1;
public static final int VERSION_COMPRESSED_VALUES = 2;
public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3;
public static final int VERSION_PACKED_INDEX = 4;
public static final int VERSION_CURRENT = VERSION_PACKED_INDEX;
public static final int VERSION_START = 4; // version used by Lucene 7.0
public static final int VERSION_CURRENT = VERSION_START;
/** How many bytes each docs takes in the fixed-width offline format */
private final int bytesPerDoc;

View File

@ -142,7 +142,7 @@ class DocIdsWriter {
}
}
static void readInts32(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
private static void readInts32(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
for (int i = 0; i < count; i++) {
visitor.visit(in.readInt());
}

View File

@ -26,7 +26,6 @@ import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
@ -103,27 +102,8 @@ public final class FST<T> implements Accountable {
// Increment version to change it
private static final String FILE_FORMAT_NAME = "FST";
private static final int VERSION_START = 0;
/** Changed numBytesPerArc for array'd case from byte to int. */
private static final int VERSION_INT_NUM_BYTES_PER_ARC = 1;
/** Write BYTE2 labels as 2-byte short, not vInt. */
private static final int VERSION_SHORT_BYTE2_LABELS = 2;
/** Added optional packed format. */
private static final int VERSION_PACKED = 3;
/** Changed from int to vInt for encoding arc targets.
* Also changed maxBytesPerArc from int to vInt in the array case. */
private static final int VERSION_VINT_TARGET = 4;
/** Don't store arcWithOutputCount anymore */
private static final int VERSION_NO_NODE_ARC_COUNTS = 5;
private static final int VERSION_PACKED_REMOVED = 6;
private static final int VERSION_CURRENT = VERSION_PACKED_REMOVED;
private static final int VERSION_START = 6;
private static final int VERSION_CURRENT = VERSION_START;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
@ -285,12 +265,7 @@ public final class FST<T> implements Accountable {
// NOTE: only reads most recent format; we don't have
// back-compat promise for FSTs (they are experimental):
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_CURRENT);
if (version < VERSION_PACKED_REMOVED) {
if (in.readByte() == 1) {
throw new CorruptIndexException("Cannot read packed FSTs anymore", in);
}
}
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
if (in.readByte() == 1) {
// accepts empty string
// 1 KB blocks:
@ -325,11 +300,6 @@ public final class FST<T> implements Accountable {
throw new IllegalStateException("invalid input type " + t);
}
startNode = in.readVLong();
if (version < VERSION_NO_NODE_ARC_COUNTS) {
in.readVLong();
in.readVLong();
in.readVLong();
}
long numBytes = in.readVLong();
if (numBytes > 1 << maxBlockBits) {
@ -768,11 +738,7 @@ public final class FST<T> implements Accountable {
if (b == ARCS_AS_FIXED_ARRAY) {
// array: jump straight to end
arc.numArcs = in.readVInt();
if (version >= VERSION_VINT_TARGET) {
arc.bytesPerArc = in.readVInt();
} else {
arc.bytesPerArc = in.readInt();
}
arc.bytesPerArc = in.readVInt();
//System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
arc.posArcsStart = in.getPosition();
arc.arcIdx = arc.numArcs - 2;
@ -808,13 +774,7 @@ public final class FST<T> implements Accountable {
}
private long readUnpackedNodeTarget(BytesReader in) throws IOException {
long target;
if (version < VERSION_VINT_TARGET) {
target = in.readInt();
} else {
target = in.readVLong();
}
return target;
return in.readVLong();
}
/**
@ -857,11 +817,7 @@ public final class FST<T> implements Accountable {
//System.out.println(" fixedArray");
// this is first arc in a fixed-array
arc.numArcs = in.readVInt();
if (version >= VERSION_VINT_TARGET) {
arc.bytesPerArc = in.readVInt();
} else {
arc.bytesPerArc = in.readInt();
}
arc.bytesPerArc = in.readVInt();
arc.arcIdx = -1;
arc.nextArc = arc.posArcsStart = in.getPosition();
//System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
@ -920,11 +876,7 @@ public final class FST<T> implements Accountable {
in.readVInt();
// Skip bytesPerArc:
if (version >= VERSION_VINT_TARGET) {
in.readVInt();
} else {
in.readInt();
}
in.readVInt();
} else {
in.setPosition(pos);
}
@ -1092,11 +1044,7 @@ public final class FST<T> implements Accountable {
if (in.readByte() == ARCS_AS_FIXED_ARRAY) {
// Arcs are full array; do binary search:
arc.numArcs = in.readVInt();
if (version >= VERSION_VINT_TARGET) {
arc.bytesPerArc = in.readVInt();
} else {
arc.bytesPerArc = in.readInt();
}
arc.bytesPerArc = in.readVInt();
arc.posArcsStart = in.getPosition();
int low = 0;
int high = arc.numArcs-1;