mirror of https://github.com/apache/lucene.git
remove unnecessary deprecated classes
This commit is contained in:
parent
13acba8b4e
commit
ada714972c
|
@ -1,170 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.PointsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 5.0 index format, with configurable per-field postings
|
||||
* and docvalues formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene50 package documentation for file format details.
|
||||
* @deprecated Only for reading old 5.0-5.2 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene50Codec extends Codec {
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
|
||||
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
|
||||
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
|
||||
|
||||
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return Lucene50Codec.this.getPostingsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return Lucene50Codec.this.getDocValuesFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final StoredFieldsFormat storedFieldsFormat;
|
||||
|
||||
/**
|
||||
* Instantiates a new codec.
|
||||
*/
|
||||
public Lucene50Codec() {
|
||||
this(Mode.BEST_SPEED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new codec, specifying the stored fields compression
|
||||
* mode to use.
|
||||
* @param mode stored fields compression mode to use for newly
|
||||
* flushed/merged segments.
|
||||
*/
|
||||
public Lucene50Codec(Mode mode) {
|
||||
super("Lucene50");
|
||||
this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final StoredFieldsFormat storedFieldsFormat() {
|
||||
return storedFieldsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final TermVectorsFormat termVectorsFormat() {
|
||||
return vectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PostingsFormat postingsFormat() {
|
||||
return postingsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final FieldInfosFormat fieldInfosFormat() {
|
||||
return fieldInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final LiveDocsFormat liveDocsFormat() {
|
||||
return liveDocsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final CompoundFormat compoundFormat() {
|
||||
return compoundFormat;
|
||||
}
|
||||
|
||||
/** Returns the postings format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene50".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return defaultFormat;
|
||||
}
|
||||
|
||||
/** Returns the docvalues format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene50".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return defaultDVFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final DocValuesFormat docValuesFormat() {
|
||||
return docValuesFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PointsFormat pointsFormat() {
|
||||
return PointsFormat.EMPTY;
|
||||
}
|
||||
|
||||
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50");
|
||||
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene50");
|
||||
|
||||
private final NormsFormat normsFormat = new Lucene50NormsFormat();
|
||||
|
||||
@Override
|
||||
public NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -1,658 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import java.io.Closeable; // javadocs
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LongsRef;
|
||||
import org.apache.lucene.util.MathUtil;
|
||||
import org.apache.lucene.util.PagedBytes;
|
||||
import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.packed.DirectWriter;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat.*;
|
||||
|
||||
/** writer for {@link Lucene50DocValuesFormat} */
|
||||
class Lucene50DocValuesConsumer extends DocValuesConsumer implements Closeable {
|
||||
|
||||
IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
|
||||
/** expert: Creates a new writer */
|
||||
public Lucene50DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
data = state.directory.createOutput(dataName, state.context);
|
||||
CodecUtil.writeIndexHeader(data, dataCodec, Lucene50DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
meta = state.directory.createOutput(metaName, state.context);
|
||||
CodecUtil.writeIndexHeader(meta, metaCodec, Lucene50DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
maxDoc = state.segmentInfo.maxDoc();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||
addNumericField(field, values, true);
|
||||
}
|
||||
|
||||
void addNumericField(FieldInfo field, Iterable<Number> values, boolean optimizeStorage) throws IOException {
|
||||
long count = 0;
|
||||
long minValue = Long.MAX_VALUE;
|
||||
long maxValue = Long.MIN_VALUE;
|
||||
long gcd = 0;
|
||||
long missingCount = 0;
|
||||
long zeroCount = 0;
|
||||
// TODO: more efficient?
|
||||
HashSet<Long> uniqueValues = null;
|
||||
if (optimizeStorage) {
|
||||
uniqueValues = new HashSet<>();
|
||||
|
||||
for (Number nv : values) {
|
||||
final long v;
|
||||
if (nv == null) {
|
||||
v = 0;
|
||||
missingCount++;
|
||||
zeroCount++;
|
||||
} else {
|
||||
v = nv.longValue();
|
||||
if (v == 0) {
|
||||
zeroCount++;
|
||||
}
|
||||
}
|
||||
|
||||
if (gcd != 1) {
|
||||
if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
|
||||
// in that case v - minValue might overflow and make the GCD computation return
|
||||
// wrong results. Since these extreme values are unlikely, we just discard
|
||||
// GCD computation for them
|
||||
gcd = 1;
|
||||
} else if (count != 0) { // minValue needs to be set first
|
||||
gcd = MathUtil.gcd(gcd, v - minValue);
|
||||
}
|
||||
}
|
||||
|
||||
minValue = Math.min(minValue, v);
|
||||
maxValue = Math.max(maxValue, v);
|
||||
|
||||
if (uniqueValues != null) {
|
||||
if (uniqueValues.add(v)) {
|
||||
if (uniqueValues.size() > 256) {
|
||||
uniqueValues = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++count;
|
||||
}
|
||||
} else {
|
||||
for (Number nv : values) {
|
||||
long v = nv.longValue();
|
||||
minValue = Math.min(minValue, v);
|
||||
maxValue = Math.max(maxValue, v);
|
||||
++count;
|
||||
}
|
||||
}
|
||||
|
||||
final long delta = maxValue - minValue;
|
||||
final int deltaBitsRequired = DirectWriter.unsignedBitsRequired(delta);
|
||||
final int tableBitsRequired = uniqueValues == null
|
||||
? Integer.MAX_VALUE
|
||||
: DirectWriter.bitsRequired(uniqueValues.size() - 1);
|
||||
|
||||
final int format;
|
||||
if (uniqueValues != null
|
||||
&& count <= Integer.MAX_VALUE
|
||||
&& (uniqueValues.size() == 1
|
||||
|| (uniqueValues.size() == 2 && missingCount > 0 && zeroCount == missingCount))) {
|
||||
// either one unique value C or two unique values: "missing" and C
|
||||
format = CONST_COMPRESSED;
|
||||
} else if (uniqueValues != null && tableBitsRequired < deltaBitsRequired) {
|
||||
format = TABLE_COMPRESSED;
|
||||
} else if (gcd != 0 && gcd != 1) {
|
||||
final long gcdDelta = (maxValue - minValue) / gcd;
|
||||
final long gcdBitsRequired = DirectWriter.unsignedBitsRequired(gcdDelta);
|
||||
format = gcdBitsRequired < deltaBitsRequired ? GCD_COMPRESSED : DELTA_COMPRESSED;
|
||||
} else {
|
||||
format = DELTA_COMPRESSED;
|
||||
}
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene50DocValuesFormat.NUMERIC);
|
||||
meta.writeVInt(format);
|
||||
if (missingCount == 0) {
|
||||
meta.writeLong(ALL_LIVE);
|
||||
} else if (missingCount == count) {
|
||||
meta.writeLong(ALL_MISSING);
|
||||
} else {
|
||||
meta.writeLong(data.getFilePointer());
|
||||
writeMissingBitset(values);
|
||||
}
|
||||
meta.writeLong(data.getFilePointer());
|
||||
meta.writeVLong(count);
|
||||
|
||||
switch (format) {
|
||||
case CONST_COMPRESSED:
|
||||
// write the constant (nonzero value in the n=2 case, singleton value otherwise)
|
||||
meta.writeLong(minValue < 0 ? Collections.min(uniqueValues) : Collections.max(uniqueValues));
|
||||
break;
|
||||
case GCD_COMPRESSED:
|
||||
meta.writeLong(minValue);
|
||||
meta.writeLong(gcd);
|
||||
final long maxDelta = (maxValue - minValue) / gcd;
|
||||
final int bits = DirectWriter.unsignedBitsRequired(maxDelta);
|
||||
meta.writeVInt(bits);
|
||||
final DirectWriter quotientWriter = DirectWriter.getInstance(data, count, bits);
|
||||
for (Number nv : values) {
|
||||
long value = nv == null ? 0 : nv.longValue();
|
||||
quotientWriter.add((value - minValue) / gcd);
|
||||
}
|
||||
quotientWriter.finish();
|
||||
break;
|
||||
case DELTA_COMPRESSED:
|
||||
final long minDelta = delta < 0 ? 0 : minValue;
|
||||
meta.writeLong(minDelta);
|
||||
meta.writeVInt(deltaBitsRequired);
|
||||
final DirectWriter writer = DirectWriter.getInstance(data, count, deltaBitsRequired);
|
||||
for (Number nv : values) {
|
||||
long v = nv == null ? 0 : nv.longValue();
|
||||
writer.add(v - minDelta);
|
||||
}
|
||||
writer.finish();
|
||||
break;
|
||||
case TABLE_COMPRESSED:
|
||||
final Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
|
||||
Arrays.sort(decode);
|
||||
final HashMap<Long,Integer> encode = new HashMap<>();
|
||||
meta.writeVInt(decode.length);
|
||||
for (int i = 0; i < decode.length; i++) {
|
||||
meta.writeLong(decode[i]);
|
||||
encode.put(decode[i], i);
|
||||
}
|
||||
meta.writeVInt(tableBitsRequired);
|
||||
final DirectWriter ordsWriter = DirectWriter.getInstance(data, count, tableBitsRequired);
|
||||
for (Number nv : values) {
|
||||
ordsWriter.add(encode.get(nv == null ? 0 : nv.longValue()));
|
||||
}
|
||||
ordsWriter.finish();
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
meta.writeLong(data.getFilePointer());
|
||||
}
|
||||
|
||||
// TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
|
||||
// but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
|
||||
void writeMissingBitset(Iterable<?> values) throws IOException {
|
||||
byte bits = 0;
|
||||
int count = 0;
|
||||
for (Object v : values) {
|
||||
if (count == 8) {
|
||||
data.writeByte(bits);
|
||||
count = 0;
|
||||
bits = 0;
|
||||
}
|
||||
if (v != null) {
|
||||
bits |= 1 << (count & 7);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
if (count > 0) {
|
||||
data.writeByte(bits);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
|
||||
// write the byte[] data
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene50DocValuesFormat.BINARY);
|
||||
int minLength = Integer.MAX_VALUE;
|
||||
int maxLength = Integer.MIN_VALUE;
|
||||
final long startFP = data.getFilePointer();
|
||||
long count = 0;
|
||||
long missingCount = 0;
|
||||
for(BytesRef v : values) {
|
||||
final int length;
|
||||
if (v == null) {
|
||||
length = 0;
|
||||
missingCount++;
|
||||
} else {
|
||||
length = v.length;
|
||||
}
|
||||
minLength = Math.min(minLength, length);
|
||||
maxLength = Math.max(maxLength, length);
|
||||
if (v != null) {
|
||||
data.writeBytes(v.bytes, v.offset, v.length);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED);
|
||||
if (missingCount == 0) {
|
||||
meta.writeLong(ALL_LIVE);
|
||||
} else if (missingCount == count) {
|
||||
meta.writeLong(ALL_MISSING);
|
||||
} else {
|
||||
meta.writeLong(data.getFilePointer());
|
||||
writeMissingBitset(values);
|
||||
}
|
||||
meta.writeVInt(minLength);
|
||||
meta.writeVInt(maxLength);
|
||||
meta.writeVLong(count);
|
||||
meta.writeLong(startFP);
|
||||
|
||||
// if minLength == maxLength, it's a fixed-length byte[], we are done (the addresses are implicit)
|
||||
// otherwise, we need to record the length fields...
|
||||
if (minLength != maxLength) {
|
||||
meta.writeLong(data.getFilePointer());
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
meta.writeVInt(MONOTONIC_BLOCK_SIZE);
|
||||
|
||||
final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, MONOTONIC_BLOCK_SIZE);
|
||||
long addr = 0;
|
||||
writer.add(addr);
|
||||
for (BytesRef v : values) {
|
||||
if (v != null) {
|
||||
addr += v.length;
|
||||
}
|
||||
writer.add(addr);
|
||||
}
|
||||
writer.finish();
|
||||
}
|
||||
}
|
||||
|
||||
/** expert: writes a value dictionary for a sorted/sortedset field */
|
||||
private void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
|
||||
// first check if it's a "fixed-length" terms dict
|
||||
int minLength = Integer.MAX_VALUE;
|
||||
int maxLength = Integer.MIN_VALUE;
|
||||
long numValues = 0;
|
||||
for (BytesRef v : values) {
|
||||
minLength = Math.min(minLength, v.length);
|
||||
maxLength = Math.max(maxLength, v.length);
|
||||
numValues++;
|
||||
}
|
||||
if (minLength == maxLength) {
|
||||
// no index needed: direct addressing by mult
|
||||
addBinaryField(field, values);
|
||||
} else if (numValues < REVERSE_INTERVAL_COUNT) {
|
||||
// low cardinality: waste a few KB of ram, but can't really use fancy index etc
|
||||
addBinaryField(field, values);
|
||||
} else {
|
||||
assert numValues > 0; // we don't have to handle the empty case
|
||||
// header
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene50DocValuesFormat.BINARY);
|
||||
meta.writeVInt(BINARY_PREFIX_COMPRESSED);
|
||||
meta.writeLong(-1L);
|
||||
// now write the bytes: sharing prefixes within a block
|
||||
final long startFP = data.getFilePointer();
|
||||
// currently, we have to store the delta from expected for every 1/nth term
|
||||
// we could avoid this, but it's not much and less overall RAM than the previous approach!
|
||||
RAMOutputStream addressBuffer = new RAMOutputStream();
|
||||
MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, MONOTONIC_BLOCK_SIZE);
|
||||
// buffers up 16 terms
|
||||
RAMOutputStream bytesBuffer = new RAMOutputStream();
|
||||
// buffers up block header
|
||||
RAMOutputStream headerBuffer = new RAMOutputStream();
|
||||
BytesRefBuilder lastTerm = new BytesRefBuilder();
|
||||
lastTerm.grow(maxLength);
|
||||
long count = 0;
|
||||
int suffixDeltas[] = new int[INTERVAL_COUNT];
|
||||
for (BytesRef v : values) {
|
||||
int termPosition = (int) (count & INTERVAL_MASK);
|
||||
if (termPosition == 0) {
|
||||
termAddresses.add(data.getFilePointer() - startFP);
|
||||
// abs-encode first term
|
||||
headerBuffer.writeVInt(v.length);
|
||||
headerBuffer.writeBytes(v.bytes, v.offset, v.length);
|
||||
lastTerm.copyBytes(v);
|
||||
} else {
|
||||
// prefix-code: we only share at most 255 characters, to encode the length as a single
|
||||
// byte and have random access. Larger terms just get less compression.
|
||||
int sharedPrefix = Math.min(255, StringHelper.bytesDifference(lastTerm.get(), v));
|
||||
bytesBuffer.writeByte((byte) sharedPrefix);
|
||||
bytesBuffer.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
|
||||
// we can encode one smaller, because terms are unique.
|
||||
suffixDeltas[termPosition] = v.length - sharedPrefix - 1;
|
||||
}
|
||||
|
||||
count++;
|
||||
// flush block
|
||||
if ((count & INTERVAL_MASK) == 0) {
|
||||
flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
|
||||
}
|
||||
}
|
||||
// flush trailing crap
|
||||
int leftover = (int) (count & INTERVAL_MASK);
|
||||
if (leftover > 0) {
|
||||
Arrays.fill(suffixDeltas, leftover, suffixDeltas.length, 0);
|
||||
flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
|
||||
}
|
||||
final long indexStartFP = data.getFilePointer();
|
||||
// write addresses of indexed terms
|
||||
termAddresses.finish();
|
||||
addressBuffer.writeTo(data);
|
||||
addressBuffer = null;
|
||||
termAddresses = null;
|
||||
meta.writeVInt(minLength);
|
||||
meta.writeVInt(maxLength);
|
||||
meta.writeVLong(count);
|
||||
meta.writeLong(startFP);
|
||||
meta.writeLong(indexStartFP);
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
meta.writeVInt(MONOTONIC_BLOCK_SIZE);
|
||||
addReverseTermIndex(field, values, maxLength);
|
||||
}
|
||||
}
|
||||
|
||||
// writes term dictionary "block"
|
||||
// first term is absolute encoded as vint length + bytes.
|
||||
// lengths of subsequent N terms are encoded as either N bytes or N shorts.
|
||||
// in the double-byte case, the first byte is indicated with -1.
|
||||
// subsequent terms are encoded as byte suffixLength + bytes.
|
||||
private void flushTermsDictBlock(RAMOutputStream headerBuffer, RAMOutputStream bytesBuffer, int suffixDeltas[]) throws IOException {
|
||||
boolean twoByte = false;
|
||||
for (int i = 1; i < suffixDeltas.length; i++) {
|
||||
if (suffixDeltas[i] > 254) {
|
||||
twoByte = true;
|
||||
}
|
||||
}
|
||||
if (twoByte) {
|
||||
headerBuffer.writeByte((byte)255);
|
||||
for (int i = 1; i < suffixDeltas.length; i++) {
|
||||
headerBuffer.writeShort((short) suffixDeltas[i]);
|
||||
}
|
||||
} else {
|
||||
for (int i = 1; i < suffixDeltas.length; i++) {
|
||||
headerBuffer.writeByte((byte) suffixDeltas[i]);
|
||||
}
|
||||
}
|
||||
headerBuffer.writeTo(data);
|
||||
headerBuffer.reset();
|
||||
bytesBuffer.writeTo(data);
|
||||
bytesBuffer.reset();
|
||||
}
|
||||
|
||||
// writes reverse term index: used for binary searching a term into a range of 64 blocks
|
||||
// for every 64 blocks (1024 terms) we store a term, trimming any suffix unnecessary for comparison
|
||||
// terms are written as a contiguous byte[], but never spanning 2^15 byte boundaries.
|
||||
private void addReverseTermIndex(FieldInfo field, final Iterable<BytesRef> values, int maxLength) throws IOException {
|
||||
long count = 0;
|
||||
BytesRefBuilder priorTerm = new BytesRefBuilder();
|
||||
priorTerm.grow(maxLength);
|
||||
BytesRef indexTerm = new BytesRef();
|
||||
long startFP = data.getFilePointer();
|
||||
PagedBytes pagedBytes = new PagedBytes(15);
|
||||
MonotonicBlockPackedWriter addresses = new MonotonicBlockPackedWriter(data, MONOTONIC_BLOCK_SIZE);
|
||||
|
||||
for (BytesRef b : values) {
|
||||
int termPosition = (int) (count & REVERSE_INTERVAL_MASK);
|
||||
if (termPosition == 0) {
|
||||
int len = StringHelper.sortKeyLength(priorTerm.get(), b);
|
||||
indexTerm.bytes = b.bytes;
|
||||
indexTerm.offset = b.offset;
|
||||
indexTerm.length = len;
|
||||
addresses.add(pagedBytes.copyUsingLengthPrefix(indexTerm));
|
||||
} else if (termPosition == REVERSE_INTERVAL_MASK) {
|
||||
priorTerm.copyBytes(b);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
addresses.finish();
|
||||
long numBytes = pagedBytes.getPointer();
|
||||
pagedBytes.freeze(true);
|
||||
PagedBytesDataInput in = pagedBytes.getDataInput();
|
||||
meta.writeLong(startFP);
|
||||
data.writeVLong(numBytes);
|
||||
data.copyBytes(in, numBytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene50DocValuesFormat.SORTED);
|
||||
addTermsDict(field, values);
|
||||
addNumericField(field, docToOrd, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addSortedNumericField(FieldInfo field, final Iterable<Number> docToValueCount, final Iterable<Number> values) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene50DocValuesFormat.SORTED_NUMERIC);
|
||||
if (isSingleValued(docToValueCount)) {
|
||||
meta.writeVInt(SORTED_SINGLE_VALUED);
|
||||
// The field is single-valued, we can encode it as NUMERIC
|
||||
addNumericField(field, singletonView(docToValueCount, values, null));
|
||||
} else {
|
||||
final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToValueCount, values);
|
||||
if (uniqueValueSets != null) {
|
||||
meta.writeVInt(SORTED_SET_TABLE);
|
||||
|
||||
// write the set_id -> values mapping
|
||||
writeDictionary(uniqueValueSets);
|
||||
|
||||
// write the doc -> set_id as a numeric field
|
||||
addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), false);
|
||||
} else {
|
||||
meta.writeVInt(SORTED_WITH_ADDRESSES);
|
||||
// write the stream of values as a numeric field
|
||||
addNumericField(field, values, true);
|
||||
// write the doc -> ord count as a absolute index to the stream
|
||||
addAddresses(field, docToValueCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, final Iterable<Number> docToOrdCount, final Iterable<Number> ords) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene50DocValuesFormat.SORTED_SET);
|
||||
|
||||
if (isSingleValued(docToOrdCount)) {
|
||||
meta.writeVInt(SORTED_SINGLE_VALUED);
|
||||
// The field is single-valued, we can encode it as SORTED
|
||||
addSortedField(field, values, singletonView(docToOrdCount, ords, -1L));
|
||||
} else {
|
||||
final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToOrdCount, ords);
|
||||
if (uniqueValueSets != null) {
|
||||
meta.writeVInt(SORTED_SET_TABLE);
|
||||
|
||||
// write the set_id -> ords mapping
|
||||
writeDictionary(uniqueValueSets);
|
||||
|
||||
// write the ord -> byte[] as a binary field
|
||||
addTermsDict(field, values);
|
||||
|
||||
// write the doc -> set_id as a numeric field
|
||||
addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), false);
|
||||
} else {
|
||||
meta.writeVInt(SORTED_WITH_ADDRESSES);
|
||||
|
||||
// write the ord -> byte[] as a binary field
|
||||
addTermsDict(field, values);
|
||||
|
||||
// write the stream of ords as a numeric field
|
||||
// NOTE: we could return an iterator that delta-encodes these within a doc
|
||||
addNumericField(field, ords, false);
|
||||
|
||||
// write the doc -> ord count as a absolute index to the stream
|
||||
addAddresses(field, docToOrdCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private SortedSet<LongsRef> uniqueValueSets(Iterable<Number> docToValueCount, Iterable<Number> values) {
|
||||
Set<LongsRef> uniqueValueSet = new HashSet<>();
|
||||
LongsRef docValues = new LongsRef(256);
|
||||
|
||||
Iterator<Number> valueCountIterator = docToValueCount.iterator();
|
||||
Iterator<Number> valueIterator = values.iterator();
|
||||
int totalDictSize = 0;
|
||||
while (valueCountIterator.hasNext()) {
|
||||
docValues.length = valueCountIterator.next().intValue();
|
||||
if (docValues.length > 256) {
|
||||
return null;
|
||||
}
|
||||
for (int i = 0; i < docValues.length; ++i) {
|
||||
docValues.longs[i] = valueIterator.next().longValue();
|
||||
}
|
||||
if (uniqueValueSet.contains(docValues)) {
|
||||
continue;
|
||||
}
|
||||
totalDictSize += docValues.length;
|
||||
if (totalDictSize > 256) {
|
||||
return null;
|
||||
}
|
||||
uniqueValueSet.add(new LongsRef(Arrays.copyOf(docValues.longs, docValues.length), 0, docValues.length));
|
||||
}
|
||||
assert valueIterator.hasNext() == false;
|
||||
return new TreeSet<>(uniqueValueSet);
|
||||
}
|
||||
|
||||
private void writeDictionary(SortedSet<LongsRef> uniqueValueSets) throws IOException {
|
||||
int lengthSum = 0;
|
||||
for (LongsRef longs : uniqueValueSets) {
|
||||
lengthSum += longs.length;
|
||||
}
|
||||
|
||||
meta.writeInt(lengthSum);
|
||||
for (LongsRef valueSet : uniqueValueSets) {
|
||||
for (int i = 0; i < valueSet.length; ++i) {
|
||||
meta.writeLong(valueSet.longs[valueSet.offset + i]);
|
||||
}
|
||||
}
|
||||
|
||||
meta.writeInt(uniqueValueSets.size());
|
||||
for (LongsRef valueSet : uniqueValueSets) {
|
||||
meta.writeInt(valueSet.length);
|
||||
}
|
||||
}
|
||||
|
||||
private Iterable<Number> docToSetId(SortedSet<LongsRef> uniqueValueSets, Iterable<Number> docToValueCount, Iterable<Number> values) {
|
||||
final Map<LongsRef, Integer> setIds = new HashMap<>();
|
||||
int i = 0;
|
||||
for (LongsRef set : uniqueValueSets) {
|
||||
setIds.put(set, i++);
|
||||
}
|
||||
assert i == uniqueValueSets.size();
|
||||
|
||||
return new Iterable<Number>() {
|
||||
|
||||
@Override
|
||||
public Iterator<Number> iterator() {
|
||||
final Iterator<Number> valueCountIterator = docToValueCount.iterator();
|
||||
final Iterator<Number> valueIterator = values.iterator();
|
||||
final LongsRef docValues = new LongsRef(256);
|
||||
return new Iterator<Number>() {
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return valueCountIterator.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Number next() {
|
||||
docValues.length = valueCountIterator.next().intValue();
|
||||
for (int i = 0; i < docValues.length; ++i) {
|
||||
docValues.longs[i] = valueIterator.next().longValue();
|
||||
}
|
||||
final Integer id = setIds.get(docValues);
|
||||
assert id != null;
|
||||
return id;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// writes addressing information as MONOTONIC_COMPRESSED integer
|
||||
private void addAddresses(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene50DocValuesFormat.NUMERIC);
|
||||
meta.writeVInt(MONOTONIC_COMPRESSED);
|
||||
meta.writeLong(-1L);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
meta.writeVLong(maxDoc);
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
meta.writeVInt(MONOTONIC_BLOCK_SIZE);
|
||||
|
||||
final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, MONOTONIC_BLOCK_SIZE);
|
||||
long addr = 0;
|
||||
writer.add(addr);
|
||||
for (Number v : values) {
|
||||
addr += v.longValue();
|
||||
writer.add(addr);
|
||||
}
|
||||
writer.finish();
|
||||
meta.writeLong(data.getFilePointer());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
if (meta != null) {
|
||||
meta.writeVInt(-1); // write EOF marker
|
||||
CodecUtil.writeFooter(meta); // write checksum
|
||||
}
|
||||
if (data != null) {
|
||||
CodecUtil.writeFooter(data); // write checksum
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(data, meta);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(data, meta);
|
||||
}
|
||||
meta = data = null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,115 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Lucene 5.0 Doc values format.
|
||||
* @deprecated Only for reading old 5.0-5.3 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene50DocValuesFormat extends DocValuesFormat {
|
||||
|
||||
/** Sole Constructor */
|
||||
public Lucene50DocValuesFormat() {
|
||||
super("Lucene50");
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
return new Lucene50DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene50DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||
}
|
||||
|
||||
static final String DATA_CODEC = "Lucene50DocValuesData";
|
||||
static final String DATA_EXTENSION = "dvd";
|
||||
static final String META_CODEC = "Lucene50DocValuesMetadata";
|
||||
static final String META_EXTENSION = "dvm";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_SORTEDSET_TABLE = 1;
|
||||
static final int VERSION_CURRENT = VERSION_SORTEDSET_TABLE;
|
||||
|
||||
// indicates docvalues type
|
||||
static final byte NUMERIC = 0;
|
||||
static final byte BINARY = 1;
|
||||
static final byte SORTED = 2;
|
||||
static final byte SORTED_SET = 3;
|
||||
static final byte SORTED_NUMERIC = 4;
|
||||
|
||||
// address terms in blocks of 16 terms
|
||||
static final int INTERVAL_SHIFT = 4;
|
||||
static final int INTERVAL_COUNT = 1 << INTERVAL_SHIFT;
|
||||
static final int INTERVAL_MASK = INTERVAL_COUNT - 1;
|
||||
|
||||
// build reverse index from every 1024th term
|
||||
static final int REVERSE_INTERVAL_SHIFT = 10;
|
||||
static final int REVERSE_INTERVAL_COUNT = 1 << REVERSE_INTERVAL_SHIFT;
|
||||
static final int REVERSE_INTERVAL_MASK = REVERSE_INTERVAL_COUNT - 1;
|
||||
|
||||
// for conversion from reverse index to block
|
||||
static final int BLOCK_INTERVAL_SHIFT = REVERSE_INTERVAL_SHIFT - INTERVAL_SHIFT;
|
||||
static final int BLOCK_INTERVAL_COUNT = 1 << BLOCK_INTERVAL_SHIFT;
|
||||
static final int BLOCK_INTERVAL_MASK = BLOCK_INTERVAL_COUNT - 1;
|
||||
|
||||
/** Compressed using packed blocks of ints. */
|
||||
static final int DELTA_COMPRESSED = 0;
|
||||
/** Compressed by computing the GCD. */
|
||||
static final int GCD_COMPRESSED = 1;
|
||||
/** Compressed by giving IDs to unique values. */
|
||||
static final int TABLE_COMPRESSED = 2;
|
||||
/** Compressed with monotonically increasing values */
|
||||
static final int MONOTONIC_COMPRESSED = 3;
|
||||
/** Compressed with constant value (uses only missing bitset) */
|
||||
static final int CONST_COMPRESSED = 4;
|
||||
|
||||
/** Uncompressed binary, written directly (fixed length). */
|
||||
static final int BINARY_FIXED_UNCOMPRESSED = 0;
|
||||
/** Uncompressed binary, written directly (variable length). */
|
||||
static final int BINARY_VARIABLE_UNCOMPRESSED = 1;
|
||||
/** Compressed binary with shared prefixes */
|
||||
static final int BINARY_PREFIX_COMPRESSED = 2;
|
||||
|
||||
/** Standard storage for sorted set values with 1 level of indirection:
|
||||
* {@code docId -> address -> ord}. */
|
||||
static final int SORTED_WITH_ADDRESSES = 0;
|
||||
/** Single-valued sorted set values, encoded as sorted values, so no level
|
||||
* of indirection: {@code docId -> ord}. */
|
||||
static final int SORTED_SINGLE_VALUED = 1;
|
||||
/** Compressed giving IDs to unique sets of values:
|
||||
* {@code docId -> setId -> ords} */
|
||||
static final int SORTED_SET_TABLE = 2;
|
||||
|
||||
/** placeholder for missing offset that means there are no missing values */
|
||||
static final int ALL_LIVE = -1;
|
||||
/** placeholder for missing offset that means all values are missing */
|
||||
static final int ALL_MISSING = -2;
|
||||
|
||||
// addressing uses 16k blocks
|
||||
static final int MONOTONIC_BLOCK_SIZE = 16384;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,62 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Lucene 5.0 Score normalization format.
|
||||
* @deprecated Only for reading old 5.0-5.2 segments
|
||||
*/
|
||||
@Deprecated
|
||||
class Lucene50NormsFormat extends NormsFormat {
|
||||
|
||||
/** Sole Constructor */
|
||||
public Lucene50NormsFormat() {}
|
||||
|
||||
@Override
|
||||
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||
throw new UnsupportedOperationException("this codec can only be used for reading");
|
||||
}
|
||||
|
||||
@Override
|
||||
public NormsProducer normsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene50NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
static final String DATA_CODEC = "Lucene50NormsData";
|
||||
static final String DATA_EXTENSION = "nvd";
|
||||
static final String METADATA_CODEC = "Lucene50NormsMetadata";
|
||||
static final String METADATA_EXTENSION = "nvm";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
static final byte DELTA_COMPRESSED = 0;
|
||||
static final byte TABLE_COMPRESSED = 1;
|
||||
static final byte CONST_COMPRESSED = 2;
|
||||
static final byte UNCOMPRESSED = 3;
|
||||
static final byte INDIRECT = 4;
|
||||
static final byte PATCHED_BITSET = 5;
|
||||
static final byte PATCHED_TABLE = 6;
|
||||
}
|
|
@ -1,481 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.CONST_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.DELTA_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.INDIRECT;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.PATCHED_BITSET;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.PATCHED_TABLE;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.TABLE_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.UNCOMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.VERSION_START;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.Accountables;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.SparseFixedBitSet;
|
||||
import org.apache.lucene.util.packed.BlockPackedReader;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Reader for {@link Lucene50NormsFormat}
|
||||
* @deprecated Only for reading old 5.0-5.2 segments
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene50NormsProducer extends NormsProducer {
|
||||
// metadata maps (just file pointers and minimal stuff)
|
||||
private final Map<String,NormsEntry> norms = new HashMap<>();
|
||||
private final IndexInput data;
|
||||
|
||||
// ram instances we have already loaded
|
||||
final Map<String,Norms> instances = new HashMap<>();
|
||||
|
||||
private final AtomicLong ramBytesUsed;
|
||||
private final AtomicInteger activeCount = new AtomicInteger();
|
||||
private final int maxDoc;
|
||||
|
||||
private final boolean merging;
|
||||
|
||||
// clone for merge: when merging we don't do any instances.put()s
|
||||
Lucene50NormsProducer(Lucene50NormsProducer original) {
|
||||
assert Thread.holdsLock(original);
|
||||
norms.putAll(original.norms);
|
||||
data = original.data.clone();
|
||||
instances.putAll(original.instances);
|
||||
ramBytesUsed = new AtomicLong(original.ramBytesUsed.get());
|
||||
activeCount.set(original.activeCount.get());
|
||||
maxDoc = original.maxDoc;
|
||||
merging = true;
|
||||
}
|
||||
|
||||
Lucene50NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
merging = false;
|
||||
maxDoc = state.segmentInfo.maxDoc();
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass()));
|
||||
int version = -1;
|
||||
|
||||
// read in the entries from the metadata file.
|
||||
try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) {
|
||||
Throwable priorE = null;
|
||||
try {
|
||||
version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
readFields(in, state.fieldInfos);
|
||||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
CodecUtil.checkFooter(in, priorE);
|
||||
}
|
||||
}
|
||||
|
||||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
this.data = state.directory.openInput(dataName, state.context);
|
||||
boolean success = false;
|
||||
try {
|
||||
final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
if (version != version2) {
|
||||
throw new CorruptIndexException("Format versions mismatch: meta=" + version + ",data=" + version2, data);
|
||||
}
|
||||
|
||||
// NOTE: data file is too costly to verify checksum against all the bytes on open,
|
||||
// but for now we at least verify proper structure of the checksum footer: which looks
|
||||
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
|
||||
// such as file truncation.
|
||||
CodecUtil.retrieveChecksum(data);
|
||||
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
|
||||
int fieldNumber = meta.readVInt();
|
||||
while (fieldNumber != -1) {
|
||||
FieldInfo info = infos.fieldInfo(fieldNumber);
|
||||
if (info == null) {
|
||||
throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
|
||||
} else if (!info.hasNorms()) {
|
||||
throw new CorruptIndexException("Invalid field: " + info.name, meta);
|
||||
}
|
||||
NormsEntry entry = readEntry(info, meta);
|
||||
norms.put(info.name, entry);
|
||||
fieldNumber = meta.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
private NormsEntry readEntry(FieldInfo info, IndexInput meta) throws IOException {
|
||||
NormsEntry entry = new NormsEntry();
|
||||
entry.count = meta.readVInt();
|
||||
entry.format = meta.readByte();
|
||||
entry.offset = meta.readLong();
|
||||
switch(entry.format) {
|
||||
case CONST_COMPRESSED:
|
||||
case UNCOMPRESSED:
|
||||
case TABLE_COMPRESSED:
|
||||
case DELTA_COMPRESSED:
|
||||
break;
|
||||
case PATCHED_BITSET:
|
||||
case PATCHED_TABLE:
|
||||
case INDIRECT:
|
||||
if (meta.readVInt() != info.number) {
|
||||
throw new CorruptIndexException("indirect norms entry for field: " + info.name + " is corrupt", meta);
|
||||
}
|
||||
entry.nested = readEntry(info, meta);
|
||||
break;
|
||||
default:
|
||||
throw new CorruptIndexException("Unknown format: " + entry.format, meta);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized NumericDocValues getNorms(FieldInfo field) throws IOException {
|
||||
Norms instance = instances.get(field.name);
|
||||
if (instance == null) {
|
||||
instance = loadNorms(norms.get(field.name));
|
||||
if (!merging) {
|
||||
instances.put(field.name, instance);
|
||||
activeCount.incrementAndGet();
|
||||
ramBytesUsed.addAndGet(instance.ramBytesUsed());
|
||||
}
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return ramBytesUsed.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized Collection<Accountable> getChildResources() {
|
||||
return Accountables.namedAccountables("field", instances);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkIntegrity() throws IOException {
|
||||
CodecUtil.checksumEntireFile(data);
|
||||
}
|
||||
|
||||
private Norms loadNorms(NormsEntry entry) throws IOException {
|
||||
switch(entry.format) {
|
||||
case CONST_COMPRESSED: {
|
||||
final long v = entry.offset;
|
||||
return new Norms() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
return v;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return 8;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "constant";
|
||||
}
|
||||
};
|
||||
}
|
||||
case UNCOMPRESSED: {
|
||||
data.seek(entry.offset);
|
||||
final byte bytes[] = new byte[entry.count];
|
||||
data.readBytes(bytes, 0, bytes.length);
|
||||
return new Norms() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
return bytes[docID];
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return RamUsageEstimator.sizeOf(bytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "byte array";
|
||||
}
|
||||
};
|
||||
}
|
||||
case DELTA_COMPRESSED: {
|
||||
data.seek(entry.offset);
|
||||
int packedIntsVersion = data.readVInt();
|
||||
int blockSize = data.readVInt();
|
||||
final BlockPackedReader reader = new BlockPackedReader(data, packedIntsVersion, blockSize, entry.count, false);
|
||||
return new Norms() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
return reader.get(docID);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return reader.ramBytesUsed();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Accountable> getChildResources() {
|
||||
return Collections.singleton(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "delta compressed";
|
||||
}
|
||||
};
|
||||
}
|
||||
case TABLE_COMPRESSED: {
|
||||
data.seek(entry.offset);
|
||||
int packedIntsVersion = data.readVInt();
|
||||
final int formatID = data.readVInt();
|
||||
final int bitsPerValue = data.readVInt();
|
||||
|
||||
if (bitsPerValue != 1 && bitsPerValue != 2 && bitsPerValue != 4) {
|
||||
throw new CorruptIndexException("TABLE_COMPRESSED only supports bpv=1, bpv=2 and bpv=4, got=" + bitsPerValue, data);
|
||||
}
|
||||
int size = 1 << bitsPerValue;
|
||||
final byte decode[] = new byte[size];
|
||||
final int ordsSize = data.readVInt();
|
||||
for (int i = 0; i < ordsSize; ++i) {
|
||||
decode[i] = data.readByte();
|
||||
}
|
||||
for (int i = ordsSize; i < size; ++i) {
|
||||
decode[i] = 0;
|
||||
}
|
||||
|
||||
final PackedInts.Reader ordsReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID), packedIntsVersion, entry.count, bitsPerValue);
|
||||
return new Norms() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
return decode[(int)ordsReader.get(docID)];
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return RamUsageEstimator.sizeOf(decode) + ordsReader.ramBytesUsed();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Accountable> getChildResources() {
|
||||
return Collections.singleton(ordsReader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "table compressed";
|
||||
}
|
||||
};
|
||||
}
|
||||
case INDIRECT: {
|
||||
data.seek(entry.offset);
|
||||
final long common = data.readLong();
|
||||
int packedIntsVersion = data.readVInt();
|
||||
int blockSize = data.readVInt();
|
||||
final MonotonicBlockPackedReader live = MonotonicBlockPackedReader.of(data, packedIntsVersion, blockSize, entry.count, false);
|
||||
final Norms nestedInstance = loadNorms(entry.nested);
|
||||
final int upperBound = entry.count-1;
|
||||
return new Norms() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
int low = 0;
|
||||
int high = upperBound;
|
||||
while (low <= high) {
|
||||
int mid = (low + high) >>> 1;
|
||||
long doc = live.get(mid);
|
||||
|
||||
if (doc < docID) {
|
||||
low = mid + 1;
|
||||
} else if (doc > docID) {
|
||||
high = mid - 1;
|
||||
} else {
|
||||
return nestedInstance.get(mid);
|
||||
}
|
||||
}
|
||||
return common;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return live.ramBytesUsed() + nestedInstance.ramBytesUsed();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Accountable> getChildResources() {
|
||||
List<Accountable> children = new ArrayList<>();
|
||||
children.add(Accountables.namedAccountable("keys", live));
|
||||
children.add(Accountables.namedAccountable("values", nestedInstance));
|
||||
return Collections.unmodifiableList(children);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "indirect";
|
||||
}
|
||||
};
|
||||
}
|
||||
case PATCHED_BITSET: {
|
||||
data.seek(entry.offset);
|
||||
final long common = data.readLong();
|
||||
int packedIntsVersion = data.readVInt();
|
||||
int blockSize = data.readVInt();
|
||||
MonotonicBlockPackedReader live = MonotonicBlockPackedReader.of(data, packedIntsVersion, blockSize, entry.count, true);
|
||||
final SparseFixedBitSet set = new SparseFixedBitSet(maxDoc);
|
||||
for (int i = 0; i < live.size(); i++) {
|
||||
int doc = (int) live.get(i);
|
||||
set.set(doc);
|
||||
}
|
||||
Norms nestedInstance = loadNorms(entry.nested);
|
||||
return new Norms() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
if (set.get(docID)) {
|
||||
return nestedInstance.get(docID);
|
||||
} else {
|
||||
return common;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return set.ramBytesUsed() + nestedInstance.ramBytesUsed();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Accountable> getChildResources() {
|
||||
List<Accountable> children = new ArrayList<>();
|
||||
children.add(Accountables.namedAccountable("keys", set));
|
||||
children.add(Accountables.namedAccountable("values", nestedInstance));
|
||||
return Collections.unmodifiableList(children);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "patched bitset";
|
||||
}
|
||||
};
|
||||
}
|
||||
case PATCHED_TABLE: {
|
||||
data.seek(entry.offset);
|
||||
int packedIntsVersion = data.readVInt();
|
||||
final int formatID = data.readVInt();
|
||||
final int bitsPerValue = data.readVInt();
|
||||
|
||||
if (bitsPerValue != 2 && bitsPerValue != 4) {
|
||||
throw new CorruptIndexException("PATCHED_TABLE only supports bpv=2 and bpv=4, got=" + bitsPerValue, data);
|
||||
}
|
||||
final int size = 1 << bitsPerValue;
|
||||
final int ordsSize = data.readVInt();
|
||||
final byte decode[] = new byte[ordsSize];
|
||||
assert ordsSize + 1 == size;
|
||||
for (int i = 0; i < ordsSize; ++i) {
|
||||
decode[i] = data.readByte();
|
||||
}
|
||||
|
||||
final PackedInts.Reader ordsReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID), packedIntsVersion, entry.count, bitsPerValue);
|
||||
final Norms nestedInstance = loadNorms(entry.nested);
|
||||
|
||||
return new Norms() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
int ord = (int)ordsReader.get(docID);
|
||||
try {
|
||||
// doing a try/catch here eliminates a seemingly unavoidable branch in hotspot...
|
||||
return decode[ord];
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
return nestedInstance.get(docID);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return RamUsageEstimator.sizeOf(decode) + ordsReader.ramBytesUsed() + nestedInstance.ramBytesUsed();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Accountable> getChildResources() {
|
||||
List<Accountable> children = new ArrayList<>();
|
||||
children.add(Accountables.namedAccountable("common", ordsReader));
|
||||
children.add(Accountables.namedAccountable("uncommon", nestedInstance));
|
||||
return Collections.unmodifiableList(children);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "patched table";
|
||||
}
|
||||
};
|
||||
}
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
data.close();
|
||||
}
|
||||
|
||||
static class NormsEntry {
|
||||
byte format;
|
||||
long offset;
|
||||
int count;
|
||||
NormsEntry nested;
|
||||
}
|
||||
|
||||
static abstract class Norms extends NumericDocValues implements Accountable {
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized NormsProducer getMergeInstance() throws IOException {
|
||||
return new Lucene50NormsProducer(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + "(fields=" + norms.size() + ",active=" + activeCount.get() + ")";
|
||||
}
|
||||
}
|
|
@ -1,176 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene53;
|
||||
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.PointsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 5.3 index format, with configurable per-field postings
|
||||
* and docvalues formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene53 package documentation for file format details.
|
||||
* @deprecated Only for reading old 5.3 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene53Codec extends Codec {
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
|
||||
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
|
||||
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
|
||||
|
||||
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return Lucene53Codec.this.getPostingsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return Lucene53Codec.this.getDocValuesFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final StoredFieldsFormat storedFieldsFormat;
|
||||
|
||||
/**
|
||||
* Instantiates a new codec.
|
||||
*/
|
||||
public Lucene53Codec() {
|
||||
this(Mode.BEST_SPEED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new codec, specifying the stored fields compression
|
||||
* mode to use.
|
||||
* @param mode stored fields compression mode to use for newly
|
||||
* flushed/merged segments.
|
||||
*/
|
||||
public Lucene53Codec(Mode mode) {
|
||||
super("Lucene53");
|
||||
this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final StoredFieldsFormat storedFieldsFormat() {
|
||||
return storedFieldsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final TermVectorsFormat termVectorsFormat() {
|
||||
return vectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PostingsFormat postingsFormat() {
|
||||
return postingsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final FieldInfosFormat fieldInfosFormat() {
|
||||
return fieldInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final LiveDocsFormat liveDocsFormat() {
|
||||
return liveDocsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final CompoundFormat compoundFormat() {
|
||||
return compoundFormat;
|
||||
}
|
||||
|
||||
/** Returns the postings format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene50".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return defaultFormat;
|
||||
}
|
||||
|
||||
/** Returns the docvalues format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene50".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return defaultDVFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final DocValuesFormat docValuesFormat() {
|
||||
return docValuesFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PointsFormat pointsFormat() {
|
||||
return PointsFormat.EMPTY;
|
||||
}
|
||||
|
||||
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50");
|
||||
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene50");
|
||||
|
||||
private final NormsFormat normsFormat = new Lucene53NormsFormat();
|
||||
|
||||
@Override
|
||||
public final NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Lucene 5.3 file format.
|
||||
</body>
|
||||
</html>
|
|
@ -1,178 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene54;
|
||||
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.PointsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53NormsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 5.4 index format, with configurable per-field postings
|
||||
* and docvalues formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene54 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
* @deprecated Only for 5.x back compat
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene54Codec extends Codec {
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
|
||||
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
|
||||
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
|
||||
|
||||
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return Lucene54Codec.this.getPostingsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return Lucene54Codec.this.getDocValuesFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final StoredFieldsFormat storedFieldsFormat;
|
||||
|
||||
/**
|
||||
* Instantiates a new codec.
|
||||
*/
|
||||
public Lucene54Codec() {
|
||||
this(Mode.BEST_SPEED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new codec, specifying the stored fields compression
|
||||
* mode to use.
|
||||
* @param mode stored fields compression mode to use for newly
|
||||
* flushed/merged segments.
|
||||
*/
|
||||
public Lucene54Codec(Mode mode) {
|
||||
super("Lucene54");
|
||||
this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final StoredFieldsFormat storedFieldsFormat() {
|
||||
return storedFieldsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final TermVectorsFormat termVectorsFormat() {
|
||||
return vectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PostingsFormat postingsFormat() {
|
||||
return postingsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final FieldInfosFormat fieldInfosFormat() {
|
||||
return fieldInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final LiveDocsFormat liveDocsFormat() {
|
||||
return liveDocsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final CompoundFormat compoundFormat() {
|
||||
return compoundFormat;
|
||||
}
|
||||
|
||||
/** Returns the postings format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene50".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return defaultFormat;
|
||||
}
|
||||
|
||||
/** Returns the docvalues format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene54".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return defaultDVFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final DocValuesFormat docValuesFormat() {
|
||||
return docValuesFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PointsFormat pointsFormat() {
|
||||
return PointsFormat.EMPTY;
|
||||
}
|
||||
|
||||
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50");
|
||||
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene54");
|
||||
|
||||
private final NormsFormat normsFormat = new Lucene53NormsFormat();
|
||||
|
||||
@Override
|
||||
public final NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Lucene 5.4 file format.
|
||||
</body>
|
||||
</html>
|
|
@ -13,7 +13,4 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.codecs.lucene50.Lucene50Codec
|
||||
org.apache.lucene.codecs.lucene53.Lucene53Codec
|
||||
org.apache.lucene.codecs.lucene54.Lucene54Codec
|
||||
org.apache.lucene.codecs.lucene60.Lucene60Codec
|
||||
|
|
|
@ -13,4 +13,3 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat
|
||||
|
|
|
@ -1,403 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.FilterIterator;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.InPlaceMergeSorter;
|
||||
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.VERSION_CURRENT;
|
||||
|
||||
/**
|
||||
* Writer for {@link Lucene50NormsFormat}
|
||||
* @deprecated Only for testing old 5.0-5.2 segments
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene50NormsConsumer extends NormsConsumer {
|
||||
static final int BLOCK_SIZE = 1 << 14;
|
||||
|
||||
// threshold for indirect encoding, computed as 1 - 1/log2(maxint)
|
||||
// norms are only read for matching postings... so this is the threshold
|
||||
// where n log n operations < maxdoc (e.g. it performs similar to other fields)
|
||||
static final float INDIRECT_THRESHOLD = 1 - 1 / 31F;
|
||||
|
||||
IndexOutput data, meta;
|
||||
|
||||
Lucene50NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
data = state.directory.createOutput(dataName, state.context);
|
||||
CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
meta = state.directory.createOutput(metaName, state.context);
|
||||
CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// we explicitly use only certain bits per value and a specified format, so we statically check this will work
|
||||
static {
|
||||
assert PackedInts.Format.PACKED_SINGLE_BLOCK.isSupported(1);
|
||||
assert PackedInts.Format.PACKED_SINGLE_BLOCK.isSupported(2);
|
||||
assert PackedInts.Format.PACKED_SINGLE_BLOCK.isSupported(4);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addNormsField(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||
writeNormsField(field, values, 0);
|
||||
}
|
||||
|
||||
private void writeNormsField(FieldInfo field, Iterable<Number> values, int level) throws IOException {
|
||||
assert level <= 1; // we only "recurse" once in the indirect case
|
||||
meta.writeVInt(field.number);
|
||||
NormMap uniqueValues = new NormMap();
|
||||
int count = 0;
|
||||
|
||||
for (Number nv : values) {
|
||||
if (nv == null) {
|
||||
throw new IllegalStateException("illegal norms data for field " + field.name + ", got null for value: " + count);
|
||||
}
|
||||
final long v = nv.longValue();
|
||||
|
||||
if (uniqueValues != null) {
|
||||
if (v >= Byte.MIN_VALUE && v <= Byte.MAX_VALUE) {
|
||||
if (uniqueValues.add((byte) v)) {
|
||||
if (uniqueValues.size > 256) {
|
||||
uniqueValues = null;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// anything outside an 8 bit float comes from a custom scorer, which is an extreme edge case
|
||||
uniqueValues = null;
|
||||
}
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
||||
if (uniqueValues == null) {
|
||||
addDeltaCompressed(values, count);
|
||||
} else if (uniqueValues.size == 1) {
|
||||
// 0 bpv
|
||||
addConstant(uniqueValues.values[0]);
|
||||
} else {
|
||||
// small number of unique values: this is the typical case
|
||||
uniqueValues.optimizeOrdinals();
|
||||
|
||||
int numCommonValues = -1;
|
||||
int commonValuesCount = 0;
|
||||
if (level == 0 && count > 256) {
|
||||
float threshold_count = count * INDIRECT_THRESHOLD;
|
||||
if (uniqueValues.freqs[0] > threshold_count) {
|
||||
numCommonValues = 1;
|
||||
} else if ((commonValuesCount = sum(uniqueValues.freqs, 0, 3)) > threshold_count && uniqueValues.size > 4) {
|
||||
numCommonValues = 3;
|
||||
} else if ((commonValuesCount = sum(uniqueValues.freqs, 0, 15)) > threshold_count && uniqueValues.size > 16) {
|
||||
numCommonValues = 15;
|
||||
}
|
||||
}
|
||||
|
||||
if (numCommonValues == -1) {
|
||||
// no pattern in values, just find the most efficient way to pack the values
|
||||
FormatAndBits compression = fastestFormatAndBits(uniqueValues.size - 1);
|
||||
if (compression.bitsPerValue == 8) {
|
||||
addUncompressed(values, count);
|
||||
} else {
|
||||
addTableCompressed(values, compression, count, uniqueValues);
|
||||
}
|
||||
|
||||
} else if (numCommonValues == 1) {
|
||||
byte commonValue = uniqueValues.values[0];
|
||||
if (commonValue == 0) {
|
||||
// if the common value is missing, don't waste RAM on a bitset, since we won't be searching those docs
|
||||
addIndirect(field, values, count, uniqueValues, 0);
|
||||
} else {
|
||||
// otherwise, write a sparse bitset, where 1 indicates 'uncommon value'.
|
||||
addPatchedBitset(field, values, count, uniqueValues);
|
||||
}
|
||||
} else {
|
||||
addPatchedTable(field, values, numCommonValues, commonValuesCount, count, uniqueValues);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int sum(int[] freqs, int start, int end) {
|
||||
int accum = 0;
|
||||
for (int i = start; i < end; ++i) {
|
||||
accum += freqs[i];
|
||||
}
|
||||
return accum;
|
||||
}
|
||||
|
||||
private FormatAndBits fastestFormatAndBits(int max) {
|
||||
// we only use bpv=1,2,4,8
|
||||
PackedInts.Format format = PackedInts.Format.PACKED_SINGLE_BLOCK;
|
||||
int bitsPerValue = PackedInts.bitsRequired(max);
|
||||
if (bitsPerValue == 3) {
|
||||
bitsPerValue = 4;
|
||||
} else if (bitsPerValue > 4) {
|
||||
bitsPerValue = 8;
|
||||
}
|
||||
return new FormatAndBits(format, bitsPerValue);
|
||||
}
|
||||
|
||||
private void addConstant(byte constant) throws IOException {
|
||||
meta.writeVInt(0);
|
||||
meta.writeByte(Lucene50NormsFormat.CONST_COMPRESSED);
|
||||
meta.writeLong(constant);
|
||||
}
|
||||
|
||||
private void addUncompressed(Iterable<Number> values, int count) throws IOException {
|
||||
meta.writeVInt(count);
|
||||
meta.writeByte(Lucene50NormsFormat.UNCOMPRESSED); // uncompressed byte[]
|
||||
meta.writeLong(data.getFilePointer());
|
||||
for (Number nv : values) {
|
||||
data.writeByte(nv.byteValue());
|
||||
}
|
||||
}
|
||||
|
||||
private void addTableCompressed(Iterable<Number> values, FormatAndBits compression, int count, NormMap uniqueValues) throws IOException {
|
||||
meta.writeVInt(count);
|
||||
meta.writeByte(Lucene50NormsFormat.TABLE_COMPRESSED); // table-compressed
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
writeTable(values, compression, count, uniqueValues, uniqueValues.size);
|
||||
}
|
||||
|
||||
private void writeTable(Iterable<Number> values, FormatAndBits compression, int count, NormMap uniqueValues, int numOrds) throws IOException {
|
||||
data.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
data.writeVInt(compression.format.getId());
|
||||
data.writeVInt(compression.bitsPerValue);
|
||||
|
||||
data.writeVInt(numOrds);
|
||||
for (int i = 0; i < numOrds; i++) {
|
||||
data.writeByte(uniqueValues.values[i]);
|
||||
}
|
||||
|
||||
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, compression.format, count, compression.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||
for(Number nv : values) {
|
||||
int ord = uniqueValues.ord(nv.byteValue());
|
||||
if (ord < numOrds) {
|
||||
writer.add(ord);
|
||||
} else {
|
||||
writer.add(numOrds); // collapses all ords >= numOrds into a single value
|
||||
}
|
||||
}
|
||||
writer.finish();
|
||||
}
|
||||
|
||||
private void addDeltaCompressed(Iterable<Number> values, int count) throws IOException {
|
||||
meta.writeVInt(count);
|
||||
meta.writeByte(Lucene50NormsFormat.DELTA_COMPRESSED); // delta-compressed
|
||||
meta.writeLong(data.getFilePointer());
|
||||
data.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
data.writeVInt(BLOCK_SIZE);
|
||||
|
||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
||||
for (Number nv : values) {
|
||||
writer.add(nv.longValue());
|
||||
}
|
||||
writer.finish();
|
||||
}
|
||||
|
||||
// encodes only uncommon values in a sparse bitset
|
||||
// access is constant time, and the common case is predictable
|
||||
// exceptions nest either to CONST (if there are only 2 values), or INDIRECT (if there are > 2 values)
|
||||
private void addPatchedBitset(FieldInfo field, final Iterable<Number> values, int count, NormMap uniqueValues) throws IOException {
|
||||
int commonCount = uniqueValues.freqs[0];
|
||||
|
||||
meta.writeVInt(count - commonCount);
|
||||
meta.writeByte(Lucene50NormsFormat.PATCHED_BITSET);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
// write docs with value
|
||||
writeDocsWithValue(values, uniqueValues, 0);
|
||||
|
||||
// write exceptions: only two cases make sense
|
||||
// bpv = 1 (folded into sparse bitset already)
|
||||
// bpv > 1 (add indirect exception table)
|
||||
meta.writeVInt(field.number);
|
||||
if (uniqueValues.size == 2) {
|
||||
// special case: implicit in bitset
|
||||
addConstant(uniqueValues.values[1]);
|
||||
} else {
|
||||
// exception table
|
||||
addIndirect(field, values, count, uniqueValues, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// encodes common values in a table, and the rest of the values as exceptions using INDIRECT.
|
||||
// the exceptions should not be accessed very often, since the values are uncommon
|
||||
private void addPatchedTable(FieldInfo field, final Iterable<Number> values, final int numCommonValues, int commonValuesCount, int count, final NormMap uniqueValues) throws IOException {
|
||||
meta.writeVInt(count);
|
||||
meta.writeByte(Lucene50NormsFormat.PATCHED_TABLE);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
assert numCommonValues == 3 || numCommonValues == 15;
|
||||
FormatAndBits compression = fastestFormatAndBits(numCommonValues);
|
||||
|
||||
writeTable(values, compression, count, uniqueValues, numCommonValues);
|
||||
|
||||
meta.writeVInt(field.number);
|
||||
addIndirect(field, values, count - commonValuesCount, uniqueValues, numCommonValues);
|
||||
}
|
||||
|
||||
// encodes values as sparse array: keys[] and values[]
|
||||
// access is log(N) where N = keys.length (slow!)
|
||||
// so this is only appropriate as an exception table for patched, or when common value is 0 (wont be accessed by searching)
|
||||
private void addIndirect(FieldInfo field, final Iterable<Number> values, int count, final NormMap uniqueValues, final int minOrd) throws IOException {
|
||||
int commonCount = uniqueValues.freqs[minOrd];
|
||||
|
||||
meta.writeVInt(count - commonCount);
|
||||
meta.writeByte(Lucene50NormsFormat.INDIRECT);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
// write docs with value
|
||||
writeDocsWithValue(values, uniqueValues, minOrd);
|
||||
|
||||
// write actual values
|
||||
writeNormsField(field, new Iterable<Number>() {
|
||||
@Override
|
||||
public Iterator<Number> iterator() {
|
||||
return new FilterIterator<Number, Number>(values.iterator()) {
|
||||
@Override
|
||||
protected boolean predicateFunction(Number value) {
|
||||
return uniqueValues.ord(value.byteValue()) > minOrd;
|
||||
}
|
||||
};
|
||||
}
|
||||
}, 1);
|
||||
}
|
||||
|
||||
private void writeDocsWithValue(final Iterable<Number> values, NormMap uniqueValues, int minOrd) throws IOException {
|
||||
data.writeLong(uniqueValues.values[minOrd]);
|
||||
data.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
data.writeVInt(BLOCK_SIZE);
|
||||
|
||||
// write docs with value
|
||||
final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
|
||||
int doc = 0;
|
||||
for (Number n : values) {
|
||||
int ord = uniqueValues.ord(n.byteValue());
|
||||
if (ord > minOrd) {
|
||||
writer.add(doc);
|
||||
}
|
||||
doc++;
|
||||
}
|
||||
writer.finish();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
if (meta != null) {
|
||||
meta.writeVInt(-1); // write EOF marker
|
||||
CodecUtil.writeFooter(meta); // write checksum
|
||||
}
|
||||
if (data != null) {
|
||||
CodecUtil.writeFooter(data); // write checksum
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(data, meta);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(data, meta);
|
||||
}
|
||||
meta = data = null;
|
||||
}
|
||||
}
|
||||
|
||||
// specialized deduplication of long->ord for norms: 99.99999% of the time this will be a single-byte range.
|
||||
static class NormMap {
|
||||
// we use short: at most we will add 257 values to this map before it's rejected as too big above.
|
||||
private final short[] ords = new short[256];
|
||||
final int[] freqs = new int[257];
|
||||
final byte[] values = new byte[257];
|
||||
int size;
|
||||
|
||||
{
|
||||
Arrays.fill(ords, (short)-1);
|
||||
}
|
||||
|
||||
// adds an item to the mapping. returns true if actually added
|
||||
public boolean add(byte l) {
|
||||
assert size <= 256; // once we add > 256 values, we nullify the map in addNumericField and don't use this strategy
|
||||
int index = (int)l + 128;
|
||||
short previous = ords[index];
|
||||
if (previous < 0) {
|
||||
short slot = (short)size;
|
||||
ords[index] = slot;
|
||||
freqs[slot]++;
|
||||
values[slot] = l;
|
||||
size++;
|
||||
return true;
|
||||
} else {
|
||||
freqs[previous]++;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public int ord(byte value) {
|
||||
return ords[(int)value + 128];
|
||||
}
|
||||
|
||||
// reassign ordinals so higher frequencies have lower ordinals
|
||||
public void optimizeOrdinals() {
|
||||
new InPlaceMergeSorter() {
|
||||
@Override
|
||||
protected int compare(int i, int j) {
|
||||
return freqs[j] - freqs[i]; // sort descending
|
||||
}
|
||||
@Override
|
||||
protected void swap(int i, int j) {
|
||||
// swap ordinal i with ordinal j
|
||||
ords[(int)values[i] + 128] = (short)j;
|
||||
ords[(int)values[j] + 128] = (short)i;
|
||||
|
||||
int tmpFreq = freqs[i];
|
||||
byte tmpValue = values[i];
|
||||
freqs[i] = freqs[j];
|
||||
values[i] = values[j];
|
||||
freqs[j] = tmpFreq;
|
||||
values[j] = tmpValue;
|
||||
}
|
||||
}.sort(0, size);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,41 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
|
||||
/**
|
||||
* Codec for testing 5.0 index format
|
||||
* @deprecated Only for testing old 5.0-5.2 segments
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene50RWCodec extends Lucene50Codec {
|
||||
private final NormsFormat normsFormat = new Lucene50RWNormsFormat();
|
||||
private final SegmentInfoFormat segmentInfoFormat = new Lucene50RWSegmentInfoFormat();
|
||||
|
||||
@Override
|
||||
public NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfoFormat;
|
||||
}
|
||||
}
|
|
@ -1,36 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Read-write version of 5.0 norms format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene50RWNormsFormat extends Lucene50NormsFormat {
|
||||
|
||||
@Override
|
||||
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||
return new Lucene50NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
}
|
|
@ -1,281 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.asserting.AssertingCodec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.SerialMergeScheduler;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Tests Lucene50DocValuesFormat
|
||||
*/
|
||||
public class TestLucene50DocValuesFormat extends BaseCompressingDocValuesFormatTestCase {
|
||||
private final Codec codec = TestUtil.alwaysDocValuesFormat(new Lucene50DocValuesFormat());
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
|
||||
// TODO: these big methods can easily blow up some of the other ram-hungry codecs...
|
||||
// for now just keep them here, as we want to test this for this format.
|
||||
|
||||
@Slow
|
||||
public void testSortedSetVariableLengthBigVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16, 100);
|
||||
}
|
||||
}
|
||||
|
||||
@Nightly
|
||||
public void testSortedSetVariableLengthManyVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16, 100);
|
||||
}
|
||||
}
|
||||
|
||||
@Slow
|
||||
public void testSortedVariableLengthBigVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestSortedVsStoredFields(atLeast(300), 1, 32766);
|
||||
}
|
||||
}
|
||||
|
||||
@Nightly
|
||||
public void testSortedVariableLengthManyVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestSortedVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500);
|
||||
}
|
||||
}
|
||||
|
||||
@Slow
|
||||
public void testTermsEnumFixedWidth() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 10, 10);
|
||||
}
|
||||
}
|
||||
|
||||
@Slow
|
||||
public void testTermsEnumVariableWidth() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 1, 500);
|
||||
}
|
||||
}
|
||||
|
||||
@Nightly
|
||||
public void testTermsEnumRandomMany() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), 1, 500);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: try to refactor this and some termsenum tests into the base class.
|
||||
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
|
||||
// the postings format correctly.
|
||||
private void doTestTermsEnumRandom(int numDocs, int minLength, int maxLength) throws Exception {
|
||||
Directory dir = newFSDirectory(createTempDir());
|
||||
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||
conf.setMergeScheduler(new SerialMergeScheduler());
|
||||
// set to duel against a codec which has ordinals:
|
||||
final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
|
||||
final DocValuesFormat dv = new Lucene50DocValuesFormat();
|
||||
conf.setCodec(new AssertingCodec() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return pf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return dv;
|
||||
}
|
||||
});
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
|
||||
|
||||
// index some docs
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
|
||||
doc.add(idField);
|
||||
final int length = TestUtil.nextInt(random(), minLength, maxLength);
|
||||
int numValues = random().nextInt(17);
|
||||
// create a random list of strings
|
||||
List<String> values = new ArrayList<>();
|
||||
for (int v = 0; v < numValues; v++) {
|
||||
values.add(TestUtil.randomSimpleString(random(), minLength, length));
|
||||
}
|
||||
|
||||
// add in any order to the indexed field
|
||||
ArrayList<String> unordered = new ArrayList<>(values);
|
||||
Collections.shuffle(unordered, random());
|
||||
for (String v : values) {
|
||||
doc.add(newStringField("indexed", v, Field.Store.NO));
|
||||
}
|
||||
|
||||
// add in any order to the dv field
|
||||
ArrayList<String> unordered2 = new ArrayList<>(values);
|
||||
Collections.shuffle(unordered2, random());
|
||||
for (String v : unordered2) {
|
||||
doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
|
||||
}
|
||||
|
||||
writer.addDocument(doc);
|
||||
if (random().nextInt(31) == 0) {
|
||||
writer.commit();
|
||||
}
|
||||
}
|
||||
|
||||
// delete some docs
|
||||
int numDeletions = random().nextInt(numDocs/10);
|
||||
for (int i = 0; i < numDeletions; i++) {
|
||||
int id = random().nextInt(numDocs);
|
||||
writer.deleteDocuments(new Term("id", Integer.toString(id)));
|
||||
}
|
||||
|
||||
// compare per-segment
|
||||
DirectoryReader ir = writer.getReader();
|
||||
for (LeafReaderContext context : ir.leaves()) {
|
||||
LeafReader r = context.reader();
|
||||
Terms terms = r.terms("indexed");
|
||||
if (terms != null) {
|
||||
SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
|
||||
assertEquals(terms.size(), ssdv.getValueCount());
|
||||
TermsEnum expected = terms.iterator();
|
||||
TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
|
||||
assertEquals(terms.size(), expected, actual);
|
||||
|
||||
doTestSortedSetEnumAdvanceIndependently(ssdv);
|
||||
}
|
||||
}
|
||||
ir.close();
|
||||
|
||||
writer.forceMerge(1);
|
||||
|
||||
// now compare again after the merge
|
||||
ir = writer.getReader();
|
||||
LeafReader ar = getOnlyLeafReader(ir);
|
||||
Terms terms = ar.terms("indexed");
|
||||
if (terms != null) {
|
||||
assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
|
||||
TermsEnum expected = terms.iterator();
|
||||
TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
|
||||
assertEquals(terms.size(), expected, actual);
|
||||
}
|
||||
ir.close();
|
||||
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void assertEquals(long numOrds, TermsEnum expected, TermsEnum actual) throws Exception {
|
||||
BytesRef ref;
|
||||
|
||||
// sequential next() through all terms
|
||||
while ((ref = expected.next()) != null) {
|
||||
assertEquals(ref, actual.next());
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
assertNull(actual.next());
|
||||
|
||||
// sequential seekExact(ord) through all terms
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
expected.seekExact(i);
|
||||
actual.seekExact(i);
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// sequential seekExact(BytesRef) through all terms
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
expected.seekExact(i);
|
||||
assertTrue(actual.seekExact(expected.term()));
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// sequential seekCeil(BytesRef) through all terms
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
expected.seekExact(i);
|
||||
assertEquals(SeekStatus.FOUND, actual.seekCeil(expected.term()));
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// random seekExact(ord)
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1);
|
||||
expected.seekExact(randomOrd);
|
||||
actual.seekExact(randomOrd);
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// random seekExact(BytesRef)
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1);
|
||||
expected.seekExact(randomOrd);
|
||||
actual.seekExact(expected.term());
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// random seekCeil(BytesRef)
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
BytesRef target = new BytesRef(TestUtil.randomUnicodeString(random()));
|
||||
SeekStatus expectedStatus = expected.seekCeil(target);
|
||||
assertEquals(expectedStatus, actual.seekCeil(target));
|
||||
if (expectedStatus != SeekStatus.END) {
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,130 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.NormMap;
|
||||
import org.apache.lucene.index.BaseNormsFormatTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Tests Lucene50NormsFormat
|
||||
*/
|
||||
public class TestLucene50NormsFormat extends BaseNormsFormatTestCase {
|
||||
private final Codec codec = new Lucene50RWCodec();
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
|
||||
// NormMap is rather complicated, doing domain encoding / tracking frequencies etc.
|
||||
// test it directly some here...
|
||||
|
||||
public void testNormMapSimple() {
|
||||
NormMap map = new NormMap();
|
||||
map.add((byte)4);
|
||||
map.add((byte) 10);
|
||||
map.add((byte) 5);
|
||||
map.add((byte)10);
|
||||
assertEquals(3, map.size);
|
||||
|
||||
// first come, first serve ord assignment
|
||||
assertEquals(0, map.ord((byte) 4));
|
||||
assertEquals(1, map.ord((byte) 10));
|
||||
assertEquals(2, map.ord((byte) 5));
|
||||
|
||||
assertEquals(4, map.values[0]);
|
||||
assertEquals(10, map.values[1]);
|
||||
assertEquals(5, map.values[2]);
|
||||
|
||||
assertEquals(1, map.freqs[0]);
|
||||
assertEquals(2, map.freqs[1]);
|
||||
assertEquals(1, map.freqs[2]);
|
||||
|
||||
// optimizing reorders the ordinals
|
||||
map.optimizeOrdinals();
|
||||
assertEquals(0, map.ord((byte)10));
|
||||
assertEquals(1, map.ord((byte)4));
|
||||
assertEquals(2, map.ord((byte)5));
|
||||
|
||||
assertEquals(10, map.values[0]);
|
||||
assertEquals(4, map.values[1]);
|
||||
assertEquals(5, map.values[2]);
|
||||
|
||||
assertEquals(2, map.freqs[0]);
|
||||
assertEquals(1, map.freqs[1]);
|
||||
assertEquals(1, map.freqs[2]);
|
||||
}
|
||||
|
||||
public void testNormMapRandom() {
|
||||
|
||||
Set<Byte> uniqueValuesSet = new HashSet<>();
|
||||
int numUniqValues = TestUtil.nextInt(random(), 1, 256);
|
||||
for (int i = 0; i < numUniqValues; i++) {
|
||||
uniqueValuesSet.add(Byte.valueOf((byte)TestUtil.nextInt(random(), Byte.MIN_VALUE, Byte.MAX_VALUE)));
|
||||
}
|
||||
Byte uniqueValues[] = uniqueValuesSet.toArray(new Byte[uniqueValuesSet.size()]);
|
||||
|
||||
Map<Byte,Integer> freqs = new HashMap<>();
|
||||
NormMap map = new NormMap();
|
||||
int numdocs = TestUtil.nextInt(random(), 1, 100000);
|
||||
for (int i = 0; i < numdocs; i++) {
|
||||
byte value = uniqueValues[random().nextInt(uniqueValues.length)];
|
||||
// now add to both expected and actual
|
||||
map.add(value);
|
||||
if (freqs.containsKey(value)) {
|
||||
freqs.put(value, freqs.get(value) + 1);
|
||||
} else {
|
||||
freqs.put(value, 1);
|
||||
}
|
||||
}
|
||||
|
||||
assertEquals(freqs.size(), map.size);
|
||||
for (Map.Entry<Byte,Integer> kv : freqs.entrySet()) {
|
||||
byte value = kv.getKey();
|
||||
int freq = kv.getValue();
|
||||
int ord = map.ord(value);
|
||||
assertEquals(freq, map.freqs[ord]);
|
||||
assertEquals(value, map.values[ord]);
|
||||
}
|
||||
|
||||
// optimizing should reorder ordinals from greatest to least frequency
|
||||
map.optimizeOrdinals();
|
||||
// recheck consistency
|
||||
assertEquals(freqs.size(), map.size);
|
||||
for (Map.Entry<Byte,Integer> kv : freqs.entrySet()) {
|
||||
byte value = kv.getKey();
|
||||
int freq = kv.getValue();
|
||||
int ord = map.ord(value);
|
||||
assertEquals(freq, map.freqs[ord]);
|
||||
assertEquals(value, map.values[ord]);
|
||||
}
|
||||
// also check descending freq
|
||||
int prevFreq = map.freqs[0];
|
||||
for (int i = 1; i < map.size; ++i) {
|
||||
assertTrue(prevFreq >= map.freqs[i]);
|
||||
prevFreq = map.freqs[i];
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue