mirror of https://github.com/apache/lucene.git
LUCENE-5743: Add Lucene49NormsFormat
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1601606 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
84e1847228
commit
6384ae9fb7
|
@ -129,6 +129,9 @@ New Features
|
||||||
from Directory. Add Lucene49Codec and Lucene49DocValuesFormat that make
|
from Directory. Add Lucene49Codec and Lucene49DocValuesFormat that make
|
||||||
use of these. (Robert Muir)
|
use of these. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-5743: Add Lucene49NormsFormat, which can compress in some cases
|
||||||
|
such as very short fields. (Ryan Ernst, Adrien Grand, Robert Muir)
|
||||||
|
|
||||||
Changes in Backwards Compatibility Policy
|
Changes in Backwards Compatibility Policy
|
||||||
|
|
||||||
* LUCENE-5634: Add reuse argument to IndexableField.tokenStream. This
|
* LUCENE-5634: Add reuse argument to IndexableField.tokenStream. This
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
package org.apache.lucene.codecs.diskdv;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
|
||||||
|
import org.apache.lucene.index.BaseNormsFormatTestCase;
|
||||||
|
|
||||||
|
/** Tests DiskNormsFormat */
|
||||||
|
public class TestDiskNormsFormat extends BaseNormsFormatTestCase {
|
||||||
|
private final Codec codec = new CheapBastardCodec();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Codec getCodec() {
|
||||||
|
return codec;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
package org.apache.lucene.codecs.simpletext;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.index.BaseNormsFormatTestCase;
|
||||||
|
|
||||||
|
/** Tests SimpleTextNormsFormat */
|
||||||
|
public class TestSimpleTextNormsFormat extends BaseNormsFormatTestCase {
|
||||||
|
private final Codec codec = new SimpleTextCodec();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Codec getCodec() {
|
||||||
|
return codec;
|
||||||
|
}
|
||||||
|
}
|
|
@ -131,7 +131,7 @@ public class Lucene49Codec extends Codec {
|
||||||
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41");
|
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41");
|
||||||
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene49");
|
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene49");
|
||||||
|
|
||||||
private final NormsFormat normsFormat = new Lucene42NormsFormat();
|
private final NormsFormat normsFormat = new Lucene49NormsFormat();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final NormsFormat normsFormat() {
|
public final NormsFormat normsFormat() {
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.codecs.lucene49;
|
||||||
|
|
||||||
import java.io.Closeable; // javadocs
|
import java.io.Closeable; // javadocs
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
@ -199,6 +200,7 @@ public class Lucene49DocValuesConsumer extends DocValuesConsumer implements Clos
|
||||||
break;
|
break;
|
||||||
case TABLE_COMPRESSED:
|
case TABLE_COMPRESSED:
|
||||||
final Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
|
final Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
|
||||||
|
Arrays.sort(decode);
|
||||||
final HashMap<Long,Integer> encode = new HashMap<>();
|
final HashMap<Long,Integer> encode = new HashMap<>();
|
||||||
meta.writeVInt(decode.length);
|
meta.writeVInt(decode.length);
|
||||||
for (int i = 0; i < decode.length; i++) {
|
for (int i = 0; i < decode.length; i++) {
|
||||||
|
|
|
@ -0,0 +1,208 @@
|
||||||
|
package org.apache.lucene.codecs.lucene49;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||||
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
|
|
||||||
|
import static org.apache.lucene.codecs.lucene49.Lucene49NormsFormat.VERSION_CURRENT;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writer for {@link Lucene49NormsFormat}
|
||||||
|
*/
|
||||||
|
class Lucene49NormsConsumer extends DocValuesConsumer {
|
||||||
|
static final byte DELTA_COMPRESSED = 0;
|
||||||
|
static final byte TABLE_COMPRESSED = 1;
|
||||||
|
static final byte CONST_COMPRESSED = 2;
|
||||||
|
static final byte UNCOMPRESSED = 3;
|
||||||
|
static final int BLOCK_SIZE = 16384;
|
||||||
|
|
||||||
|
IndexOutput data, meta;
|
||||||
|
final int maxDoc;
|
||||||
|
|
||||||
|
Lucene49NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||||
|
maxDoc = state.segmentInfo.getDocCount();
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||||
|
data = state.directory.createOutput(dataName, state.context);
|
||||||
|
CodecUtil.writeHeader(data, dataCodec, VERSION_CURRENT);
|
||||||
|
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||||
|
meta = state.directory.createOutput(metaName, state.context);
|
||||||
|
CodecUtil.writeHeader(meta, metaCodec, VERSION_CURRENT);
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// we explicitly use only certain bits per value and a specified format, so we statically check this will work
|
||||||
|
static {
|
||||||
|
assert PackedInts.Format.PACKED_SINGLE_BLOCK.isSupported(1);
|
||||||
|
assert PackedInts.Format.PACKED_SINGLE_BLOCK.isSupported(2);
|
||||||
|
assert PackedInts.Format.PACKED_SINGLE_BLOCK.isSupported(4);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||||
|
meta.writeVInt(field.number);
|
||||||
|
long minValue = Long.MAX_VALUE;
|
||||||
|
long maxValue = Long.MIN_VALUE;
|
||||||
|
// TODO: more efficient?
|
||||||
|
HashSet<Long> uniqueValues = null;
|
||||||
|
uniqueValues = new HashSet<>();
|
||||||
|
|
||||||
|
long count = 0;
|
||||||
|
for (Number nv : values) {
|
||||||
|
if (nv == null) {
|
||||||
|
throw new IllegalStateException("illegal norms data for field " + field.name + ", got null for value: " + count);
|
||||||
|
}
|
||||||
|
final long v = nv.longValue();
|
||||||
|
|
||||||
|
minValue = Math.min(minValue, v);
|
||||||
|
maxValue = Math.max(maxValue, v);
|
||||||
|
|
||||||
|
if (uniqueValues != null) {
|
||||||
|
if (uniqueValues.add(v)) {
|
||||||
|
if (uniqueValues.size() > 256) {
|
||||||
|
uniqueValues = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
++count;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (count != maxDoc) {
|
||||||
|
throw new IllegalStateException("illegal norms data for field " + field.name + ", expected " + maxDoc + " values, got " + count);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (uniqueValues != null && uniqueValues.size() == 1) {
|
||||||
|
// 0 bpv
|
||||||
|
meta.writeByte(CONST_COMPRESSED);
|
||||||
|
meta.writeLong(minValue);
|
||||||
|
} else if (uniqueValues != null) {
|
||||||
|
// small number of unique values: this is the typical case:
|
||||||
|
// we only use bpv=1,2,4,8
|
||||||
|
PackedInts.Format format = PackedInts.Format.PACKED_SINGLE_BLOCK;
|
||||||
|
int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1);
|
||||||
|
if (bitsPerValue == 3) {
|
||||||
|
bitsPerValue = 4;
|
||||||
|
} else if (bitsPerValue > 4) {
|
||||||
|
bitsPerValue = 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
|
||||||
|
meta.writeByte(UNCOMPRESSED); // uncompressed byte[]
|
||||||
|
meta.writeLong(data.getFilePointer());
|
||||||
|
for (Number nv : values) {
|
||||||
|
data.writeByte(nv == null ? 0 : (byte) nv.longValue());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
meta.writeByte(TABLE_COMPRESSED); // table-compressed
|
||||||
|
meta.writeLong(data.getFilePointer());
|
||||||
|
data.writeVInt(PackedInts.VERSION_CURRENT);
|
||||||
|
|
||||||
|
Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
|
||||||
|
Arrays.sort(decode);
|
||||||
|
final HashMap<Long,Integer> encode = new HashMap<>();
|
||||||
|
// upgrade to power of two sized array
|
||||||
|
int size = 1 << bitsPerValue;
|
||||||
|
data.writeVInt(size);
|
||||||
|
for (int i = 0; i < decode.length; i++) {
|
||||||
|
data.writeLong(decode[i]);
|
||||||
|
encode.put(decode[i], i);
|
||||||
|
}
|
||||||
|
for (int i = decode.length; i < size; i++) {
|
||||||
|
data.writeLong(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
data.writeVInt(format.getId());
|
||||||
|
data.writeVInt(bitsPerValue);
|
||||||
|
|
||||||
|
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, format, maxDoc, bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||||
|
for(Number nv : values) {
|
||||||
|
writer.add(encode.get(nv.longValue()));
|
||||||
|
}
|
||||||
|
writer.finish();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
meta.writeByte(DELTA_COMPRESSED); // delta-compressed
|
||||||
|
meta.writeLong(data.getFilePointer());
|
||||||
|
data.writeVInt(PackedInts.VERSION_CURRENT);
|
||||||
|
data.writeVInt(BLOCK_SIZE);
|
||||||
|
|
||||||
|
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
||||||
|
for (Number nv : values) {
|
||||||
|
writer.add(nv.longValue());
|
||||||
|
}
|
||||||
|
writer.finish();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
if (meta != null) {
|
||||||
|
meta.writeVInt(-1); // write EOF marker
|
||||||
|
CodecUtil.writeFooter(meta); // write checksum
|
||||||
|
}
|
||||||
|
if (data != null) {
|
||||||
|
CodecUtil.writeFooter(data); // write checksum
|
||||||
|
}
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (success) {
|
||||||
|
IOUtils.close(data, meta);
|
||||||
|
} else {
|
||||||
|
IOUtils.closeWhileHandlingException(data, meta);
|
||||||
|
}
|
||||||
|
meta = data = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, final Iterable<Number> docToOrdCount, final Iterable<Number> ords) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,121 @@
|
||||||
|
package org.apache.lucene.codecs.lucene49;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||||
|
import org.apache.lucene.codecs.DocValuesProducer;
|
||||||
|
import org.apache.lucene.codecs.NormsFormat;
|
||||||
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
import org.apache.lucene.util.SmallFloat;
|
||||||
|
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||||
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lucene 4.9 Score normalization format.
|
||||||
|
* <p>
|
||||||
|
* Encodes normalization values with these strategies:
|
||||||
|
* <p>
|
||||||
|
* <ul>
|
||||||
|
* <li>Uncompressed: when values fit into a single byte and would require more than 4 bits
|
||||||
|
* per value, they are just encoded as an uncompressed byte array.
|
||||||
|
* <li>Constant: when there is only one value present for the entire field, no actual data
|
||||||
|
* is written: this constant is encoded in the metadata
|
||||||
|
* <li>Table-compressed: when the number of unique values is very small (< 64), and
|
||||||
|
* when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
|
||||||
|
* a lookup table is written instead. Each per-document entry is instead the ordinal
|
||||||
|
* to this table, and those ordinals are compressed with bitpacking ({@link PackedInts}).
|
||||||
|
* <li>Delta-compressed: per-document integers written as deltas from the minimum value,
|
||||||
|
* compressed with bitpacking. For more information, see {@link BlockPackedWriter}.
|
||||||
|
* This is only used when norms of larger than one byte are present.
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* Files:
|
||||||
|
* <ol>
|
||||||
|
* <li><tt>.nvd</tt>: Norms data</li>
|
||||||
|
* <li><tt>.nvm</tt>: Norms metadata</li>
|
||||||
|
* </ol>
|
||||||
|
* <ol>
|
||||||
|
* <li><a name="nvm" id="nvm"></a>
|
||||||
|
* <p>The Norms metadata or .nvm file.</p>
|
||||||
|
* <p>For each norms field, this stores metadata, such as the offset into the
|
||||||
|
* Norms data (.nvd)</p>
|
||||||
|
* <p>Norms metadata (.dvm) --> Header,<Entry><sup>NumFields</sup>,Footer</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
|
* <li>Entry --> FieldNumber,Type,Offset</li>
|
||||||
|
* <li>FieldNumber --> {@link DataOutput#writeVInt vInt}</li>
|
||||||
|
* <li>Type --> {@link DataOutput#writeByte Byte}</li>
|
||||||
|
* <li>Offset --> {@link DataOutput#writeLong Int64}</li>
|
||||||
|
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||||
|
* </ul>
|
||||||
|
* <p>FieldNumber of -1 indicates the end of metadata.</p>
|
||||||
|
* <p>Offset is the pointer to the start of the data in the norms data (.nvd), or the singleton value for Constant</p>
|
||||||
|
* <p>Type indicates how Numeric values will be compressed:
|
||||||
|
* <ul>
|
||||||
|
* <li>0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded
|
||||||
|
* from the minimum value within the block.
|
||||||
|
* <li>1 --> table-compressed. When the number of unique numeric values is small and it would save space,
|
||||||
|
* a lookup table of unique values is written, followed by the ordinal for each document.
|
||||||
|
* <li>2 --> constant. When there is a single value for the entire field.
|
||||||
|
* <li>3 --> uncompressed: Values written as a simple byte[].
|
||||||
|
* </ul>
|
||||||
|
* <li><a name="nvd" id="nvd"></a>
|
||||||
|
* <p>The Norms data or .nvd file.</p>
|
||||||
|
* <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||||
|
* <p>Norms data (.nvd) --> Header,<Uncompressed | TableCompressed | DeltaCompressed><sup>NumFields</sup>,Footer</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
|
* <li>Uncompressed --> {@link DataOutput#writeByte Byte}<sup>maxDoc</sup></li>
|
||||||
|
* <li>TableCompressed --> PackedIntsVersion,Table,BitPackedData</li>
|
||||||
|
* <li>Table --> TableSize, {@link DataOutput#writeLong int64}<sup>TableSize</sup></li>
|
||||||
|
* <li>BitpackedData --> {@link PackedInts}</li>
|
||||||
|
* <li>DeltaCompressed --> PackedIntsVersion,BlockSize,DeltaCompressedData</li>
|
||||||
|
* <li>DeltaCompressedData --> {@link BlockPackedWriter BlockPackedWriter(blockSize=16k)}</li>
|
||||||
|
* <li>PackedIntsVersion,BlockSize,TableSize --> {@link DataOutput#writeVInt vInt}</li>
|
||||||
|
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||||
|
* </ul>
|
||||||
|
* </ol>
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class Lucene49NormsFormat extends NormsFormat {
|
||||||
|
|
||||||
|
/** Sole Constructor */
|
||||||
|
public Lucene49NormsFormat() {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||||
|
return new Lucene49NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocValuesProducer normsProducer(SegmentReadState state) throws IOException {
|
||||||
|
return new Lucene49NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final String DATA_CODEC = "Lucene49NormsData";
|
||||||
|
private static final String DATA_EXTENSION = "nvd";
|
||||||
|
private static final String METADATA_CODEC = "Lucene49NormsMetadata";
|
||||||
|
private static final String METADATA_EXTENSION = "nvm";
|
||||||
|
static final int VERSION_START = 0;
|
||||||
|
static final int VERSION_CURRENT = VERSION_START;
|
||||||
|
}
|
|
@ -0,0 +1,233 @@
|
||||||
|
package org.apache.lucene.codecs.lucene49;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.DocValuesProducer;
|
||||||
|
import org.apache.lucene.index.BinaryDocValues;
|
||||||
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.FieldInfos;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
|
import org.apache.lucene.index.SortedDocValues;
|
||||||
|
import org.apache.lucene.index.SortedSetDocValues;
|
||||||
|
import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
import org.apache.lucene.util.packed.BlockPackedReader;
|
||||||
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
|
|
||||||
|
import static org.apache.lucene.codecs.lucene49.Lucene49NormsFormat.VERSION_START;
|
||||||
|
import static org.apache.lucene.codecs.lucene49.Lucene49NormsFormat.VERSION_CURRENT;
|
||||||
|
import static org.apache.lucene.codecs.lucene49.Lucene49NormsConsumer.CONST_COMPRESSED;
|
||||||
|
import static org.apache.lucene.codecs.lucene49.Lucene49NormsConsumer.DELTA_COMPRESSED;
|
||||||
|
import static org.apache.lucene.codecs.lucene49.Lucene49NormsConsumer.TABLE_COMPRESSED;
|
||||||
|
import static org.apache.lucene.codecs.lucene49.Lucene49NormsConsumer.UNCOMPRESSED;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reader for {@link Lucene49NormsFormat}
|
||||||
|
*/
|
||||||
|
class Lucene49NormsProducer extends DocValuesProducer {
|
||||||
|
// metadata maps (just file pointers and minimal stuff)
|
||||||
|
private final Map<Integer,NormsEntry> norms = new HashMap<>();
|
||||||
|
private final IndexInput data;
|
||||||
|
private final int version;
|
||||||
|
|
||||||
|
// ram instances we have already loaded
|
||||||
|
final Map<Integer,NumericDocValues> instances = new HashMap<>();
|
||||||
|
|
||||||
|
private final int maxDoc;
|
||||||
|
private final AtomicLong ramBytesUsed;
|
||||||
|
|
||||||
|
Lucene49NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||||
|
maxDoc = state.segmentInfo.getDocCount();
|
||||||
|
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||||
|
// read in the entries from the metadata file.
|
||||||
|
ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context);
|
||||||
|
boolean success = false;
|
||||||
|
ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass()));
|
||||||
|
try {
|
||||||
|
version = CodecUtil.checkHeader(in, metaCodec, VERSION_START, VERSION_CURRENT);
|
||||||
|
readFields(in, state.fieldInfos);
|
||||||
|
CodecUtil.checkFooter(in);
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (success) {
|
||||||
|
IOUtils.close(in);
|
||||||
|
} else {
|
||||||
|
IOUtils.closeWhileHandlingException(in);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||||
|
this.data = state.directory.openInput(dataName, state.context);
|
||||||
|
success = false;
|
||||||
|
try {
|
||||||
|
final int version2 = CodecUtil.checkHeader(data, dataCodec, VERSION_START, VERSION_CURRENT);
|
||||||
|
if (version != version2) {
|
||||||
|
throw new CorruptIndexException("Format versions mismatch");
|
||||||
|
}
|
||||||
|
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(this.data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
|
||||||
|
int fieldNumber = meta.readVInt();
|
||||||
|
while (fieldNumber != -1) {
|
||||||
|
FieldInfo info = infos.fieldInfo(fieldNumber);
|
||||||
|
if (info == null) {
|
||||||
|
throw new CorruptIndexException("Invalid field number: " + fieldNumber + " (resource=" + meta + ")");
|
||||||
|
} else if (!info.hasNorms()) {
|
||||||
|
throw new CorruptIndexException("Invalid field: " + info.name + " (resource=" + meta + ")");
|
||||||
|
}
|
||||||
|
NormsEntry entry = new NormsEntry();
|
||||||
|
entry.format = meta.readByte();
|
||||||
|
entry.offset = meta.readLong();
|
||||||
|
switch(entry.format) {
|
||||||
|
case CONST_COMPRESSED:
|
||||||
|
case UNCOMPRESSED:
|
||||||
|
case TABLE_COMPRESSED:
|
||||||
|
case DELTA_COMPRESSED:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
|
||||||
|
}
|
||||||
|
norms.put(fieldNumber, entry);
|
||||||
|
fieldNumber = meta.readVInt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized NumericDocValues getNumeric(FieldInfo field) throws IOException {
|
||||||
|
NumericDocValues instance = instances.get(field.number);
|
||||||
|
if (instance == null) {
|
||||||
|
instance = loadNorms(field);
|
||||||
|
instances.put(field.number, instance);
|
||||||
|
}
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long ramBytesUsed() {
|
||||||
|
return ramBytesUsed.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void checkIntegrity() throws IOException {
|
||||||
|
CodecUtil.checksumEntireFile(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
private NumericDocValues loadNorms(FieldInfo field) throws IOException {
|
||||||
|
NormsEntry entry = norms.get(field.number);
|
||||||
|
switch(entry.format) {
|
||||||
|
case CONST_COMPRESSED:
|
||||||
|
final long v = entry.offset;
|
||||||
|
return new NumericDocValues() {
|
||||||
|
@Override
|
||||||
|
public long get(int docID) {
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
case UNCOMPRESSED:
|
||||||
|
data.seek(entry.offset);
|
||||||
|
final byte bytes[] = new byte[maxDoc];
|
||||||
|
data.readBytes(bytes, 0, bytes.length);
|
||||||
|
ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(bytes));
|
||||||
|
return new NumericDocValues() {
|
||||||
|
@Override
|
||||||
|
public long get(int docID) {
|
||||||
|
return bytes[docID];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
case DELTA_COMPRESSED:
|
||||||
|
data.seek(entry.offset);
|
||||||
|
int packedIntsVersion = data.readVInt();
|
||||||
|
int blockSize = data.readVInt();
|
||||||
|
final BlockPackedReader reader = new BlockPackedReader(data, packedIntsVersion, blockSize, maxDoc, false);
|
||||||
|
ramBytesUsed.addAndGet(reader.ramBytesUsed());
|
||||||
|
return reader;
|
||||||
|
case TABLE_COMPRESSED:
|
||||||
|
data.seek(entry.offset);
|
||||||
|
int packedVersion = data.readVInt();
|
||||||
|
int size = data.readVInt();
|
||||||
|
if (size > 256) {
|
||||||
|
throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + data);
|
||||||
|
}
|
||||||
|
final long decode[] = new long[size];
|
||||||
|
for (int i = 0; i < decode.length; i++) {
|
||||||
|
decode[i] = data.readLong();
|
||||||
|
}
|
||||||
|
final int formatID = data.readVInt();
|
||||||
|
final int bitsPerValue = data.readVInt();
|
||||||
|
final PackedInts.Reader ordsReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID), packedVersion, maxDoc, bitsPerValue);
|
||||||
|
ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(decode) + ordsReader.ramBytesUsed());
|
||||||
|
return new NumericDocValues() {
|
||||||
|
@Override
|
||||||
|
public long get(int docID) {
|
||||||
|
return decode[(int)ordsReader.get(docID)];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
default:
|
||||||
|
throw new AssertionError();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
|
||||||
|
throw new IllegalStateException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||||
|
throw new IllegalStateException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
|
||||||
|
throw new IllegalStateException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Bits getDocsWithField(FieldInfo field) throws IOException {
|
||||||
|
throw new IllegalStateException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
data.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
static class NormsEntry {
|
||||||
|
byte format;
|
||||||
|
long offset;
|
||||||
|
}
|
||||||
|
}
|
|
@ -173,7 +173,7 @@ term occurs in each document. Note that this will not exist if all fields in
|
||||||
all documents omit position data.
|
all documents omit position data.
|
||||||
</li>
|
</li>
|
||||||
<li>
|
<li>
|
||||||
{@link org.apache.lucene.codecs.lucene42.Lucene42NormsFormat Normalization factors}.
|
{@link org.apache.lucene.codecs.lucene49.Lucene49NormsFormat Normalization factors}.
|
||||||
For each field in each document, a value is stored
|
For each field in each document, a value is stored
|
||||||
that is multiplied into the score for hits on that field.
|
that is multiplied into the score for hits on that field.
|
||||||
</li>
|
</li>
|
||||||
|
@ -289,7 +289,7 @@ systems that frequently run out of file handles.</td>
|
||||||
<td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
<td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>{@link org.apache.lucene.codecs.lucene42.Lucene42NormsFormat Norms}</td>
|
<td>{@link org.apache.lucene.codecs.lucene49.Lucene49NormsFormat Norms}</td>
|
||||||
<td>.nvd, .nvm</td>
|
<td>.nvd, .nvm</td>
|
||||||
<td>Encodes length and boost factors for docs and fields</td>
|
<td>Encodes length and boost factors for docs and fields</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
package org.apache.lucene.codecs.lucene40;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.index.BaseNormsFormatTestCase;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
|
||||||
|
/** Tests Lucene40's norms format */
|
||||||
|
public class TestLucene40NormsFormat extends BaseNormsFormatTestCase {
|
||||||
|
final Codec codec = new Lucene40RWCodec();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Codec getCodec() {
|
||||||
|
return codec;
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() {
|
||||||
|
OLD_FORMAT_IMPERSONATION_IS_ACTIVE = true; // explicitly instantiates ancient codec
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
package org.apache.lucene.codecs.lucene42;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.index.BaseNormsFormatTestCase;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
|
||||||
|
/** Tests Lucene42's norms format */
|
||||||
|
public class TestLucene42NormsFormat extends BaseNormsFormatTestCase {
|
||||||
|
final Codec codec = new Lucene42RWCodec();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Codec getCodec() {
|
||||||
|
return codec;
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() {
|
||||||
|
OLD_FORMAT_IMPERSONATION_IS_ACTIVE = true; // explicitly instantiates ancient codec
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
package org.apache.lucene.codecs.lucene49;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.index.BaseNormsFormatTestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests Lucene49NormsFormat
|
||||||
|
*/
|
||||||
|
public class TestLucene49NormsFormat extends BaseNormsFormatTestCase {
|
||||||
|
final Codec codec = new Lucene49Codec();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Codec getCodec() {
|
||||||
|
return codec;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
|
||||||
|
/** Tests the codec configuration defined by LuceneTestCase randomly
|
||||||
|
*/
|
||||||
|
public class TestNormsFormat extends BaseNormsFormatTestCase {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Codec getCodec() {
|
||||||
|
return Codec.getDefault();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,183 @@
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.document.TextField;
|
||||||
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
|
import org.apache.lucene.search.TermStatistics;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Abstract class to do basic tests for a norms format.
|
||||||
|
* NOTE: This test focuses on the norms impl, nothing else.
|
||||||
|
* The [stretch] goal is for this test to be
|
||||||
|
* so thorough in testing a new NormsFormat that if this
|
||||||
|
* test passes, then all Lucene/Solr tests should also pass. Ie,
|
||||||
|
* if there is some bug in a given NormsFormat that this
|
||||||
|
* test fails to catch then this test needs to be improved! */
|
||||||
|
public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCase {
|
||||||
|
|
||||||
|
public void testByteRange() throws Exception {
|
||||||
|
int iterations = atLeast(1);
|
||||||
|
final Random r = random();
|
||||||
|
for (int i = 0; i < iterations; i++) {
|
||||||
|
doTestNormsVersusStoredFields(new LongProducer() {
|
||||||
|
@Override
|
||||||
|
long next() {
|
||||||
|
return TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testLongRange() throws Exception {
|
||||||
|
int iterations = atLeast(1);
|
||||||
|
final Random r = random();
|
||||||
|
for (int i = 0; i < iterations; i++) {
|
||||||
|
doTestNormsVersusStoredFields(new LongProducer() {
|
||||||
|
@Override
|
||||||
|
long next() {
|
||||||
|
return TestUtil.nextLong(r, Long.MIN_VALUE, Long.MAX_VALUE);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFewValues() throws Exception {
|
||||||
|
int iterations = atLeast(1);
|
||||||
|
final Random r = random();
|
||||||
|
for (int i = 0; i < iterations; i++) {
|
||||||
|
doTestNormsVersusStoredFields(new LongProducer() {
|
||||||
|
@Override
|
||||||
|
long next() {
|
||||||
|
return r.nextBoolean() ? 20 : 3;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAllZeros() throws Exception {
|
||||||
|
int iterations = atLeast(1);
|
||||||
|
final Random r = random();
|
||||||
|
for (int i = 0; i < iterations; i++) {
|
||||||
|
doTestNormsVersusStoredFields(new LongProducer() {
|
||||||
|
@Override
|
||||||
|
long next() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTestNormsVersusStoredFields(LongProducer longs) throws Exception {
|
||||||
|
int numDocs = atLeast(500);
|
||||||
|
long norms[] = new long[numDocs];
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
norms[i] = longs.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
|
||||||
|
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
|
||||||
|
conf.setSimilarity(new CannedNormSimilarity(norms));
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
|
||||||
|
Document doc = new Document();
|
||||||
|
Field idField = new StringField("id", "", Field.Store.NO);
|
||||||
|
Field storedField = newTextField("stored", "", Field.Store.YES);
|
||||||
|
doc.add(idField);
|
||||||
|
doc.add(storedField);
|
||||||
|
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
idField.setStringValue(Integer.toString(i));
|
||||||
|
long value = norms[i];
|
||||||
|
storedField.setStringValue(Long.toString(value));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
if (random().nextInt(31) == 0) {
|
||||||
|
writer.commit();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// delete some docs
|
||||||
|
int numDeletions = random().nextInt(numDocs/10);
|
||||||
|
for (int i = 0; i < numDeletions; i++) {
|
||||||
|
int id = random().nextInt(numDocs);
|
||||||
|
writer.deleteDocuments(new Term("id", Integer.toString(id)));
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.shutdown();
|
||||||
|
|
||||||
|
// compare
|
||||||
|
DirectoryReader ir = DirectoryReader.open(dir);
|
||||||
|
for (AtomicReaderContext context : ir.leaves()) {
|
||||||
|
AtomicReader r = context.reader();
|
||||||
|
NumericDocValues docValues = r.getNormValues("stored");
|
||||||
|
for (int i = 0; i < r.maxDoc(); i++) {
|
||||||
|
long storedValue = Long.parseLong(r.document(i).get("stored"));
|
||||||
|
assertEquals(storedValue, docValues.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ir.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static abstract class LongProducer {
|
||||||
|
abstract long next();
|
||||||
|
}
|
||||||
|
|
||||||
|
static class CannedNormSimilarity extends Similarity {
|
||||||
|
final long norms[];
|
||||||
|
int index = 0;
|
||||||
|
|
||||||
|
CannedNormSimilarity(long norms[]) {
|
||||||
|
this.norms = norms;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long computeNorm(FieldInvertState state) {
|
||||||
|
return norms[index++];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SimScorer simScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void addRandomFields(Document doc) {
|
||||||
|
// TODO: improve
|
||||||
|
doc.add(new TextField("foobar", "boo", Field.Store.NO));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue