Inline skip data into postings lists (#13585)

This updates the postings format in order to inline skip data into postings. This format is generally similar to the current `Lucene99PostingsFormat`, e.g. it shares the same block encoding logic, but it has a few differences:
 - Skip data is inlined into postings to make the access pattern more sequential.
 - There are only 2 levels of skip data: on every block (128 docs) and every 32 blocks (4,096 docs).

In general, I found that the fact that skip data is inlined may slow down a bit queries that don't need skip data at all (e.g. `CountOrXXX` tasks that never advance of consult impacts) and speed up a bit queries that advance by small intervals. The fact that the greatest level only allows skipping 4096 docs at once means that we're slower at advancing by large intervals, but data suggests that it doesn't significantly hurt performance.
This commit is contained in:
Adrien Grand 2024-07-31 17:18:28 +02:00 committed by GitHub
parent 5226e282b4
commit b4a8810b7a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
74 changed files with 6816 additions and 811 deletions

View File

@ -23,7 +23,7 @@ configure(project(":lucene:core")) {
description "Regenerate gen_ForUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/codecs/lucene99")
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
def genScript = file("${genDir}/gen_ForUtil.py")
def genOutput = file("${genDir}/ForUtil.java")
@ -96,5 +96,30 @@ configure(project(":lucene:backward-codecs")) {
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
task generateForUtil99Internal() {
description "Regenerate gen_ForUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene99")
def genScript = file("${genDir}/gen_ForUtil.py")
def genOutput = file("${genDir}/ForUtil.java")
inputs.file genScript
outputs.file genOutput
doLast {
quietExec {
workingDir genDir
executable project.externalTool("python3")
args = [ '-B', genScript ]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil99Internal, [
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
}

View File

@ -317,6 +317,12 @@ Optimizations
by 1) using a confined Arena where appropriate, and 2) grouping files from the same segment to a
single shared Arena. (Chris Hegarty, Michael Gibney, Uwe Schindler)
* GITHUB#13585: Lucene912PostingsFormat, the new default postings format, now
only has 2 levels of skip data, which are inlined into postings instead of
being stored at the end of postings lists. This translates into better
performance for queries that need skipping such as conjunctions.
(Adrien Grand)
Changes in runtime behavior
---------------------

View File

@ -0,0 +1,4 @@
{
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/ForUtil.java": "f31797842f047626df6a1a6b97167bec60269fec",
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/gen_ForUtil.py": "325f2610974b0e76e278b6445405a098a3763feb"
}

View File

@ -35,6 +35,7 @@ module org.apache.lucene.backward_codecs {
exports org.apache.lucene.backward_codecs.lucene92;
exports org.apache.lucene.backward_codecs.lucene94;
exports org.apache.lucene.backward_codecs.lucene95;
exports org.apache.lucene.backward_codecs.lucene99;
exports org.apache.lucene.backward_codecs.packed;
exports org.apache.lucene.backward_codecs.store;
@ -43,7 +44,8 @@ module org.apache.lucene.backward_codecs {
provides org.apache.lucene.codecs.PostingsFormat with
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat;
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat,
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat;
provides org.apache.lucene.codecs.KnnVectorsFormat with
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
@ -59,5 +61,6 @@ module org.apache.lucene.backward_codecs {
org.apache.lucene.backward_codecs.lucene91.Lucene91Codec,
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec;
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataInput;

View File

@ -14,12 +14,33 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.util.Objects;
import org.apache.lucene.codecs.*;
import org.apache.lucene.codecs.lucene90.*;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@ -98,7 +119,7 @@ public class Lucene99Codec extends Codec {
super("Lucene99");
this.storedFieldsFormat =
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
this.defaultPostingsFormat = new Lucene99PostingsFormat();
this.defaultPostingsFormat = new Lucene912PostingsFormat();
this.defaultDVFormat = new Lucene90DocValuesFormat();
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
@ -24,7 +24,6 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.index.IndexOptions;
@ -339,7 +338,7 @@ import org.apache.lucene.util.packed.PackedInts;
*
* @lucene.experimental
*/
public final class Lucene99PostingsFormat extends PostingsFormat {
public class Lucene99PostingsFormat extends PostingsFormat {
/**
* Filename extension for document number, frequencies, and skip data. See chapter: <a
@ -374,28 +373,9 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
private final int minTermBlockSize;
private final int maxTermBlockSize;
/** Creates {@code Lucene99PostingsFormat} with default settings. */
public Lucene99PostingsFormat() {
this(
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/**
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
*
* @see
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
*/
public Lucene99PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("Lucene99");
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
this.minTermBlockSize = minTermBlockSize;
this.maxTermBlockSize = maxTermBlockSize;
}
@Override
@ -405,19 +385,7 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret =
new Lucene90BlockTreeTermsWriter(
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
throw new UnsupportedOperationException();
}
@Override

View File

@ -14,23 +14,23 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_START;
import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_START;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import java.util.AbstractList;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import java.util.Arrays;
@ -61,6 +61,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
private long lastDocPointer;
private int lastPosBufferUpto;
/** Sole constructor. */
public Lucene99SkipReader(
IndexInput skipStream,
int maxSkipLevels,
@ -98,6 +99,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
return df % ForUtil.BLOCK_SIZE == 0 ? df - 1 : df;
}
/** Initialize state. */
public void init(
long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df)
throws IOException {
@ -125,22 +127,27 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
return lastDocPointer;
}
/** Returns the pointer in the pos file. */
public long getPosPointer() {
return lastPosPointer;
}
/** Return the start offset in the position block. */
public int getPosBufferUpto() {
return lastPosBufferUpto;
}
/** Returns the pointer in the pay file. */
public long getPayPointer() {
return lastPayPointer;
}
/** Return the number of bytes in the pay block that belongs to docs from the previous block. */
public int getPayloadByteUpto() {
return lastPayloadByteUpto;
}
/** Return the next skip doc, no skipping can be performed until this doc. */
public int getNextSkipDoc() {
return skipDoc[0];
}
@ -199,7 +206,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
return delta;
}
// The default impl skips impacts
/** Read impacts. The default implementation skips them. */
protected void readImpacts(int level, IndexInput skipStream) throws IOException {
skipStream.skipBytes(skipStream.readVInt());
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import java.util.Arrays;
@ -66,6 +66,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
/** Sole constructor. */
public Lucene99SkipWriter(
int maxSkipLevels,
int blockSize,
@ -92,6 +93,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
}
}
/** Reset state for the given index options. */
public void setField(
boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
this.fieldHasPositions = fieldHasPositions;
@ -211,6 +213,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
competitiveFreqNorms.clear();
}
/** Write impacts to the given output. */
public static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out)
throws IOException {
Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs();

View File

@ -0,0 +1,134 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.LongHeap;
import org.apache.lucene.util.packed.PackedInts;
/** Utility class to encode sequences of 128 small positive integers. */
final class PForUtil {
private static final int MAX_EXCEPTIONS = 7;
static boolean allEqual(long[] l) {
for (int i = 1; i < ForUtil.BLOCK_SIZE; ++i) {
if (l[i] != l[0]) {
return false;
}
}
return true;
}
private final ForUtil forUtil;
PForUtil(ForUtil forUtil) {
assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE;
this.forUtil = forUtil;
}
/** Encode 128 integers from {@code longs} into {@code out}. */
void encode(long[] longs, DataOutput out) throws IOException {
// Determine the top MAX_EXCEPTIONS + 1 values
final LongHeap top = new LongHeap(MAX_EXCEPTIONS + 1);
for (int i = 0; i <= MAX_EXCEPTIONS; ++i) {
top.push(longs[i]);
}
long topValue = top.top();
for (int i = MAX_EXCEPTIONS + 1; i < ForUtil.BLOCK_SIZE; ++i) {
if (longs[i] > topValue) {
topValue = top.updateTop(longs[i]);
}
}
long max = 0L;
for (int i = 1; i <= top.size(); ++i) {
max = Math.max(max, top.get(i));
}
final int maxBitsRequired = PackedInts.bitsRequired(max);
// We store the patch on a byte, so we can't decrease the number of bits required by more than 8
final int patchedBitsRequired =
Math.max(PackedInts.bitsRequired(topValue), maxBitsRequired - 8);
int numExceptions = 0;
final long maxUnpatchedValue = (1L << patchedBitsRequired) - 1;
for (int i = 2; i <= top.size(); ++i) {
if (top.get(i) > maxUnpatchedValue) {
numExceptions++;
}
}
final byte[] exceptions = new byte[numExceptions * 2];
if (numExceptions > 0) {
int exceptionCount = 0;
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
if (longs[i] > maxUnpatchedValue) {
exceptions[exceptionCount * 2] = (byte) i;
exceptions[exceptionCount * 2 + 1] = (byte) (longs[i] >>> patchedBitsRequired);
longs[i] &= maxUnpatchedValue;
exceptionCount++;
}
}
assert exceptionCount == numExceptions : exceptionCount + " " + numExceptions;
}
if (allEqual(longs) && maxBitsRequired <= 8) {
for (int i = 0; i < numExceptions; ++i) {
exceptions[2 * i + 1] =
(byte) (Byte.toUnsignedLong(exceptions[2 * i + 1]) << patchedBitsRequired);
}
out.writeByte((byte) (numExceptions << 5));
out.writeVLong(longs[0]);
} else {
final int token = (numExceptions << 5) | patchedBitsRequired;
out.writeByte((byte) token);
forUtil.encode(longs, patchedBitsRequired, out);
}
out.writeBytes(exceptions, exceptions.length);
}
/** Decode 128 integers into {@code ints}. */
void decode(DataInput in, long[] longs) throws IOException {
final int token = Byte.toUnsignedInt(in.readByte());
final int bitsPerValue = token & 0x1f;
final int numExceptions = token >>> 5;
if (bitsPerValue == 0) {
Arrays.fill(longs, 0, ForUtil.BLOCK_SIZE, in.readVLong());
} else {
forUtil.decode(bitsPerValue, in, longs);
}
for (int i = 0; i < numExceptions; ++i) {
longs[Byte.toUnsignedInt(in.readByte())] |=
Byte.toUnsignedLong(in.readByte()) << bitsPerValue;
}
}
/** Skip 128 integers. */
void skip(DataInput in) throws IOException {
final int token = Byte.toUnsignedInt(in.readByte());
final int bitsPerValue = token & 0x1f;
final int numExceptions = token >>> 5;
if (bitsPerValue == 0) {
in.readVLong();
in.skipBytes((numExceptions << 1));
} else {
in.skipBytes(forUtil.numBytes(bitsPerValue) + (numExceptions << 1));
}
}
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.IndexInput;

View File

@ -0,0 +1,524 @@
#! /usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from math import gcd
"""Code generation for ForUtil.java"""
MAX_SPECIALIZED_BITS_PER_VALUE = 24
OUTPUT_FILE = "ForUtil.java"
PRIMITIVE_SIZE = [8, 16, 32]
HEADER = """// This file has been automatically generated, DO NOT EDIT
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
// Inspired from https://fulmicoton.com/posts/bitpacking/
// Encodes multiple integers in a long to get SIMD-like speedups.
// If bitsPerValue <= 8 then we pack 8 ints per long
// else if bitsPerValue <= 16 we pack 4 ints per long
// else we pack 2 ints per long
final class ForUtil {
static final int BLOCK_SIZE = 128;
private static final int BLOCK_SIZE_LOG2 = 7;
private static long expandMask32(long mask32) {
return mask32 | (mask32 << 32);
}
private static long expandMask16(long mask16) {
return expandMask32(mask16 | (mask16 << 16));
}
private static long expandMask8(long mask8) {
return expandMask16(mask8 | (mask8 << 8));
}
private static long mask32(int bitsPerValue) {
return expandMask32((1L << bitsPerValue) - 1);
}
private static long mask16(int bitsPerValue) {
return expandMask16((1L << bitsPerValue) - 1);
}
private static long mask8(int bitsPerValue) {
return expandMask8((1L << bitsPerValue) - 1);
}
private static void expand8(long[] arr) {
for (int i = 0; i < 16; ++i) {
long l = arr[i];
arr[i] = (l >>> 56) & 0xFFL;
arr[16 + i] = (l >>> 48) & 0xFFL;
arr[32 + i] = (l >>> 40) & 0xFFL;
arr[48 + i] = (l >>> 32) & 0xFFL;
arr[64 + i] = (l >>> 24) & 0xFFL;
arr[80 + i] = (l >>> 16) & 0xFFL;
arr[96 + i] = (l >>> 8) & 0xFFL;
arr[112 + i] = l & 0xFFL;
}
}
private static void expand8To32(long[] arr) {
for (int i = 0; i < 16; ++i) {
long l = arr[i];
arr[i] = (l >>> 24) & 0x000000FF000000FFL;
arr[16 + i] = (l >>> 16) & 0x000000FF000000FFL;
arr[32 + i] = (l >>> 8) & 0x000000FF000000FFL;
arr[48 + i] = l & 0x000000FF000000FFL;
}
}
private static void collapse8(long[] arr) {
for (int i = 0; i < 16; ++i) {
arr[i] =
(arr[i] << 56)
| (arr[16 + i] << 48)
| (arr[32 + i] << 40)
| (arr[48 + i] << 32)
| (arr[64 + i] << 24)
| (arr[80 + i] << 16)
| (arr[96 + i] << 8)
| arr[112 + i];
}
}
private static void expand16(long[] arr) {
for (int i = 0; i < 32; ++i) {
long l = arr[i];
arr[i] = (l >>> 48) & 0xFFFFL;
arr[32 + i] = (l >>> 32) & 0xFFFFL;
arr[64 + i] = (l >>> 16) & 0xFFFFL;
arr[96 + i] = l & 0xFFFFL;
}
}
private static void expand16To32(long[] arr) {
for (int i = 0; i < 32; ++i) {
long l = arr[i];
arr[i] = (l >>> 16) & 0x0000FFFF0000FFFFL;
arr[32 + i] = l & 0x0000FFFF0000FFFFL;
}
}
private static void collapse16(long[] arr) {
for (int i = 0; i < 32; ++i) {
arr[i] = (arr[i] << 48) | (arr[32 + i] << 32) | (arr[64 + i] << 16) | arr[96 + i];
}
}
private static void expand32(long[] arr) {
for (int i = 0; i < 64; ++i) {
long l = arr[i];
arr[i] = l >>> 32;
arr[64 + i] = l & 0xFFFFFFFFL;
}
}
private static void collapse32(long[] arr) {
for (int i = 0; i < 64; ++i) {
arr[i] = (arr[i] << 32) | arr[64 + i];
}
}
private static void prefixSum8(long[] arr, long base) {
expand8To32(arr);
prefixSum32(arr, base);
}
private static void prefixSum16(long[] arr, long base) {
// We need to move to the next primitive size to avoid overflows
expand16To32(arr);
prefixSum32(arr, base);
}
private static void prefixSum32(long[] arr, long base) {
arr[0] += base << 32;
innerPrefixSum32(arr);
expand32(arr);
final long l = arr[BLOCK_SIZE/2-1];
for (int i = BLOCK_SIZE/2; i < BLOCK_SIZE; ++i) {
arr[i] += l;
}
}
// For some reason unrolling seems to help
private static void innerPrefixSum32(long[] arr) {
arr[1] += arr[0];
arr[2] += arr[1];
arr[3] += arr[2];
arr[4] += arr[3];
arr[5] += arr[4];
arr[6] += arr[5];
arr[7] += arr[6];
arr[8] += arr[7];
arr[9] += arr[8];
arr[10] += arr[9];
arr[11] += arr[10];
arr[12] += arr[11];
arr[13] += arr[12];
arr[14] += arr[13];
arr[15] += arr[14];
arr[16] += arr[15];
arr[17] += arr[16];
arr[18] += arr[17];
arr[19] += arr[18];
arr[20] += arr[19];
arr[21] += arr[20];
arr[22] += arr[21];
arr[23] += arr[22];
arr[24] += arr[23];
arr[25] += arr[24];
arr[26] += arr[25];
arr[27] += arr[26];
arr[28] += arr[27];
arr[29] += arr[28];
arr[30] += arr[29];
arr[31] += arr[30];
arr[32] += arr[31];
arr[33] += arr[32];
arr[34] += arr[33];
arr[35] += arr[34];
arr[36] += arr[35];
arr[37] += arr[36];
arr[38] += arr[37];
arr[39] += arr[38];
arr[40] += arr[39];
arr[41] += arr[40];
arr[42] += arr[41];
arr[43] += arr[42];
arr[44] += arr[43];
arr[45] += arr[44];
arr[46] += arr[45];
arr[47] += arr[46];
arr[48] += arr[47];
arr[49] += arr[48];
arr[50] += arr[49];
arr[51] += arr[50];
arr[52] += arr[51];
arr[53] += arr[52];
arr[54] += arr[53];
arr[55] += arr[54];
arr[56] += arr[55];
arr[57] += arr[56];
arr[58] += arr[57];
arr[59] += arr[58];
arr[60] += arr[59];
arr[61] += arr[60];
arr[62] += arr[61];
arr[63] += arr[62];
}
private final long[] tmp = new long[BLOCK_SIZE / 2];
/** Encode 128 integers from {@code longs} into {@code out}. */
void encode(long[] longs, int bitsPerValue, DataOutput out) throws IOException {
final int nextPrimitive;
final int numLongs;
if (bitsPerValue <= 8) {
nextPrimitive = 8;
numLongs = BLOCK_SIZE / 8;
collapse8(longs);
} else if (bitsPerValue <= 16) {
nextPrimitive = 16;
numLongs = BLOCK_SIZE / 4;
collapse16(longs);
} else {
nextPrimitive = 32;
numLongs = BLOCK_SIZE / 2;
collapse32(longs);
}
final int numLongsPerShift = bitsPerValue * 2;
int idx = 0;
int shift = nextPrimitive - bitsPerValue;
for (int i = 0; i < numLongsPerShift; ++i) {
tmp[i] = longs[idx++] << shift;
}
for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) {
for (int i = 0; i < numLongsPerShift; ++i) {
tmp[i] |= longs[idx++] << shift;
}
}
final int remainingBitsPerLong = shift + bitsPerValue;
final long maskRemainingBitsPerLong;
if (nextPrimitive == 8) {
maskRemainingBitsPerLong = MASKS8[remainingBitsPerLong];
} else if (nextPrimitive == 16) {
maskRemainingBitsPerLong = MASKS16[remainingBitsPerLong];
} else {
maskRemainingBitsPerLong = MASKS32[remainingBitsPerLong];
}
int tmpIdx = 0;
int remainingBitsPerValue = bitsPerValue;
while (idx < numLongs) {
if (remainingBitsPerValue >= remainingBitsPerLong) {
remainingBitsPerValue -= remainingBitsPerLong;
tmp[tmpIdx++] |= (longs[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerLong;
if (remainingBitsPerValue == 0) {
idx++;
remainingBitsPerValue = bitsPerValue;
}
} else {
final long mask1, mask2;
if (nextPrimitive == 8) {
mask1 = MASKS8[remainingBitsPerValue];
mask2 = MASKS8[remainingBitsPerLong - remainingBitsPerValue];
} else if (nextPrimitive == 16) {
mask1 = MASKS16[remainingBitsPerValue];
mask2 = MASKS16[remainingBitsPerLong - remainingBitsPerValue];
} else {
mask1 = MASKS32[remainingBitsPerValue];
mask2 = MASKS32[remainingBitsPerLong - remainingBitsPerValue];
}
tmp[tmpIdx] |= (longs[idx++] & mask1) << (remainingBitsPerLong - remainingBitsPerValue);
remainingBitsPerValue = bitsPerValue - remainingBitsPerLong + remainingBitsPerValue;
tmp[tmpIdx++] |= (longs[idx] >>> remainingBitsPerValue) & mask2;
}
}
for (int i = 0; i < numLongsPerShift; ++i) {
out.writeLong(tmp[i]);
}
}
/** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */
int numBytes(int bitsPerValue) {
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
}
private static void decodeSlow(int bitsPerValue, DataInput in, long[] tmp, long[] longs)
throws IOException {
final int numLongs = bitsPerValue << 1;
in.readLongs(tmp, 0, numLongs);
final long mask = MASKS32[bitsPerValue];
int longsIdx = 0;
int shift = 32 - bitsPerValue;
for (; shift >= 0; shift -= bitsPerValue) {
shiftLongs(tmp, numLongs, longs, longsIdx, shift, mask);
longsIdx += numLongs;
}
final int remainingBitsPerLong = shift + bitsPerValue;
final long mask32RemainingBitsPerLong = MASKS32[remainingBitsPerLong];
int tmpIdx = 0;
int remainingBits = remainingBitsPerLong;
for (; longsIdx < BLOCK_SIZE / 2; ++longsIdx) {
int b = bitsPerValue - remainingBits;
long l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b;
while (b >= remainingBitsPerLong) {
b -= remainingBitsPerLong;
l |= (tmp[tmpIdx++] & mask32RemainingBitsPerLong) << b;
}
if (b > 0) {
l |= (tmp[tmpIdx] >>> (remainingBitsPerLong - b)) & MASKS32[b];
remainingBits = remainingBitsPerLong - b;
} else {
remainingBits = remainingBitsPerLong;
}
longs[longsIdx] = l;
}
}
/**
* The pattern that this shiftLongs method applies is recognized by the C2 compiler, which
* generates SIMD instructions for it in order to shift multiple longs at once.
*/
private static void shiftLongs(long[] a, int count, long[] b, int bi, int shift, long mask) {
for (int i = 0; i < count; ++i) {
b[bi + i] = (a[i] >>> shift) & mask;
}
}
"""
def writeRemainderWithSIMDOptimize(bpv, next_primitive, remaining_bits_per_long, o, num_values, f):
iteration = 1
num_longs = bpv * num_values / remaining_bits_per_long
while num_longs % 2 == 0 and num_values % 2 == 0:
num_longs /= 2
num_values /= 2
iteration *= 2
f.write(' shiftLongs(tmp, %d, tmp, 0, 0, MASK%d_%d);\n' % (iteration * num_longs, next_primitive, remaining_bits_per_long))
f.write(' for (int iter = 0, tmpIdx = 0, longsIdx = %d; iter < %d; ++iter, tmpIdx += %d, longsIdx += %d) {\n' %(o, iteration, num_longs, num_values))
tmp_idx = 0
b = bpv
b -= remaining_bits_per_long
f.write(' long l0 = tmp[tmpIdx + %d] << %d;\n' %(tmp_idx, b))
tmp_idx += 1
while b >= remaining_bits_per_long:
b -= remaining_bits_per_long
f.write(' l0 |= tmp[tmpIdx + %d] << %d;\n' %(tmp_idx, b))
tmp_idx += 1
f.write(' longs[longsIdx + 0] = l0;\n')
f.write(' }\n')
def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values, f):
iteration = 1
num_longs = bpv * num_values / remaining_bits_per_long
while num_longs % 2 == 0 and num_values % 2 == 0:
num_longs /= 2
num_values /= 2
iteration *= 2
f.write(' for (int iter = 0, tmpIdx = 0, longsIdx = %d; iter < %d; ++iter, tmpIdx += %d, longsIdx += %d) {\n' %(o, iteration, num_longs, num_values))
i = 0
remaining_bits = 0
tmp_idx = 0
for i in range(int(num_values)):
b = bpv
if remaining_bits == 0:
b -= remaining_bits_per_long
f.write(' long l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits_per_long, b))
else:
b -= remaining_bits
f.write(' long l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b))
tmp_idx += 1
while b >= remaining_bits_per_long:
b -= remaining_bits_per_long
f.write(' l%d |= (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits_per_long, b))
tmp_idx += 1
if b > 0:
f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_long-b, next_primitive, b))
remaining_bits = remaining_bits_per_long-b
f.write(' longs[longsIdx + %d] = l%d;\n' %(i, i))
f.write(' }\n')
def writeDecode(bpv, f):
next_primitive = 32
if bpv <= 8:
next_primitive = 8
elif bpv <= 16:
next_primitive = 16
f.write(' private static void decode%d(DataInput in, long[] tmp, long[] longs) throws IOException {\n' %bpv)
num_values_per_long = 64 / next_primitive
if bpv == next_primitive:
f.write(' in.readLongs(longs, 0, %d);\n' %(bpv*2))
else:
f.write(' in.readLongs(tmp, 0, %d);\n' %(bpv*2))
shift = next_primitive - bpv
o = 0
while shift >= 0:
f.write(' shiftLongs(tmp, %d, longs, %d, %d, MASK%d_%d);\n' %(bpv*2, o, shift, next_primitive, bpv))
o += bpv*2
shift -= bpv
if shift + bpv > 0:
if bpv % (next_primitive % bpv) == 0:
writeRemainderWithSIMDOptimize(bpv, next_primitive, shift + bpv, o, 128/num_values_per_long - o, f)
else:
writeRemainder(bpv, next_primitive, shift + bpv, o, 128/num_values_per_long - o, f)
f.write(' }\n')
if __name__ == '__main__':
f = open(OUTPUT_FILE, 'w')
f.write(HEADER)
for primitive_size in PRIMITIVE_SIZE:
f.write(' private static final long[] MASKS%d = new long[%d];\n' %(primitive_size, primitive_size))
f.write('\n')
f.write(' static {\n')
for primitive_size in PRIMITIVE_SIZE:
f.write(' for (int i = 0; i < %d; ++i) {\n' %primitive_size)
f.write(' MASKS%d[i] = mask%d(i);\n' %(primitive_size, primitive_size))
f.write(' }\n')
f.write(' }')
f.write("""
// mark values in array as final longs to avoid the cost of reading array, arrays should only be
// used when the idx is a variable
""")
for primitive_size in PRIMITIVE_SIZE:
for bpv in range(1, min(MAX_SPECIALIZED_BITS_PER_VALUE + 1, primitive_size)):
if bpv * 2 != primitive_size or primitive_size == 8:
f.write(' private static final long MASK%d_%d = MASKS%d[%d];\n' %(primitive_size, bpv, primitive_size, bpv))
f.write("""
/** Decode 128 integers into {@code longs}. */
void decode(int bitsPerValue, DataInput in, long[] longs) throws IOException {
switch (bitsPerValue) {
""")
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
next_primitive = 32
if bpv <= 8:
next_primitive = 8
elif bpv <= 16:
next_primitive = 16
f.write(' case %d:\n' %bpv)
f.write(' decode%d(in, tmp, longs);\n' %bpv)
f.write(' expand%d(longs);\n' %next_primitive)
f.write(' break;\n')
f.write(' default:\n')
f.write(' decodeSlow(bitsPerValue, in, tmp, longs);\n')
f.write(' expand32(longs);\n')
f.write(' break;\n')
f.write(' }\n')
f.write(' }\n')
f.write("""
/**
* Delta-decode 128 integers into {@code longs}.
*/
void decodeAndPrefixSum(int bitsPerValue, DataInput in, long base, long[] longs) throws IOException {
switch (bitsPerValue) {
""")
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
next_primitive = 32
if bpv <= 8:
next_primitive = 8
elif bpv <= 16:
next_primitive = 16
f.write(' case %d:\n' %bpv)
f.write(' decode%d(in, tmp, longs);\n' %bpv)
f.write(' prefixSum%d(longs, base);\n' %next_primitive)
f.write(' break;\n')
f.write(' default:\n')
f.write(' decodeSlow(bitsPerValue, in, tmp, longs);\n')
f.write(' prefixSum32(longs, base);\n')
f.write(' break;\n')
f.write(' }\n')
f.write(' }\n')
f.write('\n')
for i in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
writeDecode(i, f)
if i < MAX_SPECIALIZED_BITS_PER_VALUE:
f.write('\n')
f.write('}\n')

View File

@ -0,0 +1,428 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Lucene 9.9 file format.
*
* <h2>Apache Lucene - Index File Formats</h2>
*
* <div>
*
* <ul>
* <li><a href="#Introduction">Introduction</a>
* <li><a href="#Definitions">Definitions</a>
* <ul>
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
* <li><a href="#Types_of_Fields">Types of Fields</a>
* <li><a href="#Segments">Segments</a>
* <li><a href="#Document_Numbers">Document Numbers</a>
* </ul>
* <li><a href="#Overview">Index Structure Overview</a>
* <li><a href="#File_Naming">File Naming</a>
* <li><a href="#file-names">Summary of File Extensions</a>
* <ul>
* <li><a href="#Lock_File">Lock File</a>
* <li><a href="#History">History</a>
* <li><a href="#Limitations">Limitations</a>
* </ul>
* </ul>
*
* </div> <a id="Introduction"></a>
*
* <h3>Introduction</h3>
*
* <div>
*
* <p>This document defines the index file formats used in this version of Lucene. If you are using
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
* with the version you are using.
*
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
* </div> <a id="Definitions"></a>
*
* <h3>Definitions</h3>
*
* <div>
*
* <p>The fundamental concepts in Lucene are index, document, field and term.
*
* <p>An index contains a sequence of documents.
*
* <ul>
* <li>A document is a sequence of fields.
* <li>A field is a named sequence of terms.
* <li>A term is a sequence of bytes.
* </ul>
*
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
* are represented as a pair: the string naming the field, and the bytes within the field. <a
* id="Inverted_Indexing"></a>
*
* <h4>Inverted Indexing</h4>
*
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
* search more efficient. Lucene's terms index falls into the family of indexes known as an
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
* This is the inverse of the natural relationship, in which documents list terms. <a
* id="Types_of_Fields"></a>
*
* <h4>Types of Fields</h4>
*
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
* may be both stored and indexed.
*
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
* useful for certain identifier fields to be indexed literally.
*
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
* Fields. <a id="Segments"></a>
*
* <h4>Segments</h4>
*
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
* fully independent index, which could be searched separately. Indexes evolve by:
*
* <ol>
* <li>Creating new segments for newly added documents.
* <li>Merging existing segments.
* </ol>
*
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
* composed of a set of segments. <a id="Document_Numbers"></a>
*
* <h4>Document Numbers</h4>
*
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
* document added to an index is numbered zero, and each subsequent document added gets a number one
* greater than the previous.
*
* <p>Note that a document's number may change, so caution should be taken when storing these
* numbers outside of Lucene. In particular, numbers may change in the following situations:
*
* <ul>
* <li>
* <p>The numbers stored in each segment are unique only within the segment, and must be
* converted before they can be used in a larger context. The standard technique is to
* allocate each segment a range of values, based on the range of numbers used in that
* segment. To convert a document number from a segment to an external value, the segment's
* <i>base</i> document number is added. To convert an external value back to a
* segment-specific value, the segment is identified by the range that the external value is
* in, and the segment's base value is subtracted. For example two five document segments
* might be combined, so that the first segment has a base value of zero, and the second of
* five. Document three from the second segment would have an external value of eight.
* <li>
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
* removed as the index evolves through merging. Deleted documents are dropped when segments
* are merged. A freshly-merged segment thus has no gaps in its numbering.
* </ul>
*
* </div> <a id="Overview"></a>
*
* <h3>Index Structure Overview</h3>
*
* <div>
*
* <p>Each segment index maintains the following:
*
* <ul>
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
* contains metadata about a segment, such as the number of documents, what files it uses, and
* information about how the segment is sorted
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
* contains metadata about the set of named fields used in the index.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
* This contains, for each document, a list of attribute-value pairs, where the attributes are
* field names. These are used to store auxiliary information about the document, such as its
* title, url, or an identifier to access a database. The set of stored fields are what is
* returned for each hit when searching. This is keyed by document number.
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term dictionary}.
* A dictionary containing all of the terms used in all of the indexed fields of all of the
* documents. The dictionary also contains the number of documents which contain the term, and
* pointers to the term's frequency and proximity data.
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Frequency
* data}. For each term in the dictionary, the numbers of all the documents that contain that
* term, and the frequency of the term in that document, unless frequencies are omitted
* ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Proximity
* data}. For each term in the dictionary, the positions that the term occurs in each
* document. Note that this will not exist if all fields in all documents omit position data.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
* each field in each document, a value is stored that is multiplied into the score for hits
* on that field.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
* field in each document, the term vector (sometimes called document vector) may be stored. A
* term vector consists of term text and term frequency. To add Term Vectors to your index see
* the {@link org.apache.lucene.document.Field Field} constructors
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
* stored values, these are also keyed by document number, but are generally intended to be
* loaded into main memory for fast access. Whereas stored values are generally intended for
* summary results from searches, per-document values are useful for things like scoring
* factors.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
* optional file indicating which documents are live.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
* intersection (2D, 3D).
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
* vector format stores numeric vectors in a format optimized for random access and
* computation, supporting high-dimensional nearest-neighbor search.
* </ul>
*
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
*
* <h3>File Naming</h3>
*
* <div>
*
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
* correspond to the different file formats described below. When using the Compound File format
* (default for small segments) these files (except for the Segment info file, the Lock file, and
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
*
* <p>Typically, all segments in an index are stored in a single directory, although this is not
* required.
*
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
* never before used filename. This is achieved using a simple generations approach. For example,
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
*
* <h3>Summary of File Extensions</h3>
*
* <div>
*
* <p>The following table summarizes the names and extensions of the files in Lucene:
*
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
* <caption>lucene filenames by extension</caption>
* <tr>
* <th>Name</th>
* <th>Extension</th>
* <th>Brief Description</th>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
* <td>segments_N</td>
* <td>Stores information about a commit point</td>
* </tr>
* <tr>
* <td><a href="#Lock_File">Lock File</a></td>
* <td>write.lock</td>
* <td>The Write lock prevents multiple IndexWriters from writing to the same
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
* <td>.cfs, .cfe</td>
* <td>An optional "virtual" file consisting of all the other index files for
* systems that frequently run out of file handles.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
* <td>.fnm</td>
* <td>Stores information about the fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
* <td>.fdx</td>
* <td>Contains pointers to field data</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
* <td>.fdt</td>
* <td>The stored fields for documents</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Dictionary}</td>
* <td>.tim</td>
* <td>The term dictionary, stores term info</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Index}</td>
* <td>.tip</td>
* <td>The index into the Term Dictionary</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Frequencies}</td>
* <td>.doc</td>
* <td>Contains the list of docs which contain each term along with frequency</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Positions}</td>
* <td>.pos</td>
* <td>Stores position information about where a term occurs in the index</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Payloads}</td>
* <td>.pay</td>
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
* <td>.nvd, .nvm</td>
* <td>Encodes length and boost factors for docs and fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
* <td>.dvd, .dvm</td>
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
* <td>.tvx</td>
* <td>Stores offset into the document data file</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
* <td>.tvd</td>
* <td>Contains term vector data.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what documents are live</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
* <td>.dii, .dim</td>
* <td>Holds indexed points</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
* <td>.vec, .vem, .veq, vex</td>
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
* hnsw graph data.</td>
* </tr>
* </table>
*
* </div> <a id="Lock_File"></a>
*
* <h3>Lock File</h3>
*
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
* lock directory is different from the index directory then the write lock will be named
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
* directory. When this file is present, a writer is currently modifying the index (adding or
* removing documents). This lock file ensures that only one writer is modifying the index at a
* time. <a id="History"></a>
*
* <h3>History</h3>
*
* <p>Compatibility notes are provided in this document, describing how file formats have changed
* from prior versions:
*
* <ul>
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
* written in the new file format (meaning no specific "upgrade" process is needed). But note
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
* store (vectors &amp; stored fields) files. This allows for faster indexing in certain
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
* change in 2.1).
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
* details.
* <li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData may be passed to
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
* details. Also, diagnostics were added to each segment written recording details about why
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
* read, but on merge the new segment will write them, uncompressed). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
* <li>In version 3.1, segments records the code version that created them. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
* Additionally segments track explicitly whether or not they have term vectors. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
* they were stored in text format only.
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
* was introduced. Normalization factors need no longer be a single byte, they can be any
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
* the postings lists. Payloads can be stored in the term vectors.
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
* were changed to inline directly into the term dictionary. Stored fields are compressed by
* default.
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
* allow updating NumericDocValues fields.
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
* checksum of the file.
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
* suitable for faceting/sorting/analytics.
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
* for binary fields and ord indexes for multi-valued fields.
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
* sorting.
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
* an iterator API.
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
* if they may not produce high enough scores. Additionally doc values and norms has been
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
* elements to skip when advancing in the data.
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
* performant encoding that is vectorized.
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
* user-defined sorts to be used
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
* smaller stored fields.
* <li>In version 9.0, vector-valued fields were added.
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
* IndexDISI. ordToDoc mappings was added to .vem.
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
* layer and not writing the node ids for the zeroth layer.
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
* format to utilize int8 quantized vectors for float32 vector search.
* </ul>
*
* <a id="Limitations"></a>
*
* <h3>Limitations</h3>
*
* <div>
*
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
* index file format and the current implementation. Eventually these should be replaced with either
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
* VInt} values which have no limit. </div>
*/
package org.apache.lucene.backward_codecs.lucene99;

View File

@ -22,3 +22,4 @@ org.apache.lucene.backward_codecs.lucene91.Lucene91Codec
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec

View File

@ -16,3 +16,4 @@
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat

View File

@ -23,12 +23,11 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.backward_codecs.lucene90.Lucene90ScoreSkipReader.MutableImpactList;
import org.apache.lucene.backward_codecs.lucene99.Lucene99SkipWriter;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SkipWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
@ -77,22 +76,6 @@ public class TestLucene90PostingsFormat extends BasePostingsFormatTestCase {
d.close();
}
private void shouldFail(int minItemsInBlock, int maxItemsInBlock) {
expectThrows(
IllegalArgumentException.class,
() -> {
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock);
});
}
public void testInvalidBlockSizes() throws Exception {
shouldFail(0, 0);
shouldFail(10, 8);
shouldFail(-1, 10);
shouldFail(10, -1);
shouldFail(10, 12);
}
public void testImpactSerialization() throws IOException {
// omit norms and omit freqs
doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L)));

View File

@ -14,22 +14,22 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;

View File

@ -0,0 +1,68 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
public class Lucene99RWPostingsFormat extends Lucene99PostingsFormat {
private final int minTermBlockSize;
private final int maxTermBlockSize;
/** Creates {@code Lucene99PostingsFormat} with default settings. */
public Lucene99RWPostingsFormat() {
this(
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/**
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
*
* @see
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
*/
public Lucene99RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super();
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
this.minTermBlockSize = minTermBlockSize;
this.maxTermBlockSize = maxTermBlockSize;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret =
new Lucene90BlockTreeTermsWriter(
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;

View File

@ -19,7 +19,6 @@ package org.apache.lucene.backward_codecs.lucene99;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase {

View File

@ -14,22 +14,26 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import static org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.readImpacts;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.readImpacts;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -41,7 +45,7 @@ import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.BytesRef;
public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat());
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99RWPostingsFormat());
@Override
protected Codec getCodec() {
@ -77,7 +81,7 @@ public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
expectThrows(
IllegalArgumentException.class,
() -> {
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock);
new Lucene99RWPostingsFormat(minItemsInBlock, maxItemsInBlock);
});
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.Directory;

View File

@ -20,9 +20,9 @@ import static org.apache.lucene.backward_index.TestBasicBackwardsCompatibility.a
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.document.Document;

View File

@ -23,13 +23,13 @@ import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene99PostingsWriter}. */
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene912PostingsWriter}. */
public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
private final int minTermBlockSize;
@ -67,7 +67,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false;
try {
@ -84,7 +84,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state);

View File

@ -24,7 +24,7 @@ import java.util.TreeMap;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
@ -54,7 +54,7 @@ import org.apache.lucene.util.automaton.TransitionAccessor;
// - or: longer dense skip lists than just next byte?
/**
* Wraps {@link Lucene99PostingsFormat} format for on-disk storage, but then at read time loads and
* Wraps {@link Lucene912PostingsFormat} format for on-disk storage, but then at read time loads and
* stores all terms and postings directly in RAM as byte[], int[].
*
* <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the
@ -97,12 +97,12 @@ public final class DirectPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return PostingsFormat.forName("Lucene99").fieldsConsumer(state);
return PostingsFormat.forName("Lucene912").fieldsConsumer(state);
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
FieldsProducer postings = PostingsFormat.forName("Lucene99").fieldsProducer(state);
FieldsProducer postings = PostingsFormat.forName("Lucene912").fieldsProducer(state);
if (state.context.context() != IOContext.Context.MERGE) {
FieldsProducer loadedPostings;
try {

View File

@ -22,8 +22,8 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
@ -41,7 +41,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false;
try {
@ -57,7 +57,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new FSTTermsReader(state, postingsReader);

View File

@ -17,13 +17,13 @@
package org.apache.lucene.codecs.uniformsplit;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.TermState;
@ -34,7 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator;
/**
* {@link TermState} serializer which encodes each file pointer as a delta relative to a base file
* pointer. It differs from {@link Lucene99PostingsWriter#encodeTerm} which encodes each file
* pointer. It differs from {@link Lucene912PostingsWriter#encodeTerm} which encodes each file
* pointer as a delta relative to the previous file pointer.
*
* <p>It automatically sets the base file pointer to the first valid file pointer for doc start FP,
@ -95,7 +95,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
/**
* Writes a {@link BlockTermState} to the provided {@link DataOutput}.
*
* <p>Simpler variant of {@link Lucene99PostingsWriter#encodeTerm(DataOutput, FieldInfo,
* <p>Simpler variant of {@link Lucene912PostingsWriter#encodeTerm(DataOutput, FieldInfo,
* BlockTermState, boolean)}.
*/
public void writeTermState(
@ -140,15 +140,12 @@ public class DeltaBaseTermStateSerializer implements Accountable {
termStatesOutput.writeVLong(intTermState.lastPosBlockOffset);
}
}
if (intTermState.skipOffset != -1) {
termStatesOutput.writeVLong(intTermState.skipOffset);
}
}
/**
* Reads a {@link BlockTermState} from the provided {@link DataInput}.
*
* <p>Simpler variant of {@link Lucene99PostingsReader#decodeTerm(DataInput, FieldInfo,
* <p>Simpler variant of {@link Lucene912PostingsReader#decodeTerm(DataInput, FieldInfo,
* BlockTermState, boolean)}.
*
* @param reuse {@link BlockTermState} to reuse; or null to create a new one.
@ -190,9 +187,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
intTermState.lastPosBlockOffset = termStatesInput.readVLong();
}
}
if (intTermState.docFreq > BLOCK_SIZE) {
intTermState.skipOffset = termStatesInput.readVLong();
}
return intTermState;
}
@ -210,7 +204,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
termState.docStartFP = 0;
termState.posStartFP = 0;
termState.payStartFP = 0;
termState.skipOffset = -1;
termState.lastPosBlockOffset = -1;
termState.singletonDocID = -1;

View File

@ -23,8 +23,8 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
@ -113,7 +113,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer termsWriter =
@ -130,7 +130,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
boolean success = false;
try {
FieldsProducer termsReader =

View File

@ -28,7 +28,7 @@
* org.apache.lucene.search.PhraseQuery})
* <li>Quite efficient for {@link org.apache.lucene.search.PrefixQuery}
* <li>Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case
* prefer {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat}
* prefer {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat}
* </ul>
*/
package org.apache.lucene.codecs.uniformsplit;

View File

@ -22,7 +22,7 @@ import java.io.IOException;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.KnnByteVectorField;
@ -42,7 +42,7 @@ import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase;
public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase {
@Override
protected Codec getCodec() {
return new Lucene99Codec() {
return new Lucene912Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new HnswBitVectorsFormat();

View File

@ -17,7 +17,7 @@
package org.apache.lucene.codecs.lucene90.tests;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
/** Test utility class to create mock {@link IntBlockTermState}. */
public class MockTermStateFactory {

View File

@ -1,4 +1,4 @@
{
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/ForUtil.java": "1292ad354d255b1272ffd3db684aa2ddb2bc49ec",
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/gen_ForUtil.py": "ab7b63a1b73986cc04e43de1c8f474b97aef5116"
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "5ff856e80cab30f9e5704aa89f3197f017d07624",
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "3ccf92b3ddbff6340a13e8a55090bfb900dc7be2"
}

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
/** Lucene Core. */
@SuppressWarnings("module") // the test framework is compiled after the core...
@ -33,6 +33,7 @@ module org.apache.lucene.core {
exports org.apache.lucene.codecs.lucene94;
exports org.apache.lucene.codecs.lucene95;
exports org.apache.lucene.codecs.lucene99;
exports org.apache.lucene.codecs.lucene912;
exports org.apache.lucene.codecs.perfield;
exports org.apache.lucene.codecs;
exports org.apache.lucene.document;
@ -71,7 +72,7 @@ module org.apache.lucene.core {
provides org.apache.lucene.analysis.TokenizerFactory with
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
provides org.apache.lucene.codecs.Codec with
Lucene99Codec;
Lucene912Codec;
provides org.apache.lucene.codecs.DocValuesFormat with
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
provides org.apache.lucene.codecs.KnnVectorsFormat with
@ -79,7 +80,7 @@ module org.apache.lucene.core {
org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat;
provides org.apache.lucene.codecs.PostingsFormat with
org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
provides org.apache.lucene.index.SortFieldProvider with
org.apache.lucene.search.SortField.Provider,
org.apache.lucene.search.SortedNumericSortField.Provider,

View File

@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
return LOADER;
}
static Codec defaultCodec = LOADER.lookup("Lucene99");
static Codec defaultCodec = LOADER.lookup("Lucene912");
}
private final String name;

View File

@ -18,8 +18,6 @@ package org.apache.lucene.codecs;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
@ -106,7 +104,7 @@ public final class CompetitiveImpactAccumulator {
}
/** Get the set of competitive freq and norm pairs, ordered by increasing freq and norm. */
public Collection<Impact> getCompetitiveFreqNormPairs() {
public List<Impact> getCompetitiveFreqNormPairs() {
List<Impact> impacts = new ArrayList<>();
int maxFreqForLowerNorms = 0;
for (int i = 0; i < maxFreqs.length; ++i) {
@ -126,7 +124,7 @@ public final class CompetitiveImpactAccumulator {
for (Impact impact : impacts) {
add(impact, freqNormPairs);
}
return Collections.unmodifiableSet(freqNormPairs);
return List.copyOf(freqNormPairs);
}
private void add(Impact newEntry, TreeSet<Impact> freqNormPairs) {

View File

@ -49,9 +49,9 @@ import org.apache.lucene.util.packed.DirectMonotonicWriter;
*
* <pre class="prettyprint">
* // the default: for high performance
* indexWriterConfig.setCodec(new Lucene99Codec(Mode.BEST_SPEED));
* indexWriterConfig.setCodec(new Lucene912Codec(Mode.BEST_SPEED));
* // instead for higher performance (but slower):
* // indexWriterConfig.setCodec(new Lucene99Codec(Mode.BEST_COMPRESSION));
* // indexWriterConfig.setCodec(new Lucene912Codec(Mode.BEST_COMPRESSION));
* </pre>
*
* <p><b>File formats</b>

View File

@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.packed.PackedInts;
/** Utility class to encode/decode increasing sequences of 128 integers. */
public class ForDeltaUtil {
// IDENTITY_PLUS_ONE[i] == i+1
private static final long[] IDENTITY_PLUS_ONE = new long[ForUtil.BLOCK_SIZE];
static {
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
IDENTITY_PLUS_ONE[i] = i + 1;
}
}
private static void prefixSumOfOnes(long[] arr, long base) {
System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE);
// This loop gets auto-vectorized
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
arr[i] += base;
}
}
private final ForUtil forUtil;
ForDeltaUtil(ForUtil forUtil) {
this.forUtil = forUtil;
}
/**
* Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code
* longs} are expected to be deltas between consecutive values.
*/
void encodeDeltas(long[] longs, DataOutput out) throws IOException {
if (longs[0] == 1 && PForUtil.allEqual(longs)) { // happens with very dense postings
out.writeByte((byte) 0);
} else {
long or = 0;
for (long l : longs) {
or |= l;
}
assert or != 0;
final int bitsPerValue = PackedInts.bitsRequired(or);
out.writeByte((byte) bitsPerValue);
forUtil.encode(longs, bitsPerValue, out);
}
}
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */
void decodeAndPrefixSum(DataInput in, long base, long[] longs) throws IOException {
final int bitsPerValue = Byte.toUnsignedInt(in.readByte());
if (bitsPerValue == 0) {
prefixSumOfOnes(longs, base);
} else {
forUtil.decodeAndPrefixSum(bitsPerValue, in, base, longs);
}
}
void skip(DataInput in) throws IOException {
final int bitsPerValue = Byte.toUnsignedInt(in.readByte());
in.skipBytes(forUtil.numBytes(bitsPerValue));
}
}

View File

@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.store.DataInput;

View File

@ -0,0 +1,217 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
import java.util.Objects;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
/**
* Implements the Lucene 9.12 index format
*
* <p>If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}.
*
* @see org.apache.lucene.codecs.lucene99 package documentation for file format details.
* @lucene.experimental
*/
public class Lucene912Codec extends Codec {
/** Configuration option for the codec. */
public enum Mode {
/** Trade compression ratio for retrieval speed. */
BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED),
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION);
private final Lucene90StoredFieldsFormat.Mode storedMode;
private Mode(Lucene90StoredFieldsFormat.Mode storedMode) {
this.storedMode = Objects.requireNonNull(storedMode);
}
}
private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene99SegmentInfoFormat();
private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
private final CompoundFormat compoundFormat = new Lucene90CompoundFormat();
private final NormsFormat normsFormat = new Lucene90NormsFormat();
private final PostingsFormat defaultPostingsFormat;
private final PostingsFormat postingsFormat =
new PerFieldPostingsFormat() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return Lucene912Codec.this.getPostingsFormatForField(field);
}
};
private final DocValuesFormat defaultDVFormat;
private final DocValuesFormat docValuesFormat =
new PerFieldDocValuesFormat() {
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return Lucene912Codec.this.getDocValuesFormatForField(field);
}
};
private final KnnVectorsFormat defaultKnnVectorsFormat;
private final KnnVectorsFormat knnVectorsFormat =
new PerFieldKnnVectorsFormat() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return Lucene912Codec.this.getKnnVectorsFormatForField(field);
}
};
private final StoredFieldsFormat storedFieldsFormat;
/** Instantiates a new codec. */
public Lucene912Codec() {
this(Mode.BEST_SPEED);
}
/**
* Instantiates a new codec, specifying the stored fields compression mode to use.
*
* @param mode stored fields compression mode to use for newly flushed/merged segments.
*/
public Lucene912Codec(Mode mode) {
super("Lucene912");
this.storedFieldsFormat =
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
this.defaultPostingsFormat = new Lucene912PostingsFormat();
this.defaultDVFormat = new Lucene90DocValuesFormat();
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
}
@Override
public final StoredFieldsFormat storedFieldsFormat() {
return storedFieldsFormat;
}
@Override
public final TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
}
@Override
public final PostingsFormat postingsFormat() {
return postingsFormat;
}
@Override
public final FieldInfosFormat fieldInfosFormat() {
return fieldInfosFormat;
}
@Override
public final SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}
@Override
public final LiveDocsFormat liveDocsFormat() {
return liveDocsFormat;
}
@Override
public final CompoundFormat compoundFormat() {
return compoundFormat;
}
@Override
public final PointsFormat pointsFormat() {
return new Lucene90PointsFormat();
}
@Override
public final KnnVectorsFormat knnVectorsFormat() {
return knnVectorsFormat;
}
/**
* Returns the postings format that should be used for writing new segments of <code>field</code>.
*
* <p>The default implementation always returns "Lucene912".
*
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
* future version of Lucene are only guaranteed to be able to read the default implementation,
*/
public PostingsFormat getPostingsFormatForField(String field) {
return defaultPostingsFormat;
}
/**
* Returns the docvalues format that should be used for writing new segments of <code>field</code>
* .
*
* <p>The default implementation always returns "Lucene90".
*
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
* future version of Lucene are only guaranteed to be able to read the default implementation.
*/
public DocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;
}
/**
* Returns the vectors format that should be used for writing new segments of <code>field</code>
*
* <p>The default implementation always returns "Lucene99HnswVectorsFormat".
*
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
* future version of Lucene are only guaranteed to be able to read the default implementation.
*/
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return defaultKnnVectorsFormat;
}
@Override
public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
}
@Override
public final NormsFormat normsFormat() {
return normsFormat;
}
}

View File

@ -0,0 +1,492 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
/**
* Lucene 9.12 postings format, which encodes postings in packed integer blocks for fast decode.
*
* <p>Basic idea:
*
* <ul>
* <li><b>Packed Blocks and VInt Blocks</b>:
* <p>In packed blocks, integers are encoded with the same bit width ({@link PackedInts packed
* format}): the block size (i.e. number of integers inside block) is fixed (currently 128).
* Additionally blocks that are all the same value are encoded in an optimized way.
* <p>In VInt blocks, integers are encoded as {@link DataOutput#writeVInt VInt}: the block
* size is variable.
* <li><b>Block structure</b>:
* <p>When the postings are long enough, Lucene912PostingsFormat will try to encode most
* integer data as a packed block.
* <p>Take a term with 259 documents as an example, the first 256 document ids are encoded as
* two packed blocks, while the remaining 3 are encoded as one VInt block.
* <p>Different kinds of data are always encoded separately into different packed blocks, but
* may possibly be interleaved into the same VInt block.
* <p>This strategy is applied to pairs: &lt;document number, frequency&gt;, &lt;position,
* payload length&gt;, &lt;position, offset start, offset length&gt;, and &lt;position,
* payload length, offsetstart, offset length&gt;.
* <li><b>Skipdata</b>:
* <p>Skipdata is interleaved with blocks on 2 levels. Level 0 skip data is interleaved
* between every packed block. Level 1 skip data is interleaved between every 32 packed
* blocks.
* <li><b>Positions, Payloads, and Offsets</b>:
* <p>A position is an integer indicating where the term occurs within one document. A payload
* is a blob of metadata associated with current position. An offset is a pair of integers
* indicating the tokenized start/end offsets for given term in current position: it is
* essentially a specialized payload.
* <p>When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets
* (assuming a null payload contributes one count). As mentioned in block structure, it is
* possible to encode these three either combined or separately.
* <p>In all cases, payloads and offsets are stored together. When encoded as a packed block,
* position data is separated out as .pos, while payloads and offsets are encoded in .pay
* (payload metadata will also be stored directly in .pay). When encoded as VInt blocks, all
* these three are stored interleaved into the .pos (so is payload metadata).
* <p>With this strategy, the majority of payload and offset data will be outside .pos file.
* So for queries that require only position data, running on a full index with payloads and
* offsets, this reduces disk pre-fetches.
* </ul>
*
* <p>Files and detailed format:
*
* <ul>
* <li><code>.tim</code>: <a href="#Termdictionary">Term Dictionary</a>
* <li><code>.tip</code>: <a href="#Termindex">Term Index</a>
* <li><code>.doc</code>: <a href="#Frequencies">Frequencies and Skip Data</a>
* <li><code>.pos</code>: <a href="#Positions">Positions</a>
* <li><code>.pay</code>: <a href="#Payloads">Payloads and Offsets</a>
* </ul>
*
* <a id="Termdictionary"></a>
*
* <dl>
* <dd><b>Term Dictionary</b>
* <p>The .tim file contains the list of terms in each field along with per-term statistics
* (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the
* .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on
* the format.
* <p>NOTE: The term dictionary can plug into different postings implementations: the postings
* writer/reader are actually responsible for encoding and decoding the PostingsHeader and
* TermMetadata sections described here:
* <ul>
* <li>PostingsHeader --&gt; Header, PackedBlockSize
* <li>TermMetadata --&gt; (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?,
* PayFPDelta?
* <li>Header, --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
* <li>PackedBlockSize, SingletonDocID --&gt; {@link DataOutput#writeVInt VInt}
* <li>DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta --&gt; {@link
* DataOutput#writeVLong VLong}
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}
* </ul>
* <p>Notes:
* <ul>
* <li>Header is a {@link CodecUtil#writeIndexHeader IndexHeader} storing the version
* information for the postings.
* <li>PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width
* is determined by the largest integer. Smaller block size result in smaller variance
* among width of integers hence smaller indexes. Larger block size result in more
* efficient bulk i/o hence better acceleration. This value should always be a multiple
* of 64, currently fixed as 128 as a tradeoff. It is also the skip interval used to
* accelerate {@link org.apache.lucene.index.PostingsEnum#advance(int)}.
* <li>DocFPDelta determines the position of this term's TermFreqs within the .doc file. In
* particular, it is the difference of file offset between this term's data and previous
* term's data (or zero, for the first term in the block).On disk it is stored as the
* difference from previous value in sequence.
* <li>PosFPDelta determines the position of this term's TermPositions within the .pos file.
* While PayFPDelta determines the position of this term's &lt;TermPayloads,
* TermOffsets?&gt; within the .pay file. Similar to DocFPDelta, it is the difference
* between two file positions (or neglected, for fields that omit payloads and offsets).
* <li>PosVIntBlockFPDelta determines the position of this term's last TermPosition in last
* pos packed block within the .pos file. It is synonym for PayVIntBlockFPDelta or
* OffsetVIntBlockFPDelta. This is actually used to indicate whether it is necessary to
* load following payloads and offsets from .pos instead of .pay. Every time a new block
* of positions are to be loaded, the PostingsReader will use this value to check
* whether current block is packed format or VInt. When packed format, payloads and
* offsets are fetched from .pay, otherwise from .pos. (this value is neglected when
* total number of positions i.e. totalTermFreq is less or equal to PackedBlockSize).
* <li>SingletonDocID is an optimization when a term only appears in one document. In this
* case, instead of writing a file pointer to the .doc file (DocFPDelta), and then a
* VIntBlock at that location, the single document ID is written to the term dictionary.
* </ul>
* </dl>
*
* <a id="Termindex"></a>
*
* <dl>
* <dd><b>Term Index</b>
* <p>The .tip file contains an index into the term dictionary, so that it can be accessed
* randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format.
* </dl>
*
* <a id="Frequencies"></a>
*
* <dl>
* <dd><b>Frequencies and Skip Data</b>
* <p>The .doc file contains the lists of documents which contain each term, along with the
* frequency of the term in that document (except when frequencies are omitted: {@link
* IndexOptions#DOCS}). Skip data is saved at the end of each term's postings. The skip data
* is saved once for the entire postings list.
* <ul>
* <li>docFile(.doc) --&gt; Header, &lt;TermFreqs&gt;<sup>TermCount</sup>, Footer
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
* <li>TermFreqs --&gt; &lt;PackedBlock32&gt; <sup>PackedDocBlockNum/32</sup>, VIntBlock?
* <li>PackedBlock32 --&gt; Level1SkipData, &lt;PackedBlock&gt; <sup>32</sup>
* <li>PackedBlock --&gt; Level0SkipData, PackedDocDeltaBlock, PackedFreqBlock?
* <li>VIntBlock --&gt;
* &lt;DocDelta[,Freq?]&gt;<sup>DocFreq-PackedBlockSize*PackedDocBlockNum</sup>
* <li>Level1SkipData --&gt; DocDelta, DocFPDelta, Skip1NumBytes?, ImpactLength?, Impacts?,
* PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto?
* <li>Level0SkipData --&gt; Skip0NumBytes, DocDelta, DocFPDelta, PackedBlockLength,
* ImpactLength?, Impacts?, PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto?
* <li>PackedFreqBlock --&gt; {@link PackedInts PackedInts}, uses patching
* <li>PackedDocDeltaBlock --&gt; {@link PackedInts PackedInts}, does not use patching
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}
* </ul>
* <p>Notes:
* <ul>
* <li>PackedDocDeltaBlock is theoretically generated from two steps:
* <ol>
* <li>Calculate the difference between each document number and previous one, and get
* a d-gaps list (for the first document, use absolute value);
* <li>For those d-gaps from first one to
* PackedDocBlockNum*PackedBlockSize<sup>th</sup>, separately encode as packed
* blocks.
* </ol>
* If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step.
* <li>VIntBlock stores remaining d-gaps (along with frequencies when possible) with a
* format that encodes DocDelta and Freq:
* <p>DocDelta: if frequencies are indexed, this determines both the document number and
* the frequency. In particular, DocDelta/2 is the difference between this document
* number and the previous document number (or zero when this is the first document in a
* TermFreqs). When DocDelta is odd, the frequency is one. When DocDelta is even, the
* frequency is read as another VInt. If frequencies are omitted, DocDelta contains the
* gap (not multiplied by 2) between document numbers and no frequency information is
* stored.
* <p>For example, the TermFreqs for a term which occurs once in document seven and
* three times in document eleven, with frequencies indexed, would be the following
* sequence of VInts:
* <p>15, 8, 3
* <p>If frequencies were omitted ({@link IndexOptions#DOCS}) it would be this sequence
* of VInts instead:
* <p>7,4
* <li>PackedDocBlockNum is the number of packed blocks for current term's docids or
* frequencies. In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize)
* <li>On skip data, DocDelta is the delta between the last doc of the previous block - or
* -1 if there is no previous block - and the last doc of this block. This helps know by
* how much the doc ID should be incremented in case the block gets skipped.
* <li>Skip0Length is the length of skip data at level 0. Encoding it is useful when skip
* data is never needed to quickly skip over skip data, e.g. if only using nextDoc(). It
* is also used when only the first fields of skip data are needed, in order to skip
* over remaining fields without reading them.
* <li>ImpactLength and Impacts are only stored if frequencies are indexed.
* <li>Since positions and payloads are also block encoded, the skip should skip to related
* block first, then fetch the values according to in-block offset. PosFPSkip and
* PayFPSkip record the file offsets of related block in .pos and .pay, respectively.
* While PosBlockOffset indicates which value to fetch inside the related block
* (PayBlockOffset is unnecessary since it is always equal to PosBlockOffset). Same as
* DocFPSkip, the file offsets are relative to the start of current term's TermFreqs,
* and stored as a difference sequence.
* <li>PayByteUpto indicates the start offset of the current payload. It is equivalent to
* the sum of the payload lengths in the current block up to PosBlockOffset
* <li>ImpactLength is the total length of CompetitiveFreqDelta and CompetitiveNormDelta
* pairs. CompetitiveFreqDelta and CompetitiveNormDelta are used to safely skip score
* calculation for uncompetitive documents; See {@link
* org.apache.lucene.codecs.CompetitiveImpactAccumulator} for more details.
* </ul>
* </dl>
*
* <a id="Positions"></a>
*
* <dl>
* <dd><b>Positions</b>
* <p>The .pos file contains the lists of positions that each term occurs at within documents.
* It also sometimes stores part of payloads and offsets for speedup.
* <ul>
* <li>PosFile(.pos) --&gt; Header, &lt;TermPositions&gt; <sup>TermCount</sup>, Footer
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
* <li>TermPositions --&gt; &lt;PackedPosDeltaBlock&gt; <sup>PackedPosBlockNum</sup>,
* VIntBlock?
* <li>VIntBlock --&gt; &lt;PositionDelta[, PayloadLength?], PayloadData?, OffsetDelta?,
* OffsetLength?&gt;<sup>PosVIntCount</sup>
* <li>PackedPosDeltaBlock --&gt; {@link PackedInts PackedInts}
* <li>PositionDelta, OffsetDelta, OffsetLength --&gt; {@link DataOutput#writeVInt VInt}
* <li>PayloadData --&gt; {@link DataOutput#writeByte byte}<sup>PayLength</sup>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}
* </ul>
* <p>Notes:
* <ul>
* <li>TermPositions are order by term (terms are implicit, from the term dictionary), and
* position values for each term document pair are incremental, and ordered by document
* number.
* <li>PackedPosBlockNum is the number of packed blocks for current term's positions,
* payloads or offsets. In particular, PackedPosBlockNum =
* floor(totalTermFreq/PackedBlockSize)
* <li>PosVIntCount is the number of positions encoded as VInt format. In particular,
* PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize
* <li>The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock
* in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.
* <li>PositionDelta is, if payloads are disabled for the term's field, the difference
* between the position of the current occurrence in the document and the previous
* occurrence (or zero, if this is the first occurrence in this document). If payloads
* are enabled for the term's field, then PositionDelta/2 is the difference between the
* current and the previous position. If payloads are enabled and PositionDelta is odd,
* then PayloadLength is stored, indicating the length of the payload at the current
* term position.
* <li>For example, the TermPositions for a term which occurs as the fourth term in one
* document, and as the fifth and ninth term in a subsequent document, would be the
* following sequence of VInts (payloads disabled):
* <p>4, 5, 4
* <li>PayloadData is metadata associated with the current term position. If PayloadLength
* is stored at the current position, then it indicates the length of this payload. If
* PayloadLength is not stored, then this payload has the same length as the payload at
* the previous position.
* <li>OffsetDelta/2 is the difference between this position's startOffset from the previous
* occurrence (or zero, if this is the first occurrence in this document). If
* OffsetDelta is odd, then the length (endOffset-startOffset) differs from the previous
* occurrence and an OffsetLength follows. Offset data is only written for {@link
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.
* </ul>
* </dl>
*
* <a id="Payloads"></a>
*
* <dl>
* <dd><b>Payloads and Offsets</b>
* <p>The .pay file will store payloads and offsets associated with certain term-document
* positions. Some payloads and offsets will be separated out into .pos file, for performance
* reasons.
* <ul>
* <li>PayFile(.pay): --&gt; Header, &lt;TermPayloads?, TermOffsets?&gt;
* <sup>TermCount</sup>, Footer
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
* <li>TermPayloads --&gt; &lt;PackedPayLengthBlock, SumPayLength, PayData&gt;
* <sup>PackedPayBlockNum</sup>
* <li>TermOffsets --&gt; &lt;PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock&gt;
* <sup>PackedPayBlockNum</sup>
* <li>PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --&gt;
* {@link PackedInts PackedInts}
* <li>SumPayLength --&gt; {@link DataOutput#writeVInt VInt}
* <li>PayData --&gt; {@link DataOutput#writeByte byte}<sup>SumPayLength</sup>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}
* </ul>
* <p>Notes:
* <ul>
* <li>The order of TermPayloads/TermOffsets will be the same as TermPositions, note that
* part of payload/offsets are stored in .pos.
* <li>The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is
* the same as PackedFreqBlock in chapter <a href="#Frequencies">Frequencies and Skip
* Data</a>. While PackedStartDeltaBlock follows a same procedure as
* PackedDocDeltaBlock.
* <li>PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also
* synonym for PackedOffsetBlockNum.
* <li>SumPayLength is the total length of payloads written within one block, should be the
* sum of PayLengths in one packed block.
* <li>PayLength in PackedPayLengthBlock is the length of each payload associated with the
* current position.
* </ul>
* </dl>
*
* @lucene.experimental
*/
public final class Lucene912PostingsFormat extends PostingsFormat {
/** Filename extension for some small metadata about how postings are encoded. */
public static final String META_EXTENSION = "psm";
/**
* Filename extension for document number, frequencies, and skip data. See chapter: <a
* href="#Frequencies">Frequencies and Skip Data</a>
*/
public static final String DOC_EXTENSION = "doc";
/** Filename extension for positions. See chapter: <a href="#Positions">Positions</a> */
public static final String POS_EXTENSION = "pos";
/**
* Filename extension for payloads and offsets. See chapter: <a href="#Payloads">Payloads and
* Offsets</a>
*/
public static final String PAY_EXTENSION = "pay";
/** Size of blocks. */
public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE;
public static final int BLOCK_MASK = BLOCK_SIZE - 1;
/** We insert skip data on every block and every SKIP_FACTOR=32 blocks. */
public static final int LEVEL1_FACTOR = 32;
/** Total number of docs covered by level 1 skip data: 32 * 128 = 4,096 */
public static final int LEVEL1_NUM_DOCS = LEVEL1_FACTOR * BLOCK_SIZE;
public static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1;
static final String TERMS_CODEC = "Lucene90PostingsWriterTerms";
static final String META_CODEC = "Lucene912PostingsWriterMeta";
static final String DOC_CODEC = "Lucene912PostingsWriterDoc";
static final String POS_CODEC = "Lucene912PostingsWriterPos";
static final String PAY_CODEC = "Lucene912PostingsWriterPay";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
private final int minTermBlockSize;
private final int maxTermBlockSize;
/** Creates {@code Lucene912PostingsFormat} with default settings. */
public Lucene912PostingsFormat() {
this(
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/**
* Creates {@code Lucene912PostingsFormat} with custom values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
*
* @see
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
*/
public Lucene912PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("Lucene912");
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
this.minTermBlockSize = minTermBlockSize;
this.maxTermBlockSize = maxTermBlockSize;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret =
new Lucene90BlockTreeTermsWriter(
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new Lucene90BlockTreeTermsReader(postingsReader, state);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsReader);
}
}
}
/**
* Holds all state required for {@link Lucene912PostingsReader} to produce a {@link
* org.apache.lucene.index.PostingsEnum} without re-seeking the terms dict.
*
* @lucene.internal
*/
public static final class IntBlockTermState extends BlockTermState {
/** file pointer to the start of the doc ids enumeration, in {@link #DOC_EXTENSION} file */
public long docStartFP;
/** file pointer to the start of the positions enumeration, in {@link #POS_EXTENSION} file */
public long posStartFP;
/** file pointer to the start of the payloads enumeration, in {@link #PAY_EXTENSION} file */
public long payStartFP;
/**
* file offset for the last position in the last block, if there are more than {@link
* ForUtil#BLOCK_SIZE} positions; otherwise -1
*
* <p>One might think to use total term frequency to track how many positions are left to read
* as we decode the blocks, and decode the last block differently when num_left_positions &lt;
* BLOCK_SIZE. Unfortunately this won't work since the tracking will be messed up when we skip
* blocks as the skipper will only tell us new position offset (start of block) and number of
* positions to skip for that block, without telling us how many positions it has skipped.
*/
public long lastPosBlockOffset;
/**
* docid when there is a single pulsed posting, otherwise -1. freq is always implicitly
* totalTermFreq in this case.
*/
public int singletonDocID;
/** Sole constructor. */
public IntBlockTermState() {
lastPosBlockOffset = -1;
singletonDocID = -1;
}
@Override
public IntBlockTermState clone() {
IntBlockTermState other = new IntBlockTermState();
other.copyFrom(this);
return other;
}
@Override
public void copyFrom(TermState _other) {
super.copyFrom(_other);
IntBlockTermState other = (IntBlockTermState) _other;
docStartFP = other.docStartFP;
posStartFP = other.posStartFP;
payStartFP = other.payStartFP;
lastPosBlockOffset = other.lastPosBlockOffset;
singletonDocID = other.singletonDocID;
}
@Override
public String toString() {
return super.toString()
+ " docStartFP="
+ docStartFP
+ " posStartFP="
+ posStartFP
+ " payStartFP="
+ payStartFP
+ " lastPosBlockOffset="
+ lastPosBlockOffset
+ " singletonDocID="
+ singletonDocID;
}
}
}

View File

@ -0,0 +1,681 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.*;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/** Writer for {@link Lucene912PostingsFormat}. */
public class Lucene912PostingsWriter extends PushPostingsWriterBase {
static final IntBlockTermState EMPTY_STATE = new IntBlockTermState();
IndexOutput metaOut;
IndexOutput docOut;
IndexOutput posOut;
IndexOutput payOut;
IntBlockTermState lastState;
// Holds starting file pointers for current term:
private long docStartFP;
private long posStartFP;
private long payStartFP;
final long[] docDeltaBuffer;
final long[] freqBuffer;
private int docBufferUpto;
final long[] posDeltaBuffer;
final long[] payloadLengthBuffer;
final long[] offsetStartDeltaBuffer;
final long[] offsetLengthBuffer;
private int posBufferUpto;
private byte[] payloadBytes;
private int payloadByteUpto;
private int level0LastDocID;
private long level0LastPosFP;
private long level0LastPayFP;
private int level1LastDocID;
private long level1LastPosFP;
private long level1LastPayFP;
private int docID;
private int lastDocID;
private int lastPosition;
private int lastStartOffset;
private int docCount;
private final PForUtil pforUtil;
private final ForDeltaUtil forDeltaUtil;
private boolean fieldHasNorms;
private NumericDocValues norms;
private final CompetitiveImpactAccumulator level0FreqNormAccumulator =
new CompetitiveImpactAccumulator();
private final CompetitiveImpactAccumulator level1CompetitiveFreqNormAccumulator =
new CompetitiveImpactAccumulator();
private int maxNumImpactsAtLevel0;
private int maxImpactNumBytesAtLevel0;
private int maxNumImpactsAtLevel1;
private int maxImpactNumBytesAtLevel1;
/** Scratch output that we use to be able to prepend the encoded length, e.g. impacts. */
private final ByteBuffersDataOutput scratchOutput = ByteBuffersDataOutput.newResettableInstance();
/**
* Output for a single block. This is useful to be able to prepend skip data before each block,
* which can only be computed once the block is encoded. The content is then typically copied to
* {@link #level1Output}.
*/
private final ByteBuffersDataOutput level0Output = ByteBuffersDataOutput.newResettableInstance();
/**
* Output for groups of 32 blocks. This is useful to prepend skip data for these 32 blocks, which
* can only be done once we have encoded these 32 blocks. The content is then typically copied to
* {@link #docCount}.
*/
private final ByteBuffersDataOutput level1Output = ByteBuffersDataOutput.newResettableInstance();
/** Sole constructor. */
public Lucene912PostingsWriter(SegmentWriteState state) throws IOException {
String metaFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, Lucene912PostingsFormat.META_EXTENSION);
String docFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, Lucene912PostingsFormat.DOC_EXTENSION);
metaOut = state.directory.createOutput(metaFileName, state.context);
IndexOutput posOut = null;
IndexOutput payOut = null;
boolean success = false;
try {
docOut = state.directory.createOutput(docFileName, state.context);
CodecUtil.writeIndexHeader(
metaOut, META_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(
docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
final ForUtil forUtil = new ForUtil();
forDeltaUtil = new ForDeltaUtil(forUtil);
pforUtil = new PForUtil(forUtil);
if (state.fieldInfos.hasProx()) {
posDeltaBuffer = new long[BLOCK_SIZE];
String posFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, Lucene912PostingsFormat.POS_EXTENSION);
posOut = state.directory.createOutput(posFileName, state.context);
CodecUtil.writeIndexHeader(
posOut, POS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
if (state.fieldInfos.hasPayloads()) {
payloadBytes = new byte[128];
payloadLengthBuffer = new long[BLOCK_SIZE];
} else {
payloadBytes = null;
payloadLengthBuffer = null;
}
if (state.fieldInfos.hasOffsets()) {
offsetStartDeltaBuffer = new long[BLOCK_SIZE];
offsetLengthBuffer = new long[BLOCK_SIZE];
} else {
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
}
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
String payFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name,
state.segmentSuffix,
Lucene912PostingsFormat.PAY_EXTENSION);
payOut = state.directory.createOutput(payFileName, state.context);
CodecUtil.writeIndexHeader(
payOut, PAY_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
}
} else {
posDeltaBuffer = null;
payloadLengthBuffer = null;
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
payloadBytes = null;
}
this.payOut = payOut;
this.posOut = posOut;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut);
}
}
docDeltaBuffer = new long[BLOCK_SIZE];
freqBuffer = new long[BLOCK_SIZE];
}
@Override
public IntBlockTermState newTermState() {
return new IntBlockTermState();
}
@Override
public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException {
CodecUtil.writeIndexHeader(
termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
termsOut.writeVInt(BLOCK_SIZE);
}
@Override
public void setField(FieldInfo fieldInfo) {
super.setField(fieldInfo);
lastState = EMPTY_STATE;
fieldHasNorms = fieldInfo.hasNorms();
}
@Override
public void startTerm(NumericDocValues norms) {
docStartFP = docOut.getFilePointer();
if (writePositions) {
posStartFP = posOut.getFilePointer();
level1LastPosFP = level0LastPosFP = posStartFP;
if (writePayloads || writeOffsets) {
payStartFP = payOut.getFilePointer();
level1LastPayFP = level0LastPayFP = payStartFP;
}
}
lastDocID = -1;
level0LastDocID = -1;
level1LastDocID = -1;
this.norms = norms;
if (writeFreqs) {
level0FreqNormAccumulator.clear();
}
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
if (docBufferUpto == BLOCK_SIZE) {
flushDocBlock(false);
docBufferUpto = 0;
}
final int docDelta = docID - lastDocID;
if (docID < 0 || docDelta <= 0) {
throw new CorruptIndexException(
"docs out of order (" + docID + " <= " + lastDocID + " )", docOut);
}
docDeltaBuffer[docBufferUpto] = docDelta;
if (writeFreqs) {
freqBuffer[docBufferUpto] = termDocFreq;
}
this.docID = docID;
lastPosition = 0;
lastStartOffset = 0;
if (writeFreqs) {
long norm;
if (fieldHasNorms) {
boolean found = norms.advanceExact(docID);
if (found == false) {
// This can happen if indexing hits a problem after adding a doc to the
// postings but before buffering the norm. Such documents are written
// deleted and will go away on the first merge.
norm = 1L;
} else {
norm = norms.longValue();
assert norm != 0 : docID;
}
} else {
norm = 1L;
}
level0FreqNormAccumulator.add(termDocFreq, norm);
}
}
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset)
throws IOException {
if (position > IndexWriter.MAX_POSITION) {
throw new CorruptIndexException(
"position="
+ position
+ " is too large (> IndexWriter.MAX_POSITION="
+ IndexWriter.MAX_POSITION
+ ")",
docOut);
}
if (position < 0) {
throw new CorruptIndexException("position=" + position + " is < 0", docOut);
}
posDeltaBuffer[posBufferUpto] = position - lastPosition;
if (writePayloads) {
if (payload == null || payload.length == 0) {
// no payload
payloadLengthBuffer[posBufferUpto] = 0;
} else {
payloadLengthBuffer[posBufferUpto] = payload.length;
if (payloadByteUpto + payload.length > payloadBytes.length) {
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
}
System.arraycopy(
payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
payloadByteUpto += payload.length;
}
}
if (writeOffsets) {
assert startOffset >= lastStartOffset;
assert endOffset >= startOffset;
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset;
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
lastStartOffset = startOffset;
}
posBufferUpto++;
lastPosition = position;
if (posBufferUpto == BLOCK_SIZE) {
pforUtil.encode(posDeltaBuffer, posOut);
if (writePayloads) {
pforUtil.encode(payloadLengthBuffer, payOut);
payOut.writeVInt(payloadByteUpto);
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
payloadByteUpto = 0;
}
if (writeOffsets) {
pforUtil.encode(offsetStartDeltaBuffer, payOut);
pforUtil.encode(offsetLengthBuffer, payOut);
}
posBufferUpto = 0;
}
}
@Override
public void finishDoc() throws IOException {
docBufferUpto++;
docCount++;
lastDocID = docID;
}
/**
* Special vints that are encoded on 2 bytes if they require 15 bits or less. VInt becomes
* especially slow when the number of bytes is variable, so this special layout helps in the case
* when the number likely requires 15 bits or less
*/
static void writeVInt15(DataOutput out, int v) throws IOException {
assert v >= 0;
writeVLong15(out, v);
}
/**
* @see #writeVInt15(DataOutput, int)
*/
static void writeVLong15(DataOutput out, long v) throws IOException {
assert v >= 0;
if ((v & ~0x7FFFL) == 0) {
out.writeShort((short) v);
} else {
out.writeShort((short) (0x8000 | (v & 0x7FFF)));
out.writeVLong(v >> 15);
}
}
private void flushDocBlock(boolean finishTerm) throws IOException {
assert docBufferUpto != 0;
if (docBufferUpto < BLOCK_SIZE) {
assert finishTerm;
PostingsUtil.writeVIntBlock(
level0Output, docDeltaBuffer, freqBuffer, docBufferUpto, writeFreqs);
} else {
if (writeFreqs) {
List<Impact> impacts = level0FreqNormAccumulator.getCompetitiveFreqNormPairs();
if (impacts.size() > maxNumImpactsAtLevel0) {
maxNumImpactsAtLevel0 = impacts.size();
}
writeImpacts(impacts, scratchOutput);
assert level0Output.size() == 0;
if (scratchOutput.size() > maxImpactNumBytesAtLevel0) {
maxImpactNumBytesAtLevel0 = Math.toIntExact(scratchOutput.size());
}
level0Output.writeVLong(scratchOutput.size());
scratchOutput.copyTo(level0Output);
scratchOutput.reset();
if (writePositions) {
level0Output.writeVLong(posOut.getFilePointer() - level0LastPosFP);
level0Output.writeByte((byte) posBufferUpto);
level0LastPosFP = posOut.getFilePointer();
if (writeOffsets || writePayloads) {
level0Output.writeVLong(payOut.getFilePointer() - level0LastPayFP);
level0Output.writeVInt(payloadByteUpto);
level0LastPayFP = payOut.getFilePointer();
}
}
}
long numSkipBytes = level0Output.size();
forDeltaUtil.encodeDeltas(docDeltaBuffer, level0Output);
if (writeFreqs) {
pforUtil.encode(freqBuffer, level0Output);
}
// docID - lastBlockDocID is at least 128, so it can never fit a single byte with a vint
// Even if we subtracted 128, only extremely dense blocks would be eligible to a single byte
// so let's go with 2 bytes right away
writeVInt15(scratchOutput, docID - level0LastDocID);
writeVLong15(scratchOutput, level0Output.size());
numSkipBytes += scratchOutput.size();
level1Output.writeVLong(numSkipBytes);
scratchOutput.copyTo(level1Output);
scratchOutput.reset();
}
level0Output.copyTo(level1Output);
level0Output.reset();
level0LastDocID = docID;
if (writeFreqs) {
level1CompetitiveFreqNormAccumulator.addAll(level0FreqNormAccumulator);
level0FreqNormAccumulator.clear();
}
if ((docCount & LEVEL1_MASK) == 0) { // true every 32 blocks (4,096 docs)
writeLevel1SkipData();
level1LastDocID = docID;
level1CompetitiveFreqNormAccumulator.clear();
} else if (finishTerm) {
level1Output.copyTo(docOut);
level1Output.reset();
level1CompetitiveFreqNormAccumulator.clear();
}
}
private void writeLevel1SkipData() throws IOException {
docOut.writeVInt(docID - level1LastDocID);
long numImpactBytes = scratchOutput.size();
final long level1End;
if (writeFreqs) {
List<Impact> impacts = level1CompetitiveFreqNormAccumulator.getCompetitiveFreqNormPairs();
if (impacts.size() > maxNumImpactsAtLevel1) {
maxNumImpactsAtLevel1 = impacts.size();
}
writeImpacts(impacts, scratchOutput);
numImpactBytes = scratchOutput.size();
if (numImpactBytes > maxImpactNumBytesAtLevel1) {
maxImpactNumBytesAtLevel1 = Math.toIntExact(numImpactBytes);
}
if (writePositions) {
scratchOutput.writeVLong(posOut.getFilePointer() - level1LastPosFP);
scratchOutput.writeByte((byte) posBufferUpto);
level1LastPosFP = posOut.getFilePointer();
if (writeOffsets || writePayloads) {
scratchOutput.writeVLong(payOut.getFilePointer() - level1LastPayFP);
scratchOutput.writeVInt(payloadByteUpto);
level1LastPayFP = payOut.getFilePointer();
}
}
final long level1Len = 2 * Short.BYTES + scratchOutput.size() + level1Output.size();
docOut.writeVLong(level1Len);
level1End = docOut.getFilePointer() + level1Len;
// There are at most 128 impacts, that require at most 2 bytes each
assert numImpactBytes <= Short.MAX_VALUE;
// Like impacts plus a few vlongs, still way under the max short value
assert scratchOutput.size() + Short.BYTES <= Short.MAX_VALUE;
docOut.writeShort((short) (scratchOutput.size() + Short.BYTES));
docOut.writeShort((short) numImpactBytes);
scratchOutput.copyTo(docOut);
scratchOutput.reset();
} else {
docOut.writeVLong(level1Output.size());
level1End = docOut.getFilePointer() + level1Output.size();
}
level1Output.copyTo(docOut);
level1Output.reset();
assert docOut.getFilePointer() == level1End : docOut.getFilePointer() + " " + level1End;
}
static void writeImpacts(Collection<Impact> impacts, DataOutput out) throws IOException {
Impact previous = new Impact(0, 0);
for (Impact impact : impacts) {
assert impact.freq > previous.freq;
assert Long.compareUnsigned(impact.norm, previous.norm) > 0;
int freqDelta = impact.freq - previous.freq - 1;
long normDelta = impact.norm - previous.norm - 1;
if (normDelta == 0) {
// most of time, norm only increases by 1, so we can fold everything in a single byte
out.writeVInt(freqDelta << 1);
} else {
out.writeVInt((freqDelta << 1) | 1);
out.writeZLong(normDelta);
}
previous = impact;
}
}
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(BlockTermState _state) throws IOException {
IntBlockTermState state = (IntBlockTermState) _state;
assert state.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
assert state.docFreq == docCount : state.docFreq + " vs " + docCount;
// docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to
// it.
final int singletonDocID;
if (state.docFreq == 1) {
// pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
singletonDocID = (int) docDeltaBuffer[0] - 1;
} else {
singletonDocID = -1;
flushDocBlock(true);
}
final long lastPosBlockOffset;
if (writePositions) {
// totalTermFreq is just total number of positions(or payloads, or offsets)
// associated with current term.
assert state.totalTermFreq != -1;
if (state.totalTermFreq > BLOCK_SIZE) {
// record file offset for last pos in last block
lastPosBlockOffset = posOut.getFilePointer() - posStartFP;
} else {
lastPosBlockOffset = -1;
}
if (posBufferUpto > 0) {
assert posBufferUpto < BLOCK_SIZE;
// TODO: should we send offsets/payloads to
// .pay...? seems wasteful (have to store extra
// vLong for low (< BLOCK_SIZE) DF terms = vast vast
// majority)
// vInt encode the remaining positions/payloads/offsets:
int lastPayloadLength = -1; // force first payload length to be written
int lastOffsetLength = -1; // force first offset length to be written
int payloadBytesReadUpto = 0;
for (int i = 0; i < posBufferUpto; i++) {
final int posDelta = (int) posDeltaBuffer[i];
if (writePayloads) {
final int payloadLength = (int) payloadLengthBuffer[i];
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
posOut.writeVInt((posDelta << 1) | 1);
posOut.writeVInt(payloadLength);
} else {
posOut.writeVInt(posDelta << 1);
}
if (payloadLength != 0) {
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
payloadBytesReadUpto += payloadLength;
}
} else {
posOut.writeVInt(posDelta);
}
if (writeOffsets) {
int delta = (int) offsetStartDeltaBuffer[i];
int length = (int) offsetLengthBuffer[i];
if (length == lastOffsetLength) {
posOut.writeVInt(delta << 1);
} else {
posOut.writeVInt(delta << 1 | 1);
posOut.writeVInt(length);
lastOffsetLength = length;
}
}
}
if (writePayloads) {
assert payloadBytesReadUpto == payloadByteUpto;
payloadByteUpto = 0;
}
}
} else {
lastPosBlockOffset = -1;
}
state.docStartFP = docStartFP;
state.posStartFP = posStartFP;
state.payStartFP = payStartFP;
state.singletonDocID = singletonDocID;
state.lastPosBlockOffset = lastPosBlockOffset;
docBufferUpto = 0;
posBufferUpto = 0;
lastDocID = -1;
docCount = 0;
}
@Override
public void encodeTerm(
DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute)
throws IOException {
IntBlockTermState state = (IntBlockTermState) _state;
if (absolute) {
lastState = EMPTY_STATE;
assert lastState.docStartFP == 0;
}
if (lastState.singletonDocID != -1
&& state.singletonDocID != -1
&& state.docStartFP == lastState.docStartFP) {
// With runs of rare values such as ID fields, the increment of pointers in the docs file is
// often 0.
// Furthermore some ID schemes like auto-increment IDs or Flake IDs are monotonic, so we
// encode the delta
// between consecutive doc IDs to save space.
final long delta = (long) state.singletonDocID - lastState.singletonDocID;
out.writeVLong((BitUtil.zigZagEncode(delta) << 1) | 0x01);
} else {
out.writeVLong((state.docStartFP - lastState.docStartFP) << 1);
if (state.singletonDocID != -1) {
out.writeVInt(state.singletonDocID);
}
}
if (writePositions) {
out.writeVLong(state.posStartFP - lastState.posStartFP);
if (writePayloads || writeOffsets) {
out.writeVLong(state.payStartFP - lastState.payStartFP);
}
}
if (writePositions) {
if (state.lastPosBlockOffset != -1) {
out.writeVLong(state.lastPosBlockOffset);
}
}
lastState = state;
}
@Override
public void close() throws IOException {
// TODO: add a finish() at least to PushBase? DV too...?
boolean success = false;
try {
if (docOut != null) {
CodecUtil.writeFooter(docOut);
}
if (posOut != null) {
CodecUtil.writeFooter(posOut);
}
if (payOut != null) {
CodecUtil.writeFooter(payOut);
}
if (metaOut != null) {
metaOut.writeInt(maxNumImpactsAtLevel0);
metaOut.writeInt(maxImpactNumBytesAtLevel0);
metaOut.writeInt(maxNumImpactsAtLevel1);
metaOut.writeInt(maxImpactNumBytesAtLevel1);
metaOut.writeLong(docOut.getFilePointer());
if (posOut != null) {
metaOut.writeLong(posOut.getFilePointer());
if (payOut != null) {
metaOut.writeLong(payOut.getFilePointer());
}
}
CodecUtil.writeFooter(metaOut);
}
success = true;
} finally {
if (success) {
IOUtils.close(metaOut, docOut, posOut, payOut);
} else {
IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut);
}
metaOut = docOut = posOut = payOut = null;
}
}
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.codecs.lucene912;
import java.io.IOException;
import java.util.Arrays;

View File

@ -0,0 +1,73 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
/** Utility class to encode/decode postings block. */
final class PostingsUtil {
/**
* Read values that have been written using variable-length encoding and group-varint encoding
* instead of bit-packing.
*/
static void readVIntBlock(
IndexInput docIn,
long[] docBuffer,
long[] freqBuffer,
int num,
boolean indexHasFreq,
boolean decodeFreq)
throws IOException {
docIn.readGroupVInts(docBuffer, num);
if (indexHasFreq && decodeFreq) {
for (int i = 0; i < num; ++i) {
freqBuffer[i] = docBuffer[i] & 0x01;
docBuffer[i] >>= 1;
if (freqBuffer[i] == 0) {
freqBuffer[i] = docIn.readVInt();
}
}
} else if (indexHasFreq) {
for (int i = 0; i < num; ++i) {
docBuffer[i] >>= 1;
}
}
}
/** Write freq buffer with variable-length encoding and doc buffer with group-varint encoding. */
static void writeVIntBlock(
DataOutput docOut, long[] docBuffer, long[] freqBuffer, int num, boolean writeFreqs)
throws IOException {
if (writeFreqs) {
for (int i = 0; i < num; i++) {
docBuffer[i] = (docBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
}
}
docOut.writeGroupVInts(docBuffer, num);
if (writeFreqs) {
for (int i = 0; i < num; i++) {
final int freq = (int) freqBuffer[i];
if (freq != 1) {
docOut.writeVInt(freq);
}
}
}
}
}

View File

@ -40,7 +40,7 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.store.DataInput;

View File

@ -0,0 +1,431 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Lucene 9.12 file format.
*
* <h2>Apache Lucene - Index File Formats</h2>
*
* <div>
*
* <ul>
* <li><a href="#Introduction">Introduction</a>
* <li><a href="#Definitions">Definitions</a>
* <ul>
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
* <li><a href="#Types_of_Fields">Types of Fields</a>
* <li><a href="#Segments">Segments</a>
* <li><a href="#Document_Numbers">Document Numbers</a>
* </ul>
* <li><a href="#Overview">Index Structure Overview</a>
* <li><a href="#File_Naming">File Naming</a>
* <li><a href="#file-names">Summary of File Extensions</a>
* <ul>
* <li><a href="#Lock_File">Lock File</a>
* <li><a href="#History">History</a>
* <li><a href="#Limitations">Limitations</a>
* </ul>
* </ul>
*
* </div> <a id="Introduction"></a>
*
* <h3>Introduction</h3>
*
* <div>
*
* <p>This document defines the index file formats used in this version of Lucene. If you are using
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
* with the version you are using.
*
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
* </div> <a id="Definitions"></a>
*
* <h3>Definitions</h3>
*
* <div>
*
* <p>The fundamental concepts in Lucene are index, document, field and term.
*
* <p>An index contains a sequence of documents.
*
* <ul>
* <li>A document is a sequence of fields.
* <li>A field is a named sequence of terms.
* <li>A term is a sequence of bytes.
* </ul>
*
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
* are represented as a pair: the string naming the field, and the bytes within the field. <a
* id="Inverted_Indexing"></a>
*
* <h4>Inverted Indexing</h4>
*
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
* search more efficient. Lucene's terms index falls into the family of indexes known as an
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
* This is the inverse of the natural relationship, in which documents list terms. <a
* id="Types_of_Fields"></a>
*
* <h4>Types of Fields</h4>
*
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
* may be both stored and indexed.
*
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
* useful for certain identifier fields to be indexed literally.
*
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
* Fields. <a id="Segments"></a>
*
* <h4>Segments</h4>
*
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
* fully independent index, which could be searched separately. Indexes evolve by:
*
* <ol>
* <li>Creating new segments for newly added documents.
* <li>Merging existing segments.
* </ol>
*
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
* composed of a set of segments. <a id="Document_Numbers"></a>
*
* <h4>Document Numbers</h4>
*
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
* document added to an index is numbered zero, and each subsequent document added gets a number one
* greater than the previous.
*
* <p>Note that a document's number may change, so caution should be taken when storing these
* numbers outside of Lucene. In particular, numbers may change in the following situations:
*
* <ul>
* <li>
* <p>The numbers stored in each segment are unique only within the segment, and must be
* converted before they can be used in a larger context. The standard technique is to
* allocate each segment a range of values, based on the range of numbers used in that
* segment. To convert a document number from a segment to an external value, the segment's
* <i>base</i> document number is added. To convert an external value back to a
* segment-specific value, the segment is identified by the range that the external value is
* in, and the segment's base value is subtracted. For example two five document segments
* might be combined, so that the first segment has a base value of zero, and the second of
* five. Document three from the second segment would have an external value of eight.
* <li>
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
* removed as the index evolves through merging. Deleted documents are dropped when segments
* are merged. A freshly-merged segment thus has no gaps in its numbering.
* </ul>
*
* </div> <a id="Overview"></a>
*
* <h3>Index Structure Overview</h3>
*
* <div>
*
* <p>Each segment index maintains the following:
*
* <ul>
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
* contains metadata about a segment, such as the number of documents, what files it uses, and
* information about how the segment is sorted
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
* contains metadata about the set of named fields used in the index.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
* This contains, for each document, a list of attribute-value pairs, where the attributes are
* field names. These are used to store auxiliary information about the document, such as its
* title, url, or an identifier to access a database. The set of stored fields are what is
* returned for each hit when searching. This is keyed by document number.
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A
* dictionary containing all of the terms used in all of the indexed fields of all of the
* documents. The dictionary also contains the number of documents which contain the term, and
* pointers to the term's frequency and proximity data.
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For
* each term in the dictionary, the numbers of all the documents that contain that term, and
* the frequency of the term in that document, unless frequencies are omitted ({@link
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For
* each term in the dictionary, the positions that the term occurs in each document. Note that
* this will not exist if all fields in all documents omit position data.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
* each field in each document, a value is stored that is multiplied into the score for hits
* on that field.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
* field in each document, the term vector (sometimes called document vector) may be stored. A
* term vector consists of term text and term frequency. To add Term Vectors to your index see
* the {@link org.apache.lucene.document.Field Field} constructors
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
* stored values, these are also keyed by document number, but are generally intended to be
* loaded into main memory for fast access. Whereas stored values are generally intended for
* summary results from searches, per-document values are useful for things like scoring
* factors.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
* optional file indicating which documents are live.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
* intersection (2D, 3D).
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
* vector format stores numeric vectors in a format optimized for random access and
* computation, supporting high-dimensional nearest-neighbor search.
* </ul>
*
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
*
* <h3>File Naming</h3>
*
* <div>
*
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
* correspond to the different file formats described below. When using the Compound File format
* (default for small segments) these files (except for the Segment info file, the Lock file, and
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
*
* <p>Typically, all segments in an index are stored in a single directory, although this is not
* required.
*
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
* never before used filename. This is achieved using a simple generations approach. For example,
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
*
* <h3>Summary of File Extensions</h3>
*
* <div>
*
* <p>The following table summarizes the names and extensions of the files in Lucene:
*
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
* <caption>lucene filenames by extension</caption>
* <tr>
* <th>Name</th>
* <th>Extension</th>
* <th>Brief Description</th>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
* <td>segments_N</td>
* <td>Stores information about a commit point</td>
* </tr>
* <tr>
* <td><a href="#Lock_File">Lock File</a></td>
* <td>write.lock</td>
* <td>The Write lock prevents multiple IndexWriters from writing to the same
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
* <td>.cfs, .cfe</td>
* <td>An optional "virtual" file consisting of all the other index files for
* systems that frequently run out of file handles.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
* <td>.fnm</td>
* <td>Stores information about the fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
* <td>.fdx</td>
* <td>Contains pointers to field data</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
* <td>.fdt</td>
* <td>The stored fields for documents</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary}</td>
* <td>.tim</td>
* <td>The term dictionary, stores term info</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index}</td>
* <td>.tip</td>
* <td>The index into the Term Dictionary</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies}</td>
* <td>.doc</td>
* <td>Contains the list of docs which contain each term along with frequency</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions}</td>
* <td>.pos</td>
* <td>Stores position information about where a term occurs in the index</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads}</td>
* <td>.pay</td>
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
* <td>.nvd, .nvm</td>
* <td>Encodes length and boost factors for docs and fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
* <td>.dvd, .dvm</td>
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
* <td>.tvx</td>
* <td>Stores offset into the document data file</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
* <td>.tvd</td>
* <td>Contains term vector data.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what documents are live</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
* <td>.dii, .dim</td>
* <td>Holds indexed points</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
* <td>.vec, .vem, .veq, vex</td>
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
* hnsw graph data.</td>
* </tr>
* </table>
*
* </div> <a id="Lock_File"></a>
*
* <h3>Lock File</h3>
*
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
* lock directory is different from the index directory then the write lock will be named
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
* directory. When this file is present, a writer is currently modifying the index (adding or
* removing documents). This lock file ensures that only one writer is modifying the index at a
* time. <a id="History"></a>
*
* <h3>History</h3>
*
* <p>Compatibility notes are provided in this document, describing how file formats have changed
* from prior versions:
*
* <ul>
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
* written in the new file format (meaning no specific "upgrade" process is needed). But note
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
* store (vectors &amp; stored fields) files. This allows for faster indexing in certain
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
* change in 2.1).
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
* details.
* <li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData may be passed to
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
* details. Also, diagnostics were added to each segment written recording details about why
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
* read, but on merge the new segment will write them, uncompressed). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
* <li>In version 3.1, segments records the code version that created them. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
* Additionally segments track explicitly whether or not they have term vectors. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
* they were stored in text format only.
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
* was introduced. Normalization factors need no longer be a single byte, they can be any
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
* the postings lists. Payloads can be stored in the term vectors.
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
* were changed to inline directly into the term dictionary. Stored fields are compressed by
* default.
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
* allow updating NumericDocValues fields.
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
* checksum of the file.
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
* suitable for faceting/sorting/analytics.
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
* for binary fields and ord indexes for multi-valued fields.
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
* sorting.
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
* an iterator API.
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
* if they may not produce high enough scores. Additionally doc values and norms has been
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
* elements to skip when advancing in the data.
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
* performant encoding that is vectorized.
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
* user-defined sorts to be used
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
* smaller stored fields.
* <li>In version 9.0, vector-valued fields were added.
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
* IndexDISI. ordToDoc mappings was added to .vem.
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
* layer and not writing the node ids for the zeroth layer.
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
* format to utilize int8 quantized vectors for float32 vector search.
* <li>In version 9.12, skip data was refactored to have only two levels: every 128 docs and every
* 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that
* need skipping, especially conjunctions.
* </ul>
*
* <a id="Limitations"></a>
*
* <h3>Limitations</h3>
*
* <div>
*
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
* index file format and the current implementation. Eventually these should be replaced with either
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
* VInt} values which have no limit. </div>
*/
package org.apache.lucene.codecs.lucene912;

View File

@ -15,414 +15,5 @@
* limitations under the License.
*/
/**
* Lucene 9.9 file format.
*
* <h2>Apache Lucene - Index File Formats</h2>
*
* <div>
*
* <ul>
* <li><a href="#Introduction">Introduction</a>
* <li><a href="#Definitions">Definitions</a>
* <ul>
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
* <li><a href="#Types_of_Fields">Types of Fields</a>
* <li><a href="#Segments">Segments</a>
* <li><a href="#Document_Numbers">Document Numbers</a>
* </ul>
* <li><a href="#Overview">Index Structure Overview</a>
* <li><a href="#File_Naming">File Naming</a>
* <li><a href="#file-names">Summary of File Extensions</a>
* <ul>
* <li><a href="#Lock_File">Lock File</a>
* <li><a href="#History">History</a>
* <li><a href="#Limitations">Limitations</a>
* </ul>
* </ul>
*
* </div> <a id="Introduction"></a>
*
* <h3>Introduction</h3>
*
* <div>
*
* <p>This document defines the index file formats used in this version of Lucene. If you are using
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
* with the version you are using.
*
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
* </div> <a id="Definitions"></a>
*
* <h3>Definitions</h3>
*
* <div>
*
* <p>The fundamental concepts in Lucene are index, document, field and term.
*
* <p>An index contains a sequence of documents.
*
* <ul>
* <li>A document is a sequence of fields.
* <li>A field is a named sequence of terms.
* <li>A term is a sequence of bytes.
* </ul>
*
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
* are represented as a pair: the string naming the field, and the bytes within the field. <a
* id="Inverted_Indexing"></a>
*
* <h4>Inverted Indexing</h4>
*
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
* search more efficient. Lucene's terms index falls into the family of indexes known as an
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
* This is the inverse of the natural relationship, in which documents list terms. <a
* id="Types_of_Fields"></a>
*
* <h4>Types of Fields</h4>
*
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
* may be both stored and indexed.
*
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
* useful for certain identifier fields to be indexed literally.
*
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
* Fields. <a id="Segments"></a>
*
* <h4>Segments</h4>
*
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
* fully independent index, which could be searched separately. Indexes evolve by:
*
* <ol>
* <li>Creating new segments for newly added documents.
* <li>Merging existing segments.
* </ol>
*
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
* composed of a set of segments. <a id="Document_Numbers"></a>
*
* <h4>Document Numbers</h4>
*
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
* document added to an index is numbered zero, and each subsequent document added gets a number one
* greater than the previous.
*
* <p>Note that a document's number may change, so caution should be taken when storing these
* numbers outside of Lucene. In particular, numbers may change in the following situations:
*
* <ul>
* <li>
* <p>The numbers stored in each segment are unique only within the segment, and must be
* converted before they can be used in a larger context. The standard technique is to
* allocate each segment a range of values, based on the range of numbers used in that
* segment. To convert a document number from a segment to an external value, the segment's
* <i>base</i> document number is added. To convert an external value back to a
* segment-specific value, the segment is identified by the range that the external value is
* in, and the segment's base value is subtracted. For example two five document segments
* might be combined, so that the first segment has a base value of zero, and the second of
* five. Document three from the second segment would have an external value of eight.
* <li>
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
* removed as the index evolves through merging. Deleted documents are dropped when segments
* are merged. A freshly-merged segment thus has no gaps in its numbering.
* </ul>
*
* </div> <a id="Overview"></a>
*
* <h3>Index Structure Overview</h3>
*
* <div>
*
* <p>Each segment index maintains the following:
*
* <ul>
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
* contains metadata about a segment, such as the number of documents, what files it uses, and
* information about how the segment is sorted
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
* contains metadata about the set of named fields used in the index.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
* This contains, for each document, a list of attribute-value pairs, where the attributes are
* field names. These are used to store auxiliary information about the document, such as its
* title, url, or an identifier to access a database. The set of stored fields are what is
* returned for each hit when searching. This is keyed by document number.
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term dictionary}. A
* dictionary containing all of the terms used in all of the indexed fields of all of the
* documents. The dictionary also contains the number of documents which contain the term, and
* pointers to the term's frequency and proximity data.
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term Frequency data}. For
* each term in the dictionary, the numbers of all the documents that contain that term, and
* the frequency of the term in that document, unless frequencies are omitted ({@link
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term Proximity data}. For
* each term in the dictionary, the positions that the term occurs in each document. Note that
* this will not exist if all fields in all documents omit position data.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
* each field in each document, a value is stored that is multiplied into the score for hits
* on that field.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
* field in each document, the term vector (sometimes called document vector) may be stored. A
* term vector consists of term text and term frequency. To add Term Vectors to your index see
* the {@link org.apache.lucene.document.Field Field} constructors
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
* stored values, these are also keyed by document number, but are generally intended to be
* loaded into main memory for fast access. Whereas stored values are generally intended for
* summary results from searches, per-document values are useful for things like scoring
* factors.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
* optional file indicating which documents are live.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
* intersection (2D, 3D).
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
* vector format stores numeric vectors in a format optimized for random access and
* computation, supporting high-dimensional nearest-neighbor search.
* </ul>
*
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
*
* <h3>File Naming</h3>
*
* <div>
*
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
* correspond to the different file formats described below. When using the Compound File format
* (default for small segments) these files (except for the Segment info file, the Lock file, and
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
*
* <p>Typically, all segments in an index are stored in a single directory, although this is not
* required.
*
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
* never before used filename. This is achieved using a simple generations approach. For example,
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
*
* <h3>Summary of File Extensions</h3>
*
* <div>
*
* <p>The following table summarizes the names and extensions of the files in Lucene:
*
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
* <caption>lucene filenames by extension</caption>
* <tr>
* <th>Name</th>
* <th>Extension</th>
* <th>Brief Description</th>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
* <td>segments_N</td>
* <td>Stores information about a commit point</td>
* </tr>
* <tr>
* <td><a href="#Lock_File">Lock File</a></td>
* <td>write.lock</td>
* <td>The Write lock prevents multiple IndexWriters from writing to the same
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
* <td>.cfs, .cfe</td>
* <td>An optional "virtual" file consisting of all the other index files for
* systems that frequently run out of file handles.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
* <td>.fnm</td>
* <td>Stores information about the fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
* <td>.fdx</td>
* <td>Contains pointers to field data</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
* <td>.fdt</td>
* <td>The stored fields for documents</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term Dictionary}</td>
* <td>.tim</td>
* <td>The term dictionary, stores term info</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term Index}</td>
* <td>.tip</td>
* <td>The index into the Term Dictionary</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Frequencies}</td>
* <td>.doc</td>
* <td>Contains the list of docs which contain each term along with frequency</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Positions}</td>
* <td>.pos</td>
* <td>Stores position information about where a term occurs in the index</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Payloads}</td>
* <td>.pay</td>
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
* <td>.nvd, .nvm</td>
* <td>Encodes length and boost factors for docs and fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
* <td>.dvd, .dvm</td>
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
* <td>.tvx</td>
* <td>Stores offset into the document data file</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
* <td>.tvd</td>
* <td>Contains term vector data.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what documents are live</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
* <td>.dii, .dim</td>
* <td>Holds indexed points</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
* <td>.vec, .vem, .veq, vex</td>
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
* hnsw graph data.</td>
* </tr>
* </table>
*
* </div> <a id="Lock_File"></a>
*
* <h3>Lock File</h3>
*
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
* lock directory is different from the index directory then the write lock will be named
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
* directory. When this file is present, a writer is currently modifying the index (adding or
* removing documents). This lock file ensures that only one writer is modifying the index at a
* time. <a id="History"></a>
*
* <h3>History</h3>
*
* <p>Compatibility notes are provided in this document, describing how file formats have changed
* from prior versions:
*
* <ul>
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
* written in the new file format (meaning no specific "upgrade" process is needed). But note
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
* store (vectors &amp; stored fields) files. This allows for faster indexing in certain
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
* change in 2.1).
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
* details.
* <li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData may be passed to
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
* details. Also, diagnostics were added to each segment written recording details about why
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
* read, but on merge the new segment will write them, uncompressed). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
* <li>In version 3.1, segments records the code version that created them. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
* Additionally segments track explicitly whether or not they have term vectors. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
* they were stored in text format only.
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
* was introduced. Normalization factors need no longer be a single byte, they can be any
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
* the postings lists. Payloads can be stored in the term vectors.
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
* were changed to inline directly into the term dictionary. Stored fields are compressed by
* default.
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
* allow updating NumericDocValues fields.
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
* checksum of the file.
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
* suitable for faceting/sorting/analytics.
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
* for binary fields and ord indexes for multi-valued fields.
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
* sorting.
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
* an iterator API.
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
* if they may not produce high enough scores. Additionally doc values and norms has been
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
* elements to skip when advancing in the data.
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
* performant encoding that is vectorized.
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
* user-defined sorts to be used
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
* smaller stored fields.
* <li>In version 9.0, vector-valued fields were added.
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
* IndexDISI. ordToDoc mappings was added to .vem.
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
* layer and not writing the node ids for the zeroth layer.
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
* format to utilize int8 quantized vectors for float32 vector search.
* </ul>
*
* <a id="Limitations"></a>
*
* <h3>Limitations</h3>
*
* <div>
*
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
* index file format and the current implementation. Eventually these should be replaced with either
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
* VInt} values which have no limit. </div>
*/
/** Lucene 9.9 file format. */
package org.apache.lucene.codecs.lucene99;

View File

@ -21,8 +21,8 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
@ -399,10 +399,10 @@ public class PhraseQuery extends Query {
/**
* A guess of the average number of simple operations for the initial seek and buffer refill per
* document for the positions of a term. See also {@link
* Lucene99PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
* Lucene912PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
*
* <p>Aside: Instead of being constant this could depend among others on {@link
* Lucene99PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
* Lucene912PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
* TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs),
* {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block
* size of the device storing the index.
@ -411,7 +411,7 @@ public class PhraseQuery extends Query {
/**
* Number of simple operations in {@link
* Lucene99PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill
* Lucene912PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill
* is done.
*/
private static final int TERM_OPS_PER_POS = 7;

View File

@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.codecs.lucene99.Lucene99Codec
org.apache.lucene.codecs.lucene912.Lucene912Codec

View File

@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat

View File

@ -18,7 +18,7 @@ package org.apache.lucene.codecs.lucene90;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.DirectoryReader;
@ -31,7 +31,7 @@ import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase;
public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
@Override
protected Codec getCodec() {
return new Lucene99Codec(Lucene99Codec.Mode.BEST_COMPRESSION);
return new Lucene912Codec(Lucene912Codec.Mode.BEST_COMPRESSION);
}
/**
@ -42,7 +42,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
for (int i = 0; i < 10; i++) {
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setCodec(
new Lucene99Codec(RandomPicks.randomFrom(random(), Lucene99Codec.Mode.values())));
new Lucene912Codec(RandomPicks.randomFrom(random(), Lucene912Codec.Mode.values())));
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(new StoredField("field1", "value1"));
@ -72,7 +72,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
expectThrows(
NullPointerException.class,
() -> {
new Lucene99Codec(null);
new Lucene912Codec(null);
});
expectThrows(

View File

@ -0,0 +1,157 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader.MutableImpactList;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.index.BasePostingsFormatTestCase;
import org.apache.lucene.tests.util.TestUtil;
public class TestLucene912PostingsFormat extends BasePostingsFormatTestCase {
@Override
protected Codec getCodec() {
return TestUtil.alwaysPostingsFormat(new Lucene912PostingsFormat());
}
public void testVInt15() throws IOException {
byte[] bytes = new byte[5];
ByteArrayDataOutput out = new ByteArrayDataOutput(bytes);
ByteArrayDataInput in = new ByteArrayDataInput();
for (int i : new int[] {0, 1, 127, 128, 32767, 32768, Integer.MAX_VALUE}) {
out.reset(bytes);
Lucene912PostingsWriter.writeVInt15(out, i);
in.reset(bytes, 0, out.getPosition());
assertEquals(i, Lucene912PostingsReader.readVInt15(in));
assertEquals(out.getPosition(), in.getPosition());
}
}
public void testVLong15() throws IOException {
byte[] bytes = new byte[9];
ByteArrayDataOutput out = new ByteArrayDataOutput(bytes);
ByteArrayDataInput in = new ByteArrayDataInput();
for (long i : new long[] {0, 1, 127, 128, 32767, 32768, Integer.MAX_VALUE, Long.MAX_VALUE}) {
out.reset(bytes);
Lucene912PostingsWriter.writeVLong15(out, i);
in.reset(bytes, 0, out.getPosition());
assertEquals(i, Lucene912PostingsReader.readVLong15(in));
assertEquals(out.getPosition(), in.getPosition());
}
}
/** Make sure the final sub-block(s) are not skipped. */
public void testFinalBlock() throws Exception {
Directory d = newDirectory();
IndexWriter w = new IndexWriter(d, new IndexWriterConfig(new MockAnalyzer(random())));
for (int i = 0; i < 25; i++) {
Document doc = new Document();
doc.add(newStringField("field", Character.toString((char) (97 + i)), Field.Store.NO));
doc.add(newStringField("field", "z" + Character.toString((char) (97 + i)), Field.Store.NO));
w.addDocument(doc);
}
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w);
assertEquals(1, r.leaves().size());
FieldReader field = (FieldReader) r.leaves().get(0).reader().terms("field");
// We should see exactly two blocks: one root block (prefix empty string) and one block for z*
// terms (prefix z):
Stats stats = field.getStats();
assertEquals(0, stats.floorBlockCount);
assertEquals(2, stats.nonFloorBlockCount);
r.close();
w.close();
d.close();
}
public void testImpactSerialization() throws IOException {
// omit norms and omit freqs
doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L)));
// omit freqs
doTestImpactSerialization(Collections.singletonList(new Impact(1, 42L)));
// omit freqs with very large norms
doTestImpactSerialization(Collections.singletonList(new Impact(1, -100L)));
// omit norms
doTestImpactSerialization(Collections.singletonList(new Impact(30, 1L)));
// omit norms with large freq
doTestImpactSerialization(Collections.singletonList(new Impact(500, 1L)));
// freqs and norms, basic
doTestImpactSerialization(
Arrays.asList(
new Impact(1, 7L),
new Impact(3, 9L),
new Impact(7, 10L),
new Impact(15, 11L),
new Impact(20, 13L),
new Impact(28, 14L)));
// freqs and norms, high values
doTestImpactSerialization(
Arrays.asList(
new Impact(2, 2L),
new Impact(10, 10L),
new Impact(12, 50L),
new Impact(50, -100L),
new Impact(1000, -80L),
new Impact(1005, -3L)));
}
private void doTestImpactSerialization(List<Impact> impacts) throws IOException {
CompetitiveImpactAccumulator acc = new CompetitiveImpactAccumulator();
for (Impact impact : impacts) {
acc.add(impact.freq, impact.norm);
}
try (Directory dir = newDirectory()) {
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
Lucene912PostingsWriter.writeImpacts(acc.getCompetitiveFreqNormPairs(), out);
}
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
byte[] b = new byte[Math.toIntExact(in.length())];
in.readBytes(b, 0, b.length);
List<Impact> impacts2 =
Lucene912PostingsReader.readImpacts(
new ByteArrayDataInput(b),
new MutableImpactList(impacts.size() + random().nextInt(3)));
assertEquals(impacts, impacts2);
}
}
}
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.KnnFloatVectorField;
@ -79,7 +80,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
@Override
protected Codec getCodec() {
return new Lucene99Codec() {
return new Lucene912Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return format;
@ -95,7 +96,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
dir,
newIndexWriterConfig()
.setCodec(
new Lucene99Codec() {
new Lucene912Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new Lucene99HnswScalarQuantizedVectorsFormat(

View File

@ -27,6 +27,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -51,7 +52,7 @@ import org.apache.lucene.util.quantization.ScalarQuantizer;
public class TestLucene99ScalarQuantizedVectorScorer extends LuceneTestCase {
private static Codec getCodec(int bits, boolean compress) {
return new Lucene99Codec() {
return new Lucene912Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new Lucene99HnswScalarQuantizedVectorsFormat(

View File

@ -28,6 +28,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.KnnFloatVectorField;
@ -67,7 +68,7 @@ public class TestLucene99ScalarQuantizedVectorsFormat extends BaseKnnVectorsForm
@Override
protected Codec getCodec() {
return new Lucene99Codec() {
return new Lucene912Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return format;

View File

@ -1,195 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FilterIndexInput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.analysis.MockTokenizer;
import org.apache.lucene.tests.store.MockDirectoryWrapper;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.BytesRef;
/** Tests lazy skipping on the proximity file. */
public class TestLazyProxSkipping extends LuceneTestCase {
private IndexSearcher searcher;
private int seeksCounter = 0;
private String field = "tokens";
private String term1 = "xx";
private String term2 = "yy";
private String term3 = "zz";
private class SeekCountingDirectory extends MockDirectoryWrapper {
public SeekCountingDirectory(Directory delegate) {
super(random(), delegate);
}
@Override
public IndexInput openInput(String name, IOContext context) throws IOException {
IndexInput ii = super.openInput(name, context);
if (name.endsWith(".prx") || name.endsWith(".pos")) {
// we decorate the proxStream with a wrapper class that allows to count the number of calls
// of seek()
ii = new SeeksCountingStream(ii);
}
return ii;
}
}
private void createIndex(int numHits) throws IOException {
int numDocs = 500;
final Analyzer analyzer =
new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, true));
}
};
Directory directory = new SeekCountingDirectory(new ByteBuffersDirectory());
// note: test explicitly disables payloads
IndexWriter writer =
new IndexWriter(
directory,
newIndexWriterConfig(analyzer)
.setMaxBufferedDocs(10)
.setMergePolicy(newLogMergePolicy(false)));
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
String content;
if (i % (numDocs / numHits) == 0) {
// add a document that matches the query "term1 term2"
content = this.term1 + " " + this.term2;
} else if (i % 15 == 0) {
// add a document that only contains term1
content = this.term1 + " " + this.term1;
} else {
// add a document that contains term2 but not term 1
content = this.term3 + " " + this.term2;
}
doc.add(newTextField(this.field, content, Field.Store.YES));
writer.addDocument(doc);
}
// make sure the index has only a single segment
writer.forceMerge(1);
writer.close();
LeafReader reader = getOnlyLeafReader(DirectoryReader.open(directory));
this.searcher = newSearcher(reader);
}
private ScoreDoc[] search() throws IOException {
// create PhraseQuery "term1 term2" and search
PhraseQuery pq = new PhraseQuery(field, term1, term2);
return this.searcher.search(pq, 1000).scoreDocs;
}
private void performTest(int numHits) throws IOException {
createIndex(numHits);
this.seeksCounter = 0;
ScoreDoc[] hits = search();
// verify that the right number of docs was found
assertEquals(numHits, hits.length);
// check if the number of calls of seek() does not exceed the number of hits
assertTrue(this.seeksCounter > 0);
assertTrue(
"seeksCounter=" + this.seeksCounter + " numHits=" + numHits,
this.seeksCounter <= numHits + 1);
searcher.getIndexReader().close();
}
public void testLazySkipping() throws IOException {
final String fieldFormat = TestUtil.getPostingsFormat(this.field);
assumeFalse("This test cannot run with Direct postings format", fieldFormat.equals("Direct"));
assumeFalse(
"This test cannot run with SimpleText postings format", fieldFormat.equals("SimpleText"));
// test whether only the minimum amount of seeks()
// are performed
performTest(5);
performTest(10);
}
public void testSeek() throws IOException {
Directory directory = newDirectory();
IndexWriter writer =
new IndexWriter(directory, newIndexWriterConfig(new MockAnalyzer(random())));
for (int i = 0; i < 10; i++) {
Document doc = new Document();
doc.add(newTextField(this.field, "a b", Field.Store.YES));
writer.addDocument(doc);
}
writer.close();
IndexReader reader = DirectoryReader.open(directory);
PostingsEnum tp = MultiTerms.getTermPostingsEnum(reader, this.field, new BytesRef("b"));
for (int i = 0; i < 10; i++) {
tp.nextDoc();
assertEquals(tp.docID(), i);
assertEquals(tp.nextPosition(), 1);
}
tp = MultiTerms.getTermPostingsEnum(reader, this.field, new BytesRef("a"));
for (int i = 0; i < 10; i++) {
tp.nextDoc();
assertEquals(tp.docID(), i);
assertEquals(tp.nextPosition(), 0);
}
reader.close();
directory.close();
}
// Simply extends IndexInput in a way that we are able to count the number
// of invocations of seek()
class SeeksCountingStream extends FilterIndexInput {
SeeksCountingStream(IndexInput input) {
super("SeekCountingStream(" + input + ")", input);
}
@Override
public void seek(long pos) throws IOException {
TestLazyProxSkipping.this.seeksCounter++;
in.seek(pos);
}
@Override
public SeeksCountingStream clone() {
return new SeeksCountingStream(in.clone());
}
}
}

View File

@ -21,8 +21,8 @@ import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
@ -261,10 +261,10 @@ class TermIntervalsSource extends IntervalsSource {
/**
* A guess of the average number of simple operations for the initial seek and buffer refill per
* document for the positions of a term. See also {@link
* Lucene99PostingsReader.EverythingEnum#nextPosition()}.
* Lucene912PostingsReader.EverythingEnum#nextPosition()}.
*
* <p>Aside: Instead of being constant this could depend among others on {@link
* Lucene99PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
* Lucene912PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
* TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs),
* {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block
* size of the device storing the index.
@ -272,7 +272,7 @@ class TermIntervalsSource extends IntervalsSource {
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
/**
* Number of simple operations in {@link Lucene99PostingsReader.EverythingEnum#nextPosition()}
* Number of simple operations in {@link Lucene912PostingsReader.EverythingEnum#nextPosition()}
* when no seek or buffer refill is done.
*/
private static final int TERM_OPS_PER_POS = 7;

View File

@ -31,7 +31,8 @@ module org.apache.lucene.suggest {
org.apache.lucene.search.suggest.document.Completion50PostingsFormat,
org.apache.lucene.search.suggest.document.Completion84PostingsFormat,
org.apache.lucene.search.suggest.document.Completion90PostingsFormat,
org.apache.lucene.search.suggest.document.Completion99PostingsFormat;
org.apache.lucene.search.suggest.document.Completion99PostingsFormat,
org.apache.lucene.search.suggest.document.Completion912PostingsFormat;
provides org.apache.lucene.analysis.TokenFilterFactory with
org.apache.lucene.search.suggest.analyzing.SuggestStopFilterFactory;
}

View File

@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.document;
import org.apache.lucene.codecs.PostingsFormat;
/**
* {@link CompletionPostingsFormat} for {@link
* org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat}
*
* @lucene.experimental
*/
public class Completion912PostingsFormat extends CompletionPostingsFormat {
/** Creates a {@link Completion912PostingsFormat} that will load the completion FST on-heap. */
public Completion912PostingsFormat() {
this(FSTLoadMode.ON_HEAP);
}
/**
* Creates a {@link Completion912PostingsFormat} that will use the provided <code>fstLoadMode
* </code> to determine if the completion FST should be loaded on or off heap.
*/
public Completion912PostingsFormat(FSTLoadMode fstLoadMode) {
super("Completion912", fstLoadMode);
}
@Override
protected PostingsFormat delegatePostingsFormat() {
return PostingsFormat.forName("Lucene912");
}
}

View File

@ -19,8 +19,10 @@ package org.apache.lucene.search.suggest.document;
import org.apache.lucene.codecs.PostingsFormat;
/**
* {@link CompletionPostingsFormat} for {@link
* org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat}
* {@link org.apache.lucene.search.suggest.document.CompletionPostingsFormat} for {@code
* org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat}. This format is only used for
* backward-compatibility of the index format and cannot be used to write data, use {@link
* Completion912PostingsFormat} on new indices.
*
* @lucene.experimental
*/

View File

@ -34,3 +34,4 @@ org.apache.lucene.search.suggest.document.Completion50PostingsFormat
org.apache.lucene.search.suggest.document.Completion84PostingsFormat
org.apache.lucene.search.suggest.document.Completion90PostingsFormat
org.apache.lucene.search.suggest.document.Completion99PostingsFormat
org.apache.lucene.search.suggest.document.Completion912PostingsFormat

View File

@ -951,7 +951,7 @@ public class TestSuggestField extends LuceneTestCase {
new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) {
final CompletionPostingsFormat.FSTLoadMode fstLoadMode =
RandomPicks.randomFrom(random(), CompletionPostingsFormat.FSTLoadMode.values());
final PostingsFormat postingsFormat = new Completion99PostingsFormat(fstLoadMode);
final PostingsFormat postingsFormat = new Completion912PostingsFormat(fstLoadMode);
@Override
public PostingsFormat postingsFormat() {

View File

@ -28,9 +28,9 @@ import org.apache.lucene.codecs.blockterms.FixedGapTermsIndexReader;
import org.apache.lucene.codecs.blockterms.FixedGapTermsIndexWriter;
import org.apache.lucene.codecs.blockterms.TermsIndexReaderBase;
import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; // javadocs
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@ -38,7 +38,7 @@ import org.apache.lucene.index.SegmentWriteState;
// any PostingsFormat and make it ord-able...
/**
* Customized version of {@link Lucene99PostingsFormat} that uses {@link FixedGapTermsIndexWriter}.
* Customized version of {@link Lucene912PostingsFormat} that uses {@link FixedGapTermsIndexWriter}.
*/
public final class LuceneFixedGap extends PostingsFormat {
final int termIndexInterval;
@ -54,7 +54,7 @@ public final class LuceneFixedGap extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase docs = new Lucene99PostingsWriter(state);
PostingsWriterBase docs = new Lucene912PostingsWriter(state);
// TODO: should we make the terms index more easily
// pluggable? Ie so that this codec would record which
@ -91,7 +91,7 @@ public final class LuceneFixedGap extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postings = new Lucene99PostingsReader(state);
PostingsReaderBase postings = new Lucene912PostingsReader(state);
TermsIndexReaderBase indexReader;
boolean success = false;

View File

@ -29,9 +29,9 @@ import org.apache.lucene.codecs.blockterms.TermsIndexReaderBase;
import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase;
import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader;
import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; // javadocs
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@ -39,7 +39,7 @@ import org.apache.lucene.index.SegmentWriteState;
// any PostingsFormat and make it ord-able...
/**
* Customized version of {@link Lucene99PostingsFormat} that uses {@link
* Customized version of {@link Lucene912PostingsFormat} that uses {@link
* VariableGapTermsIndexWriter} with a fixed interval, but forcing high docfreq terms to be indexed
* terms.
*/
@ -59,7 +59,7 @@ public final class LuceneVarGapDocFreqInterval extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase docs = new Lucene99PostingsWriter(state);
PostingsWriterBase docs = new Lucene912PostingsWriter(state);
// TODO: should we make the terms index more easily
// pluggable? Ie so that this codec would record which
@ -100,7 +100,7 @@ public final class LuceneVarGapDocFreqInterval extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postings = new Lucene99PostingsReader(state);
PostingsReaderBase postings = new Lucene912PostingsReader(state);
TermsIndexReaderBase indexReader;
boolean success = false;

View File

@ -29,9 +29,9 @@ import org.apache.lucene.codecs.blockterms.TermsIndexReaderBase;
import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase;
import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader;
import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; // javadocs
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@ -39,7 +39,7 @@ import org.apache.lucene.index.SegmentWriteState;
// any PostingsFormat and make it ord-able...
/**
* Customized version of {@link Lucene99PostingsFormat} that uses {@link
* Customized version of {@link Lucene912PostingsFormat} that uses {@link
* VariableGapTermsIndexWriter} with a fixed interval.
*/
public final class LuceneVarGapFixedInterval extends PostingsFormat {
@ -56,7 +56,7 @@ public final class LuceneVarGapFixedInterval extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase docs = new Lucene99PostingsWriter(state);
PostingsWriterBase docs = new Lucene912PostingsWriter(state);
// TODO: should we make the terms index more easily
// pluggable? Ie so that this codec would record which
@ -95,7 +95,7 @@ public final class LuceneVarGapFixedInterval extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postings = new Lucene99PostingsReader(state);
PostingsReaderBase postings = new Lucene912PostingsReader(state);
TermsIndexReaderBase indexReader;
boolean success = false;

View File

@ -37,8 +37,8 @@ import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsReader;
import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsWriter;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.codecs.memory.FSTTermsReader;
import org.apache.lucene.codecs.memory.FSTTermsWriter;
import org.apache.lucene.index.FieldInfo;
@ -121,7 +121,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat {
random.nextInt(); // consume a random for buffersize
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
final FieldsConsumer fields;
final int t1 = random.nextInt(4);
@ -289,7 +289,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat {
System.out.println("MockRandomCodec: readBufferSize=" + readBufferSize);
}
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
final FieldsProducer fields;
final int t1 = random.nextInt(4);

View File

@ -23,8 +23,8 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.codecs.uniformsplit.BlockDecoder;
import org.apache.lucene.codecs.uniformsplit.BlockEncoder;
import org.apache.lucene.codecs.uniformsplit.IndexDictionary;
@ -67,7 +67,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState segmentWriteState) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(segmentWriteState);
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(segmentWriteState);
boolean success = false;
try {
FieldsConsumer fieldsConsumer = createFieldsConsumer(segmentWriteState, postingsWriter);
@ -145,7 +145,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState segmentReadState) throws IOException {
PostingsReaderBase postingsReader = new Lucene99PostingsReader(segmentReadState);
PostingsReaderBase postingsReader = new Lucene912PostingsReader(segmentReadState);
boolean success = false;
try {
FieldsProducer fieldsProducer = createFieldsProducer(segmentReadState, postingsReader);

View File

@ -38,7 +38,7 @@ import java.util.TimeZone;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.tests.codecs.asserting.AssertingCodec;
@ -188,9 +188,9 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
} else if ("Compressing".equals(TEST_CODEC)
|| ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
codec = CompressingCodec.randomInstance(random);
} else if ("Lucene99".equals(TEST_CODEC)
|| ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene99"))) {
codec = new Lucene99Codec(RandomPicks.randomFrom(random, Lucene99Codec.Mode.values()));
} else if ("Lucene912".equals(TEST_CODEC)
|| ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene912"))) {
codec = new Lucene912Codec(RandomPicks.randomFrom(random, Lucene912Codec.Mode.values()));
} else if (!"random".equals(TEST_CODEC)) {
codec = Codec.forName(TEST_CODEC);
} else if ("random".equals(TEST_POSTINGSFORMAT)) {

View File

@ -56,9 +56,9 @@ import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.apache.lucene.document.BinaryDocValuesField;
@ -1231,7 +1231,7 @@ public final class TestUtil {
* different from {@link Codec#getDefault()} because that is randomized.
*/
public static Codec getDefaultCodec() {
return new Lucene99Codec();
return new Lucene912Codec();
}
/**
@ -1239,7 +1239,7 @@ public final class TestUtil {
* Lucene.
*/
public static PostingsFormat getDefaultPostingsFormat() {
return new Lucene99PostingsFormat();
return new Lucene912PostingsFormat();
}
/**
@ -1250,7 +1250,7 @@ public final class TestUtil {
*/
public static PostingsFormat getDefaultPostingsFormat(
int minItemsPerBlock, int maxItemsPerBlock) {
return new Lucene99PostingsFormat(minItemsPerBlock, maxItemsPerBlock);
return new Lucene912PostingsFormat(minItemsPerBlock, maxItemsPerBlock);
}
/** Returns a random postings format that supports term ordinals */