LUCENE-9322: Add Lucene90 codec, including VectorFormat

This commit adds support for dense floating point VectorFields.
The new VectorValues class provides access to the indexed vectors.
This commit is contained in:
Michael Sokolov 2020-09-26 07:11:04 -04:00
parent 85b58c262a
commit c02f07f2d5
81 changed files with 4567 additions and 918 deletions

3
.dir-locals.el Normal file
View File

@ -0,0 +1,3 @@
;; set up Lucene style for emacs
((java-mode . ((c-basic-offset . 2))))

3
.gitignore vendored
View File

@ -7,7 +7,8 @@ build
dist
lib
test-lib
/*~
*~
.#*
/build.properties
/.idea
lucene/**/*.iml

View File

@ -157,7 +157,8 @@ configure(project(":lucene:backward-codecs")) {
"org.apache.lucene.codecs.lucene60",
"org.apache.lucene.codecs.lucene80",
"org.apache.lucene.codecs.lucene84",
"org.apache.lucene.codecs.lucene86"
"org.apache.lucene.codecs.lucene86",
"org.apache.lucene.codecs.lucene87"
]
}
}

View File

@ -20,6 +20,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
@ -128,4 +129,9 @@ public class Lucene80Codec extends Codec {
public final NormsFormat normsFormat() {
return normsFormat;
}
@Override
public final VectorFormat vectorFormat() {
return VectorFormat.EMPTY;
}
}

View File

@ -23,6 +23,7 @@ import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
@ -136,6 +137,11 @@ public class Lucene84Codec extends Codec {
return new Lucene60PointsFormat();
}
@Override
public VectorFormat vectorFormat() {
return VectorFormat.EMPTY;
}
/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*

View File

@ -24,6 +24,7 @@ import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
@ -136,6 +137,11 @@ public class Lucene86Codec extends Codec {
return pointsFormat;
}
@Override
public final VectorFormat vectorFormat() {
return VectorFormat.EMPTY;
}
/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*

View File

@ -31,6 +31,7 @@ import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
@ -137,6 +138,9 @@ public class Lucene87Codec extends Codec {
return pointsFormat;
}
@Override
public final VectorFormat vectorFormat() { return VectorFormat.EMPTY; }
/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*

View File

@ -0,0 +1,42 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Lucene 8.7 file format.
</body>
</html>

View File

@ -16,3 +16,4 @@
org.apache.lucene.codecs.lucene80.Lucene80Codec
org.apache.lucene.codecs.lucene84.Lucene84Codec
org.apache.lucene.codecs.lucene86.Lucene86Codec
org.apache.lucene.codecs.lucene87.Lucene87Codec

View File

@ -29,7 +29,7 @@ import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene87.Lucene87Codec;
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexDeletionPolicy;
@ -138,7 +138,7 @@ public class CreateIndexTask extends PerfTask {
if (defaultCodec == null && postingsFormat != null) {
try {
final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat);
iwConf.setCodec(new Lucene87Codec() {
iwConf.setCodec(new Lucene90Codec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postingsFormatChosen;

View File

@ -21,6 +21,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
@ -46,6 +47,7 @@ public final class SimpleTextCodec extends Codec {
private final DocValuesFormat dvFormat = new SimpleTextDocValuesFormat();
private final CompoundFormat compoundFormat = new SimpleTextCompoundFormat();
private final PointsFormat pointsFormat = new SimpleTextPointsFormat();
private final VectorFormat vectorFormat = new SimpleTextVectorFormat();
public SimpleTextCodec() {
super("SimpleText");
@ -100,4 +102,9 @@ public final class SimpleTextCodec extends Codec {
public PointsFormat pointsFormat() {
return pointsFormat;
}
@Override
public VectorFormat vectorFormat() {
return vectorFormat;
}
}

View File

@ -30,6 +30,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -67,6 +68,8 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat {
static final BytesRef DATA_DIM_COUNT = new BytesRef(" data dimensional count ");
static final BytesRef INDEX_DIM_COUNT = new BytesRef(" index dimensional count ");
static final BytesRef DIM_NUM_BYTES = new BytesRef(" dimensional num bytes ");
static final BytesRef VECTOR_NUM_DIMS = new BytesRef(" vector number of dimensions ");
static final BytesRef VECTOR_SCORE_FUNC = new BytesRef(" vector score function ");
static final BytesRef SOFT_DELETES = new BytesRef(" soft-deletes ");
@Override
@ -146,13 +149,23 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat {
assert StringHelper.startsWith(scratch.get(), DIM_NUM_BYTES);
int dimensionalNumBytes = Integer.parseInt(readString(DIM_NUM_BYTES.length, scratch));
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), VECTOR_NUM_DIMS);
int vectorNumDimensions = Integer.parseInt(readString(VECTOR_NUM_DIMS.length, scratch));
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), VECTOR_SCORE_FUNC);
String scoreFunction = readString(VECTOR_SCORE_FUNC.length, scratch);
VectorValues.ScoreFunction vectorDistFunc = distanceFunction(scoreFunction);
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SOFT_DELETES);
boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch));
infos[i] = new FieldInfo(name, fieldNumber, storeTermVector,
omitNorms, storePayloads, indexOptions, docValuesType, dvGen, Collections.unmodifiableMap(atts),
dimensionalCount, indexDimensionalCount, dimensionalNumBytes, isSoftDeletesField);
dimensionalCount, indexDimensionalCount, dimensionalNumBytes,
vectorNumDimensions, vectorDistFunc, isSoftDeletesField);
}
SimpleTextUtil.checkFooter(input);
@ -172,6 +185,10 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat {
public DocValuesType docValuesType(String dvType) {
return DocValuesType.valueOf(dvType);
}
public VectorValues.ScoreFunction distanceFunction(String scoreFunction) {
return VectorValues.ScoreFunction.valueOf(scoreFunction);
}
private String readString(int offset, BytesRefBuilder scratch) {
return new String(scratch.bytes(), offset, scratch.length()-offset, StandardCharsets.UTF_8);
@ -253,6 +270,14 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat {
SimpleTextUtil.write(out, Integer.toString(fi.getPointNumBytes()), scratch);
SimpleTextUtil.writeNewline(out);
SimpleTextUtil.write(out, VECTOR_NUM_DIMS);
SimpleTextUtil.write(out, Integer.toString(fi.getVectorDimension()), scratch);
SimpleTextUtil.writeNewline(out);
SimpleTextUtil.write(out, VECTOR_SCORE_FUNC);
SimpleTextUtil.write(out, fi.getVectorScoreFunction().name(), scratch);
SimpleTextUtil.writeNewline(out);
SimpleTextUtil.write(out, SOFT_DELETES);
SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch);
SimpleTextUtil.writeNewline(out);

View File

@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.VectorWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
/** For debugging, curiosity, transparency only!! Do not use this codec in production.
*
* <p>This codec stores all data in a single human-readable text file (_N.vec). You can view this in
* any text editor, and even edit it to alter your index.
*
* @lucene.experimental */
public final class SimpleTextVectorFormat extends VectorFormat {
@Override
public VectorWriter fieldsWriter(SegmentWriteState state) throws IOException {
return new SimpleTextVectorWriter(state);
}
@Override
public VectorReader fieldsReader(SegmentReadState state) throws IOException {
return new SimpleTextVectorReader(state);
}
/** Extension of points data file */
static final String VECTOR_EXTENSION = "vec";
/** Extension of points index file */
static final String META_EXTENSION = "gri";
}

View File

@ -0,0 +1,304 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.BufferedChecksumIndexInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.StringHelper;
import static org.apache.lucene.codecs.simpletext.SimpleTextVectorWriter.*;
/**
* Reads vector values from a simple text format. All vectors are read up front and cached in RAM in order to support
* random access.
* <b>FOR RECREATIONAL USE ONLY</b>
* @lucene.experimental
*/
public class SimpleTextVectorReader extends VectorReader {
private static final BytesRef EMPTY = new BytesRef("");
private final SegmentReadState readState;
private final IndexInput dataIn;
private final BytesRefBuilder scratch = new BytesRefBuilder();
private final Map<String, FieldEntry> fieldEntries = new HashMap<>();
SimpleTextVectorReader(SegmentReadState readState) throws IOException {
this.readState = readState;
String metaFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name, readState.segmentSuffix, SimpleTextVectorFormat.META_EXTENSION);
try (ChecksumIndexInput in = readState.directory.openChecksumInput(metaFileName, IOContext.DEFAULT)) {
int fieldNumber = readInt(in, FIELD_NUMBER);
while (fieldNumber != -1) {
String fieldName = readString(in, FIELD_NAME);
String scoreFunctionName = readString(in, SCORE_FUNCTION);
VectorValues.ScoreFunction scoreFunction = VectorValues.ScoreFunction.valueOf(scoreFunctionName);
long vectorDataOffset = readLong(in, VECTOR_DATA_OFFSET);
long vectorDataLength = readLong(in, VECTOR_DATA_LENGTH);
int dimension = readInt(in, VECTOR_DIMENSION);
int size = readInt(in, SIZE);
int[] docIds = new int[size];
for (int i = 0; i < size; i++) {
docIds[i] = readInt(in, EMPTY);
}
assert fieldEntries.containsKey(fieldName) == false;
fieldEntries.put(fieldName, new FieldEntry(dimension, scoreFunction, vectorDataOffset, vectorDataLength, docIds));
fieldNumber = readInt(in, FIELD_NUMBER);
}
SimpleTextUtil.checkFooter(in);
}
String vectorFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name, readState.segmentSuffix, SimpleTextVectorFormat.VECTOR_EXTENSION);
dataIn = readState.directory.openInput(vectorFileName, IOContext.DEFAULT);
}
@Override
public VectorValues getVectorValues(String field) throws IOException {
FieldInfo info = readState.fieldInfos.fieldInfo(field);
if (info == null) {
throw new IllegalStateException("No vectors indexed for field=\"" + field + "\"");
}
int dimension = info.getVectorDimension();
if (dimension == 0) {
return VectorValues.EMPTY;
}
FieldEntry fieldEntry = fieldEntries.get(field);
if (fieldEntry == null) {
throw new IllegalStateException("No entry found for vector field=\"" + field + "\"");
}
if (dimension != fieldEntry.dimension) {
throw new IllegalStateException("Inconsistent vector dimension for field=\"" + field + "\"; " + dimension + " != " + fieldEntry.dimension);
}
IndexInput bytesSlice = dataIn.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);
return new SimpleTextVectorValues(fieldEntry, bytesSlice);
}
@Override
public void checkIntegrity() throws IOException {
IndexInput clone = dataIn.clone();
clone.seek(0);
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included in SimpleTextUtil.CHECKSUM):
long footerStartPos = dataIn.length() - (SimpleTextUtil.CHECKSUM.length + 21);
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
while (true) {
SimpleTextUtil.readLine(input, scratch);
if (input.getFilePointer() >= footerStartPos) {
// Make sure we landed at precisely the right location:
if (input.getFilePointer() != footerStartPos) {
throw new CorruptIndexException("SimpleText failure: footer does not start at expected position current=" + input.getFilePointer() + " vs expected=" + footerStartPos, input);
}
SimpleTextUtil.checkFooter(input);
break;
}
}
}
@Override
public long ramBytesUsed() {
return 0;
}
@Override
public void close() throws IOException {
dataIn.close();
}
private static class FieldEntry {
final int dimension;
final VectorValues.ScoreFunction scoreFunction;
final long vectorDataOffset;
final long vectorDataLength;
final int[] ordToDoc;
FieldEntry(int dimension, VectorValues.ScoreFunction scoreFunction,
long vectorDataOffset, long vectorDataLength, int[] ordToDoc) {
this.dimension = dimension;
this.scoreFunction = scoreFunction;
this.vectorDataOffset = vectorDataOffset;
this.vectorDataLength = vectorDataLength;
this.ordToDoc = ordToDoc;
}
int size() {
return ordToDoc.length;
}
}
private static class SimpleTextVectorValues extends VectorValues implements VectorValues.RandomAccess {
private final BytesRefBuilder scratch = new BytesRefBuilder();
private final FieldEntry entry;
private final IndexInput in;
private final BytesRef binaryValue;
private final float[][] values;
int curOrd;
SimpleTextVectorValues(FieldEntry entry, IndexInput in) throws IOException {
this.entry = entry;
this.in = in;
values = new float[entry.size()][entry.dimension];
binaryValue = new BytesRef(entry.dimension * Float.BYTES);
binaryValue.length = binaryValue.bytes.length;
curOrd = -1;
readAllVectors();
}
@Override
public int dimension() {
return entry.dimension;
}
@Override
public int size() {
return entry.size();
}
@Override
public ScoreFunction scoreFunction() {
return entry.scoreFunction;
}
@Override
public float[] vectorValue() {
return values[curOrd];
}
@Override
public BytesRef binaryValue() {
ByteBuffer.wrap(binaryValue.bytes).asFloatBuffer().get(values[curOrd]);
return binaryValue;
}
@Override
public RandomAccess randomAccess() {
return this;
}
@Override
public int docID() {
if (curOrd == -1) {
return -1;
}
return entry.ordToDoc[curOrd];
}
@Override
public int nextDoc() throws IOException {
if (++curOrd < entry.size()) {
return docID();
}
return NO_MORE_DOCS;
}
@Override
public int advance(int target) throws IOException {
return slowAdvance(target);
}
@Override
public long cost() {
return size();
}
private void readAllVectors() throws IOException {
for (int i = 0; i < values.length; i++) {
readVector(values[i]);
}
}
private void readVector(float[] value) throws IOException {
SimpleTextUtil.readLine(in, scratch);
// skip leading " [" and strip trailing "]"
String s = new BytesRef(scratch.bytes(), 2, scratch.length() - 3).utf8ToString();
String[] floatStrings = s.split(",");
assert floatStrings.length == value.length : " read " + s + " when expecting " + value.length + " floats";
for (int i = 0; i < floatStrings.length; i++) {
value[i] = Float.parseFloat(floatStrings[i]);
}
}
@Override
public float[] vectorValue(int targetOrd) throws IOException {
return values[targetOrd];
}
@Override
public BytesRef binaryValue(int targetOrd) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public TopDocs search(float[] target, int k, int fanout) throws IOException {
throw new UnsupportedOperationException();
}
}
private int readInt(IndexInput in, BytesRef field) throws IOException {
SimpleTextUtil.readLine(in, scratch);
return parseInt(field);
}
private long readLong(IndexInput in, BytesRef field) throws IOException {
SimpleTextUtil.readLine(in, scratch);
return parseLong(field);
}
private String readString(IndexInput in, BytesRef field) throws IOException {
SimpleTextUtil.readLine(in, scratch);
return stripPrefix(field);
}
private boolean startsWith(BytesRef prefix) {
return StringHelper.startsWith(scratch.get(), prefix);
}
private int parseInt(BytesRef prefix) {
assert startsWith(prefix);
return Integer.parseInt(stripPrefix(prefix));
}
private long parseLong(BytesRef prefix) {
assert startsWith(prefix);
return Long.parseLong(stripPrefix(prefix));
}
private String stripPrefix(BytesRef prefix) {
int prefixLen = prefix.length;
return new String(scratch.bytes(), prefixLen, scratch.length() - prefixLen, StandardCharsets.UTF_8);
}
}

View File

@ -0,0 +1,148 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.codecs.VectorWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
/**
* Writes vector-valued fields in a plain text format
*/
public class SimpleTextVectorWriter extends VectorWriter {
static final BytesRef FIELD_NUMBER = new BytesRef("field-number ");
static final BytesRef FIELD_NAME = new BytesRef("field-name ");
static final BytesRef SCORE_FUNCTION = new BytesRef("score-function ");
static final BytesRef VECTOR_DATA_OFFSET = new BytesRef("vector-data-offset ");
static final BytesRef VECTOR_DATA_LENGTH = new BytesRef("vector-data-length ");
static final BytesRef VECTOR_DIMENSION = new BytesRef("vector-dimension ");
static final BytesRef SIZE = new BytesRef("size ");
private final IndexOutput meta, vectorData;
private final BytesRefBuilder scratch = new BytesRefBuilder();
SimpleTextVectorWriter(SegmentWriteState state) throws IOException {
assert state.fieldInfos.hasVectorValues();
String metaFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, SimpleTextVectorFormat.META_EXTENSION);
meta = state.directory.createOutput(metaFileName, state.context);
String vectorDataFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, SimpleTextVectorFormat.VECTOR_EXTENSION);
vectorData = state.directory.createOutput(vectorDataFileName, state.context);
}
@Override
public void writeField(FieldInfo fieldInfo, VectorValues vectors) throws IOException {
long vectorDataOffset = vectorData.getFilePointer();
List<Integer> docIds = new ArrayList<>();
int docV, ord = 0;
for (docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc(), ord++) {
writeVectorValue(vectors);
docIds.add(docV);
}
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds);
}
private void writeVectorValue(VectorValues vectors) throws IOException {
// write vector value
float[] value = vectors.vectorValue();
assert value.length == vectors.dimension();
write(vectorData, Arrays.toString(value));
newline(vectorData);
}
private void writeMeta(FieldInfo field, long vectorDataOffset, long vectorDataLength, List<Integer> docIds) throws IOException {
writeField(meta, FIELD_NUMBER, field.number);
writeField(meta, FIELD_NAME, field.name);
writeField(meta, SCORE_FUNCTION, field.getVectorScoreFunction().name());
writeField(meta, VECTOR_DATA_OFFSET, vectorDataOffset);
writeField(meta, VECTOR_DATA_LENGTH, vectorDataLength);
writeField(meta, VECTOR_DIMENSION, field.getVectorDimension());
writeField(meta, SIZE, docIds.size());
for (Integer docId : docIds) {
writeInt(meta, docId);
newline(meta);
}
writeField(meta, FIELD_NUMBER, -1);
}
@Override
public void finish() throws IOException {
SimpleTextUtil.writeChecksum(meta, scratch);
SimpleTextUtil.writeChecksum(vectorData, scratch);
}
@Override
public void close() throws IOException {
IOUtils.close(vectorData, meta);
}
private void writeField(IndexOutput out, BytesRef fieldName, int value) throws IOException {
write(out, fieldName);
writeInt(out, value);
newline(out);
}
private void writeField(IndexOutput out, BytesRef fieldName, long value) throws IOException {
write(out, fieldName);
writeLong(out, value);
newline(out);
}
private void writeField(IndexOutput out, BytesRef fieldName, String value) throws IOException {
write(out, fieldName);
write(out, value);
newline(out);
}
private void write(IndexOutput out, String s) throws IOException {
SimpleTextUtil.write(out, s, scratch);
}
private void writeInt(IndexOutput out, int x) throws IOException {
SimpleTextUtil.write(out, Integer.toString(x), scratch);
}
private void writeLong(IndexOutput out, long x) throws IOException {
SimpleTextUtil.write(out, Long.toString(x), scratch);
}
private void write(IndexOutput out, BytesRef b) throws IOException {
SimpleTextUtil.write(out, b);
}
private void newline(IndexOutput out) throws IOException {
SimpleTextUtil.writeNewline(out);
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.codecs.lucene84.MockTermStateFactory;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.ByteBuffersIndexOutput;
import org.apache.lucene.util.BytesRef;
@ -120,6 +121,8 @@ public class TestBlockWriter extends LuceneTestCase {
0,
0,
0,
0,
VectorValues.ScoreFunction.NONE,
true
);
}

View File

@ -42,6 +42,7 @@ import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
@ -203,6 +204,8 @@ public class TestSTBlockReader extends LuceneTestCase {
0,
0,
0,
0,
VectorValues.ScoreFunction.NONE,
false
);
}

View File

@ -56,8 +56,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
return LOADER;
}
// TODO: should we use this, or maybe a system property is better?
static Codec defaultCodec = LOADER.lookup("Lucene87");
static Codec defaultCodec = LOADER.lookup("Lucene90");
}
private final String name;
@ -110,6 +109,9 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
/** Encodes/decodes points index */
public abstract PointsFormat pointsFormat();
/** Encodes/decodes numeric vector fields */
public abstract VectorFormat vectorFormat();
/** looks up a codec by name */
public static Codec forName(String name) {

View File

@ -108,4 +108,9 @@ public abstract class FilterCodec extends Codec {
public PointsFormat pointsFormat() {
return delegate.pointsFormat();
}
@Override
public VectorFormat vectorFormat() {
return delegate.vectorFormat();
}
}

View File

@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs;
import java.io.IOException;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.VectorValues;
/**
* Encodes/decodes per-document vector and any associated indexing structures required to support nearest-neighbor search
*/
public abstract class VectorFormat {
/** Sole constructor */
protected VectorFormat() {}
/**
* Returns a {@link VectorWriter} to write the vectors to the index.
*/
public abstract VectorWriter fieldsWriter(SegmentWriteState state) throws IOException;
/**
* Returns a {@link VectorReader} to read the vectors from the index.
*/
public abstract VectorReader fieldsReader(SegmentReadState state) throws IOException;
/**
* EMPTY throws an exception when written. It acts as a sentinel indicating a Codec that does not support vectors.
*/
public static final VectorFormat EMPTY = new VectorFormat() {
@Override
public VectorWriter fieldsWriter(SegmentWriteState state) {
throw new UnsupportedOperationException("Attempt to write EMPTY VectorValues: maybe you forgot to use codec=Lucene90");
}
@Override
public VectorReader fieldsReader(SegmentReadState state) {
return new VectorReader() {
@Override
public void checkIntegrity() {
}
@Override
public VectorValues getVectorValues(String field) {
return VectorValues.EMPTY;
}
@Override
public void close() throws IOException {
}
@Override
public long ramBytesUsed() {
return 0;
}
};
}
};
}

View File

@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs;
import java.io.Closeable;
import java.io.IOException;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.util.Accountable;
/**
* Reads vectors from an index.
*/
public abstract class VectorReader implements Closeable, Accountable {
/** Sole constructor */
protected VectorReader() {}
/**
* Checks consistency of this reader.
* <p>
* Note that this may be costly in terms of I/O, e.g.
* may involve computing a checksum value against large data files.
* @lucene.internal
*/
public abstract void checkIntegrity() throws IOException;
/** Returns the {@link VectorValues} for the given {@code field} */
public abstract VectorValues getVectorValues(String field) throws IOException;
/**
* Returns an instance optimized for merging. This instance may only be
* consumed in the thread that called {@link #getMergeInstance()}.
* <p>
* The default implementation returns {@code this} */
public VectorReader getMergeInstance() {
return this;
}
}

View File

@ -0,0 +1,283 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.BytesRef;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
/**
* Writes vectors to an index.
*/
public abstract class VectorWriter implements Closeable {
/** Sole constructor */
protected VectorWriter() {}
/** Write all values contained in the provided reader */
public abstract void writeField(FieldInfo fieldInfo, VectorValues values) throws IOException;
/** Called once at the end before close */
public abstract void finish() throws IOException;
/** Merge the vector values from multiple segments, for all fields */
public void merge(MergeState mergeState) throws IOException {
for (int i = 0; i < mergeState.fieldInfos.length; i++) {
VectorReader reader = mergeState.vectorReaders[i];
assert reader != null || mergeState.fieldInfos[i].hasVectorValues() == false;
if (reader != null) {
reader.checkIntegrity();
}
}
for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) {
if (fieldInfo.hasVectorValues()) {
mergeVectors(fieldInfo, mergeState);
}
}
finish();
}
private void mergeVectors(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException {
if (mergeState.infoStream.isEnabled("VV")) {
mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
}
List<VectorValuesSub> subs = new ArrayList<>();
int dimension = -1;
VectorValues.ScoreFunction scoreFunction = null;
int nonEmptySegmentIndex = 0;
for (int i = 0; i < mergeState.vectorReaders.length; i++) {
VectorReader vectorReader = mergeState.vectorReaders[i];
if (vectorReader != null) {
if (mergeFieldInfo != null && mergeFieldInfo.hasVectorValues()) {
int segmentDimension = mergeFieldInfo.getVectorDimension();
VectorValues.ScoreFunction segmentScoreFunction = mergeFieldInfo.getVectorScoreFunction();
if (dimension == -1) {
dimension = segmentDimension;
scoreFunction = mergeFieldInfo.getVectorScoreFunction();
} else if (dimension != segmentDimension) {
throw new IllegalStateException("Varying dimensions for vector-valued field " + mergeFieldInfo.name
+ ": " + dimension + "!=" + segmentDimension);
} else if (scoreFunction != segmentScoreFunction) {
throw new IllegalStateException("Varying score functions for vector-valued field " + mergeFieldInfo.name
+ ": " + scoreFunction + "!=" + segmentScoreFunction);
}
VectorValues values = vectorReader.getVectorValues(mergeFieldInfo.name);
if (values != null) {
subs.add(new VectorValuesSub(nonEmptySegmentIndex++, mergeState.docMaps[i], values));
}
}
}
}
// Create a new VectorValues by iterating over the sub vectors, mapping the resulting
// docids using docMaps in the mergeState.
if (subs.size() > 0) {
writeField(mergeFieldInfo, new VectorValuesMerger(subs, mergeState));
}
if (mergeState.infoStream.isEnabled("VV")) {
mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
}
}
/** Tracks state of one sub-reader that we are merging */
private static class VectorValuesSub extends DocIDMerger.Sub {
final MergeState.DocMap docMap;
final VectorValues values;
final int segmentIndex;
int count;
VectorValuesSub(int segmentIndex, MergeState.DocMap docMap, VectorValues values) {
super(docMap);
this.values = values;
this.segmentIndex = segmentIndex;
this.docMap = docMap;
assert values.docID() == -1;
}
@Override
public int nextDoc() throws IOException {
int docId = values.nextDoc();
if (docId != NO_MORE_DOCS) {
// Note: this does count deleted docs since they are present in the to-be-merged segment
++count;
}
return docId;
}
}
/**
* View over multiple VectorValues supporting iterator-style access via DocIdMerger. Maintains a reverse ordinal
* mapping for documents having values in order to support random access by dense ordinal.
*/
private static class VectorValuesMerger extends VectorValues {
private final List<VectorValuesSub> subs;
private final DocIDMerger<VectorValuesSub> docIdMerger;
private final int[] ordBase;
private final int cost;
private final int size;
private int docId;
private VectorValuesSub current;
// For each doc with a vector, record its ord in the segments being merged. This enables random access into the
// unmerged segments using the ords from the merged segment.
private int[] ordMap;
private int ord;
VectorValuesMerger(List<VectorValuesSub> subs, MergeState mergeState) throws IOException {
this.subs = subs;
docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
int totalCost = 0, totalSize = 0;
for (VectorValuesSub sub : subs) {
totalCost += sub.values.cost();
totalSize += sub.values.size();
}
cost = totalCost;
size = totalSize;
ordMap = new int[size];
ordBase = new int[subs.size()];
int lastBase = 0;
for (int k = 0; k < subs.size(); k++) {
int size = subs.get(k).values.size();
ordBase[k] = lastBase;
lastBase += size;
}
docId = -1;
}
@Override
public int docID() {
return docId;
}
@Override
public int nextDoc() throws IOException {
current = docIdMerger.next();
if (current == null) {
docId = NO_MORE_DOCS;
} else {
docId = current.mappedDocID;
ordMap[ord++] = ordBase[current.segmentIndex] + current.count - 1;
}
return docId;
}
@Override
public float[] vectorValue() throws IOException {
return current.values.vectorValue();
}
@Override
public BytesRef binaryValue() throws IOException {
return current.values.binaryValue();
}
@Override
public RandomAccess randomAccess() {
return new MergerRandomAccess();
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException();
}
@Override
public int size() {
return size;
}
@Override
public long cost() {
return cost;
}
@Override
public int dimension() {
return subs.get(0).values.dimension();
}
@Override
public VectorValues.ScoreFunction scoreFunction() {
return subs.get(0).values.scoreFunction();
}
class MergerRandomAccess implements VectorValues.RandomAccess {
private final List<RandomAccess> raSubs;
MergerRandomAccess() {
raSubs = new ArrayList<>(subs.size());
for (VectorValuesSub sub : subs) {
raSubs.add(sub.values.randomAccess());
}
}
@Override
public int size() {
return size;
}
@Override
public int dimension() {
return VectorValuesMerger.this.dimension();
}
@Override
public ScoreFunction scoreFunction() {
return VectorValuesMerger.this.scoreFunction();
}
@Override
public float[] vectorValue(int target) throws IOException {
int unmappedOrd = ordMap[target];
int segmentOrd = Arrays.binarySearch(ordBase, unmappedOrd);
if (segmentOrd < 0) {
// get the index of the greatest lower bound
segmentOrd = -2 - segmentOrd;
}
while(segmentOrd < ordBase.length - 1 && ordBase[segmentOrd + 1] == ordBase[segmentOrd]) {
// forward over empty segments which will share the same ordBase
segmentOrd++;
}
return raSubs.get(segmentOrd).vectorValue(unmappedOrd - ordBase[segmentOrd]);
}
@Override
public BytesRef binaryValue(int targetOrd) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public TopDocs search(float[] target, int k, int fanout) throws IOException {
throw new UnsupportedOperationException();
}
}
}
}

View File

@ -31,6 +31,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
@ -148,7 +149,8 @@ public final class Lucene50FieldInfosFormat extends FieldInfosFormat {
lastAttributes = attributes;
try {
infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads,
indexOptions, docValuesType, dvGen, attributes, 0, 0, 0, false);
indexOptions, docValuesType, dvGen, attributes, 0, 0, 0,
0, VectorValues.ScoreFunction.NONE, false);
} catch (IllegalStateException e) {
throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);
}

View File

@ -17,7 +17,7 @@
/**
* Components from the Lucene 5.0 index format
* See {@link org.apache.lucene.codecs.lucene80} for an overview
* See {@link org.apache.lucene.codecs.lucene90} for an overview
* of the index format.
*/
package org.apache.lucene.codecs.lucene50;

View File

@ -31,6 +31,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
@ -164,7 +165,8 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat {
try {
infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads,
indexOptions, docValuesType, dvGen, attributes,
pointDataDimensionCount, pointIndexDimensionCount, pointNumBytes, isSoftDeletesField);
pointDataDimensionCount, pointIndexDimensionCount, pointNumBytes,
0, VectorValues.ScoreFunction.NONE, isSoftDeletesField);
} catch (IllegalStateException e) {
throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);
}

View File

@ -16,7 +16,7 @@
*/
/**
* Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene86}
* Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene90}
* for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene60;

View File

@ -17,7 +17,7 @@
/**
* Components from the Lucene 8.0 index format
* See {@link org.apache.lucene.codecs.lucene84} for an overview
* See {@link org.apache.lucene.codecs.lucene90} for an overview
* of the index format.
*/
package org.apache.lucene.codecs.lucene80;

View File

@ -16,7 +16,7 @@
*/
/**
* Components from the Lucene 8.4 index format. See {@link org.apache.lucene.codecs.lucene86}
* Components from the Lucene 8.4 index format. See {@link org.apache.lucene.codecs.lucene90}
* for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene84;

View File

@ -16,401 +16,7 @@
*/
/**
* Lucene 8.6 file format.
*
* <h2>Apache Lucene - Index File Formats</h2>
* <div>
* <ul>
* <li><a href="#Introduction">Introduction</a></li>
* <li><a href="#Definitions">Definitions</a>
* <ul>
* <li><a href="#Inverted_Indexing">Inverted Indexing</a></li>
* <li><a href="#Types_of_Fields">Types of Fields</a></li>
* <li><a href="#Segments">Segments</a></li>
* <li><a href="#Document_Numbers">Document Numbers</a></li>
* </ul>
* </li>
* <li><a href="#Overview">Index Structure Overview</a></li>
* <li><a href="#File_Naming">File Naming</a></li>
* <li><a href="#file-names">Summary of File Extensions</a>
* <ul>
* <li><a href="#Lock_File">Lock File</a></li>
* <li><a href="#History">History</a></li>
* <li><a href="#Limitations">Limitations</a></li>
* </ul>
* </li>
* </ul>
* </div>
* <a id="Introduction"></a>
* <h3>Introduction</h3>
* <div>
* <p>This document defines the index file formats used in this version of Lucene.
* If you are using a different version of Lucene, please consult the copy of
* <code>docs/</code> that was distributed with
* the version you are using.</p>
* <p>This document attempts to provide a high-level definition of the Apache
* Lucene file formats.</p>
* </div>
* <a id="Definitions"></a>
* <h3>Definitions</h3>
* <div>
* <p>The fundamental concepts in Lucene are index, document, field and term.</p>
* <p>An index contains a sequence of documents.</p>
* <ul>
* <li>A document is a sequence of fields.</li>
* <li>A field is a named sequence of terms.</li>
* <li>A term is a sequence of bytes.</li>
* </ul>
* <p>The same sequence of bytes in two different fields is considered a different
* term. Thus terms are represented as a pair: the string naming the field, and the
* bytes within the field.</p>
* <a id="Inverted_Indexing"></a>
* <h4>Inverted Indexing</h4>
* <p>The index stores statistics about terms in order to make term-based search
* more efficient. Lucene's index falls into the family of indexes known as an
* <i>inverted index.</i> This is because it can list, for a term, the documents
* that contain it. This is the inverse of the natural relationship, in which
* documents list terms.</p>
* <a id="Types_of_Fields"></a>
* <h4>Types of Fields</h4>
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
* in the index literally, in a non-inverted manner. Fields that are inverted are
* called <i>indexed</i>. A field may be both stored and indexed.</p>
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the
* text of a field may be used literally as a term to be indexed. Most fields are
* tokenized, but sometimes it is useful for certain identifier fields to be
* indexed literally.</p>
* <p>See the {@link org.apache.lucene.document.Field Field}
* java docs for more information on Fields.</p>
* <a id="Segments"></a>
* <h4>Segments</h4>
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
* Each segment is a fully independent index, which could be searched separately.
* Indexes evolve by:</p>
* <ol>
* <li>Creating new segments for newly added documents.</li>
* <li>Merging existing segments.</li>
* </ol>
* <p>Searches may involve multiple segments and/or multiple indexes, each index
* potentially composed of a set of segments.</p>
* <a id="Document_Numbers"></a>
* <h4>Document Numbers</h4>
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>.
* The first document added to an index is numbered zero, and each subsequent
* document added gets a number one greater than the previous.</p>
* <p>Note that a document's number may change, so caution should be taken when
* storing these numbers outside of Lucene. In particular, numbers may change in
* the following situations:</p>
* <ul>
* <li>
* <p>The numbers stored in each segment are unique only within the segment, and
* must be converted before they can be used in a larger context. The standard
* technique is to allocate each segment a range of values, based on the range of
* numbers used in that segment. To convert a document number from a segment to an
* external value, the segment's <i>base</i> document number is added. To convert
* an external value back to a segment-specific value, the segment is identified
* by the range that the external value is in, and the segment's base value is
* subtracted. For example two five document segments might be combined, so that
* the first segment has a base value of zero, and the second of five. Document
* three from the second segment would have an external value of eight.</p>
* </li>
* <li>
* <p>When documents are deleted, gaps are created in the numbering. These are
* eventually removed as the index evolves through merging. Deleted documents are
* dropped when segments are merged. A freshly-merged segment thus has no gaps in
* its numbering.</p>
* </li>
* </ul>
* </div>
* <a id="Overview"></a>
* <h3>Index Structure Overview</h3>
* <div>
* <p>Each segment index maintains the following:</p>
* <ul>
* <li>
* {@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment info}.
* This contains metadata about a segment, such as the number of documents,
* what files it uses, and information about how the segment is sorted
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Field names}.
* This contains the set of field names used in the index.
* </li>
* <li>
* Stored Field values.
* This contains, for each document, a list of attribute-value pairs, where the attributes
* are field names. These are used to store auxiliary information about the document, such as
* its title, url, or an identifier to access a database. The set of stored fields are what is
* returned for each hit when searching. This is keyed by document number.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term dictionary}.
* A dictionary containing all of the terms used in all of the
* indexed fields of all of the documents. The dictionary also contains the number
* of documents which contain the term, and pointers to the term's frequency and
* proximity data.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Frequency data}.
* For each term in the dictionary, the numbers of all the
* documents that contain that term, and the frequency of the term in that
* document, unless frequencies are omitted ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Proximity data}.
* For each term in the dictionary, the positions that the
* term occurs in each document. Note that this will not exist if all fields in
* all documents omit position data.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Normalization factors}.
* For each field in each document, a value is stored
* that is multiplied into the score for hits on that field.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
* For each field in each document, the term vector (sometimes
* called document vector) may be stored. A term vector consists of term text and
* term frequency. To add Term Vectors to your index see the
* {@link org.apache.lucene.document.Field Field} constructors
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-document values}.
* Like stored values, these are also keyed by document
* number, but are generally intended to be loaded into main memory for fast
* access. Whereas stored values are generally intended for summary results from
* searches, per-document values are useful for things like scoring factors.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
* An optional file indicating which documents are live.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}.
* Optional pair of files, recording dimensionally indexed fields, to enable fast
* numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
* and geographic shape intersection (2D, 3D).
* </li>
* </ul>
* <p>Details on each of these are provided in their linked pages.</p>
* </div>
* <a id="File_Naming"></a>
* <h3>File Naming</h3>
* <div>
* <p>All files belonging to a segment have the same name with varying extensions.
* The extensions correspond to the different file formats described below. When
* using the Compound File format (default for small segments) these files (except
* for the Segment info file, the Lock file, and Deleted documents file) are collapsed
* into a single .cfs file (see below for details)</p>
* <p>Typically, all segments in an index are stored in a single directory,
* although this is not required.</p>
* <p>File names are never re-used. That is, when any file is saved
* to the Directory it is given a never before used filename. This is achieved
* using a simple generations approach. For example, the first segments file is
* segments_1, then segments_2, etc. The generation is a sequential long integer
* represented in alpha-numeric (base 36) form.</p>
* </div>
* <a id="file-names"></a>
* <h3>Summary of File Extensions</h3>
* <div>
* <p>The following table summarizes the names and extensions of the files in
* Lucene:</p>
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
* <caption>lucene filenames by extension</caption>
* <tr>
* <th>Name</th>
* <th>Extension</th>
* <th>Brief Description</th>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
* <td>segments_N</td>
* <td>Stores information about a commit point</td>
* </tr>
* <tr>
* <td><a href="#Lock_File">Lock File</a></td>
* <td>write.lock</td>
* <td>The Write lock prevents multiple IndexWriters from writing to the same
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}</td>
* <td>.cfs, .cfe</td>
* <td>An optional "virtual" file consisting of all the other index files for
* systems that frequently run out of file handles.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Fields}</td>
* <td>.fnm</td>
* <td>Stores information about the fields</td>
* </tr>
* <tr>
* <td>Field Index</td>
* <td>.fdx</td>
* <td>Contains pointers to field data</td>
* </tr>
* <tr>
* <td>Field Data</td>
* <td>.fdt</td>
* <td>The stored fields for documents</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Dictionary}</td>
* <td>.tim</td>
* <td>The term dictionary, stores term info</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Index}</td>
* <td>.tip</td>
* <td>The index into the Term Dictionary</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Frequencies}</td>
* <td>.doc</td>
* <td>Contains the list of docs which contain each term along with frequency</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Positions}</td>
* <td>.pos</td>
* <td>Stores position information about where a term occurs in the index</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Payloads}</td>
* <td>.pay</td>
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Norms}</td>
* <td>.nvd, .nvm</td>
* <td>Encodes length and boost factors for docs and fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-Document Values}</td>
* <td>.dvd, .dvm</td>
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}</td>
* <td>.tvx</td>
* <td>Stores offset into the document data file</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data}</td>
* <td>.tvd</td>
* <td>Contains term vector data.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what documents are live</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}</td>
* <td>.dii, .dim</td>
* <td>Holds indexed points, if any</td>
* </tr>
* </table>
* </div>
* <a id="Lock_File"></a>
* <h3>Lock File</h3>
* The write lock, which is stored in the index directory by default, is named
* "write.lock". If the lock directory is different from the index directory then
* the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
* derived from the full path to the index directory. When this file is present, a
* writer is currently modifying the index (adding or removing documents). This
* lock file ensures that only one writer is modifying the index at a time.
* <a id="History"></a>
* <h3>History</h3>
* <p>Compatibility notes are provided in this document, describing how file
* formats have changed from prior versions:</p>
* <ul>
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie,
* no more commit lock). The change is fully backwards compatible: you can open a
* pre-2.1 index for searching or adding/deleting of docs. When the new segments
* file is saved (committed), it will be written in the new file format (meaning
* no specific "upgrade" process is needed). But note that once a commit has
* occurred, pre-2.1 Lucene will not be able to read the index.</li>
* <li>In version 2.3, the file format was changed to allow segments to share a
* single set of doc store (vectors &amp; stored fields) files. This allows for
* faster indexing in certain cases. The change is fully backwards compatible (in
* the same way as the lock-less commits change in 2.1).</li>
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not
* Java's modified UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">
* LUCENE-510</a> for details.</li>
* <li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData
* may be passed to IndexWriter's commit methods (and later retrieved), which is
* recorded in the segments_N file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">
* LUCENE-1382</a> for details. Also,
* diagnostics were added to each segment written recording details about why it
* was written (due to flush, merge; which OS/JRE was used; etc.). See issue
* <a href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.</li>
* <li>In version 3.0, compressed fields are no longer written to the index (they
* can still be read, but on merge the new segment will write them, uncompressed).
* See issue <a href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a>
* for details.</li>
* <li>In version 3.1, segments records the code version that created them. See
* <a href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
* Additionally segments track explicitly whether or not they have term vectors.
* See <a href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a>
* for details.</li>
* <li>In version 3.2, numeric fields are written as natively to stored fields
* file, previously they were stored in text format only.</li>
* <li>In version 3.4, fields can omit position data while still indexing term
* frequencies.</li>
* <li>In version 4.0, the format of the inverted index became extensible via
* the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
* ({@code DocValues}) was introduced. Normalization factors need no longer be a
* single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
* Terms need not be unicode strings, they can be any byte sequence. Term offsets
* can optionally be indexed into the postings lists. Payloads can be stored in the
* term vectors.</li>
* <li>In version 4.1, the format of the postings list changed to use either
* of FOR compression or variable-byte encoding, depending upon the frequency
* of the term. Terms appearing only once were changed to inline directly into
* the term dictionary. Stored fields are compressed by default. </li>
* <li>In version 4.2, term vectors are compressed by default. DocValues has
* a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
* on multi-valued fields.</li>
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.</li>
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
* allow updating NumericDocValues fields.</li>
* <li>In version 4.8, checksum footers were added to the end of each index file
* for improved data integrity. Specifically, the last 8 bytes of every index file
* contain the zlib-crc32 checksum of the file.</li>
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
* that is suitable for faceting/sorting/analytics.
* <li>In version 5.4, DocValues have been improved to store more information on disk:
* addresses for binary fields and ord indexes for multi-valued fields.
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
* <li>In version 7.0, DocValues have been improved to better support sparse doc values
* thanks to an iterator API.</li>
* <li>In version 8.0, postings have been enhanced to record, for each block of
* doc ids, the (term freq, normalization factor) pairs that may trigger the
* maximum score of the block. This information is recorded alongside skip data
* in order to be able to skip blocks of doc ids if they may not produce high
* enough scores.
* Additionally doc values and norms has been extended with jump-tables to make access O(1)
* instead of O(n), where n is the number of elements to skip when advancing in the data.</li>
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
* performant encoding that is vectorized.</li>
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to
* allow user-defined sorts to be used</li>
* </ul>
* <a id="Limitations"></a>
* <h3>Limitations</h3>
* <div>
* <p>Lucene uses a Java <code>int</code> to refer to
* document numbers, and the index file format uses an <code>Int32</code>
* on-disk to store document numbers. This is a limitation
* of both the index file format and the current implementation. Eventually these
* should be replaced with either <code>UInt64</code> values, or
* better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.</p>
* </div>
* Components from the Lucene 8.6 index format. See {@link org.apache.lucene.codecs.lucene90}
* for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene86;

View File

@ -16,401 +16,7 @@
*/
/**
* Lucene 8.7 file format.
*
* <h2>Apache Lucene - Index File Formats</h2>
* <div>
* <ul>
* <li><a href="#Introduction">Introduction</a></li>
* <li><a href="#Definitions">Definitions</a>
* <ul>
* <li><a href="#Inverted_Indexing">Inverted Indexing</a></li>
* <li><a href="#Types_of_Fields">Types of Fields</a></li>
* <li><a href="#Segments">Segments</a></li>
* <li><a href="#Document_Numbers">Document Numbers</a></li>
* </ul>
* </li>
* <li><a href="#Overview">Index Structure Overview</a></li>
* <li><a href="#File_Naming">File Naming</a></li>
* <li><a href="#file-names">Summary of File Extensions</a>
* <ul>
* <li><a href="#Lock_File">Lock File</a></li>
* <li><a href="#History">History</a></li>
* <li><a href="#Limitations">Limitations</a></li>
* </ul>
* </li>
* </ul>
* </div>
* <a id="Introduction"></a>
* <h3>Introduction</h3>
* <div>
* <p>This document defines the index file formats used in this version of Lucene.
* If you are using a different version of Lucene, please consult the copy of
* <code>docs/</code> that was distributed with
* the version you are using.</p>
* <p>This document attempts to provide a high-level definition of the Apache
* Lucene file formats.</p>
* </div>
* <a id="Definitions"></a>
* <h3>Definitions</h3>
* <div>
* <p>The fundamental concepts in Lucene are index, document, field and term.</p>
* <p>An index contains a sequence of documents.</p>
* <ul>
* <li>A document is a sequence of fields.</li>
* <li>A field is a named sequence of terms.</li>
* <li>A term is a sequence of bytes.</li>
* </ul>
* <p>The same sequence of bytes in two different fields is considered a different
* term. Thus terms are represented as a pair: the string naming the field, and the
* bytes within the field.</p>
* <a id="Inverted_Indexing"></a>
* <h4>Inverted Indexing</h4>
* <p>The index stores statistics about terms in order to make term-based search
* more efficient. Lucene's index falls into the family of indexes known as an
* <i>inverted index.</i> This is because it can list, for a term, the documents
* that contain it. This is the inverse of the natural relationship, in which
* documents list terms.</p>
* <a id="Types_of_Fields"></a>
* <h4>Types of Fields</h4>
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
* in the index literally, in a non-inverted manner. Fields that are inverted are
* called <i>indexed</i>. A field may be both stored and indexed.</p>
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the
* text of a field may be used literally as a term to be indexed. Most fields are
* tokenized, but sometimes it is useful for certain identifier fields to be
* indexed literally.</p>
* <p>See the {@link org.apache.lucene.document.Field Field}
* java docs for more information on Fields.</p>
* <a id="Segments"></a>
* <h4>Segments</h4>
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
* Each segment is a fully independent index, which could be searched separately.
* Indexes evolve by:</p>
* <ol>
* <li>Creating new segments for newly added documents.</li>
* <li>Merging existing segments.</li>
* </ol>
* <p>Searches may involve multiple segments and/or multiple indexes, each index
* potentially composed of a set of segments.</p>
* <a id="Document_Numbers"></a>
* <h4>Document Numbers</h4>
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>.
* The first document added to an index is numbered zero, and each subsequent
* document added gets a number one greater than the previous.</p>
* <p>Note that a document's number may change, so caution should be taken when
* storing these numbers outside of Lucene. In particular, numbers may change in
* the following situations:</p>
* <ul>
* <li>
* <p>The numbers stored in each segment are unique only within the segment, and
* must be converted before they can be used in a larger context. The standard
* technique is to allocate each segment a range of values, based on the range of
* numbers used in that segment. To convert a document number from a segment to an
* external value, the segment's <i>base</i> document number is added. To convert
* an external value back to a segment-specific value, the segment is identified
* by the range that the external value is in, and the segment's base value is
* subtracted. For example two five document segments might be combined, so that
* the first segment has a base value of zero, and the second of five. Document
* three from the second segment would have an external value of eight.</p>
* </li>
* <li>
* <p>When documents are deleted, gaps are created in the numbering. These are
* eventually removed as the index evolves through merging. Deleted documents are
* dropped when segments are merged. A freshly-merged segment thus has no gaps in
* its numbering.</p>
* </li>
* </ul>
* </div>
* <a id="Overview"></a>
* <h3>Index Structure Overview</h3>
* <div>
* <p>Each segment index maintains the following:</p>
* <ul>
* <li>
* {@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment info}.
* This contains metadata about a segment, such as the number of documents,
* what files it uses, and information about how the segment is sorted
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Field names}.
* This contains the set of field names used in the index.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Stored Field values}.
* This contains, for each document, a list of attribute-value pairs, where the attributes
* are field names. These are used to store auxiliary information about the document, such as
* its title, url, or an identifier to access a database. The set of stored fields are what is
* returned for each hit when searching. This is keyed by document number.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term dictionary}.
* A dictionary containing all of the terms used in all of the
* indexed fields of all of the documents. The dictionary also contains the number
* of documents which contain the term, and pointers to the term's frequency and
* proximity data.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Frequency data}.
* For each term in the dictionary, the numbers of all the
* documents that contain that term, and the frequency of the term in that
* document, unless frequencies are omitted ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Proximity data}.
* For each term in the dictionary, the positions that the
* term occurs in each document. Note that this will not exist if all fields in
* all documents omit position data.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Normalization factors}.
* For each field in each document, a value is stored
* that is multiplied into the score for hits on that field.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
* For each field in each document, the term vector (sometimes
* called document vector) may be stored. A term vector consists of term text and
* term frequency. To add Term Vectors to your index see the
* {@link org.apache.lucene.document.Field Field} constructors
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-document values}.
* Like stored values, these are also keyed by document
* number, but are generally intended to be loaded into main memory for fast
* access. Whereas stored values are generally intended for summary results from
* searches, per-document values are useful for things like scoring factors.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
* An optional file indicating which documents are live.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}.
* Optional pair of files, recording dimensionally indexed fields, to enable fast
* numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
* and geographic shape intersection (2D, 3D).
* </li>
* </ul>
* <p>Details on each of these are provided in their linked pages.</p>
* </div>
* <a id="File_Naming"></a>
* <h3>File Naming</h3>
* <div>
* <p>All files belonging to a segment have the same name with varying extensions.
* The extensions correspond to the different file formats described below. When
* using the Compound File format (default for small segments) these files (except
* for the Segment info file, the Lock file, and Deleted documents file) are collapsed
* into a single .cfs file (see below for details)</p>
* <p>Typically, all segments in an index are stored in a single directory,
* although this is not required.</p>
* <p>File names are never re-used. That is, when any file is saved
* to the Directory it is given a never before used filename. This is achieved
* using a simple generations approach. For example, the first segments file is
* segments_1, then segments_2, etc. The generation is a sequential long integer
* represented in alpha-numeric (base 36) form.</p>
* </div>
* <a id="file-names"></a>
* <h3>Summary of File Extensions</h3>
* <div>
* <p>The following table summarizes the names and extensions of the files in
* Lucene:</p>
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
* <caption>lucene filenames by extension</caption>
* <tr>
* <th>Name</th>
* <th>Extension</th>
* <th>Brief Description</th>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
* <td>segments_N</td>
* <td>Stores information about a commit point</td>
* </tr>
* <tr>
* <td><a href="#Lock_File">Lock File</a></td>
* <td>write.lock</td>
* <td>The Write lock prevents multiple IndexWriters from writing to the same
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}</td>
* <td>.cfs, .cfe</td>
* <td>An optional "virtual" file consisting of all the other index files for
* systems that frequently run out of file handles.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Fields}</td>
* <td>.fnm</td>
* <td>Stores information about the fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Index}</td>
* <td>.fdx</td>
* <td>Contains pointers to field data</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Data}</td>
* <td>.fdt</td>
* <td>The stored fields for documents</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Dictionary}</td>
* <td>.tim</td>
* <td>The term dictionary, stores term info</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Index}</td>
* <td>.tip</td>
* <td>The index into the Term Dictionary</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Frequencies}</td>
* <td>.doc</td>
* <td>Contains the list of docs which contain each term along with frequency</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Positions}</td>
* <td>.pos</td>
* <td>Stores position information about where a term occurs in the index</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Payloads}</td>
* <td>.pay</td>
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Norms}</td>
* <td>.nvd, .nvm</td>
* <td>Encodes length and boost factors for docs and fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-Document Values}</td>
* <td>.dvd, .dvm</td>
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}</td>
* <td>.tvx</td>
* <td>Stores offset into the document data file</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data}</td>
* <td>.tvd</td>
* <td>Contains term vector data.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what documents are live</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}</td>
* <td>.dii, .dim</td>
* <td>Holds indexed points, if any</td>
* </tr>
* </table>
* </div>
* <a id="Lock_File"></a>
* <h3>Lock File</h3>
* The write lock, which is stored in the index directory by default, is named
* "write.lock". If the lock directory is different from the index directory then
* the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
* derived from the full path to the index directory. When this file is present, a
* writer is currently modifying the index (adding or removing documents). This
* lock file ensures that only one writer is modifying the index at a time.
* <a id="History"></a>
* <h3>History</h3>
* <p>Compatibility notes are provided in this document, describing how file
* formats have changed from prior versions:</p>
* <ul>
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie,
* no more commit lock). The change is fully backwards compatible: you can open a
* pre-2.1 index for searching or adding/deleting of docs. When the new segments
* file is saved (committed), it will be written in the new file format (meaning
* no specific "upgrade" process is needed). But note that once a commit has
* occurred, pre-2.1 Lucene will not be able to read the index.</li>
* <li>In version 2.3, the file format was changed to allow segments to share a
* single set of doc store (vectors &amp; stored fields) files. This allows for
* faster indexing in certain cases. The change is fully backwards compatible (in
* the same way as the lock-less commits change in 2.1).</li>
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not
* Java's modified UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">
* LUCENE-510</a> for details.</li>
* <li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData
* may be passed to IndexWriter's commit methods (and later retrieved), which is
* recorded in the segments_N file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">
* LUCENE-1382</a> for details. Also,
* diagnostics were added to each segment written recording details about why it
* was written (due to flush, merge; which OS/JRE was used; etc.). See issue
* <a href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.</li>
* <li>In version 3.0, compressed fields are no longer written to the index (they
* can still be read, but on merge the new segment will write them, uncompressed).
* See issue <a href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a>
* for details.</li>
* <li>In version 3.1, segments records the code version that created them. See
* <a href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
* Additionally segments track explicitly whether or not they have term vectors.
* See <a href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a>
* for details.</li>
* <li>In version 3.2, numeric fields are written as natively to stored fields
* file, previously they were stored in text format only.</li>
* <li>In version 3.4, fields can omit position data while still indexing term
* frequencies.</li>
* <li>In version 4.0, the format of the inverted index became extensible via
* the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
* ({@code DocValues}) was introduced. Normalization factors need no longer be a
* single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
* Terms need not be unicode strings, they can be any byte sequence. Term offsets
* can optionally be indexed into the postings lists. Payloads can be stored in the
* term vectors.</li>
* <li>In version 4.1, the format of the postings list changed to use either
* of FOR compression or variable-byte encoding, depending upon the frequency
* of the term. Terms appearing only once were changed to inline directly into
* the term dictionary. Stored fields are compressed by default. </li>
* <li>In version 4.2, term vectors are compressed by default. DocValues has
* a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
* on multi-valued fields.</li>
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.</li>
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
* allow updating NumericDocValues fields.</li>
* <li>In version 4.8, checksum footers were added to the end of each index file
* for improved data integrity. Specifically, the last 8 bytes of every index file
* contain the zlib-crc32 checksum of the file.</li>
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
* that is suitable for faceting/sorting/analytics.
* <li>In version 5.4, DocValues have been improved to store more information on disk:
* addresses for binary fields and ord indexes for multi-valued fields.
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
* <li>In version 7.0, DocValues have been improved to better support sparse doc values
* thanks to an iterator API.</li>
* <li>In version 8.0, postings have been enhanced to record, for each block of
* doc ids, the (term freq, normalization factor) pairs that may trigger the
* maximum score of the block. This information is recorded alongside skip data
* in order to be able to skip blocks of doc ids if they may not produce high
* enough scores.
* Additionally doc values and norms has been extended with jump-tables to make access O(1)
* instead of O(n), where n is the number of elements to skip when advancing in the data.</li>
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
* performant encoding that is vectorized.</li>
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to
* allow user-defined sorts to be used</li>
* </ul>
* <a id="Limitations"></a>
* <h3>Limitations</h3>
* <div>
* <p>Lucene uses a Java <code>int</code> to refer to
* document numbers, and the index file format uses an <code>Int32</code>
* on-disk to store document numbers. This is a limitation
* of both the index file format and the current implementation. Eventually these
* should be replaced with either <code>UInt64</code> values, or
* better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.</p>
* </div>
* Components from the Lucene 8.7 index format. See {@link org.apache.lucene.codecs.lucene90}
* for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene87;

View File

@ -0,0 +1,183 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene90;
import java.util.Objects;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat;
import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
/**
* Implements the Lucene 9.0 index format
* <p>
* If you want to reuse functionality of this codec in another codec, extend
* {@link FilterCodec}.
*
* @see org.apache.lucene.codecs.lucene90 package documentation for file format details.
*
* @lucene.experimental
*/
public class Lucene90Codec extends Codec {
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene90FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
private final PostingsFormat defaultFormat;
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return Lucene90Codec.this.getPostingsFormatForField(field);
}
};
private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return Lucene90Codec.this.getDocValuesFormatForField(field);
}
};
private final VectorFormat vectorFormat = new Lucene90VectorFormat();
private final StoredFieldsFormat storedFieldsFormat;
/**
* Instantiates a new codec.
*/
public Lucene90Codec() {
this(Lucene87StoredFieldsFormat.Mode.BEST_SPEED);
}
/**
* Instantiates a new codec, specifying the stored fields compression
* mode to use.
* @param mode stored fields compression mode to use for newly
* flushed/merged segments.
*/
public Lucene90Codec(Lucene87StoredFieldsFormat.Mode mode) {
super("Lucene90");
this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode));
this.defaultFormat = new Lucene84PostingsFormat();
}
@Override
public final StoredFieldsFormat storedFieldsFormat() {
return storedFieldsFormat;
}
@Override
public final TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
}
@Override
public final PostingsFormat postingsFormat() {
return postingsFormat;
}
@Override
public final FieldInfosFormat fieldInfosFormat() {
return fieldInfosFormat;
}
@Override
public final SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}
@Override
public final LiveDocsFormat liveDocsFormat() {
return liveDocsFormat;
}
@Override
public final CompoundFormat compoundFormat() {
return compoundFormat;
}
@Override
public final PointsFormat pointsFormat() {
return new Lucene86PointsFormat();
}
@Override
public final VectorFormat vectorFormat() {
return vectorFormat;
}
/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*
* The default implementation always returns "Lucene84".
* <p>
* <b>WARNING:</b> if you subclass, you are responsible for index
* backwards compatibility: future version of Lucene are only
* guaranteed to be able to read the default implementation,
*/
public PostingsFormat getPostingsFormatForField(String field) {
return defaultFormat;
}
/** Returns the docvalues format that should be used for writing
* new segments of <code>field</code>.
*
* The default implementation always returns "Lucene80".
* <p>
* <b>WARNING:</b> if you subclass, you are responsible for index
* backwards compatibility: future version of Lucene are only
* guaranteed to be able to read the default implementation.
*/
public DocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;
}
@Override
public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
}
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80");
private final NormsFormat normsFormat = new Lucene80NormsFormat();
@Override
public final NormsFormat normsFormat() {
return normsFormat;
}
}

View File

@ -0,0 +1,339 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene90;
import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
/**
* Lucene 9.0 Field Infos format.
* <p>Field names are stored in the field info file, with suffix <code>.fnm</code>.
* <p>FieldInfos (.fnm) --&gt; Header,FieldsCount, &lt;FieldName,FieldNumber,
* FieldBits,DocValuesBits,DocValuesGen,Attributes,DimensionCount,DimensionNumBytes&gt; <sup>FieldsCount</sup>,Footer
* <p>Data types:
* <ul>
* <li>Header --&gt; {@link CodecUtil#checkIndexHeader IndexHeader}</li>
* <li>FieldsCount --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>FieldName --&gt; {@link DataOutput#writeString String}</li>
* <li>FieldBits, IndexOptions, DocValuesBits --&gt; {@link DataOutput#writeByte Byte}</li>
* <li>FieldNumber, DimensionCount, DimensionNumBytes --&gt; {@link DataOutput#writeInt VInt}</li>
* <li>Attributes --&gt; {@link DataOutput#writeMapOfStrings Map&lt;String,String&gt;}</li>
* <li>DocValuesGen --&gt; {@link DataOutput#writeLong(long) Int64}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
* Field Descriptions:
* <ul>
* <li>FieldsCount: the number of fields in this file.</li>
* <li>FieldName: name of the field as a UTF-8 String.</li>
* <li>FieldNumber: the field's number. Note that unlike previous versions of
* Lucene, the fields are not numbered implicitly by their order in the
* file, instead explicitly.</li>
* <li>FieldBits: a byte containing field options.
* <ul>
* <li>The low order bit (0x1) is one for fields that have term vectors
* stored, and zero for fields without term vectors.</li>
* <li>If the second lowest order-bit is set (0x2), norms are omitted for the
* indexed field.</li>
* <li>If the third lowest-order bit is set (0x4), payloads are stored for the
* indexed field.</li>
* </ul>
* </li>
* <li>IndexOptions: a byte containing index options.
* <ul>
* <li>0: not indexed</li>
* <li>1: indexed as DOCS_ONLY</li>
* <li>2: indexed as DOCS_AND_FREQS</li>
* <li>3: indexed as DOCS_AND_FREQS_AND_POSITIONS</li>
* <li>4: indexed as DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS</li>
* </ul>
* </li>
* <li>DocValuesBits: a byte containing per-document value types. The type
* recorded as two four-bit integers, with the high-order bits representing
* <code>norms</code> options, and the low-order bits representing
* {@code DocValues} options. Each four-bit integer can be decoded as such:
* <ul>
* <li>0: no DocValues for this field.</li>
* <li>1: NumericDocValues. ({@link DocValuesType#NUMERIC})</li>
* <li>2: BinaryDocValues. ({@code DocValuesType#BINARY})</li>
* <li>3: SortedDocValues. ({@code DocValuesType#SORTED})</li>
* </ul>
* </li>
* <li>DocValuesGen is the generation count of the field's DocValues. If this is -1,
* there are no DocValues updates to that field. Anything above zero means there
* are updates stored by {@link DocValuesFormat}.</li>
* <li>Attributes: a key-value map of codec-private attributes.</li>
* <li>PointDimensionCount, PointNumBytes: these are non-zero only if the field is
* indexed as points, e.g. using {@link org.apache.lucene.document.LongPoint}</li>
* <li>VectorDimension: it is non-zero if the field is indexed as vectors.</li>
* <li>VectorDistFunction: a byte containing distance function used for similarity calculation.
* <ul>
* <li>0: no distance function is defined for this field.</li>
* <li>1: EUCLIDEAN distance. ({@link org.apache.lucene.index.VectorValues.ScoreFunction#EUCLIDEAN})</li>
* <li>2: DOT_PRODUCT score. ({@link org.apache.lucene.index.VectorValues.ScoreFunction#DOT_PRODUCT})</li>
* </ul>
* </li>
* </ul>
*
* @lucene.experimental
*/
public final class Lucene90FieldInfosFormat extends FieldInfosFormat {
/** Sole constructor. */
public Lucene90FieldInfosFormat() {
}
@Override
public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, EXTENSION);
try (ChecksumIndexInput input = directory.openChecksumInput(fileName, context)) {
Throwable priorE = null;
FieldInfo infos[] = null;
try {
int version = CodecUtil.checkIndexHeader(input,
Lucene90FieldInfosFormat.CODEC_NAME,
Lucene90FieldInfosFormat.FORMAT_START,
Lucene90FieldInfosFormat.FORMAT_CURRENT,
segmentInfo.getId(), segmentSuffix);
final int size = input.readVInt(); //read in the size
infos = new FieldInfo[size];
// previous field's attribute map, we share when possible:
Map<String,String> lastAttributes = Collections.emptyMap();
for (int i = 0; i < size; i++) {
String name = input.readString();
final int fieldNumber = input.readVInt();
if (fieldNumber < 0) {
throw new CorruptIndexException("invalid field number for field: " + name + ", fieldNumber=" + fieldNumber, input);
}
byte bits = input.readByte();
boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0;
boolean omitNorms = (bits & OMIT_NORMS) != 0;
boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0;
final IndexOptions indexOptions = getIndexOptions(input, input.readByte());
// DV Types are packed in one byte
final DocValuesType docValuesType = getDocValuesType(input, input.readByte());
final long dvGen = input.readLong();
Map<String,String> attributes = input.readMapOfStrings();
// just use the last field's map if its the same
if (attributes.equals(lastAttributes)) {
attributes = lastAttributes;
}
lastAttributes = attributes;
int pointDataDimensionCount = input.readVInt();
int pointNumBytes;
int pointIndexDimensionCount = pointDataDimensionCount;
if (pointDataDimensionCount != 0) {
if (version >= Lucene90FieldInfosFormat.FORMAT_SELECTIVE_INDEXING) {
pointIndexDimensionCount = input.readVInt();
}
pointNumBytes = input.readVInt();
} else {
pointNumBytes = 0;
}
final int vectorDimension = input.readVInt();
final VectorValues.ScoreFunction vectorDistFunc = getDistFunc(input, input.readByte());
try {
infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads,
indexOptions, docValuesType, dvGen, attributes,
pointDataDimensionCount, pointIndexDimensionCount, pointNumBytes, vectorDimension, vectorDistFunc, isSoftDeletesField);
infos[i].checkConsistency();
} catch (IllegalStateException e) {
throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);
}
}
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(input, priorE);
}
return new FieldInfos(infos);
}
}
static {
// We "mirror" DocValues enum values with the constants below; let's try to ensure if we add a new DocValuesType while this format is
// still used for writing, we remember to fix this encoding:
assert DocValuesType.values().length == 6;
}
private static byte docValuesByte(DocValuesType type) {
switch(type) {
case NONE:
return 0;
case NUMERIC:
return 1;
case BINARY:
return 2;
case SORTED:
return 3;
case SORTED_SET:
return 4;
case SORTED_NUMERIC:
return 5;
default:
// BUG
throw new AssertionError("unhandled DocValuesType: " + type);
}
}
private static DocValuesType getDocValuesType(IndexInput input, byte b) throws IOException {
switch(b) {
case 0:
return DocValuesType.NONE;
case 1:
return DocValuesType.NUMERIC;
case 2:
return DocValuesType.BINARY;
case 3:
return DocValuesType.SORTED;
case 4:
return DocValuesType.SORTED_SET;
case 5:
return DocValuesType.SORTED_NUMERIC;
default:
throw new CorruptIndexException("invalid docvalues byte: " + b, input);
}
}
private static VectorValues.ScoreFunction getDistFunc(IndexInput input, byte b) throws IOException {
if (b < 0 || b >= VectorValues.ScoreFunction.values().length) {
throw new CorruptIndexException("invalid distance function: " + b, input);
}
return VectorValues.ScoreFunction.values()[b];
}
static {
// We "mirror" IndexOptions enum values with the constants below; let's try to ensure if we add a new IndexOption while this format is
// still used for writing, we remember to fix this encoding:
assert IndexOptions.values().length == 5;
}
private static byte indexOptionsByte(IndexOptions indexOptions) {
switch (indexOptions) {
case NONE:
return 0;
case DOCS:
return 1;
case DOCS_AND_FREQS:
return 2;
case DOCS_AND_FREQS_AND_POSITIONS:
return 3;
case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
return 4;
default:
// BUG:
throw new AssertionError("unhandled IndexOptions: " + indexOptions);
}
}
private static IndexOptions getIndexOptions(IndexInput input, byte b) throws IOException {
switch (b) {
case 0:
return IndexOptions.NONE;
case 1:
return IndexOptions.DOCS;
case 2:
return IndexOptions.DOCS_AND_FREQS;
case 3:
return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
case 4:
return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
default:
// BUG
throw new CorruptIndexException("invalid IndexOptions byte: " + b, input);
}
}
@Override
public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, EXTENSION);
try (IndexOutput output = directory.createOutput(fileName, context)) {
CodecUtil.writeIndexHeader(output, Lucene90FieldInfosFormat.CODEC_NAME, Lucene90FieldInfosFormat.FORMAT_CURRENT, segmentInfo.getId(), segmentSuffix);
output.writeVInt(infos.size());
for (FieldInfo fi : infos) {
fi.checkConsistency();
output.writeString(fi.name);
output.writeVInt(fi.number);
byte bits = 0x0;
if (fi.hasVectors()) bits |= STORE_TERMVECTOR;
if (fi.omitsNorms()) bits |= OMIT_NORMS;
if (fi.hasPayloads()) bits |= STORE_PAYLOADS;
if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD;
output.writeByte(bits);
output.writeByte(indexOptionsByte(fi.getIndexOptions()));
// pack the DV type and hasNorms in one byte
output.writeByte(docValuesByte(fi.getDocValuesType()));
output.writeLong(fi.getDocValuesGen());
output.writeMapOfStrings(fi.attributes());
output.writeVInt(fi.getPointDimensionCount());
if (fi.getPointDimensionCount() != 0) {
output.writeVInt(fi.getPointIndexDimensionCount());
output.writeVInt(fi.getPointNumBytes());
}
output.writeVInt(fi.getVectorDimension());
output.writeByte((byte) fi.getVectorScoreFunction().ordinal());
}
CodecUtil.writeFooter(output);
}
}
/** Extension of field infos */
static final String EXTENSION = "fnm";
// Codec header
static final String CODEC_NAME = "Lucene90FieldInfos";
static final int FORMAT_START = 0;
static final int FORMAT_SOFT_DELETES = 1;
static final int FORMAT_SELECTIVE_INDEXING = 2;
static final int FORMAT_CURRENT = FORMAT_SELECTIVE_INDEXING;
// Field flags
static final byte STORE_TERMVECTOR = 0x1;
static final byte OMIT_NORMS = 0x2;
static final byte STORE_PAYLOADS = 0x4;
static final byte SOFT_DELETES_FIELD = 0x8;
}

View File

@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene90;
import java.io.IOException;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.VectorWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
/**
* Lucene 9.0 vector format, which encodes dense numeric vector values.
* TODO: add support for approximate KNN search.
*/
public final class Lucene90VectorFormat extends VectorFormat {
static final String META_CODEC_NAME = "Lucene90VectorFormatMeta";
static final String VECTOR_DATA_CODEC_NAME = "Lucene90VectorFormatData";
static final String META_EXTENSION = "vem";
static final String VECTOR_DATA_EXTENSION = "vec";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
/** Sole constructor */
public Lucene90VectorFormat() {
}
@Override
public VectorWriter fieldsWriter(SegmentWriteState state) throws IOException {
return new Lucene90VectorWriter(state);
}
@Override
public VectorReader fieldsReader(SegmentReadState state) throws IOException {
return new Lucene90VectorReader(state);
}
}

View File

@ -0,0 +1,345 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene90;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.FloatBuffer;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Reads vectors from the index segments.
* @lucene.experimental
*/
public final class Lucene90VectorReader extends VectorReader {
private final FieldInfos fieldInfos;
private final Map<String, FieldEntry> fields = new HashMap<>();
private final IndexInput vectorData;
private final int maxDoc;
Lucene90VectorReader(SegmentReadState state) throws IOException {
this.fieldInfos = state.fieldInfos;
this.maxDoc = state.segmentInfo.maxDoc();
String metaFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.META_EXTENSION);
int versionMeta = -1;
try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName, state.context)) {
Throwable priorE = null;
try {
versionMeta = CodecUtil.checkIndexHeader(meta,
Lucene90VectorFormat.META_CODEC_NAME,
Lucene90VectorFormat.VERSION_START,
Lucene90VectorFormat.VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
readFields(meta, state.fieldInfos);
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(meta, priorE);
}
}
boolean success = false;
String vectorDataFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.VECTOR_DATA_EXTENSION);
this.vectorData = state.directory.openInput(vectorDataFileName, state.context);
try {
int versionVectorData = CodecUtil.checkIndexHeader(vectorData,
Lucene90VectorFormat.VECTOR_DATA_CODEC_NAME,
Lucene90VectorFormat.VERSION_START,
Lucene90VectorFormat.VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
if (versionMeta != versionVectorData) {
throw new CorruptIndexException("Format versions mismatch: meta=" + versionMeta + ", vector data=" + versionVectorData, vectorData);
}
CodecUtil.retrieveChecksum(vectorData);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(this.vectorData);
}
}
}
private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOException {
for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) {
FieldInfo info = infos.fieldInfo(fieldNumber);
if (info == null) {
throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
}
int scoreFunctionId = meta.readInt();
if (scoreFunctionId < 0 || scoreFunctionId >= VectorValues.ScoreFunction.values().length) {
throw new CorruptIndexException("Invalid score function id: " + scoreFunctionId, meta);
}
VectorValues.ScoreFunction scoreFunction = VectorValues.ScoreFunction.values()[scoreFunctionId];
long vectorDataOffset = meta.readVLong();
long vectorDataLength = meta.readVLong();
int dimension = meta.readInt();
int size = meta.readInt();
int[] ordToDoc = new int[size];
for (int i = 0; i < size; i++) {
int doc = meta.readVInt();
ordToDoc[i] = doc;
}
FieldEntry fieldEntry = new FieldEntry(dimension, scoreFunction, maxDoc, vectorDataOffset, vectorDataLength,
ordToDoc);
fields.put(info.name, fieldEntry);
}
}
@Override
public long ramBytesUsed() {
long totalBytes = RamUsageEstimator.shallowSizeOfInstance(Lucene90VectorReader.class);
totalBytes += RamUsageEstimator.sizeOfMap(fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class));
for (FieldEntry entry : fields.values()) {
totalBytes += RamUsageEstimator.sizeOf(entry.ordToDoc);
}
return totalBytes;
}
@Override
public void checkIntegrity() throws IOException {
CodecUtil.checksumEntireFile(vectorData);
}
@Override
public VectorValues getVectorValues(String field) throws IOException {
FieldInfo info = fieldInfos.fieldInfo(field);
if (info == null) {
return null;
}
int dimension = info.getVectorDimension();
if (dimension == 0) {
return VectorValues.EMPTY;
}
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
// There is a FieldInfo, but no vectors. Should we have deleted the FieldInfo?
return null;
}
if (dimension != fieldEntry.dimension) {
throw new IllegalStateException("Inconsistent vector dimension for field=\"" + field + "\"; " + dimension + " != " + fieldEntry.dimension);
}
long numBytes = (long) fieldEntry.size() * dimension * Float.BYTES;
if (numBytes != fieldEntry.vectorDataLength) {
throw new IllegalStateException("Vector data length " + fieldEntry.vectorDataLength +
" not matching size=" + fieldEntry.size() + " * dim=" + dimension + " * 4 = " +
numBytes);
}
IndexInput bytesSlice = vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);
return new OffHeapVectorValues(fieldEntry, bytesSlice);
}
@Override
public void close() throws IOException {
vectorData.close();
}
private static class FieldEntry {
final int dimension;
final VectorValues.ScoreFunction scoreFunction;
final int maxDoc;
final long vectorDataOffset;
final long vectorDataLength;
final int[] ordToDoc;
FieldEntry(int dimension, VectorValues.ScoreFunction scoreFunction, int maxDoc,
long vectorDataOffset, long vectorDataLength, int[] ordToDoc) {
this.dimension = dimension;
this.scoreFunction = scoreFunction;
this.maxDoc = maxDoc;
this.vectorDataOffset = vectorDataOffset;
this.vectorDataLength = vectorDataLength;
this.ordToDoc = ordToDoc;
}
int size() {
return ordToDoc.length;
}
}
/** Read the vector values from the index input. This supports both iterated and random access. */
private final static class OffHeapVectorValues extends VectorValues {
final FieldEntry fieldEntry;
final IndexInput dataIn;
final BytesRef binaryValue;
final ByteBuffer byteBuffer;
final FloatBuffer floatBuffer;
final int byteSize;
final float[] value;
int ord = -1;
int doc = -1;
OffHeapVectorValues(FieldEntry fieldEntry, IndexInput dataIn) {
this.fieldEntry = fieldEntry;
this.dataIn = dataIn;
byteSize = Float.BYTES * fieldEntry.dimension;
byteBuffer = ByteBuffer.allocate(byteSize);
floatBuffer = byteBuffer.asFloatBuffer();
value = new float[fieldEntry.dimension];
binaryValue = new BytesRef(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
}
@Override
public int dimension() {
return fieldEntry.dimension;
}
@Override
public int size() {
return fieldEntry.size();
}
@Override
public ScoreFunction scoreFunction() {
return fieldEntry.scoreFunction;
}
@Override
public float[] vectorValue() throws IOException {
binaryValue();
floatBuffer.position(0);
floatBuffer.get(value, 0, fieldEntry.dimension);
return value;
}
@Override
public BytesRef binaryValue() throws IOException {
dataIn.seek(ord * byteSize);
dataIn.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
return binaryValue;
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() {
if (++ord >= size()) {
doc = NO_MORE_DOCS;
} else {
doc = fieldEntry.ordToDoc[ord];
}
return doc;
}
@Override
public int advance(int target) throws IOException {
// We could do better by log-binary search in ordToDoc, but this is never used
return slowAdvance(target);
}
@Override
public long cost() {
return fieldEntry.size();
}
@Override
public RandomAccess randomAccess() {
return new OffHeapRandomAccess(dataIn.clone());
}
class OffHeapRandomAccess implements VectorValues.RandomAccess {
final IndexInput dataIn;
final BytesRef binaryValue;
final ByteBuffer byteBuffer;
final FloatBuffer floatBuffer;
final int byteSize;
final float[] value;
OffHeapRandomAccess(IndexInput dataIn) {
this.dataIn = dataIn;
byteSize = Float.BYTES * dimension();
byteBuffer = ByteBuffer.allocate(byteSize);
floatBuffer = byteBuffer.asFloatBuffer();
value = new float[dimension()];
binaryValue = new BytesRef(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
}
@Override
public int size() {
return fieldEntry.size();
}
@Override
public int dimension() {
return fieldEntry.dimension;
}
@Override
public VectorValues.ScoreFunction scoreFunction() {
return fieldEntry.scoreFunction;
}
@Override
public float[] vectorValue(int targetOrd) throws IOException {
readValue(targetOrd);
floatBuffer.position(0);
floatBuffer.get(value);
return value;
}
@Override
public BytesRef binaryValue(int targetOrd) throws IOException {
readValue(targetOrd);
return binaryValue;
}
private void readValue(int targetOrd) throws IOException {
long offset = targetOrd * byteSize;
dataIn.seek(offset);
dataIn.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
}
@Override
public TopDocs search(float[] vector, int topK, int fanout) throws IOException {
throw new UnsupportedOperationException();
}
}
}
}

View File

@ -0,0 +1,127 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene90;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.VectorWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
/**
* Writes vector values and knn graphs to index segments.
* @lucene.experimental
*/
public final class Lucene90VectorWriter extends VectorWriter {
private final IndexOutput meta, vectorData;
private boolean finished;
Lucene90VectorWriter(SegmentWriteState state) throws IOException {
assert state.fieldInfos.hasVectorValues();
String metaFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.META_EXTENSION);
meta = state.directory.createOutput(metaFileName, state.context);
String vectorDataFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.VECTOR_DATA_EXTENSION);
vectorData = state.directory.createOutput(vectorDataFileName, state.context);
try {
CodecUtil.writeIndexHeader(meta,
Lucene90VectorFormat.META_CODEC_NAME,
Lucene90VectorFormat.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(vectorData,
Lucene90VectorFormat.VECTOR_DATA_CODEC_NAME,
Lucene90VectorFormat.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
} catch (IOException e) {
IOUtils.closeWhileHandlingException(this);
}
}
@Override
public void writeField(FieldInfo fieldInfo, VectorValues vectors) throws IOException {
long vectorDataOffset = vectorData.getFilePointer();
// TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
List<Integer> docIds = new ArrayList<>();
int docV, ord = 0;
for (docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc(), ord++) {
writeVectorValue(vectors);
docIds.add(docV);
// TODO: write knn graph value
}
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
if (vectorDataLength > 0) {
writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds);
}
}
private void writeVectorValue(VectorValues vectors) throws IOException {
// write vector value
BytesRef binaryValue = vectors.binaryValue();
assert binaryValue.length == vectors.dimension() * Float.BYTES;
vectorData.writeBytes(binaryValue.bytes, binaryValue.offset, binaryValue.length);
}
private void writeMeta(FieldInfo field, long vectorDataOffset, long vectorDataLength, List<Integer> docIds) throws IOException {
meta.writeInt(field.number);
meta.writeInt(field.getVectorScoreFunction().ordinal());
meta.writeVLong(vectorDataOffset);
meta.writeVLong(vectorDataLength);
meta.writeInt(field.getVectorDimension());
meta.writeInt(docIds.size());
for (Integer docId : docIds) {
// TODO: delta-encode, or write as bitset
meta.writeVInt(docId);
}
}
@Override
public void finish() throws IOException {
if (finished) {
throw new IllegalStateException("already finished");
}
finished = true;
if (meta != null) {
// write end of fields marker
meta.writeInt(-1);
CodecUtil.writeFooter(meta);
}
if (vectorData != null) {
CodecUtil.writeFooter(vectorData);
}
}
@Override
public void close() throws IOException {
IOUtils.close(meta, vectorData);
}
}

View File

@ -0,0 +1,429 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Lucene 9.0 file format.
*
* <h2>Apache Lucene - Index File Formats</h2>
* <div>
* <ul>
* <li><a href="#Introduction">Introduction</a></li>
* <li><a href="#Definitions">Definitions</a>
* <ul>
* <li><a href="#Inverted_Indexing">Inverted Indexing</a></li>
* <li><a href="#Types_of_Fields">Types of Fields</a></li>
* <li><a href="#Segments">Segments</a></li>
* <li><a href="#Document_Numbers">Document Numbers</a></li>
* </ul>
* </li>
* <li><a href="#Overview">Index Structure Overview</a></li>
* <li><a href="#File_Naming">File Naming</a></li>
* <li><a href="#file-names">Summary of File Extensions</a>
* <ul>
* <li><a href="#Lock_File">Lock File</a></li>
* <li><a href="#History">History</a></li>
* <li><a href="#Limitations">Limitations</a></li>
* </ul>
* </li>
* </ul>
* </div>
* <a id="Introduction"></a>
* <h3>Introduction</h3>
* <div>
* <p>This document defines the index file formats used in this version of Lucene.
* If you are using a different version of Lucene, please consult the copy of
* <code>docs/</code> that was distributed with
* the version you are using.</p>
* <p>This document attempts to provide a high-level definition of the Apache
* Lucene file formats.</p>
* </div>
* <a id="Definitions"></a>
* <h3>Definitions</h3>
* <div>
* <p>The fundamental concepts in Lucene are index, document, field and term.</p>
* <p>An index contains a sequence of documents.</p>
* <ul>
* <li>A document is a sequence of fields.</li>
* <li>A field is a named sequence of terms.</li>
* <li>A term is a sequence of bytes.</li>
* </ul>
* <p>The same sequence of bytes in two different fields is considered a different
* term. Thus terms are represented as a pair: the string naming the field, and the
* bytes within the field.</p>
* <a id="Inverted_Indexing"></a>
* <h4>Inverted Indexing</h4>
* <p>Lucene's index stores terms and statistics about those terms in order to make
* term-based search more efficient. Lucene's terms index falls into the family of indexes known as
* an <i>inverted index.</i> This is because it can list, for a term, the documents that contain
* it. This is the inverse of the natural relationship, in which documents list terms.</p>
* <a id="Types_of_Fields"></a>
* <h4>Types of Fields</h4>
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
* in the index literally, in a non-inverted manner. Fields that are inverted are
* called <i>indexed</i>. A field may be both stored and indexed.</p>
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the
* text of a field may be used literally as a term to be indexed. Most fields are
* tokenized, but sometimes it is useful for certain identifier fields to be
* indexed literally.</p>
* <p>See the {@link org.apache.lucene.document.Field Field}
* java docs for more information on Fields.</p>
* <a id="Segments"></a>
* <h4>Segments</h4>
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
* Each segment is a fully independent index, which could be searched separately.
* Indexes evolve by:</p>
* <ol>
* <li>Creating new segments for newly added documents.</li>
* <li>Merging existing segments.</li>
* </ol>
* <p>Searches may involve multiple segments and/or multiple indexes, each index
* potentially composed of a set of segments.</p>
* <a id="Document_Numbers"></a>
* <h4>Document Numbers</h4>
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>.
* The first document added to an index is numbered zero, and each subsequent
* document added gets a number one greater than the previous.</p>
* <p>Note that a document's number may change, so caution should be taken when
* storing these numbers outside of Lucene. In particular, numbers may change in
* the following situations:</p>
* <ul>
* <li>
* <p>The numbers stored in each segment are unique only within the segment, and
* must be converted before they can be used in a larger context. The standard
* technique is to allocate each segment a range of values, based on the range of
* numbers used in that segment. To convert a document number from a segment to an
* external value, the segment's <i>base</i> document number is added. To convert
* an external value back to a segment-specific value, the segment is identified
* by the range that the external value is in, and the segment's base value is
* subtracted. For example two five document segments might be combined, so that
* the first segment has a base value of zero, and the second of five. Document
* three from the second segment would have an external value of eight.</p>
* </li>
* <li>
* <p>When documents are deleted, gaps are created in the numbering. These are
* eventually removed as the index evolves through merging. Deleted documents are
* dropped when segments are merged. A freshly-merged segment thus has no gaps in
* its numbering.</p>
* </li>
* </ul>
* </div>
* <a id="Overview"></a>
* <h3>Index Structure Overview</h3>
* <div>
* <p>Each segment index maintains the following:</p>
* <ul>
* <li>
* {@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment info}.
* This contains metadata about a segment, such as the number of documents,
* what files it uses, and information about how the segment is sorted
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Field names}.
* This contains metadata about the set of named fields used in the index.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Stored Field values}.
* This contains, for each document, a list of attribute-value pairs, where the attributes
* are field names. These are used to store auxiliary information about the document, such as
* its title, url, or an identifier to access a database. The set of stored fields are what is
* returned for each hit when searching. This is keyed by document number.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term dictionary}.
* A dictionary containing all of the terms used in all of the
* indexed fields of all of the documents. The dictionary also contains the number
* of documents which contain the term, and pointers to the term's frequency and
* proximity data.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Frequency data}.
* For each term in the dictionary, the numbers of all the
* documents that contain that term, and the frequency of the term in that
* document, unless frequencies are omitted ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Proximity data}.
* For each term in the dictionary, the positions that the
* term occurs in each document. Note that this will not exist if all fields in
* all documents omit position data.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Normalization factors}.
* For each field in each document, a value is stored
* that is multiplied into the score for hits on that field.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
* For each field in each document, the term vector (sometimes
* called document vector) may be stored. A term vector consists of term text and
* term frequency. To add Term Vectors to your index see the
* {@link org.apache.lucene.document.Field Field} constructors
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-document values}.
* Like stored values, these are also keyed by document
* number, but are generally intended to be loaded into main memory for fast
* access. Whereas stored values are generally intended for summary results from
* searches, per-document values are useful for things like scoring factors.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
* An optional file indicating which documents are live.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}.
* Optional pair of files, recording dimensionally indexed fields, to enable fast
* numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
* and geographic shape intersection (2D, 3D).
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene90.Lucene90VectorFormat Vector values}.
* The vector format stores numeric vectors in a format optimized for random access and computation,
* supporting high-dimensional nearest-neighbor search.
* </li>
* </ul>
* <p>Details on each of these are provided in their linked pages.</p>
* </div>
* <a id="File_Naming"></a>
* <h3>File Naming</h3>
* <div>
* <p>All files belonging to a segment have the same name with varying extensions.
* The extensions correspond to the different file formats described below. When
* using the Compound File format (default for small segments) these files (except
* for the Segment info file, the Lock file, and Deleted documents file) are collapsed
* into a single .cfs file (see below for details)</p>
* <p>Typically, all segments in an index are stored in a single directory,
* although this is not required.</p>
* <p>File names are never re-used. That is, when any file is saved
* to the Directory it is given a never before used filename. This is achieved
* using a simple generations approach. For example, the first segments file is
* segments_1, then segments_2, etc. The generation is a sequential long integer
* represented in alpha-numeric (base 36) form.</p>
* </div>
* <a id="file-names"></a>
* <h3>Summary of File Extensions</h3>
* <div>
* <p>The following table summarizes the names and extensions of the files in
* Lucene:</p>
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
* <caption>lucene filenames by extension</caption>
* <tr>
* <th>Name</th>
* <th>Extension</th>
* <th>Brief Description</th>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
* <td>segments_N</td>
* <td>Stores information about a commit point</td>
* </tr>
* <tr>
* <td><a href="#Lock_File">Lock File</a></td>
* <td>write.lock</td>
* <td>The Write lock prevents multiple IndexWriters from writing to the same
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}</td>
* <td>.cfs, .cfe</td>
* <td>An optional "virtual" file consisting of all the other index files for
* systems that frequently run out of file handles.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Fields}</td>
* <td>.fnm</td>
* <td>Stores information about the fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Index}</td>
* <td>.fdx</td>
* <td>Contains pointers to field data</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Data}</td>
* <td>.fdt</td>
* <td>The stored fields for documents</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Dictionary}</td>
* <td>.tim</td>
* <td>The term dictionary, stores term info</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Index}</td>
* <td>.tip</td>
* <td>The index into the Term Dictionary</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Frequencies}</td>
* <td>.doc</td>
* <td>Contains the list of docs which contain each term along with frequency</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Positions}</td>
* <td>.pos</td>
* <td>Stores position information about where a term occurs in the index</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Payloads}</td>
* <td>.pay</td>
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Norms}</td>
* <td>.nvd, .nvm</td>
* <td>Encodes length and boost factors for docs and fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-Document Values}</td>
* <td>.dvd, .dvm</td>
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}</td>
* <td>.tvx</td>
* <td>Stores offset into the document data file</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data}</td>
* <td>.tvd</td>
* <td>Contains term vector data.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what documents are live</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}</td>
* <td>.dii, .dim</td>
* <td>Holds indexed points</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90VectorFormat Vector values}</td>
* <td>.vec, .vem</td>
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data, and
* <code>.vem</code> the vector metadata</td>
* </tr>
* </table>
* </div>
* <a id="Lock_File"></a>
* <h3>Lock File</h3>
* The write lock, which is stored in the index directory by default, is named
* "write.lock". If the lock directory is different from the index directory then
* the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
* derived from the full path to the index directory. When this file is present, a
* writer is currently modifying the index (adding or removing documents). This
* lock file ensures that only one writer is modifying the index at a time.
* <a id="History"></a>
* <h3>History</h3>
* <p>Compatibility notes are provided in this document, describing how file
* formats have changed from prior versions:</p>
* <ul>
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie,
* no more commit lock). The change is fully backwards compatible: you can open a
* pre-2.1 index for searching or adding/deleting of docs. When the new segments
* file is saved (committed), it will be written in the new file format (meaning
* no specific "upgrade" process is needed). But note that once a commit has
* occurred, pre-2.1 Lucene will not be able to read the index.</li>
* <li>In version 2.3, the file format was changed to allow segments to share a
* single set of doc store (vectors &amp; stored fields) files. This allows for
* faster indexing in certain cases. The change is fully backwards compatible (in
* the same way as the lock-less commits change in 2.1).</li>
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not
* Java's modified UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">
* LUCENE-510</a> for details.</li>
* <li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData
* may be passed to IndexWriter's commit methods (and later retrieved), which is
* recorded in the segments_N file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">
* LUCENE-1382</a> for details. Also,
* diagnostics were added to each segment written recording details about why it
* was written (due to flush, merge; which OS/JRE was used; etc.). See issue
* <a href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.</li>
* <li>In version 3.0, compressed fields are no longer written to the index (they
* can still be read, but on merge the new segment will write them, uncompressed).
* See issue <a href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a>
* for details.</li>
* <li>In version 3.1, segments records the code version that created them. See
* <a href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
* Additionally segments track explicitly whether or not they have term vectors.
* See <a href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a>
* for details.</li>
* <li>In version 3.2, numeric fields are written as natively to stored fields
* file, previously they were stored in text format only.</li>
* <li>In version 3.4, fields can omit position data while still indexing term
* frequencies.</li>
* <li>In version 4.0, the format of the inverted index became extensible via
* the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
* ({@code DocValues}) was introduced. Normalization factors need no longer be a
* single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
* Terms need not be unicode strings, they can be any byte sequence. Term offsets
* can optionally be indexed into the postings lists. Payloads can be stored in the
* term vectors.</li>
* <li>In version 4.1, the format of the postings list changed to use either
* of FOR compression or variable-byte encoding, depending upon the frequency
* of the term. Terms appearing only once were changed to inline directly into
* the term dictionary. Stored fields are compressed by default. </li>
* <li>In version 4.2, term vectors are compressed by default. DocValues has
* a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
* on multi-valued fields.</li>
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.</li>
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
* allow updating NumericDocValues fields.</li>
* <li>In version 4.8, checksum footers were added to the end of each index file
* for improved data integrity. Specifically, the last 8 bytes of every index file
* contain the zlib-crc32 checksum of the file.</li>
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
* that is suitable for faceting/sorting/analytics.</li>
* <li>In version 5.4, DocValues have been improved to store more information on disk:
* addresses for binary fields and ord indexes for multi-valued fields.</li>
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.</li>
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
* <li>In version 7.0, DocValues have been improved to better support sparse doc values
* thanks to an iterator API.</li>
* <li>In version 8.0, postings have been enhanced to record, for each block of
* doc ids, the (term freq, normalization factor) pairs that may trigger the
* maximum score of the block. This information is recorded alongside skip data
* in order to be able to skip blocks of doc ids if they may not produce high
* enough scores.
* Additionally doc values and norms has been extended with jump-tables to make access O(1)
* instead of O(n), where n is the number of elements to skip when advancing in the data.</li>
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
* performant encoding that is vectorized.</li>
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to
* allow user-defined sorts to be used</li>
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
* smaller stored fields.</li>
* <li>In version 9.0, vector-valued fields were added.</li>
* </ul>
* <a id="Limitations"></a>
* <h3>Limitations</h3>
* <div>
* <p>Lucene uses a Java <code>int</code> to refer to
* document numbers, and the index file format uses an <code>Int32</code>
* on-disk to store document numbers. This is a limitation
* of both the index file format and the current implementation. Eventually these
* should be replaced with either <code>UInt64</code> values, or
* better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.</p>
* </div>
*/
package org.apache.lucene.codecs.lucene90;

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.VectorValues;
/**
* Describes the properties of a field.
@ -44,6 +45,8 @@ public class FieldType implements IndexableFieldType {
private int dimensionCount;
private int indexDimensionCount;
private int dimensionNumBytes;
private int vectorDimension;
private VectorValues.ScoreFunction vectorScoreFunction = VectorValues.ScoreFunction.NONE;
private Map<String, String> attributes;
/**
@ -62,6 +65,8 @@ public class FieldType implements IndexableFieldType {
this.dimensionCount = ref.pointDimensionCount();
this.indexDimensionCount = ref.pointIndexDimensionCount();
this.dimensionNumBytes = ref.pointNumBytes();
this.vectorDimension = ref.vectorDimension();
this.vectorScoreFunction = ref.vectorScoreFunction();
if (ref.getAttributes() != null) {
this.attributes = new HashMap<>(ref.getAttributes());
}
@ -295,6 +300,7 @@ public class FieldType implements IndexableFieldType {
* Enables points indexing with selectable dimension indexing.
*/
public void setDimensions(int dimensionCount, int indexDimensionCount, int dimensionNumBytes) {
checkIfFrozen();
if (dimensionCount < 0) {
throw new IllegalArgumentException("dimensionCount must be >= 0; got " + dimensionCount);
}
@ -351,6 +357,28 @@ public class FieldType implements IndexableFieldType {
return dimensionNumBytes;
}
void setVectorDimensionsAndScoreFunction(int numDimensions, VectorValues.ScoreFunction distFunc) {
checkIfFrozen();
if (numDimensions <= 0) {
throw new IllegalArgumentException("vector numDimensions must be > 0; got " + numDimensions);
}
if (numDimensions > VectorValues.MAX_DIMENSIONS) {
throw new IllegalArgumentException("vector numDimensions must be <= VectorValues.MAX_DIMENSIONS (=" + VectorValues.MAX_DIMENSIONS + "); got " + numDimensions);
}
this.vectorDimension = numDimensions;
this.vectorScoreFunction = distFunc;
}
@Override
public int vectorDimension() {
return vectorDimension;
}
@Override
public VectorValues.ScoreFunction vectorScoreFunction() {
return vectorScoreFunction;
}
/**
* Puts an attribute value.
* <p>

View File

@ -0,0 +1,99 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;
import org.apache.lucene.index.VectorValues;
/** A field that contains a single floating-point numeric vector (or none) for each document.
* Vectors are dense - that is, every dimension of a vector contains an explicit value, stored
* packed into an array (of type float[]) whose length is the vector dimension. Values can be
* retrieved using {@link VectorValues}, which is a forward-only docID-based iterator and also
* offers random-access by dense ordinal (not docId). VectorValues.ScoreFunctions may be
* used to compare vectors at query time (for example as part of result ranking). A VectorField may
* be associated with a score function that defines the metric used for nearest-neighbor search
* among vectors of that field, but at the moment this association is purely nominal: it is intended
* for future use by the to-be-implemented nearest neighbors search.
*/
public class VectorField extends Field {
private static FieldType getType(float[] v, VectorValues.ScoreFunction scoreFunction) {
if (v == null) {
throw new IllegalArgumentException("vector value must not be null");
}
int dimension = v.length;
if (dimension == 0) {
throw new IllegalArgumentException("cannot index an empty vector");
}
if (dimension > VectorValues.MAX_DIMENSIONS) {
throw new IllegalArgumentException("cannot index vectors with dimension greater than " + VectorValues.MAX_DIMENSIONS);
}
if (scoreFunction == null) {
throw new IllegalArgumentException("score function must not be null");
}
FieldType type = new FieldType();
type.setVectorDimensionsAndScoreFunction(dimension, scoreFunction);
type.freeze();
return type;
}
/** Creates a numeric vector field. Fields are single-valued: each document has either one value
* or no value. Vectors of a single field share the same dimension and score function.
*
* @param name field name
* @param vector value
* @param scoreFunction a function defining vector proximity.
* @throws IllegalArgumentException if any parameter is null, or the vector is empty or has dimension &gt; 1024.
*/
public VectorField(String name, float[] vector, VectorValues.ScoreFunction scoreFunction) {
super(name, getType(vector, scoreFunction));
fieldsData = vector;
}
/** Creates a numeric vector field with the default EUCLIDEAN (L2) score function. Fields are
* single-valued: each document has either one value or no value. Vectors of a single field share
* the same dimension and score function.
*
* @param name field name
* @param vector value
* @throws IllegalArgumentException if any parameter is null, or the vector is empty or has dimension &gt; 1024.
*/
public VectorField(String name, float[] vector) {
this(name, vector, VectorValues.ScoreFunction.EUCLIDEAN);
}
/**
* Return the vector value of this field
*/
public float[] vectorValue() {
return (float[]) fieldsData;
}
/**
* Set the vector value of this field
* @param value the value to set; must not be null, and length must match the field type
*/
public void setVectorValue(float[] value) {
if (value == null) {
throw new IllegalArgumentException("value must not be null");
}
if (value.length != type.vectorDimension()) {
throw new IllegalArgumentException("value length " + value.length + " must match field dimension " + type.vectorDimension());
}
fieldsData = value;
}
}

View File

@ -220,6 +220,9 @@ public final class CheckIndex implements Closeable {
/** Status of index sort */
public IndexSortStatus indexSortStatus;
/** Status of vectors */
public VectorValuesStatus vectorValuesStatus;
}
/**
@ -374,7 +377,25 @@ public final class CheckIndex implements Closeable {
/** Total number of fields with points. */
public int totalValueFields;
/** Exception thrown during doc values test (null on success) */
/** Exception thrown during point values test (null on success) */
public Throwable error = null;
}
/**
* Status from testing VectorValues
*/
public static final class VectorValuesStatus {
VectorValuesStatus() {
}
/** Total number of vector values tested. */
public long totalVectorValues;
/** Total number of fields with vectors. */
public int totalVectorFields;
/** Exception thrown during vector values test (null on success) */
public Throwable error = null;
}
@ -731,6 +752,9 @@ public final class CheckIndex implements Closeable {
// Test PointValues
segInfoStat.pointsStatus = testPoints(reader, infoStream, failFast);
// Test VectorValues
segInfoStat.vectorValuesStatus = testVectors(reader, infoStream, failFast);
// Test index sort
segInfoStat.indexSortStatus = testSort(reader, indexSort, infoStream, failFast);
@ -1955,6 +1979,65 @@ public final class CheckIndex implements Closeable {
return status;
}
/**
* Test the vectors index
* @lucene.experimental
*/
public static Status.VectorValuesStatus testVectors(CodecReader reader, PrintStream infoStream, boolean failFast) throws IOException {
if (infoStream != null) {
infoStream.print(" test: vectors..............");
}
long startNS = System.nanoTime();
FieldInfos fieldInfos = reader.getFieldInfos();
Status.VectorValuesStatus status = new Status.VectorValuesStatus();
try {
if (fieldInfos.hasVectorValues()) {
for (FieldInfo fieldInfo : fieldInfos) {
if (fieldInfo.hasVectorValues()) {
int dimension = fieldInfo.getVectorDimension();
if (dimension <= 0) {
throw new RuntimeException("Field \"" + fieldInfo.name + "\" has vector values but dimension is " + dimension);
}
VectorValues values = reader.getVectorValues(fieldInfo.name);
if (values == null) {
continue;
}
status.totalVectorFields++;
int docCount = 0;
while (values.nextDoc() != NO_MORE_DOCS) {
int valueLength = values.vectorValue().length;
if (valueLength != dimension) {
throw new RuntimeException("Field \"" + fieldInfo.name + "\" has a value whose dimension=" + valueLength + " not matching the field's dimension=" + dimension);
}
++docCount;
}
if (docCount != values.size()) {
throw new RuntimeException("Field \"" + fieldInfo.name + "\" has size=" + values.size() + " but when iterated, returns " + docCount + " docs with values");
}
status.totalVectorValues += docCount;
}
}
}
msg(infoStream, String.format(Locale.ROOT, "OK [%d fields, %d vectors] [took %.3f sec]", status.totalVectorFields, status.totalVectorValues, nsToSec(System.nanoTime()-startNS)));
} catch (Throwable e) {
if (failFast) {
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR: " + e);
status.error = e;
if (infoStream != null) {
e.printStackTrace(infoStream);
}
}
return status;
}
/** Walks the entire N-dimensional points space, verifying that all points fall within the last cell's boundaries.
*
* @lucene.internal */

View File

@ -26,6 +26,7 @@ import java.util.Objects;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
@ -77,6 +78,12 @@ public abstract class CodecReader extends LeafReader implements Accountable {
* @lucene.internal
*/
public abstract PointsReader getPointsReader();
/**
* Expert: retrieve underlying VectorReader
* @lucene.internal
*/
public abstract VectorReader getVectorReader();
@Override
public final void document(int docID, StoredFieldVisitor visitor) throws IOException {
@ -202,6 +209,18 @@ public abstract class CodecReader extends LeafReader implements Accountable {
return getPointsReader().getValues(field);
}
@Override
public final VectorValues getVectorValues(String field) throws IOException {
ensureOpen();
FieldInfo fi = getFieldInfos().fieldInfo(field);
if (fi == null || fi.getVectorDimension() == 0) {
// Field does not exist or does not index vectors
return null;
}
return getVectorReader().getVectorValues(field);
}
@Override
protected void doClose() throws IOException {
}

View File

@ -47,6 +47,11 @@ abstract class DocValuesLeafReader extends LeafReader {
throw new UnsupportedOperationException();
}
@Override
public final VectorValues getVectorValues(String field) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public final void checkIntegrity() throws IOException {
throw new UnsupportedOperationException();

View File

@ -54,6 +54,9 @@ public final class FieldInfo {
private int pointIndexDimensionCount;
private int pointNumBytes;
private int vectorDimension; // if it is a positive value, it means this field indexes vectors
private VectorValues.ScoreFunction vectorScoreFunction = VectorValues.ScoreFunction.NONE;
// whether this field is used as the soft-deletes field
private final boolean softDeletesField;
@ -64,7 +67,8 @@ public final class FieldInfo {
*/
public FieldInfo(String name, int number, boolean storeTermVector, boolean omitNorms, boolean storePayloads,
IndexOptions indexOptions, DocValuesType docValues, long dvGen, Map<String,String> attributes,
int pointDimensionCount, int pointIndexDimensionCount, int pointNumBytes, boolean softDeletesField) {
int pointDimensionCount, int pointIndexDimensionCount, int pointNumBytes,
int vectorDimension, VectorValues.ScoreFunction vectorScoreFunction, boolean softDeletesField) {
this.name = Objects.requireNonNull(name);
this.number = number;
this.docValuesType = Objects.requireNonNull(docValues, "DocValuesType must not be null (field: \"" + name + "\")");
@ -83,6 +87,8 @@ public final class FieldInfo {
this.pointDimensionCount = pointDimensionCount;
this.pointIndexDimensionCount = pointIndexDimensionCount;
this.pointNumBytes = pointNumBytes;
this.vectorDimension = vectorDimension;
this.vectorScoreFunction = vectorScoreFunction;
this.softDeletesField = softDeletesField;
this.checkConsistency();
}
@ -137,6 +143,14 @@ public final class FieldInfo {
throw new IllegalStateException("field '" + name + "' cannot have a docvalues update generation without having docvalues");
}
if (vectorDimension < 0) {
throw new IllegalStateException("vectorDimension must be >=0; got " + vectorDimension);
}
if (vectorDimension == 0 && vectorScoreFunction != VectorValues.ScoreFunction.NONE) {
throw new IllegalStateException("vector score function must be NONE when dimension = 0; got " + vectorScoreFunction);
}
return true;
}
@ -232,6 +246,40 @@ public final class FieldInfo {
return pointNumBytes;
}
/** Record that this field is indexed with vectors, with the specified num of dimensions and distance function */
public void setVectorDimensionAndScoreFunction(int dimension, VectorValues.ScoreFunction scoreFunction) {
if (dimension < 0) {
throw new IllegalArgumentException("vector dimension must be >= 0; got " + dimension);
}
if (dimension > VectorValues.MAX_DIMENSIONS) {
throw new IllegalArgumentException("vector dimension must be <= VectorValues.MAX_DIMENSIONS (=" + VectorValues.MAX_DIMENSIONS + "); got " + dimension);
}
if (dimension == 0 && scoreFunction != VectorValues.ScoreFunction.NONE) {
throw new IllegalArgumentException("vector score function must be NONE when the vector dimension = 0; got " + scoreFunction);
}
if (vectorDimension != 0 && vectorDimension != dimension) {
throw new IllegalArgumentException("cannot change vector dimension from " + vectorDimension + " to " + dimension + " for field=\"" + name + "\"");
}
if (vectorScoreFunction != VectorValues.ScoreFunction.NONE && vectorScoreFunction != scoreFunction) {
throw new IllegalArgumentException("cannot change vector score function from " + vectorScoreFunction + " to " + scoreFunction + " for field=\"" + name + "\"");
}
this.vectorDimension = dimension;
this.vectorScoreFunction = scoreFunction;
assert checkConsistency();
}
/** Returns the number of dimensions of the vector value */
public int getVectorDimension() {
return vectorDimension;
}
/** Returns {@link org.apache.lucene.index.VectorValues.ScoreFunction} for the field */
public VectorValues.ScoreFunction getVectorScoreFunction() {
return vectorScoreFunction;
}
/** Record that this field is indexed with docvalues, with the specified type */
public void setDocValuesType(DocValuesType type) {
if (type == null) {
@ -336,6 +384,13 @@ public final class FieldInfo {
public boolean hasVectors() {
return storeTermVector;
}
/**
* Returns whether any (numeric) vector values exist for this field
*/
public boolean hasVectorValues() {
return vectorDimension > 0;
}
/**
* Get a codec attribute value, or null if it does not exist

View File

@ -48,6 +48,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
private final boolean hasNorms;
private final boolean hasDocValues;
private final boolean hasPointValues;
private final boolean hasVectorValues;
private final String softDeletesField;
// used only by fieldInfo(int)
@ -68,6 +69,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
boolean hasNorms = false;
boolean hasDocValues = false;
boolean hasPointValues = false;
boolean hasVectorValues = false;
String softDeletesField = null;
int size = 0; // number of elements in byNumberTemp, number of used array slots
@ -99,6 +101,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
hasDocValues |= info.getDocValuesType() != DocValuesType.NONE;
hasPayloads |= info.hasPayloads();
hasPointValues |= (info.getPointDimensionCount() != 0);
hasVectorValues |= (info.getVectorDimension() != 0);
if (info.isSoftDeletesField()) {
if (softDeletesField != null && softDeletesField.equals(info.name) == false) {
throw new IllegalArgumentException("multiple soft-deletes fields [" + info.name + ", " + softDeletesField + "]");
@ -115,6 +118,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
this.hasNorms = hasNorms;
this.hasDocValues = hasDocValues;
this.hasPointValues = hasPointValues;
this.hasVectorValues = hasVectorValues;
this.softDeletesField = softDeletesField;
List<FieldInfo> valuesTemp = new ArrayList<>();
@ -204,6 +208,11 @@ public class FieldInfos implements Iterable<FieldInfo> {
return hasPointValues;
}
/** Returns true if any fields have VectorValues */
public boolean hasVectorValues() {
return hasVectorValues;
}
/** Returns the soft-deletes field name if exists; otherwise returns null */
public String getSoftDeletesField() {
return softDeletesField;
@ -261,6 +270,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
this.dimensionNumBytes = dimensionNumBytes;
}
}
static final class FieldVectorProperties {
final int numDimensions;
final VectorValues.ScoreFunction scoreFunction;
FieldVectorProperties(int numDimensions, VectorValues.ScoreFunction scoreFunction) {
this.numDimensions = numDimensions;
this.scoreFunction = scoreFunction;
}
}
static final class FieldNumbers {
@ -274,6 +293,8 @@ public class FieldInfos implements Iterable<FieldInfo> {
private final Map<String,FieldDimensions> dimensions;
private final Map<String,FieldVectorProperties> vectorProps;
// TODO: we should similarly catch an attempt to turn
// norms back on after they were already committed; today
// we silently discard the norm but this is badly trappy
@ -288,6 +309,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
this.indexOptions = new HashMap<>();
this.docValuesType = new HashMap<>();
this.dimensions = new HashMap<>();
this.vectorProps = new HashMap<>();
this.softDeletesFieldName = softDeletesFieldName;
}
@ -297,7 +319,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
* number assigned if possible otherwise the first unassigned field number
* is used as the field number.
*/
synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int indexDimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) {
synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int indexDimensionCount, int dimensionNumBytes, int vectorDimension, VectorValues.ScoreFunction scoreFunction, boolean isSoftDeletesField) {
if (indexOptions != IndexOptions.NONE) {
IndexOptions currentOpts = this.indexOptions.get(fieldName);
if (currentOpts == null) {
@ -330,6 +352,19 @@ public class FieldInfos implements Iterable<FieldInfo> {
dimensions.put(fieldName, new FieldDimensions(dimensionCount, indexDimensionCount, dimensionNumBytes));
}
}
if (vectorDimension != 0) {
FieldVectorProperties props = vectorProps.get(fieldName);
if (props != null) {
if (props.numDimensions != vectorDimension) {
throw new IllegalArgumentException("cannot change vector dimension from " + props.numDimensions + " to " + vectorDimension + " for field=\"" + fieldName + "\"");
}
if (props.scoreFunction != scoreFunction) {
throw new IllegalArgumentException("cannot change vector score function from " + props.scoreFunction + " to " + scoreFunction + " for field=\"" + fieldName + "\"");
}
} else {
vectorProps.put(fieldName, new FieldVectorProperties(vectorDimension, scoreFunction));
}
}
Integer fieldNumber = nameToNumber.get(fieldName);
if (fieldNumber == null) {
final Integer preferredBoxed = Integer.valueOf(preferredFieldNumber);
@ -408,6 +443,24 @@ public class FieldInfos implements Iterable<FieldInfo> {
}
}
synchronized void verifyConsistentVectorProperties(Integer number, String name, int numDimensions, VectorValues.ScoreFunction scoreFunction) {
if (name.equals(numberToName.get(number)) == false) {
throw new IllegalArgumentException("field number " + number + " is already mapped to field name \"" + numberToName.get(number) + "\", not \"" + name + "\"");
}
if (number.equals(nameToNumber.get(name)) == false) {
throw new IllegalArgumentException("field name \"" + name + "\" is already mapped to field number \"" + nameToNumber.get(name) + "\", not \"" + number + "\"");
}
FieldVectorProperties props = vectorProps.get(name);
if (props != null) {
if (props.numDimensions != numDimensions) {
throw new IllegalArgumentException("cannot change vector dimension from " + props.numDimensions + " to " + numDimensions + " for field=\"" + name + "\"");
}
if (props.scoreFunction != scoreFunction) {
throw new IllegalArgumentException("cannot change vector score function from " + props.scoreFunction + " to " + scoreFunction + " for field=\"" + name + "\"");
}
}
}
/**
* Returns true if the {@code fieldName} exists in the map and is of the
* same {@code dvType}.
@ -456,6 +509,17 @@ public class FieldInfos implements Iterable<FieldInfo> {
verifyConsistentDimensions(number, name, dimensionCount, indexDimensionCount, dimensionNumBytes);
dimensions.put(name, new FieldDimensions(dimensionCount, indexDimensionCount, dimensionNumBytes));
}
synchronized void setVectorDimensionsAndScoreFunction(int number, String name, int numDimensions, VectorValues.ScoreFunction scoreFunction) {
if (numDimensions <= 0) {
throw new IllegalArgumentException("vector numDimensions must be > 0; got " + numDimensions);
}
if (numDimensions > VectorValues.MAX_DIMENSIONS) {
throw new IllegalArgumentException("vector numDimensions must be <= VectorValues.MAX_DIMENSIONS (=" + VectorValues.MAX_DIMENSIONS + "); got " + numDimensions);
}
verifyConsistentVectorProperties(number, name, numDimensions, scoreFunction);
vectorProps.put(name, new FieldVectorProperties(numDimensions, scoreFunction));
}
}
static final class Builder {
@ -489,8 +553,8 @@ public class FieldInfos implements Iterable<FieldInfo> {
// before then we'll get the same name and number,
// else we'll allocate a new one:
final boolean isSoftDeletesField = name.equals(globalFieldNumbers.softDeletesFieldName);
final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0, 0, isSoftDeletesField);
fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0, 0, isSoftDeletesField);
final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, isSoftDeletesField);
fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, isSoftDeletesField);
assert !byName.containsKey(fi.name);
globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, DocValuesType.NONE);
byName.put(fi.name, fi);
@ -505,6 +569,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
DocValuesType docValues, long dvGen,
Map<String, String> attributes,
int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes,
int vectorDimension, VectorValues.ScoreFunction vectorScoreFunction,
boolean isSoftDeletesField) {
assert assertNotFinished();
if (docValues == null) {
@ -522,8 +587,8 @@ public class FieldInfos implements Iterable<FieldInfo> {
// number for this field. If the field was seen
// before then we'll get the same name and number,
// else we'll allocate a new one:
final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dataDimensionCount, indexDimensionCount, dimensionNumBytes, vectorDimension, vectorScoreFunction, isSoftDeletesField);
fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes, vectorDimension, vectorScoreFunction, isSoftDeletesField);
assert !byName.containsKey(fi.name);
globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType());
byName.put(fi.name, fi);
@ -558,6 +623,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
fi.getIndexOptions(), fi.getDocValuesType(), dvGen,
fi.attributes(),
fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
fi.getVectorDimension(), fi.getVectorScoreFunction(),
fi.isSoftDeletesField());
}

View File

@ -23,6 +23,7 @@ import java.util.Objects;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
@ -101,6 +102,11 @@ public abstract class FilterCodecReader extends CodecReader {
return in.getPointsReader();
}
@Override
public VectorReader getVectorReader() {
return in.getVectorReader();
}
@Override
public int numDocs() {
return in.numDocs();

View File

@ -330,6 +330,11 @@ public abstract class FilterLeafReader extends LeafReader {
return in.getPointValues(field);
}
@Override
public VectorValues getVectorValues(String field) throws IOException {
return in.getVectorValues(field);
}
@Override
public Fields getTermVectors(int docID)
throws IOException {

View File

@ -1184,7 +1184,8 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable,
for(SegmentCommitInfo info : segmentInfos) {
FieldInfos fis = readFieldInfos(info);
for(FieldInfo fi : fis) {
map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField());
map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField());
}
}
@ -1921,7 +1922,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable,
if (globalFieldNumberMap.contains(f.name(), dvType) == false) {
// if this field doesn't exists we try to add it. if it exists and the DV type doesn't match we
// get a consistent error message as if you try to do that during an indexing operation.
globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0, 0, f.name().equals(config.softDeletesField));
globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, f.name().equals(config.softDeletesField));
assert globalFieldNumberMap.contains(f.name(), dvType);
}
if (config.getIndexSortFields().contains(f.name())) {
@ -2966,7 +2967,9 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable,
FieldInfos fis = readFieldInfos(info);
for(FieldInfo fi : fis) {
// This will throw exceptions if any of the incoming fields have an illegal schema change:
globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField());
globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(),
fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField());
}
infos.add(copySegmentAsIs(info, newSegName, context));
}

View File

@ -114,6 +114,16 @@ public interface IndexableFieldType {
*/
public int pointNumBytes();
/**
* The number of dimensions of the field's vector value
*/
public int vectorDimension();
/**
* The {@link org.apache.lucene.index.VectorValues.ScoreFunction} of the field's vector value
*/
public VectorValues.ScoreFunction vectorScoreFunction();
/**
* Attributes for the field type.
*

View File

@ -32,12 +32,15 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.VectorWriter;
import org.apache.lucene.codecs.NormsConsumer;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PointsWriter;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.VectorField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
@ -180,7 +183,6 @@ final class IndexingChain implements Accountable {
public FieldInfos getFieldInfos() {
return fieldInfos.finish();
}
};
}
@ -230,6 +232,12 @@ final class IndexingChain implements Accountable {
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write points");
}
t0 = System.nanoTime();
writeVectors(state, sortMap);
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write vectors");
}
// it's possible all docs hit non-aborting exceptions...
t0 = System.nanoTime();
@ -374,6 +382,50 @@ final class IndexingChain implements Accountable {
}
}
/** Writes all buffered vectors. */
private void writeVectors(SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
VectorWriter vectorWriter = null;
boolean success = false;
try {
for (int i = 0; i<fieldHash.length; i++) {
PerField perField = fieldHash[i];
while (perField != null) {
if (perField.vectorValuesWriter != null) {
if (perField.fieldInfo.getVectorDimension() == 0) {
// BUG
throw new AssertionError("segment=" + state.segmentInfo + ": field=\"" + perField.fieldInfo.name + "\" has no vectors but wrote them");
}
if (vectorWriter == null) {
// lazy init
VectorFormat fmt = state.segmentInfo.getCodec().vectorFormat();
if (fmt == null) {
throw new IllegalStateException("field=\"" + perField.fieldInfo.name + "\" was indexed as vectors but codec does not support vectors");
}
vectorWriter = fmt.fieldsWriter(state);
}
perField.vectorValuesWriter.flush(sortMap, vectorWriter);
perField.vectorValuesWriter = null;
} else if (perField.fieldInfo.getVectorDimension() != 0) {
// BUG
throw new AssertionError("segment=" + state.segmentInfo + ": field=\"" + perField.fieldInfo.name + "\" has vectors but did not write them");
}
perField = perField.next;
}
}
if (vectorWriter != null) {
vectorWriter.finish();
}
success = true;
} finally {
if (success) {
IOUtils.close(vectorWriter);
} else {
IOUtils.closeWhileHandlingException(vectorWriter);
}
}
}
private void writeNorms(SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
boolean success = false;
NormsConsumer normsConsumer = null;
@ -562,6 +614,12 @@ final class IndexingChain implements Accountable {
}
indexPoint(docID, fp, field);
}
if (fieldType.vectorDimension() != 0) {
if (fp == null) {
fp = getOrAddField(fieldName, fieldType, false);
}
indexVector(docID, fp, field);
}
return fieldCount;
}
@ -722,6 +780,24 @@ final class IndexingChain implements Accountable {
}
}
/** Called from processDocument to index one field's vector value */
private void indexVector(int docID, PerField fp, IndexableField field) {
int dimension = field.fieldType().vectorDimension();
VectorValues.ScoreFunction scoreFunction = field.fieldType().vectorScoreFunction();
// Record dimensions and distance function for this field; this setter will throw IllegalArgExc if
// the dimensions or distance function were already set to something different:
if (fp.fieldInfo.getVectorDimension() == 0) {
fieldInfos.globalFieldNumbers.setVectorDimensionsAndScoreFunction(fp.fieldInfo.number, fp.fieldInfo.name, dimension, scoreFunction);
}
fp.fieldInfo.setVectorDimensionAndScoreFunction(dimension, scoreFunction);
if (fp.vectorValuesWriter == null) {
fp.vectorValuesWriter = new VectorValuesWriter(fp.fieldInfo, bytesUsed);
}
fp.vectorValuesWriter.addValue(docID, ((VectorField) field).vectorValue());
}
/** Returns a previously created {@link PerField}, or null
* if this field name wasn't seen yet. */
private PerField getPerField(String name) {
@ -820,6 +896,9 @@ final class IndexingChain implements Accountable {
// Non-null if this field ever had points in this segment:
PointValuesWriter pointValuesWriter;
// Non-null if this field ever had vector values in this segment:
VectorValuesWriter vectorValuesWriter;
/** We use this to know when a PerField is seen for the
* first time in the current document. */
long fieldGen = -1;

View File

@ -203,6 +203,10 @@ public abstract class LeafReader extends IndexReader {
* used by a single thread. */
public abstract NumericDocValues getNormValues(String field) throws IOException;
/** Returns {@link VectorValues} for this field, or null if no {@link VectorValues} were indexed.
* The returned instance should only be used by a single thread. */
public abstract VectorValues getVectorValues(String field) throws IOException;
/**
* Get the {@link FieldInfos} describing all fields in
* this reader.

View File

@ -194,6 +194,11 @@ class MergeReaderWrapper extends LeafReader {
return in.getPointValues(fieldName);
}
@Override
public VectorValues getVectorValues(String fieldName) throws IOException {
return in.getVectorValues(fieldName);
}
@Override
public int numDocs() {
return in.numDocs();

View File

@ -24,6 +24,7 @@ import java.util.Locale;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
@ -77,6 +78,9 @@ public class MergeState {
/** Point readers to merge */
public final PointsReader[] pointsReaders;
/** Vector readers to merge */
public final VectorReader[] vectorReaders;
/** Max docs per reader */
public final int[] maxDocs;
@ -103,6 +107,7 @@ public class MergeState {
termVectorsReaders = new TermVectorsReader[numReaders];
docValuesProducers = new DocValuesProducer[numReaders];
pointsReaders = new PointsReader[numReaders];
vectorReaders = new VectorReader[numReaders];
fieldInfos = new FieldInfos[numReaders];
liveDocs = new Bits[numReaders];
@ -139,6 +144,12 @@ public class MergeState {
if (pointsReaders[i] != null) {
pointsReaders[i] = pointsReaders[i].getMergeInstance();
}
vectorReaders[i] = reader.getVectorReader();
if (vectorReaders[i] != null) {
vectorReaders[i] = vectorReaders[i].getMergeInstance();
}
numDocs += reader.numDocs();
}

View File

@ -369,6 +369,13 @@ public class ParallelLeafReader extends LeafReader {
return reader == null ? null : reader.getPointValues(fieldName);
}
@Override
public VectorValues getVectorValues(String fieldName) throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(fieldName);
return reader == null ? null : reader.getVectorValues(fieldName);
}
@Override
public void checkIntegrity() throws IOException {
ensureOpen();

View File

@ -657,7 +657,8 @@ final class ReadersAndUpdates {
private FieldInfo cloneFieldInfo(FieldInfo fi, int fieldNumber) {
return new FieldInfo(fi.name, fieldNumber, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(),
fi.getIndexOptions(), fi.getDocValuesType(), fi.getDocValuesGen(), new HashMap<>(fi.attributes()),
fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField());
fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField());
}
private SegmentReader createNewReaderWithLatestLiveDocs(SegmentReader reader) throws IOException {

View File

@ -30,6 +30,7 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundDirectory;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.PostingsFormat;
@ -61,6 +62,7 @@ final class SegmentCoreReaders {
final StoredFieldsReader fieldsReaderOrig;
final TermVectorsReader termVectorsReaderOrig;
final PointsReader pointsReader;
final VectorReader vectorReader;
final CompoundDirectory cfsReader;
final String segment;
/**
@ -137,6 +139,13 @@ final class SegmentCoreReaders {
} else {
pointsReader = null;
}
if (coreFieldInfos.hasVectorValues()) {
vectorReader = codec.vectorFormat().fieldsReader(segmentReadState);
} else {
vectorReader = null;
}
success = true;
} catch (EOFException | FileNotFoundException e) {
throw new CorruptIndexException("Problem reading index from " + dir, dir.toString(), e);
@ -168,7 +177,7 @@ final class SegmentCoreReaders {
if (ref.decrementAndGet() == 0) {
try (Closeable finalizer = this::notifyCoreClosedListeners){
IOUtils.close(termVectorsLocal, fieldsReaderLocal, fields, termVectorsReaderOrig, fieldsReaderOrig,
cfsReader, normsProducer, pointsReader);
cfsReader, normsProducer, pointsReader, vectorReader);
}
}
}

View File

@ -23,6 +23,7 @@ import java.util.List;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.VectorWriter;
import org.apache.lucene.codecs.NormsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsWriter;
@ -98,15 +99,8 @@ final class SegmentMerger {
throw new IllegalStateException("Merge would result in 0 document segment");
}
mergeFieldInfos();
long t0 = 0;
if (mergeState.infoStream.isEnabled("SM")) {
t0 = System.nanoTime();
}
int numMerged = mergeFields();
if (mergeState.infoStream.isEnabled("SM")) {
long t1 = System.nanoTime();
mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge stored fields [" + numMerged + " docs]");
}
int numMerged = mergeWithLogging(this::mergeFields, "stored fields");
assert numMerged == mergeState.segmentInfo.maxDoc(): "numMerged=" + numMerged + " vs mergeState.segmentInfo.maxDoc()=" + mergeState.segmentInfo.maxDoc();
final SegmentWriteState segmentWriteState = new SegmentWriteState(mergeState.infoStream, directory, mergeState.segmentInfo,
@ -115,77 +109,29 @@ final class SegmentMerger {
IOContext.READ, segmentWriteState.segmentSuffix);
if (mergeState.mergeFieldInfos.hasNorms()) {
if (mergeState.infoStream.isEnabled("SM")) {
t0 = System.nanoTime();
}
mergeNorms(segmentWriteState);
if (mergeState.infoStream.isEnabled("SM")) {
long t1 = System.nanoTime();
mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge norms [" + numMerged + " docs]");
}
mergeWithLogging(() -> mergeNorms(segmentWriteState), "norms", numMerged);
}
if (mergeState.infoStream.isEnabled("SM")) {
t0 = System.nanoTime();
}
try (NormsProducer norms = mergeState.mergeFieldInfos.hasNorms()
? codec.normsFormat().normsProducer(segmentReadState)
: null) {
NormsProducer normsMergeInstance = null;
if (norms != null) {
// Use the merge instance in order to reuse the same IndexInput for all terms
normsMergeInstance = norms.getMergeInstance();
}
mergeTerms(segmentWriteState, normsMergeInstance);
}
if (mergeState.infoStream.isEnabled("SM")) {
long t1 = System.nanoTime();
mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge postings [" + numMerged + " docs]");
}
mergeWithLogging(() -> mergeTerms(segmentWriteState, segmentReadState), "postings", numMerged);
if (mergeState.infoStream.isEnabled("SM")) {
t0 = System.nanoTime();
}
if (mergeState.mergeFieldInfos.hasDocValues()) {
mergeDocValues(segmentWriteState);
}
if (mergeState.infoStream.isEnabled("SM")) {
long t1 = System.nanoTime();
mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge doc values [" + numMerged + " docs]");
mergeWithLogging(() -> mergeDocValues(segmentWriteState), "doc values", numMerged);
}
if (mergeState.infoStream.isEnabled("SM")) {
t0 = System.nanoTime();
}
if (mergeState.mergeFieldInfos.hasPointValues()) {
mergePoints(segmentWriteState);
mergeWithLogging(() -> mergePoints(segmentWriteState), "points", numMerged);
}
if (mergeState.infoStream.isEnabled("SM")) {
long t1 = System.nanoTime();
mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge points [" + numMerged + " docs]");
if (mergeState.mergeFieldInfos.hasVectorValues()) {
mergeWithLogging(() -> mergeVectorValues(segmentWriteState), "numeric vectors", numMerged);
}
if (mergeState.mergeFieldInfos.hasVectors()) {
if (mergeState.infoStream.isEnabled("SM")) {
t0 = System.nanoTime();
}
numMerged = mergeVectors();
if (mergeState.infoStream.isEnabled("SM")) {
long t1 = System.nanoTime();
mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge vectors [" + numMerged + " docs]");
}
assert numMerged == mergeState.segmentInfo.maxDoc();
mergeWithLogging(this::mergeTermVectors, "term vectors");
}
// write the merged infos
if (mergeState.infoStream.isEnabled("SM")) {
t0 = System.nanoTime();
}
codec.fieldInfosFormat().write(directory, mergeState.segmentInfo, "", mergeState.mergeFieldInfos, context);
if (mergeState.infoStream.isEnabled("SM")) {
long t1 = System.nanoTime();
mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to write field infos [" + numMerged + " docs]");
}
mergeWithLogging(() -> codec.fieldInfosFormat().write(directory, mergeState.segmentInfo, "", mergeState.mergeFieldInfos, context), "field infos", numMerged);
return mergeState;
}
@ -207,7 +153,22 @@ final class SegmentMerger {
consumer.merge(mergeState);
}
}
private void mergeTerms(SegmentWriteState segmentWriteState, SegmentReadState segmentReadState) throws IOException {
try (NormsProducer norms = mergeState.mergeFieldInfos.hasNorms()
? codec.normsFormat().normsProducer(segmentReadState)
: null) {
NormsProducer normsMergeInstance = null;
if (norms != null) {
// Use the merge instance in order to reuse the same IndexInput for all terms
normsMergeInstance = norms.getMergeInstance();
}
try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) {
consumer.merge(mergeState, normsMergeInstance);
}
}
}
public void mergeFieldInfos() {
for (FieldInfos readerFieldInfos : mergeState.fieldInfos) {
for (FieldInfo fi : readerFieldInfos) {
@ -233,15 +194,51 @@ final class SegmentMerger {
* Merge the TermVectors from each of the segments into the new one.
* @throws IOException if there is a low-level IO error
*/
private int mergeVectors() throws IOException {
private int mergeTermVectors() throws IOException {
try (TermVectorsWriter termVectorsWriter = codec.termVectorsFormat().vectorsWriter(directory, mergeState.segmentInfo, context)) {
return termVectorsWriter.merge(mergeState);
int numMerged = termVectorsWriter.merge(mergeState);
assert numMerged == mergeState.segmentInfo.maxDoc();
return numMerged;
}
}
private void mergeTerms(SegmentWriteState segmentWriteState, NormsProducer norms) throws IOException {
try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) {
consumer.merge(mergeState, norms);
private void mergeVectorValues(SegmentWriteState segmentWriteState) throws IOException {
try (VectorWriter writer = codec.vectorFormat().fieldsWriter(segmentWriteState)) {
writer.merge(mergeState);
}
}
private interface Merger {
int merge() throws IOException;
}
private interface VoidMerger {
void merge() throws IOException;
}
private int mergeWithLogging(Merger merger, String formatName) throws IOException {
long t0 = 0;
if (mergeState.infoStream.isEnabled("SM")) {
t0 = System.nanoTime();
}
int numMerged = merger.merge();
if (mergeState.infoStream.isEnabled("SM")) {
long t1 = System.nanoTime();
mergeState.infoStream.message("SM", ((t1 - t0) / 1000000) + " msec to merge vector values [" + numMerged + " docs]");
}
return numMerged;
}
private void mergeWithLogging(VoidMerger merger, String formatName, int numMerged) throws IOException {
long t0 = 0;
if (mergeState.infoStream.isEnabled("SM")) {
t0 = System.nanoTime();
}
merger.merge();
if (mergeState.infoStream.isEnabled("SM")) {
long t1 = System.nanoTime();
mergeState.infoStream.message("SM", ((t1 - t0) / 1000000) + " msec to merge vector values [" + numMerged + " docs]");
}
}
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
@ -259,6 +260,11 @@ public final class SegmentReader extends CodecReader {
return docValuesProducer;
}
@Override
public VectorReader getVectorReader() {
return core.vectorReader;
}
@Override
public FieldsProducer getPostingsReader() {
ensureOpen();

View File

@ -24,6 +24,7 @@ import java.util.Iterator;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
@ -78,6 +79,12 @@ public final class SlowCodecReaderWrapper {
return readerToDocValuesProducer(reader);
}
@Override
public VectorReader getVectorReader() {
reader.ensureOpen();
return readerToVectorReader(reader);
}
@Override
public FieldsProducer getPostingsReader() {
reader.ensureOpen();
@ -160,6 +167,29 @@ public final class SlowCodecReaderWrapper {
};
}
private static VectorReader readerToVectorReader(LeafReader reader) {
return new VectorReader() {
@Override
public VectorValues getVectorValues(String field) throws IOException {
return reader.getVectorValues(field);
}
@Override
public void checkIntegrity() {
// We already checkIntegrity the entire reader up front
}
@Override
public void close() {
}
@Override
public long ramBytesUsed() {
return 0L;
}
};
}
private static NormsProducer readerToNormsProducer(final LeafReader reader) {
return new NormsProducer() {

View File

@ -0,0 +1,285 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.BytesRef;
/**
* This class provides access to per-document floating point vector values indexed as {@link
* org.apache.lucene.document.VectorField}.
*
* @lucene.experimental
*/
public abstract class VectorValues extends DocIdSetIterator {
/** The maximum length of a vector */
public static int MAX_DIMENSIONS = 1024;
/** Sole constructor */
protected VectorValues() {}
/**
* Return the dimension of the vectors
*/
public abstract int dimension();
/**
* TODO: should we use cost() for this? We rely on its always being exactly the number
* of documents having a value for this field, which is not guaranteed by the cost() contract,
* but in all the implementations so far they are the same.
* @return the number of vectors returned by this iterator
*/
public abstract int size();
/**
* Return the score function used to compare these vectors
*/
public abstract ScoreFunction scoreFunction();
/**
* Return the vector value for the current document ID.
* It is illegal to call this method when the iterator is not positioned: before advancing, or after failing to advance.
* The returned array may be shared across calls, re-used, and modified as the iterator advances.
* @return the vector value
*/
public abstract float[] vectorValue() throws IOException;
/**
* Return the binary encoded vector value for the current document ID. These are the bytes
* corresponding to the float array return by {@link #vectorValue}. It is illegal to call this
* method when the iterator is not positioned: before advancing, or after failing to advance. The
* returned storage may be shared across calls, re-used and modified as the iterator advances.
* @return the binary value
*/
public BytesRef binaryValue() throws IOException {
throw new UnsupportedOperationException();
}
/**
* Return a random access interface over this iterator's vectors. Calling the RandomAccess methods will
* have no effect on the progress of the iteration or the values returned by this iterator. Successive calls
* will retrieve independent copies that do not overwrite each others' returned values.
*/
public abstract RandomAccess randomAccess();
/**
* Provides random access to vectors by dense ordinal.
*
* @lucene.experimental
*/
public interface RandomAccess {
/**
* Return the number of vector values
*/
int size();
/**
* Return the dimension of the returned vector values
*/
int dimension();
/**
* Return the score function used to compare these vectors
*/
ScoreFunction scoreFunction();
/**
* Return the vector value indexed at the given ordinal. The provided floating point array may
* be shared and overwritten by subsequent calls to this method and {@link #binaryValue(int)}.
* @param targetOrd a valid ordinal, &ge; 0 and &lt; {@link #size()}.
*/
float[] vectorValue(int targetOrd) throws IOException;
/**
* Return the vector indexed at the given ordinal value as an array of bytes in a BytesRef;
* these are the bytes corresponding to the float array in IEEE 754 standard encoding, encoded
* using little-endian byte order. The provided bytes may be shared and overwritten by subsequent
* calls to this method and {@link #vectorValue(int)}.
* @param targetOrd a valid ordinal, &ge; 0 and &lt; {@link #size()}.
*/
BytesRef binaryValue(int targetOrd) throws IOException;
/**
* Return the dense ordinal of the document if it has a vector. This ordinal ranges from 0 to the one less than the number
* of documents having a vector in this iterator, and it is guaranteed to increase with increasing docid.
* @param docId the document whose ordinal is returned
* @return the ordinal of the given document, or -1 if the document has no vector value
*/
//int ordinal(int docId);
/**
* Return the k nearest neighbor documents as determined by comparison of their vector values
* for this field, to the given vector, by the field's score function. If the score function is
* reversed, lower values indicate nearer vectors, otherwise higher scores indicate nearer
* vectors. Unlike relevance scores, vector scores may be negative.
* @param target the vector-valued query
* @param k the number of docs to return
* @param fanout control the accuracy/speed tradeoff - larger values give better recall at higher cost
* @return the k nearest neighbor documents, along with their (scoreFunction-specific) scores.
*/
TopDocs search(float[] target, int k, int fanout) throws IOException;
}
/**
* Score function. This is used during indexing and searching of the vectors to determine the nearest neighbors.
* Score values may be negative. By default high scores indicate nearer documents, unless the function is reversed.
*/
public enum ScoreFunction {
/** No distance function is used. Note: {@link VectorValues.RandomAccess#search(float[], int, int)}
* is not supported for fields specifying this score function. */
NONE,
/** Euclidean distance */
EUCLIDEAN(true) {
@Override
public float score(float[] v1, float[] v2) {
assert v1.length == v2.length;
float squareSum = 0.0f;
int dim = v1.length;
for (int i = 0; i < dim; i++) {
float diff = v1[i] - v2[i];
squareSum += diff * diff;
}
return squareSum;
}
},
/** dot product - note, may be negative; larger values are better */
DOT_PRODUCT() {
@Override
public float score(float[] a, float[] b) {
float res = 0f;
/*
* If length of vector is larger than 8, we use unrolled dot product to accelerate the
* calculation.
*/
int i;
for (i = 0; i < a.length % 8; i++) {
res += b[i] * a[i];
}
if (a.length < 8) {
return res;
}
float s0 = 0f;
float s1 = 0f;
float s2 = 0f;
float s3 = 0f;
float s4 = 0f;
float s5 = 0f;
float s6 = 0f;
float s7 = 0f;
for (; i + 7 < a.length; i += 8) {
s0 += b[i] * a[i];
s1 += b[i + 1] * a[i + 1];
s2 += b[i + 2] * a[i + 2];
s3 += b[i + 3] * a[i + 3];
s4 += b[i + 4] * a[i + 4];
s5 += b[i + 5] * a[i + 5];
s6 += b[i + 6] * a[i + 6];
s7 += b[i + 7] * a[i + 7];
}
res += s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
return res;
}
};
/** If reversed, smaller values are better */
final public boolean reversed;
ScoreFunction(boolean reversed) {
this.reversed = reversed;
}
ScoreFunction() {
this(false);
}
/**
* Calculates the score between the specified two vectors.
*/
public float score(float[] v1, float[] v2) {
throw new UnsupportedOperationException();
}
}
/**
* Calculates a similarity score between the two vectors with specified function.
*/
public static float compare(float[] v1, float[] v2, ScoreFunction scoreFunction) {
assert v1.length == v2.length : "attempt to compare vectors of lengths: " + v1.length + " " + v2.length;
return scoreFunction.score(v1, v2);
}
/**
* Represents the lack of vector values. It is returned by providers that do not
* support VectorValues.
*/
public static final VectorValues EMPTY = new VectorValues() {
@Override
public int size() {
return 0;
}
@Override
public int dimension() {
return 0;
}
@Override
public ScoreFunction scoreFunction() {
return ScoreFunction.NONE;
}
@Override
public float[] vectorValue() {
throw new IllegalStateException("Attempt to get vectors from EMPTY values (which was not advanced)");
}
@Override
public RandomAccess randomAccess() {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
throw new IllegalStateException("VectorValues is EMPTY, and not positioned on a doc");
}
@Override
public int nextDoc() {
return NO_MORE_DOCS;
}
@Override
public int advance(int target) {
return NO_MORE_DOCS;
}
@Override
public long cost() {
return 0;
}
};
}

View File

@ -0,0 +1,322 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.VectorWriter;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.RamUsageEstimator;
/** Buffers up pending vector value(s) per doc, then flushes when segment flushes. */
class VectorValuesWriter {
private final FieldInfo fieldInfo;
private final Counter iwBytesUsed;
private final List<float[]> vectors = new ArrayList<>();
private final DocsWithFieldSet docsWithField;
private int lastDocID = -1;
private long bytesUsed;
VectorValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
this.docsWithField = new DocsWithFieldSet();
this.bytesUsed = docsWithField.ramBytesUsed();
if (iwBytesUsed != null) {
iwBytesUsed.addAndGet(bytesUsed);
}
}
/**
* Adds a value for the given document. Only a single value may be added.
* @param docID the value is added to this document
* @param vectorValue the value to add
* @throws IllegalArgumentException if a value has already been added to the given document
*/
public void addValue(int docID, float[] vectorValue) {
if (docID == lastDocID) {
throw new IllegalArgumentException("VectorValuesField \"" + fieldInfo.name + "\" appears more than once in this document (only one value is allowed per field)");
}
if (vectorValue.length != fieldInfo.getVectorDimension()) {
throw new IllegalArgumentException("Attempt to index a vector of dimension " + vectorValue.length +
" but \"" + fieldInfo.name + "\" has dimension " + fieldInfo.getVectorDimension());
}
assert docID > lastDocID;
docsWithField.add(docID);
vectors.add(ArrayUtil.copyOfSubArray(vectorValue, 0, vectorValue.length));
updateBytesUsed();
lastDocID = docID;
}
private void updateBytesUsed() {
final long newBytesUsed = docsWithField.ramBytesUsed()
+ vectors.size() * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER)
+ vectors.size() * vectors.get(0).length * Float.BYTES;
if (iwBytesUsed != null) {
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
}
bytesUsed = newBytesUsed;
}
/**
* Flush this field's values to storage, sorting the values in accordance with sortMap
* @param sortMap specifies the order of documents being flushed, or null if they are to be flushed in docid order
* @param vectorWriter the Codec's vector writer that handles the actual encoding and I/O
* @throws IOException if there is an error writing the field and its values
*/
public void flush(Sorter.DocMap sortMap, VectorWriter vectorWriter) throws IOException {
VectorValues vectorValues = new BufferedVectorValues(docsWithField, vectors, fieldInfo.getVectorDimension(), fieldInfo.getVectorScoreFunction());
if (sortMap != null) {
vectorWriter.writeField(fieldInfo, new SortingVectorValues(vectorValues, sortMap));
} else {
vectorWriter.writeField(fieldInfo, vectorValues);
}
}
private static class SortingVectorValues extends VectorValues {
private final VectorValues delegate;
private final VectorValues.RandomAccess randomAccess;
private final int[] docIdOffsets;
private final int[] ordMap;
private int docId = -1;
SortingVectorValues(VectorValues delegate, Sorter.DocMap sortMap) throws IOException {
this.delegate = delegate;
randomAccess = delegate.randomAccess();
docIdOffsets = new int[sortMap.size()];
int offset = 1; // 0 means no vector for this (field, document)
int docID;
while ((docID = delegate.nextDoc()) != NO_MORE_DOCS) {
int newDocID = sortMap.oldToNew(docID);
docIdOffsets[newDocID] = offset++;
}
// set up ordMap to map from new dense ordinal to old dense ordinal
ordMap = new int[offset - 1];
int ord = 0;
for (int docIdOffset : docIdOffsets) {
if (docIdOffset != 0) {
ordMap[ord++] = docIdOffset - 1;
}
}
assert ord == ordMap.length;
}
@Override
public int docID() {
return docId;
}
@Override
public int nextDoc() throws IOException {
while (docId < docIdOffsets.length - 1) {
++docId;
if (docIdOffsets[docId] != 0) {
return docId;
}
}
docId = NO_MORE_DOCS;
return docId;
}
@Override
public BytesRef binaryValue() throws IOException {
return randomAccess.binaryValue(docIdOffsets[docId] - 1);
}
@Override
public float[] vectorValue() {
throw new UnsupportedOperationException();
}
@Override
public int dimension() {
return delegate.dimension();
}
@Override
public int size() {
return delegate.size();
}
@Override
public ScoreFunction scoreFunction() {
return delegate.scoreFunction();
}
@Override
public int advance(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
return size();
}
@Override
public RandomAccess randomAccess() {
RandomAccess ra = delegate.randomAccess();
return new RandomAccess() {
@Override
public int size() {
return delegate.size();
}
@Override
public int dimension() {
return delegate.dimension();
}
@Override
public ScoreFunction scoreFunction() {
return delegate.scoreFunction();
}
@Override
public float[] vectorValue(int targetOrd) throws IOException {
return ra.vectorValue(ordMap[targetOrd]);
}
@Override
public BytesRef binaryValue(int targetOrd) {
throw new UnsupportedOperationException();
}
@Override
public TopDocs search(float[] target, int k, int fanout) {
throw new UnsupportedOperationException();
}
};
}
}
private static class BufferedVectorValues extends VectorValues implements VectorValues.RandomAccess {
final DocsWithFieldSet docsWithField;
// These are always the vectors of a VectorValuesWriter, which are copied when added to it
final List<float[]> vectors;
final VectorValues.ScoreFunction scoreFunction;
final int dimension;
final ByteBuffer buffer;
final BytesRef binaryValue;
final ByteBuffer raBuffer;
final BytesRef raBinaryValue;
DocIdSetIterator docsWithFieldIter;
int ord = -1;
BufferedVectorValues(DocsWithFieldSet docsWithField, List<float[]> vectors, int dimension, VectorValues.ScoreFunction scoreFunction) {
this.docsWithField = docsWithField;
this.vectors = vectors;
this.dimension = dimension;
this.scoreFunction = scoreFunction;
buffer = ByteBuffer.allocate(dimension * Float.BYTES);
binaryValue = new BytesRef(buffer.array());
raBuffer = ByteBuffer.allocate(dimension * Float.BYTES);
raBinaryValue = new BytesRef(raBuffer.array());
docsWithFieldIter = docsWithField.iterator();
}
@Override
public RandomAccess randomAccess() {
return this;
}
@Override
public int dimension() {
return dimension;
}
@Override
public int size() {
return vectors.size();
}
@Override
public VectorValues.ScoreFunction scoreFunction() {
return scoreFunction;
}
@Override
public BytesRef binaryValue() {
buffer.asFloatBuffer().put(vectorValue());
return binaryValue;
}
@Override
public BytesRef binaryValue(int targetOrd) {
raBuffer.asFloatBuffer().put(vectors.get(targetOrd));
return raBinaryValue;
}
@Override
public float[] vectorValue() {
return vectors.get(ord);
}
@Override
public float[] vectorValue(int targetOrd) {
return vectors.get(targetOrd);
}
@Override
public int docID() {
return docsWithFieldIter.docID();
}
@Override
public int nextDoc() throws IOException {
int docID = docsWithFieldIter.nextDoc();
if (docID != NO_MORE_DOCS) {
++ord;
}
return docID;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
return docsWithFieldIter.cost();
}
@Override
public TopDocs search(float[] target, int k, int fanout) throws IOException {
throw new UnsupportedOperationException();
}
}
}

View File

@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.codecs.lucene87.Lucene87Codec
org.apache.lucene.codecs.lucene90.Lucene90Codec

View File

@ -19,6 +19,7 @@ package org.apache.lucene.codecs.lucene87;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.BaseStoredFieldsFormatTestCase;
@ -32,7 +33,7 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks;
public class TestLucene87StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
@Override
protected Codec getCodec() {
return new Lucene87Codec(Mode.BEST_COMPRESSION);
return new Lucene90Codec(Mode.BEST_COMPRESSION);
}
/**
@ -43,7 +44,7 @@ public class TestLucene87StoredFieldsFormatHighCompression extends BaseStoredFie
Directory dir = newDirectory();
for (int i = 0; i < 10; i++) {
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setCodec(new Lucene87Codec(RandomPicks.randomFrom(random(), Mode.values())));
iwc.setCodec(new Lucene90Codec(RandomPicks.randomFrom(random(), Mode.values())));
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(new StoredField("field1", "value1"));
@ -70,7 +71,7 @@ public class TestLucene87StoredFieldsFormatHighCompression extends BaseStoredFie
public void testInvalidOptions() {
expectThrows(NullPointerException.class, () -> {
new Lucene87Codec(null);
new Lucene90Codec(null);
});
expectThrows(NullPointerException.class, () -> {

View File

@ -106,6 +106,16 @@ public class TestIndexableField extends LuceneTestCase {
return 0;
}
@Override
public int vectorDimension() {
return 0;
}
@Override
public VectorValues.ScoreFunction vectorScoreFunction() {
return VectorValues.ScoreFunction.NONE;
}
@Override
public Map<String, String> getAttributes() {
return null;

View File

@ -37,6 +37,8 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
import static org.apache.lucene.index.VectorValues.ScoreFunction.NONE;
public class TestPendingSoftDeletes extends TestPendingDeletes {
@Override
@ -164,7 +166,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes {
deletes.onNewReader(segmentReader, commitInfo);
reader.close();
writer.close();
FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0, 0, true);
FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0, 0, 0, NONE, true);
List<Integer> docsDeleted = Arrays.asList(1, 3, 7, 8, DocIdSetIterator.NO_MORE_DOCS);
List<DocValuesFieldUpdates> updates = Arrays.asList(singleUpdate(docsDeleted, 10, true));
for (DocValuesFieldUpdates update : updates) {
@ -185,7 +187,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes {
docsDeleted = Arrays.asList(1, 2, DocIdSetIterator.NO_MORE_DOCS);
updates = Arrays.asList(singleUpdate(docsDeleted, 10, true));
fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0, 0, true);
fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0, 0, 0, NONE, true);
for (DocValuesFieldUpdates update : updates) {
deletes.onDocValuesUpdate(fieldInfo, update.iterator());
}
@ -228,7 +230,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes {
SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo();
PendingDeletes deletes = newPendingDeletes(segmentInfo);
deletes.onNewReader(segmentReader, segmentInfo);
FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, true);
FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, 0, NONE, true);
List<Integer> docsDeleted = Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS);
List<DocValuesFieldUpdates> updates = Arrays.asList(singleUpdate(docsDeleted, 3, true));
for (DocValuesFieldUpdates update : updates) {
@ -276,7 +278,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes {
SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo();
PendingDeletes deletes = newPendingDeletes(segmentInfo);
deletes.onNewReader(segmentReader, segmentInfo);
FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, true);
FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, 0, NONE, true);
List<DocValuesFieldUpdates> updates = Arrays.asList(singleUpdate(Arrays.asList(0, 1, DocIdSetIterator.NO_MORE_DOCS), 3, false));
for (DocValuesFieldUpdates update : updates) {
deletes.onDocValuesUpdate(fieldInfo, update.iterator());
@ -295,7 +297,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes {
assertEquals(0, deletes.numPendingDeletes());
segmentInfo.advanceDocValuesGen();
fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, true);
fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, 0, NONE, true);
updates = Arrays.asList(singleUpdate(Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS), 3, true));
for (DocValuesFieldUpdates update : updates) {
deletes.onDocValuesUpdate(fieldInfo, update.iterator());

View File

@ -106,6 +106,9 @@ public class TestSegmentToThreadMapping extends LuceneTestCase {
return null;
}
@Override
public VectorValues getVectorValues(String field) { return null; }
@Override
protected void doClose() {
}

View File

@ -0,0 +1,722 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.VectorField;
import org.apache.lucene.index.VectorValues.ScoreFunction;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
/** Test Indexing/IndexWriter with vectors */
public class TestVectorValues extends LuceneTestCase {
private IndexWriterConfig createIndexWriterConfig() {
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setCodec(Codec.forName("Lucene90"));
return iwc;
}
// Suddenly add vectors to an existing field:
public void testUpgradeFieldToVectors() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(newStringField("f", "foo", Store.NO));
w.addDocument(doc);
}
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
}
}
}
public void testFieldConstructor() {
float[] v = new float[1];
VectorField field = new VectorField("f", v);
assertEquals(1, field.fieldType().vectorDimension());
assertEquals(ScoreFunction.EUCLIDEAN, field.fieldType().vectorScoreFunction());
assertSame(v, field.vectorValue());
}
public void testFieldConstructorExceptions() {
expectThrows(IllegalArgumentException.class, () -> new VectorField(null, new float[1]));
expectThrows(IllegalArgumentException.class, () -> new VectorField("f", null));
expectThrows(IllegalArgumentException.class, () -> new VectorField("f", new float[1], null));
expectThrows(IllegalArgumentException.class, () -> new VectorField("f", new float[0]));
expectThrows(IllegalArgumentException.class, () -> new VectorField("f", new float[VectorValues.MAX_DIMENSIONS + 1]));
}
public void testFieldSetValue() {
VectorField field = new VectorField("f", new float[1]);
float[] v1 = new float[1];
field.setVectorValue(v1);
assertSame(v1, field.vectorValue());
expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(new float[2]));
expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(null));
}
// Illegal schema change tests:
public void testIllegalDimChangeTwoDocs() throws Exception {
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
if (random().nextBoolean()) {
// sometimes test with two segments
w.commit();
}
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[3], ScoreFunction.DOT_PRODUCT));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
() -> w.addDocument(doc2));
assertEquals("cannot change vector dimension from 4 to 3 for field=\"f\"", expected.getMessage());
}
}
public void testIllegalScoreFunctionChange() throws Exception {
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
if (random().nextBoolean()) {
// sometimes test with two segments
w.commit();
}
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
() -> w.addDocument(doc2));
assertEquals("cannot change vector score function from DOT_PRODUCT to EUCLIDEAN for field=\"f\"", expected.getMessage());
}
}
public void testIllegalDimChangeTwoWriters() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[1], ScoreFunction.DOT_PRODUCT));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
() -> w2.addDocument(doc2));
assertEquals("cannot change vector dimension from 4 to 1 for field=\"f\"", expected.getMessage());
}
}
}
public void testIllegalScoreFunctionChangeTwoWriters() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
() -> w2.addDocument(doc2));
assertEquals("cannot change vector score function from DOT_PRODUCT to EUCLIDEAN for field=\"f\"", expected.getMessage());
}
}
}
public void testAddIndexesDirectory0() throws Exception {
String fieldName = "field";
Document doc = new Document();
doc.add(new VectorField(fieldName, new float[4], ScoreFunction.DOT_PRODUCT));
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
w2.addIndexes(new Directory[]{dir});
try (IndexReader reader = w2.getReader()) {
LeafReader r = reader.leaves().get(0).reader();
VectorValues vectorValues = r.getVectorValues(fieldName);
assertEquals(0, vectorValues.nextDoc());
assertEquals(0, vectorValues.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
}
}
}
}
public void testAddIndexesDirectory1() throws Exception {
String fieldName = "field";
Document doc = new Document();
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
w.addDocument(doc);
}
doc.add(new VectorField(fieldName, new float[4], ScoreFunction.DOT_PRODUCT));
try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
w2.addDocument(doc);
w2.addIndexes(new Directory[]{dir});
try (IndexReader reader = w2.getReader()) {
LeafReader r = reader.leaves().get(0).reader();
VectorValues vectorValues = r.getVectorValues(fieldName);
assertEquals(0, vectorValues.nextDoc());
assertEquals(0, vectorValues.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
}
}
}
}
public void testAddIndexesDirectory01() throws Exception {
String fieldName = "field";
float[] vector = new float[1];
Document doc = new Document();
doc.add(new VectorField(fieldName, vector, ScoreFunction.DOT_PRODUCT));
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
vector[0] = 1;
w2.addDocument(doc);
w2.addIndexes(new Directory[]{dir});
w2.forceMerge(1);
try (IndexReader reader = w2.getReader()) {
LeafReader r = reader.leaves().get(0).reader();
VectorValues vectorValues = r.getVectorValues(fieldName);
assertEquals(0, vectorValues.nextDoc());
assertEquals(1, vectorValues.vectorValue()[0], 0);
assertEquals(1, vectorValues.nextDoc());
assertEquals(0, vectorValues.vectorValue()[0], 0);
}
}
}
}
public void testIllegalDimChangeViaAddIndexesDirectory() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[5], ScoreFunction.DOT_PRODUCT));
w2.addDocument(doc);
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
() -> w2.addIndexes(new Directory[]{dir}));
assertEquals("cannot change vector dimension from 5 to 4 for field=\"f\"", expected.getMessage());
}
}
}
public void testIllegalScoreFunctionChangeViaAddIndexesDirectory() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN));
w2.addDocument(doc);
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
() -> w2.addIndexes(dir));
assertEquals("cannot change vector score function from EUCLIDEAN to DOT_PRODUCT for field=\"f\"", expected.getMessage());
}
}
}
public void testIllegalDimChangeViaAddIndexesCodecReader() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[5], ScoreFunction.DOT_PRODUCT));
w2.addDocument(doc);
try (DirectoryReader r = DirectoryReader.open(dir)) {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
() -> w2.addIndexes(new CodecReader[]{(CodecReader) getOnlyLeafReader(r)}));
assertEquals("cannot change vector dimension from 5 to 4 for field=\"f\"", expected.getMessage());
}
}
}
}
public void testIllegalScoreFunctionChangeViaAddIndexesCodecReader() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN));
w2.addDocument(doc);
try (DirectoryReader r = DirectoryReader.open(dir)) {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
() -> w2.addIndexes(new CodecReader[]{(CodecReader) getOnlyLeafReader(r)}));
assertEquals("cannot change vector score function from EUCLIDEAN to DOT_PRODUCT for field=\"f\"", expected.getMessage());
}
}
}
}
public void testIllegalDimChangeViaAddIndexesSlowCodecReader() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[5], ScoreFunction.DOT_PRODUCT));
w2.addDocument(doc);
try (DirectoryReader r = DirectoryReader.open(dir)) {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
() -> TestUtil.addIndexesSlowly(w2, r));
assertEquals("cannot change vector dimension from 5 to 4 for field=\"f\"", expected.getMessage());
}
}
}
}
public void testIllegalScoreFunctionChangeViaAddIndexesSlowCodecReader() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN));
w2.addDocument(doc);
try (DirectoryReader r = DirectoryReader.open(dir)) {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
() -> TestUtil.addIndexesSlowly(w2, r));
assertEquals("cannot change vector score function from EUCLIDEAN to DOT_PRODUCT for field=\"f\"", expected.getMessage());
}
}
}
}
public void testIllegalMultipleValues() throws Exception {
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
() -> w.addDocument(doc));
assertEquals("VectorValuesField \"f\" appears more than once in this document (only one value is allowed per field)",
expected.getMessage());
}
}
public void testIllegalDimensionTooLarge() throws Exception {
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
expectThrows(IllegalArgumentException.class,
() -> doc.add(new VectorField("f", new float[VectorValues.MAX_DIMENSIONS + 1], ScoreFunction.DOT_PRODUCT)));
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[1], ScoreFunction.EUCLIDEAN));
w.addDocument(doc2);
}
}
public void testIllegalEmptyVector() throws Exception {
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
Exception e = expectThrows(IllegalArgumentException.class,
() -> doc.add(new VectorField("f", new float[0], ScoreFunction.NONE)));
assertEquals("cannot index an empty vector", e.getMessage());
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[1], ScoreFunction.NONE));
w.addDocument(doc2);
}
}
// Write vectors, one segment with default codec, another with SimpleText, then forceMerge
public void testDifferentCodecs1() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
}
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setCodec(Codec.forName("SimpleText"));
try (IndexWriter w = new IndexWriter(dir, iwc)) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
w.forceMerge(1);
}
}
}
// Write vectors, one segment with with SimpleText, another with default codec, then forceMerge
public void testDifferentCodecs2() throws Exception {
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setCodec(Codec.forName("SimpleText"));
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, iwc)) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
w.forceMerge(1);
}
}
}
public void testInvalidVectorFieldUsage() {
VectorField field = new VectorField("field", new float[2], ScoreFunction.NONE);
expectThrows(IllegalArgumentException.class, () -> field.setIntValue(14));
expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(new float[1]));
assertNull(field.numericValue());
}
public void testDeleteAllVectorDocs() throws Exception {
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new StringField("id", "0", Store.NO));
doc.add(new VectorField("v", new float[]{2, 3, 5}, ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
w.addDocument(new Document());
w.commit();
try (DirectoryReader r = w.getReader()) {
assertNotNull(r.leaves().get(0).reader().getVectorValues("v"));
}
w.deleteDocuments(new Term("id", "0"));
w.forceMerge(1);
try (DirectoryReader r = w.getReader()) {
assertNull(r.leaves().get(0).reader().getVectorValues("v"));
}
}
}
public void testVectorFieldMissingFromOneSegment() throws Exception {
try (Directory dir = FSDirectory.open(createTempDir());
IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new StringField("id", "0", Store.NO));
doc.add(new VectorField("v0", new float[]{2, 3, 5}, ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
w.commit();
doc = new Document();
doc.add(new VectorField("v1", new float[]{2, 3, 5}, ScoreFunction.DOT_PRODUCT));
w.addDocument(doc);
w.forceMerge(1);
}
}
public void testSparseVectors() throws Exception {
int numDocs = atLeast(1000);
int numFields = TestUtil.nextInt(random(), 1, 10);
int[] fieldDocCounts = new int[numFields];
float[] fieldTotals= new float[numFields];
int[] fieldDims = new int[numFields];
ScoreFunction[] fieldScoreFunctions = new ScoreFunction[numFields];
for (int i = 0; i < numFields; i++) {
fieldDims[i] = random().nextInt(20) + 1;
fieldScoreFunctions[i] = ScoreFunction.values()[random().nextInt(ScoreFunction.values().length)];
}
try (Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, createIndexWriterConfig())) {
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
for (int field = 0; field < numFields; field++) {
String fieldName = "int" + field;
if (random().nextInt(100) == 17) {
float[] v = randomVector(fieldDims[field]);
doc.add(new VectorField(fieldName, v, fieldScoreFunctions[field]));
fieldDocCounts[field]++;
fieldTotals[field] += v[0];
}
}
w.addDocument(doc);
}
try (IndexReader r = w.getReader()) {
for (int field = 0; field < numFields; field++) {
int docCount = 0;
float checksum = 0;
String fieldName = "int" + field;
for (LeafReaderContext ctx : r.leaves()) {
VectorValues vectors = ctx.reader().getVectorValues(fieldName);
if (vectors != null) {
docCount += vectors.size();
while (vectors.nextDoc() != NO_MORE_DOCS) {
checksum += vectors.vectorValue()[0];
}
}
}
assertEquals(fieldDocCounts[field], docCount);
assertEquals(fieldTotals[field], checksum, 1e-5);
}
}
}
}
public void testIndexedValueNotAliased() throws Exception {
// We copy indexed values (as for BinaryDocValues) so the input float[] can be reused across
// calls to IndexWriter.addDocument.
String fieldName = "field";
float[] v = { 0 };
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc1 = new Document();
doc1.add(new VectorField(fieldName, v, VectorValues.ScoreFunction.EUCLIDEAN));
v[0] = 1;
Document doc2 = new Document();
doc2.add(new VectorField(fieldName, v, VectorValues.ScoreFunction.EUCLIDEAN));
iw.addDocument(doc1);
iw.addDocument(doc2);
v[0] = 2;
Document doc3 = new Document();
doc3.add(new VectorField(fieldName, v, VectorValues.ScoreFunction.EUCLIDEAN));
iw.addDocument(doc3);
try (IndexReader reader = iw.getReader()) {
LeafReader r = reader.leaves().get(0).reader();
VectorValues vectorValues = r.getVectorValues(fieldName);
vectorValues.nextDoc();
assertEquals(1, vectorValues.vectorValue()[0], 0);
vectorValues.nextDoc();
assertEquals(1, vectorValues.vectorValue()[0], 0);
vectorValues.nextDoc();
assertEquals(2, vectorValues.vectorValue()[0], 0);
}
}
}
public void testSortedIndex() throws Exception {
IndexWriterConfig iwc = createIndexWriterConfig();
iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT)));
String fieldName = "field";
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, iwc)) {
add(iw, fieldName, 1, 1, new float[]{1});
add(iw, fieldName, 4, 4, new float[]{4});
add(iw, fieldName, 3, 3, null);
add(iw, fieldName, 2, 2, new float[]{2});
try (IndexReader reader = iw.getReader()) {
LeafReader leaf = reader.leaves().get(0).reader();
VectorValues vectorValues = leaf.getVectorValues(fieldName);
assertEquals(1, vectorValues.dimension());
assertEquals(3, vectorValues.size());
assertEquals("1", leaf.document(vectorValues.nextDoc()).get("id"));
assertEquals(1f, vectorValues.vectorValue()[0], 0);
assertEquals("2", leaf.document(vectorValues.nextDoc()).get("id"));
assertEquals(2f, vectorValues.vectorValue()[0], 0);
assertEquals("4", leaf.document(vectorValues.nextDoc()).get("id"));
assertEquals(4f, vectorValues.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
VectorValues.RandomAccess ra = vectorValues.randomAccess();
assertEquals(1f, ra.vectorValue(0)[0], 0);
assertEquals(2f, ra.vectorValue(1)[0], 0);
assertEquals(4f, ra.vectorValue(2)[0], 0);
}
}
}
/**
* Index random vectors, sometimes skipping documents, sometimes deleting a document,
* sometimes merging, sometimes sorting the index,
* and verify that the expected values can be read back consistently.
*/
public void testRandom() throws Exception {
IndexWriterConfig iwc = createIndexWriterConfig();
if (random().nextBoolean()) {
iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT)));
}
String fieldName = "field";
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, iwc)) {
int numDoc = atLeast(100);
int dimension = atLeast(10);
float[] scratch = new float[dimension];
int numValues = 0;
float[][] values = new float[numDoc][];
for (int i = 0; i < numDoc; i++) {
if (random().nextInt(7) != 3) {
// usually index a vector value for a doc
values[i] = randomVector(dimension);
++numValues;
}
if (random().nextBoolean() && values[i] != null) {
// sometimes use a shared scratch array
System.arraycopy(values[i], 0, scratch, 0, scratch.length);
add(iw, fieldName, i, scratch);
} else {
add(iw, fieldName, i, values[i]);
}
if (random().nextInt(10) == 2) {
// sometimes delete a random document
int idToDelete = random().nextInt(i + 1);
iw.deleteDocuments(new Term("id", Integer.toString(idToDelete)));
// and remember that it was deleted
if (values[idToDelete] != null) {
values[idToDelete] = null;
--numValues;
}
}
if (random().nextInt(10) == 3) {
iw.commit();
}
}
iw.forceMerge(1);
try (IndexReader reader = iw.getReader()) {
int valueCount = 0, totalSize = 0;
for (LeafReaderContext ctx : reader.leaves()) {
VectorValues vectorValues = ctx.reader().getVectorValues(fieldName);
if (vectorValues == null) {
continue;
}
totalSize += vectorValues.size();
int docId;
while ((docId = vectorValues.nextDoc()) != NO_MORE_DOCS) {
float[] v = vectorValues.vectorValue();
assertEquals(dimension, v.length);
String idString = ctx.reader().document(docId).getField("id").stringValue();
int id = Integer.parseInt(idString);
assertArrayEquals(idString, values[id], v, 0);
++valueCount;
}
}
assertEquals(numValues, valueCount);
assertEquals(numValues, totalSize);
}
}
}
private void add(IndexWriter iw, String field, int id, float[] vector) throws IOException {
add(iw, field, id, random().nextInt(100), vector);
}
private void add(IndexWriter iw, String field, int id, int sortkey, float[] vector) throws IOException {
Document doc = new Document();
if (vector != null) {
doc.add(new VectorField(field, vector));
}
doc.add(new NumericDocValuesField("sortkey", sortkey));
doc.add(new StringField("id", Integer.toString(id), Field.Store.YES));
iw.addDocument(doc);
}
private float[] randomVector(int dim) {
float[] v = new float[dim];
for (int i = 0; i < dim; i++) {
v[i] = random().nextFloat();
}
return v;
}
public void testCheckIndexIncludesVectors() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("v1", randomVector(3), ScoreFunction.NONE));
w.addDocument(doc);
doc.add(new VectorField("v2", randomVector(3), ScoreFunction.NONE));
w.addDocument(doc);
}
ByteArrayOutputStream output = new ByteArrayOutputStream();
CheckIndex.Status status = TestUtil.checkIndex(dir, false, true, output);
assertEquals(1, status.segmentInfos.size());
CheckIndex.Status.SegmentInfoStatus segStatus = status.segmentInfos.get(0);
// total 3 vector values were indexed:
assertEquals(3, segStatus.vectorValuesStatus.totalVectorValues);
// ... across 2 fields:
assertEquals(2, segStatus.vectorValuesStatus.totalVectorFields);
// Make sure CheckIndex in fact declares that it is testing vectors!
assertTrue(output.toString(IOUtils.UTF_8).contains("test: vectors..."));
}
}
public void testScoreFunctionIdentifiers() throws Exception {
// make sure we don't accidentally mess up score function identifiers by re-ordering their enumerators
assertEquals(0, ScoreFunction.NONE.ordinal());
assertEquals(1, ScoreFunction.EUCLIDEAN.ordinal());
assertEquals(2, ScoreFunction.DOT_PRODUCT.ordinal());
assertEquals(3, ScoreFunction.values().length);
}
}

View File

@ -854,7 +854,7 @@ public class TestFSTs extends LuceneTestCase {
System.out.println("TEST: cycle=" + cycle);
}
RandomIndexWriter w = new RandomIndexWriter(random(), dir,
newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
Document doc = new Document();
Field idField = newStringField("id", "", Field.Store.NO);
doc.add(idField);

View File

@ -35,6 +35,7 @@ import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Version;
@ -81,7 +82,7 @@ public class TermVectorLeafReader extends LeafReader {
}
FieldInfo fieldInfo = new FieldInfo(field, 0,
true, true, terms.hasPayloads(),
indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, 0, false);
indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, false);
fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo});
}
@ -139,6 +140,11 @@ public class TermVectorLeafReader extends LeafReader {
return null;
}
@Override
public VectorValues getVectorValues(String fieldName) {
return null;
}
@Override
public void checkIntegrity() throws IOException {
}

View File

@ -501,7 +501,8 @@ public class MemoryIndex {
IndexOptions indexOptions = storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
return new FieldInfo(fieldName, ord, fieldType.storeTermVectors(), fieldType.omitNorms(), storePayloads,
indexOptions, fieldType.docValuesType(), -1, Collections.emptyMap(),
fieldType.pointDimensionCount(), fieldType.pointIndexDimensionCount(), fieldType.pointNumBytes(), false);
fieldType.pointDimensionCount(), fieldType.pointIndexDimensionCount(), fieldType.pointNumBytes(),
fieldType.vectorDimension(), fieldType.vectorScoreFunction(), false);
}
private void storePointValues(Info info, BytesRef pointValue) {
@ -521,6 +522,7 @@ public class MemoryIndex {
info.fieldInfo.name, info.fieldInfo.number, info.fieldInfo.hasVectors(), info.fieldInfo.hasPayloads(),
info.fieldInfo.hasPayloads(), info.fieldInfo.getIndexOptions(), docValuesType, -1, info.fieldInfo.attributes(),
info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointIndexDimensionCount(), info.fieldInfo.getPointNumBytes(),
info.fieldInfo.getVectorDimension(), info.fieldInfo.getVectorScoreFunction(),
info.fieldInfo.isSoftDeletesField()
);
} else if (existingDocValuesType != docValuesType) {
@ -1241,6 +1243,11 @@ public class MemoryIndex {
return new MemoryIndexPointValues(info);
}
@Override
public VectorValues getVectorValues(String fieldName) {
return VectorValues.EMPTY;
}
@Override
public void checkIntegrity() throws IOException {
// no-op

View File

@ -39,7 +39,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene87.Lucene87Codec;
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
@ -887,7 +887,7 @@ public class TestSuggestField extends LuceneTestCase {
static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, final Set<String> suggestFields) {
IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
iwc.setMergePolicy(newLogMergePolicy());
Codec filterCodec = new Lucene87Codec() {
Codec filterCodec = new Lucene90Codec() {
CompletionPostingsFormat.FSTLoadMode fstLoadMode =
RandomPicks.randomFrom(random(), CompletionPostingsFormat.FSTLoadMode.values());
PostingsFormat postingsFormat = new Completion84PostingsFormat(fstLoadMode);

View File

@ -354,7 +354,8 @@ abstract class BaseIndexFileFormatTestCase extends LuceneTestCase {
FieldInfo proto = oneDocReader.getFieldInfos().fieldInfo("field");
FieldInfo field = new FieldInfo(proto.name, proto.number, proto.hasVectors(), proto.omitsNorms(), proto.hasPayloads(),
proto.getIndexOptions(), proto.getDocValuesType(), proto.getDocValuesGen(), new HashMap<>(),
proto.getPointDimensionCount(), proto.getPointIndexDimensionCount(), proto.getPointNumBytes(), proto.isSoftDeletesField());
proto.getPointDimensionCount(), proto.getPointIndexDimensionCount(), proto.getPointNumBytes(),
proto.getVectorDimension(), proto.getVectorScoreFunction(), proto.isSoftDeletesField());
FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { field } );

View File

@ -80,6 +80,8 @@ public class MismatchedLeafReader extends FilterLeafReader {
oldInfo.getPointDimensionCount(), // data dimension count
oldInfo.getPointIndexDimensionCount(), // index dimension count
oldInfo.getPointNumBytes(), // dimension numBytes
oldInfo.getVectorDimension(), // number of dimensions of the field's vector
oldInfo.getVectorScoreFunction(), // distance function for calculating similarity of the field's vector
oldInfo.isSoftDeletesField()); // used as soft-deletes field
shuffled.set(i, newInfo);
}

View File

@ -130,7 +130,7 @@ public class RandomPostingsTester {
fieldInfoArray[fieldUpto] = new FieldInfo(field, fieldUpto, false, false, true,
IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS,
DocValuesType.NONE, -1, new HashMap<>(),
0, 0, 0, false);
0, 0, 0, 0, VectorValues.ScoreFunction.NONE, false);
fieldUpto++;
SortedMap<BytesRef,SeedAndOrd> postings = new TreeMap<>();
@ -651,7 +651,7 @@ public class RandomPostingsTester {
DocValuesType.NONE,
-1,
new HashMap<>(),
0, 0, 0, false);
0, 0, 0, 0, VectorValues.ScoreFunction.NONE, false);
}
FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray);

View File

@ -35,6 +35,7 @@ import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Version;
@ -207,6 +208,11 @@ public class QueryUtils {
return null;
}
@Override
public VectorValues getVectorValues(String field) throws IOException {
return null;
}
@Override
public FieldInfos getFieldInfos() {
return FieldInfos.EMPTY;

View File

@ -34,7 +34,7 @@ import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
import org.apache.lucene.codecs.compressing.CompressingCodec;
import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat;
import org.apache.lucene.codecs.lucene87.Lucene87Codec;
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
import org.apache.lucene.index.RandomCodec;
@ -187,8 +187,8 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
codec = new AssertingCodec();
} else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
codec = CompressingCodec.randomInstance(random);
} else if ("Lucene87".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene87"))) {
codec = new Lucene87Codec(RandomPicks.randomFrom(random, Lucene87StoredFieldsFormat.Mode.values())
} else if ("Lucene90".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene90"))) {
codec = new Lucene90Codec(RandomPicks.randomFrom(random, Lucene87StoredFieldsFormat.Mode.values())
);
} else if (!"random".equals(TEST_CODEC)) {
codec = Codec.forName(TEST_CODEC);

View File

@ -46,6 +46,8 @@ import java.util.regex.PatternSyntaxException;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
@ -54,7 +56,7 @@ import org.apache.lucene.codecs.blockterms.LuceneFixedGap;
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.lucene87.Lucene87Codec;
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.apache.lucene.document.BinaryDocValuesField;
@ -101,9 +103,6 @@ import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.NoLockFactory;
import org.junit.Assert;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
/**
* General utility methods for Lucene unit tests.
*/
@ -919,7 +918,7 @@ public final class TestUtil {
* This may be different than {@link Codec#getDefault()} because that is randomized.
*/
public static Codec getDefaultCodec() {
return new Lucene87Codec();
return new Lucene90Codec();
}
/**

View File

@ -276,6 +276,12 @@ public final class SlowCompositeReaderWrapper extends LeafReader {
return null; // because not supported. Throw UOE?
}
@Override
public VectorValues getVectorValues(String field) {
ensureOpen();
return null; // because not supported. Throw UOE?
}
@Override
public FieldInfos getFieldInfos() {
return fieldInfos;

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.SortField;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.SimpleOrderedMap;
@ -447,6 +448,16 @@ public final class SchemaField extends FieldProperties implements IndexableField
return 0;
}
@Override
public int vectorDimension() {
return 0;
}
@Override
public VectorValues.ScoreFunction vectorScoreFunction() {
return VectorValues.ScoreFunction.NONE;
}
@Override
public Map<String, String> getAttributes() {
return null;

View File

@ -493,6 +493,8 @@ public class CollapsingQParserPlugin extends QParserPlugin {
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointIndexDimensionCount(),
fieldInfo.getPointNumBytes(),
fieldInfo.getVectorDimension(),
fieldInfo.getVectorScoreFunction(),
fieldInfo.isSoftDeletesField());
newInfos.add(f);
} else {

View File

@ -67,7 +67,8 @@ public class Insanity {
if (fi.name.equals(insaneField)) {
filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(),
fi.hasPayloads(), fi.getIndexOptions(), DocValuesType.NONE, -1, Collections.emptyMap(),
fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()));
fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField()));
} else {
filteredInfos.add(fi);
}

View File

@ -284,7 +284,8 @@ public class UninvertingReader extends FilterLeafReader {
wrap = true;
newFieldInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(),
fi.hasPayloads(), fi.getIndexOptions(), type, fi.getDocValuesGen(), fi.attributes(),
fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()));
fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField()));
} else {
newFieldInfos.add(fi);
}

View File

@ -37,6 +37,7 @@ import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TotalHits;
@ -343,6 +344,11 @@ public class TestDocSet extends SolrTestCase {
return null;
}
@Override
public VectorValues getVectorValues(String field) {
return null;
}
@Override
protected void doClose() {
}