LUCENE-9705: Create Lucene90SegmentInfoFormat (#30)

The existing Lucene86SegmentInfoFormat is moved to backwards-codecs.
This commit is contained in:
Ignacio Vera 2021-03-30 10:04:17 +02:00 committed by GitHub
parent c11a01ab61
commit 00e57f8c8a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 348 additions and 17 deletions

View File

@ -39,7 +39,6 @@ import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@ -116,7 +115,7 @@ public class Lucene86Codec extends Codec {
}
@Override
public final SegmentInfoFormat segmentInfoFormat() {
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}

View File

@ -0,0 +1,178 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene86;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.SortFieldProvider;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Version;
/**
* Lucene 8.6 Segment info format.
*
* <p>Files:
*
* <ul>
* <li><code>.si</code>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files,
* Attributes, IndexSort, Footer
* </ul>
*
* Data types:
*
* <ul>
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
* <li>SegSize --&gt; {@link DataOutput#writeInt Int32}
* <li>SegVersion --&gt; {@link DataOutput#writeString String}
* <li>SegMinVersion --&gt; {@link DataOutput#writeString String}
* <li>Files --&gt; {@link DataOutput#writeSetOfStrings Set&lt;String&gt;}
* <li>Diagnostics,Attributes --&gt; {@link DataOutput#writeMapOfStrings Map&lt;String,String&gt;}
* <li>IsCompoundFile --&gt; {@link DataOutput#writeByte Int8}
* <li>IndexSort --&gt; {@link DataOutput#writeVInt Int32} count, followed by {@code count}
* SortField
* <li>SortField --&gt; {@link DataOutput#writeString String} sort class, followed by a per-sort
* bytestream (see {@link SortFieldProvider#readSortField(DataInput)})
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}
* </ul>
*
* Field Descriptions:
*
* <ul>
* <li>SegVersion is the code version that created the segment.
* <li>SegMinVersion is the minimum code version that contributed documents to the segment.
* <li>SegSize is the number of documents contained in the segment index.
* <li>IsCompoundFile records whether the segment is written as a compound file or not. If this is
* -1, the segment is not a compound file. If it is 1, the segment is a compound file.
* <li>The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid, for
* each segment it creates. It includes metadata like the current Lucene version, OS, Java
* version, why the segment was created (merge, flush, addIndexes), etc.
* <li>Files is a list of files referred to by this segment.
* </ul>
*
* @see SegmentInfos
* @lucene.experimental
*/
public class Lucene86SegmentInfoFormat extends SegmentInfoFormat {
/** File extension used to store {@link SegmentInfo}. */
public static final String SI_EXTENSION = "si";
static final String CODEC_NAME = "Lucene86SegmentInfo";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
/** Sole constructor. */
public Lucene86SegmentInfoFormat() {}
@Override
public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context)
throws IOException {
final String fileName = IndexFileNames.segmentFileName(segment, "", SI_EXTENSION);
try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) {
Throwable priorE = null;
SegmentInfo si = null;
try {
CodecUtil.checkIndexHeader(
input, CODEC_NAME, VERSION_START, VERSION_CURRENT, segmentID, "");
si = parseSegmentInfo(dir, input, segment, segmentID);
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(input, priorE);
}
return si;
}
}
private SegmentInfo parseSegmentInfo(
Directory dir, DataInput input, String segment, byte[] segmentID) throws IOException {
final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
byte hasMinVersion = input.readByte();
final Version minVersion;
switch (hasMinVersion) {
case 0:
minVersion = null;
break;
case 1:
minVersion = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
break;
default:
throw new CorruptIndexException("Illegal boolean value " + hasMinVersion, input);
}
final int docCount = input.readInt();
if (docCount < 0) {
throw new CorruptIndexException("invalid docCount: " + docCount, input);
}
final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
final Map<String, String> diagnostics = input.readMapOfStrings();
final Set<String> files = input.readSetOfStrings();
final Map<String, String> attributes = input.readMapOfStrings();
int numSortFields = input.readVInt();
Sort indexSort;
if (numSortFields > 0) {
SortField[] sortFields = new SortField[numSortFields];
for (int i = 0; i < numSortFields; i++) {
String name = input.readString();
sortFields[i] = SortFieldProvider.forName(name).readSortField(input);
}
indexSort = new Sort(sortFields);
} else if (numSortFields < 0) {
throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input);
} else {
indexSort = null;
}
SegmentInfo si =
new SegmentInfo(
dir,
version,
minVersion,
segment,
docCount,
isCompoundFile,
null,
diagnostics,
segmentID,
attributes,
indexSort);
si.setFiles(files);
return si;
}
@Override
public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
throw new UnsupportedOperationException("Old formats can't be used for writing");
}
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.backward_codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.backward_codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.backward_codecs.lucene86.Lucene86SegmentInfoFormat;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
@ -39,7 +40,6 @@ import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@ -138,7 +138,7 @@ public class Lucene87Codec extends Codec {
}
@Override
public final SegmentInfoFormat segmentInfoFormat() {
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.backward_codecs.lucene84.Lucene84RWPostingsFormat;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@ -52,6 +53,11 @@ public class Lucene86RWCodec extends Lucene86Codec {
storedFieldsFormat = new Lucene50RWStoredFieldsFormat(mode);
}
@Override
public SegmentInfoFormat segmentInfoFormat() {
return new Lucene86RWSegmentInfoFormat();
}
@Override
public StoredFieldsFormat storedFieldsFormat() {
return storedFieldsFormat;

View File

@ -0,0 +1,106 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene86;
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexSorter;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SortFieldProvider;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Version;
/** Writable version of Lucene86SegmentInfoFormat for testing */
public class Lucene86RWSegmentInfoFormat extends Lucene86SegmentInfoFormat {
/** Sole constructor. */
public Lucene86RWSegmentInfoFormat() {}
@Override
public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
final String fileName = IndexFileNames.segmentFileName(si.name, "", SI_EXTENSION);
try (IndexOutput output = dir.createOutput(fileName, ioContext)) {
// Only add the file once we've successfully created it, else IFD assert can trip:
si.addFile(fileName);
CodecUtil.writeIndexHeader(output, CODEC_NAME, VERSION_CURRENT, si.getId(), "");
writeSegmentInfo(output, si);
CodecUtil.writeFooter(output);
}
}
private void writeSegmentInfo(DataOutput output, SegmentInfo si) throws IOException {
Version version = si.getVersion();
if (version.major < 7) {
throw new IllegalArgumentException(
"invalid major version: should be >= 7 but got: " + version.major + " segment=" + si);
}
// Write the Lucene version that created this segment, since 3.1
output.writeInt(version.major);
output.writeInt(version.minor);
output.writeInt(version.bugfix);
// Write the min Lucene version that contributed docs to the segment, since 7.0
if (si.getMinVersion() != null) {
output.writeByte((byte) 1);
Version minVersion = si.getMinVersion();
output.writeInt(minVersion.major);
output.writeInt(minVersion.minor);
output.writeInt(minVersion.bugfix);
} else {
output.writeByte((byte) 0);
}
assert version.prerelease == 0;
output.writeInt(si.maxDoc());
output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
output.writeMapOfStrings(si.getDiagnostics());
Set<String> files = si.files();
for (String file : files) {
if (!IndexFileNames.parseSegmentName(file).equals(si.name)) {
throw new IllegalArgumentException(
"invalid files: expected segment=" + si.name + ", got=" + files);
}
}
output.writeSetOfStrings(files);
output.writeMapOfStrings(si.getAttributes());
Sort indexSort = si.getIndexSort();
int numSortFields = indexSort == null ? 0 : indexSort.getSort().length;
output.writeVInt(numSortFields);
for (int i = 0; i < numSortFields; ++i) {
SortField sortField = indexSort.getSort()[i];
IndexSorter sorter = sortField.getIndexSorter();
if (sorter == null) {
throw new IllegalArgumentException("cannot serialize SortField " + sortField);
}
output.writeString(sorter.getProviderName());
SortFieldProvider.write(sortField, output);
}
}
}

View File

@ -15,22 +15,22 @@
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene86;
package org.apache.lucene.backward_codecs.lucene86;
import org.apache.lucene.backward_codecs.lucene87.Lucene87RWCodec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.BaseSegmentInfoFormatTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
public class TestLucene86SegmentInfoFormat extends BaseSegmentInfoFormatTestCase {
@Override
protected Version[] getVersions() {
return new Version[] {Version.LATEST};
return new Version[] {Version.LUCENE_8_8_1};
}
@Override
protected Codec getCodec() {
return TestUtil.getDefaultCodec();
return new Lucene87RWCodec();
}
}

View File

@ -20,9 +20,11 @@ import org.apache.lucene.backward_codecs.lucene50.Lucene50RWCompoundFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50RWTermVectorsFormat;
import org.apache.lucene.backward_codecs.lucene80.Lucene80RWNormsFormat;
import org.apache.lucene.backward_codecs.lucene84.Lucene84RWPostingsFormat;
import org.apache.lucene.backward_codecs.lucene86.Lucene86RWSegmentInfoFormat;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@ -49,6 +51,11 @@ public class Lucene87RWCodec extends Lucene87Codec {
this.mode = mode;
}
@Override
public SegmentInfoFormat segmentInfoFormat() {
return new Lucene86RWSegmentInfoFormat();
}
@Override
public final CompoundFormat compoundFormat() {
return new Lucene50RWCompoundFormat();

View File

@ -31,7 +31,6 @@ import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@ -65,7 +64,7 @@ public class Lucene90Codec extends Codec {
private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene90FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene90SegmentInfoFormat();
private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
private final CompoundFormat compoundFormat = new Lucene90CompoundFormat();
private final PostingsFormat defaultFormat;

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene86;
package org.apache.lucene.codecs.lucene90;
import java.io.IOException;
import java.util.Map;
@ -40,7 +40,7 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Version;
/**
* Lucene 8.6 Segment info format.
* Lucene 9.0 Segment info format.
*
* <p>Files:
*
@ -83,17 +83,17 @@ import org.apache.lucene.util.Version;
* @see SegmentInfos
* @lucene.experimental
*/
public class Lucene86SegmentInfoFormat extends SegmentInfoFormat {
public class Lucene90SegmentInfoFormat extends SegmentInfoFormat {
/** File extension used to store {@link SegmentInfo}. */
public static final String SI_EXTENSION = "si";
static final String CODEC_NAME = "Lucene86SegmentInfo";
static final String CODEC_NAME = "Lucene90SegmentInfo";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
/** Sole constructor. */
public Lucene86SegmentInfoFormat() {}
public Lucene90SegmentInfoFormat() {}
@Override
public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context)

View File

@ -141,7 +141,7 @@
* <p>Each segment index maintains the following:
*
* <ul>
* <li>{@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment info}. This
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. This
* contains metadata about a segment, such as the number of documents, what files it uses, and
* information about how the segment is sorted
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Field names}. This
@ -229,7 +229,7 @@
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment Info}</td>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>

View File

@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene90;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.BaseSegmentInfoFormatTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
public class TestLucene90SegmentInfoFormat extends BaseSegmentInfoFormatTestCase {
@Override
protected Version[] getVersions() {
return new Version[] {Version.LATEST};
}
@Override
protected Codec getCodec() {
return TestUtil.getDefaultCodec();
}
}