LUCENE-9705: Create Lucene90LiveDocsFormat (#2274)

For now this is just a copy of Lucene50LiveDocsFormat. The existing
Lucene50LiveDocsFormat was moved to backwards-codecs.
This commit is contained in:
Julie Tibshirani 2021-02-04 10:43:16 -08:00 committed by GitHub
parent 7fd64aabcc
commit f0a2f1fe03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 234 additions and 13 deletions

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene50;
package org.apache.lucene.backward_codecs.lucene50;
import java.io.IOException;
import java.util.Collection;
@ -107,6 +107,11 @@ public final class Lucene50LiveDocsFormat extends LiveDocsFormat {
return new FixedBitSet(data, length);
}
/**
* Note: although this format is only used on older versions, we need to keep the write logic in
* addition to the read logic. When we delete documents that live in an older segment, we write to
* the live docs for that segment.
*/
@Override
public void writeLiveDocs(
Bits bits, Directory dir, SegmentCommitInfo info, int newDelCount, IOContext context)

View File

@ -16,6 +16,7 @@
*/
package org.apache.lucene.backward_codecs.lucene70;
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
@ -34,7 +35,6 @@ import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;

View File

@ -16,6 +16,7 @@
*/
package org.apache.lucene.backward_codecs.lucene80;
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60PointsFormat;
@ -33,7 +34,6 @@ import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;

View File

@ -17,6 +17,7 @@
package org.apache.lucene.backward_codecs.lucene84;
import java.util.Objects;
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
@ -36,7 +37,6 @@ import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;

View File

@ -18,6 +18,7 @@
package org.apache.lucene.backward_codecs.lucene86;
import java.util.Objects;
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.codecs.Codec;
@ -34,7 +35,6 @@ import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;

View File

@ -18,6 +18,7 @@
package org.apache.lucene.backward_codecs.lucene87;
import java.util.Objects;
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
@ -33,7 +34,6 @@ import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;

View File

@ -14,16 +14,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene50;
package org.apache.lucene.backward_codecs.lucene50;
import org.apache.lucene.backward_codecs.lucene86.Lucene86RWCodec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.BaseLiveDocsFormatTestCase;
import org.apache.lucene.util.TestUtil;
public class TestLucene50LiveDocsFormat extends BaseLiveDocsFormatTestCase {
@Override
protected Codec getCodec() {
return TestUtil.getDefaultCodec();
return new Lucene86RWCodec();
}
}

View File

@ -1835,6 +1835,29 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
dir.close();
}
public void testDeletes() throws Exception {
Path oldIndexDir = createTempDir("dvupdates");
TestUtil.unzip(getDataInputStream(dvUpdatesIndex), oldIndexDir);
Directory dir = newFSDirectory(oldIndexDir);
verifyUsesDefaultCodec(dir, dvUpdatesIndex);
IndexWriterConfig conf = new IndexWriterConfig(new MockAnalyzer(random()));
IndexWriter writer = new IndexWriter(dir, conf);
int maxDoc = writer.getDocStats().maxDoc;
writer.deleteDocuments(new Term("id", "1"));
if (random().nextBoolean()) {
writer.commit();
}
writer.forceMerge(1);
writer.commit();
assertEquals(maxDoc - 1, writer.getDocStats().maxDoc);
writer.close();
dir.close();
}
public void testSoftDeletes() throws Exception {
Path oldIndexDir = createTempDir("dvupdates");
TestUtil.unzip(getDataInputStream(dvUpdatesIndex), oldIndexDir);

View File

@ -31,7 +31,6 @@ import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
@ -73,7 +72,7 @@ public class Lucene90Codec extends Codec {
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene90FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
private final PostingsFormat defaultFormat;

View File

@ -0,0 +1,165 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene90;
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
/**
* Lucene 9.0 live docs format
*
* <p>The .liv file is optional, and only exists when a segment contains deletions.
*
* <p>Although per-segment, this file is maintained exterior to compound segment files.
*
* <p>Deletions (.liv) --&gt; IndexHeader,Generation,Bits
*
* <ul>
* <li>SegmentHeader --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
* <li>Bits --&gt; &lt;{@link DataOutput#writeLong Int64}&gt; <sup>LongCount</sup>
* </ul>
*/
public final class Lucene90LiveDocsFormat extends LiveDocsFormat {
/** extension of live docs */
private static final String EXTENSION = "liv";
/** codec of live docs */
private static final String CODEC_NAME = "Lucene90LiveDocs";
/** supported version range */
private static final int VERSION_START = 0;
private static final int VERSION_CURRENT = VERSION_START;
/** Sole constructor. */
public Lucene90LiveDocsFormat() {}
@Override
public Bits readLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context)
throws IOException {
long gen = info.getDelGen();
String name = IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, gen);
final int length = info.info.maxDoc();
try (ChecksumIndexInput input = dir.openChecksumInput(name, context)) {
Throwable priorE = null;
try {
CodecUtil.checkIndexHeader(
input,
CODEC_NAME,
VERSION_START,
VERSION_CURRENT,
info.info.getId(),
Long.toString(gen, Character.MAX_RADIX));
FixedBitSet fbs = readFixedBitSet(input, length);
if (fbs.length() - fbs.cardinality() != info.getDelCount()) {
throw new CorruptIndexException(
"bits.deleted="
+ (fbs.length() - fbs.cardinality())
+ " info.delcount="
+ info.getDelCount(),
input);
}
return fbs.asReadOnlyBits();
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(input, priorE);
}
}
throw new AssertionError();
}
private FixedBitSet readFixedBitSet(IndexInput input, int length) throws IOException {
long data[] = new long[FixedBitSet.bits2words(length)];
for (int i = 0; i < data.length; i++) {
data[i] = input.readLong();
}
return new FixedBitSet(data, length);
}
@Override
public void writeLiveDocs(
Bits bits, Directory dir, SegmentCommitInfo info, int newDelCount, IOContext context)
throws IOException {
long gen = info.getNextDelGen();
String name = IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, gen);
int delCount;
try (IndexOutput output = dir.createOutput(name, context)) {
CodecUtil.writeIndexHeader(
output,
CODEC_NAME,
VERSION_CURRENT,
info.info.getId(),
Long.toString(gen, Character.MAX_RADIX));
delCount = writeBits(output, bits);
CodecUtil.writeFooter(output);
}
if (delCount != info.getDelCount() + newDelCount) {
throw new CorruptIndexException(
"bits.deleted="
+ delCount
+ " info.delcount="
+ info.getDelCount()
+ " newdelcount="
+ newDelCount,
name);
}
}
private int writeBits(IndexOutput output, Bits bits) throws IOException {
int delCount = 0;
final int longCount = FixedBitSet.bits2words(bits.length());
for (int i = 0; i < longCount; ++i) {
long currentBits = 0;
for (int j = i << 6, end = Math.min(j + 63, bits.length() - 1); j <= end; ++j) {
if (bits.get(j)) {
currentBits |= 1L << j; // mod 64
} else {
delCount += 1;
}
}
output.writeLong(currentBits);
}
return delCount;
}
@Override
public void files(SegmentCommitInfo info, Collection<String> files) throws IOException {
if (info.hasDeletions()) {
files.add(IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, info.getDelGen()));
}
}
}

View File

@ -174,7 +174,7 @@
* loaded into main memory for fast access. Whereas stored values are generally intended for
* summary results from searches, per-document values are useful for things like scoring
* factors.
* <li>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}. An
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
* optional file indicating which documents are live.
* <li>{@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}. Optional pair
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
@ -300,7 +300,7 @@
* <td>Contains term vector data.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what documents are live</td>
* </tr>

View File

@ -0,0 +1,29 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene50;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.BaseLiveDocsFormatTestCase;
import org.apache.lucene.util.TestUtil;
public class TestLucene90LiveDocsFormat extends BaseLiveDocsFormatTestCase {
@Override
protected Codec getCodec() {
return TestUtil.getDefaultCodec();
}
}