LUCENE-5969, LUCENE-5412: add more infos/metadata safety

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1627941 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-09-27 09:57:35 +00:00
commit 250b215c65
65 changed files with 1094 additions and 741 deletions

View File

@ -29,6 +29,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
@ -49,8 +50,8 @@ class Lucene40FieldInfosReader extends FieldInfosReader {
}
@Override
public FieldInfos read(Directory directory, String segmentName, String segmentSuffix, IOContext iocontext) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, "", Lucene40FieldInfosFormat.FIELD_INFOS_EXTENSION);
public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext iocontext) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, "", Lucene40FieldInfosFormat.FIELD_INFOS_EXTENSION);
IndexInput input = directory.openInput(fileName, iocontext);
boolean success = false;

View File

@ -102,8 +102,12 @@ public class Lucene40LiveDocsFormat extends LiveDocsFormat {
public void writeLiveDocs(MutableBits bits, Directory dir, SegmentCommitInfo info, int newDelCount, IOContext context) throws IOException {
String filename = IndexFileNames.fileNameFromGeneration(info.info.name, DELETES_EXTENSION, info.getNextDelGen());
final BitVector liveDocs = (BitVector) bits;
assert liveDocs.count() == info.info.getDocCount() - info.getDelCount() - newDelCount;
assert liveDocs.length() == info.info.getDocCount();
if (liveDocs.length() != info.info.getDocCount()) {
throw new CorruptIndexException("liveDocs.length()=" + liveDocs.length() + "info.docCount=" + info.info.getDocCount(), filename);
}
if (liveDocs.count() != info.info.getDocCount() - info.getDelCount() - newDelCount) {
throw new CorruptIndexException("liveDocs.count()=" + liveDocs.count() + " info.docCount=" + info.info.getDocCount() + " info.getDelCount()=" + info.getDelCount() + " newDelCount=" + newDelCount, filename);
}
liveDocs.write(dir, filename, context);
}

View File

@ -72,7 +72,6 @@ import org.apache.lucene.store.DataOutput; // javadocs
@Deprecated
public class Lucene40SegmentInfoFormat extends SegmentInfoFormat {
private final SegmentInfoReader reader = new Lucene40SegmentInfoReader();
private final SegmentInfoWriter writer = new Lucene40SegmentInfoWriter();
/** Sole constructor. */
public Lucene40SegmentInfoFormat() {
@ -83,11 +82,9 @@ public class Lucene40SegmentInfoFormat extends SegmentInfoFormat {
return reader;
}
// we must unfortunately support write, to allow addIndexes to write a new .si with rewritten filenames:
// see LUCENE-5377
@Override
public SegmentInfoWriter getSegmentInfoWriter() {
return writer;
throw new UnsupportedOperationException("this codec can only be used for reading");
}
/** File extension used to store {@link SegmentInfo}. */

View File

@ -73,7 +73,7 @@ public class Lucene40SegmentInfoReader extends SegmentInfoReader {
CodecUtil.checkEOF(input);
final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics);
final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, null);
si.setFiles(files);
success = true;

View File

@ -29,6 +29,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
@ -49,8 +50,8 @@ final class Lucene42FieldInfosReader extends FieldInfosReader {
}
@Override
public FieldInfos read(Directory directory, String segmentName, String segmentSuffix, IOContext iocontext) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, "", Lucene42FieldInfosFormat.EXTENSION);
public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext iocontext) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, "", Lucene42FieldInfosFormat.EXTENSION);
IndexInput input = directory.openInput(fileName, iocontext);
boolean success = false;

View File

@ -100,7 +100,7 @@ public class Lucene46Codec extends Codec {
}
@Override
public final SegmentInfoFormat segmentInfoFormat() {
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -47,8 +48,8 @@ final class Lucene46FieldInfosReader extends FieldInfosReader {
}
@Override
public FieldInfos read(Directory directory, String segmentName, String segmentSuffix, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, segmentSuffix, Lucene46FieldInfosFormat.EXTENSION);
public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene46FieldInfosFormat.EXTENSION);
try (ChecksumIndexInput input = directory.openChecksumInput(fileName, context)) {
int codecVersion = CodecUtil.checkHeader(input, Lucene46FieldInfosFormat.CODEC_NAME,
Lucene46FieldInfosFormat.FORMAT_START,

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -43,8 +44,8 @@ final class Lucene46FieldInfosWriter extends FieldInfosWriter {
}
@Override
public void write(Directory directory, String segmentName, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, segmentSuffix, Lucene46FieldInfosFormat.EXTENSION);
public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene46FieldInfosFormat.EXTENSION);
try (IndexOutput output = directory.createOutput(fileName, context)) {
CodecUtil.writeHeader(output, Lucene46FieldInfosFormat.CODEC_NAME, Lucene46FieldInfosFormat.FORMAT_CURRENT);
output.writeVInt(infos.size());

View File

@ -31,7 +31,7 @@ import org.apache.lucene.store.DataOutput; // javadocs
* <p>
* Files:
* <ul>
* <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Id, Footer
* <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Footer
* </ul>
* </p>
* Data types:
@ -44,7 +44,6 @@ import org.apache.lucene.store.DataOutput; // javadocs
* <li>Diagnostics --&gt; {@link DataOutput#writeStringStringMap Map&lt;String,String&gt;}</li>
* <li>IsCompoundFile --&gt; {@link DataOutput#writeByte Int8}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* <li>Id --&gt; {@link DataOutput#writeString String}</li>
* </ul>
* </p>
* Field Descriptions:
@ -68,7 +67,6 @@ import org.apache.lucene.store.DataOutput; // javadocs
*/
public class Lucene46SegmentInfoFormat extends SegmentInfoFormat {
private final SegmentInfoReader reader = new Lucene46SegmentInfoReader();
private final SegmentInfoWriter writer = new Lucene46SegmentInfoWriter();
/** Sole constructor. */
public Lucene46SegmentInfoFormat() {
@ -81,7 +79,7 @@ public class Lucene46SegmentInfoFormat extends SegmentInfoFormat {
@Override
public SegmentInfoWriter getSegmentInfoWriter() {
return writer;
throw new UnsupportedOperationException("this codec can only be used for reading");
}
/** File extension used to store {@link SegmentInfo}. */
@ -89,6 +87,5 @@ public class Lucene46SegmentInfoFormat extends SegmentInfoFormat {
static final String CODEC_NAME = "Lucene46SegmentInfo";
static final int VERSION_START = 0;
static final int VERSION_CHECKSUM = 1;
static final int VERSION_ID = 2;
static final int VERSION_CURRENT = VERSION_ID;
static final int VERSION_CURRENT = VERSION_CHECKSUM;
}

View File

@ -65,13 +65,6 @@ public class Lucene46SegmentInfoReader extends SegmentInfoReader {
final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
final Map<String,String> diagnostics = input.readStringStringMap();
final Set<String> files = input.readStringSet();
String id;
if (codecVersion >= Lucene46SegmentInfoFormat.VERSION_ID) {
id = input.readString();
} else {
id = null;
}
if (codecVersion >= Lucene46SegmentInfoFormat.VERSION_CHECKSUM) {
CodecUtil.checkFooter(input);
@ -79,7 +72,7 @@ public class Lucene46SegmentInfoReader extends SegmentInfoReader {
CodecUtil.checkEOF(input);
}
final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, id);
final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, null);
si.setFiles(files);
return si;

View File

@ -99,7 +99,7 @@ public class Lucene49Codec extends Codec {
}
@Override
public final SegmentInfoFormat segmentInfoFormat() {
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
@ -45,11 +46,11 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
}
@Override
public void write(Directory directory, String segmentName, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
if (!segmentSuffix.isEmpty()) {
throw new UnsupportedOperationException("4.0 does not support fieldinfo updates");
}
final String fileName = IndexFileNames.segmentFileName(segmentName, "", Lucene40FieldInfosFormat.FIELD_INFOS_EXTENSION);
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, "", Lucene40FieldInfosFormat.FIELD_INFOS_EXTENSION);
IndexOutput output = directory.createOutput(fileName, context);
boolean success = false;
try {

View File

@ -7,9 +7,9 @@ import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.util.LuceneTestCase;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -74,4 +74,11 @@ public final class Lucene40RWCodec extends Lucene40Codec {
public PostingsFormat getPostingsFormatForField(String field) {
return postings;
}
private static final SegmentInfoFormat segmentInfos = new Lucene40RWSegmentInfoFormat();
@Override
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfos;
}
}

View File

@ -0,0 +1,29 @@
package org.apache.lucene.codecs.lucene40;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.codecs.SegmentInfoWriter;
/** read-write version of 4.0 segmentinfos for testing */
public class Lucene40RWSegmentInfoFormat extends Lucene40SegmentInfoFormat {
@Override
public SegmentInfoWriter getSegmentInfoWriter() {
return new Lucene40SegmentInfoWriter();
}
}

View File

@ -6,12 +6,14 @@ import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosFormat;
import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosWriter;
import org.apache.lucene.codecs.lucene40.Lucene40RWDocValuesFormat;
import org.apache.lucene.codecs.lucene40.Lucene40RWNormsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40RWSegmentInfoFormat;
import org.apache.lucene.codecs.lucene40.Lucene40RWTermVectorsFormat;
import org.apache.lucene.util.LuceneTestCase;
@ -73,4 +75,11 @@ public class Lucene41RWCodec extends Lucene41Codec {
public TermVectorsFormat termVectorsFormat() {
return vectors;
}
private static final SegmentInfoFormat segmentInfos = new Lucene40RWSegmentInfoFormat();
@Override
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfos;
}
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
@ -45,11 +46,11 @@ public final class Lucene42FieldInfosWriter extends FieldInfosWriter {
}
@Override
public void write(Directory directory, String segmentName, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
if (!segmentSuffix.isEmpty()) {
throw new UnsupportedOperationException("4.2 does not support fieldinfo updates");
}
final String fileName = IndexFileNames.segmentFileName(segmentName, "", Lucene42FieldInfosFormat.EXTENSION);
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, "", Lucene42FieldInfosFormat.EXTENSION);
IndexOutput output = directory.createOutput(fileName, context);
boolean success = false;
try {

View File

@ -23,6 +23,8 @@ import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.lucene40.Lucene40RWSegmentInfoFormat;
import org.apache.lucene.util.LuceneTestCase;
/**
@ -55,4 +57,11 @@ public class Lucene42RWCodec extends Lucene42Codec {
public FieldInfosFormat fieldInfosFormat() {
return fieldInfosFormat;
}
private static final SegmentInfoFormat segmentInfos = new Lucene40RWSegmentInfoFormat();
@Override
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfos;
}
}

View File

@ -23,6 +23,8 @@ import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.lucene40.Lucene40RWSegmentInfoFormat;
import org.apache.lucene.codecs.lucene42.Lucene42FieldInfosFormat;
import org.apache.lucene.codecs.lucene42.Lucene42FieldInfosWriter;
import org.apache.lucene.codecs.lucene42.Lucene42RWNormsFormat;
@ -58,4 +60,11 @@ public class Lucene45RWCodec extends Lucene45Codec {
public NormsFormat normsFormat() {
return norms;
}
private static final SegmentInfoFormat segmentInfos = new Lucene40RWSegmentInfoFormat();
@Override
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfos;
}
}

View File

@ -19,6 +19,8 @@ package org.apache.lucene.codecs.lucene46;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.SegmentInfoWriter;
import org.apache.lucene.codecs.lucene42.Lucene42RWNormsFormat;
import org.apache.lucene.codecs.lucene45.Lucene45RWDocValuesFormat;
@ -41,4 +43,11 @@ public class Lucene46RWCodec extends Lucene46Codec {
public NormsFormat normsFormat() {
return norms;
}
private static final SegmentInfoFormat segmentInfos = new Lucene46RWSegmentInfoFormat();
@Override
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfos;
}
}

View File

@ -0,0 +1,28 @@
package org.apache.lucene.codecs.lucene46;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.codecs.SegmentInfoWriter;
/** read-write version of 4.6 segmentinfos for testing */
public class Lucene46RWSegmentInfoFormat extends Lucene46SegmentInfoFormat {
@Override
public SegmentInfoWriter getSegmentInfoWriter() {
return new Lucene46SegmentInfoWriter();
}
}

View File

@ -64,7 +64,6 @@ public class Lucene46SegmentInfoWriter extends SegmentInfoWriter {
output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
output.writeStringStringMap(si.getDiagnostics());
output.writeStringSet(si.files());
output.writeString(si.getId());
CodecUtil.writeFooter(output);
success = true;
} finally {

View File

@ -19,6 +19,8 @@ package org.apache.lucene.codecs.lucene49;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.lucene46.Lucene46RWSegmentInfoFormat;
/**
* Read-write version of {@link Lucene49Codec} for testing.
@ -39,4 +41,11 @@ public class Lucene49RWCodec extends Lucene49Codec {
public NormsFormat normsFormat() {
return norms;
}
private static final SegmentInfoFormat segmentInfos = new Lucene46RWSegmentInfoFormat();
@Override
public SegmentInfoFormat segmentInfoFormat() {
return segmentInfos;
}
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -47,8 +48,8 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextFieldInfosWriter.*;
public class SimpleTextFieldInfosReader extends FieldInfosReader {
@Override
public FieldInfos read(Directory directory, String segmentName, String segmentSuffix, IOContext iocontext) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, segmentSuffix, FIELD_INFOS_EXTENSION);
public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext iocontext) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, FIELD_INFOS_EXTENSION);
ChecksumIndexInput input = directory.openChecksumInput(fileName, iocontext);
BytesRefBuilder scratch = new BytesRefBuilder();

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
@ -61,8 +62,8 @@ public class SimpleTextFieldInfosWriter extends FieldInfosWriter {
final static BytesRef ATT_VALUE = new BytesRef(" value ");
@Override
public void write(Directory directory, String segmentName, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, segmentSuffix, FIELD_INFOS_EXTENSION);
public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, FIELD_INFOS_EXTENSION);
IndexOutput out = directory.createOutput(fileName, context);
BytesRefBuilder scratch = new BytesRefBuilder();
boolean success = false;

View File

@ -20,6 +20,7 @@ package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
@ -109,7 +110,7 @@ public class SimpleTextSegmentInfoReader extends SegmentInfoReader {
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SI_ID);
final String id = readString(SI_ID.length, scratch);
final byte[] id = Arrays.copyOfRange(scratch.bytes(), SI_ID.length, scratch.length());
SimpleTextUtil.checkFooter(input);

View File

@ -107,7 +107,7 @@ public class SimpleTextSegmentInfoWriter extends SegmentInfoWriter {
}
SimpleTextUtil.write(output, SI_ID);
SimpleTextUtil.write(output, si.getId(), scratch);
SimpleTextUtil.write(output, new BytesRef(si.getId()));
SimpleTextUtil.writeNewline(output);
SimpleTextUtil.writeChecksum(output, scratch);

View File

@ -19,6 +19,7 @@ package org.apache.lucene.codecs;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFormatTooNewException;
@ -31,6 +32,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
/**
* Utility class for reading and writing versioned headers.
@ -80,8 +82,7 @@ public final class CodecUtil {
* @throws IOException If there is an I/O error writing to the underlying medium.
* @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length
*/
public static void writeHeader(DataOutput out, String codec, int version)
throws IOException {
public static void writeHeader(DataOutput out, String codec, int version) throws IOException {
BytesRef bytes = new BytesRef(codec);
if (bytes.length != codec.length() || bytes.length >= 128) {
throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]");
@ -90,6 +91,40 @@ public final class CodecUtil {
out.writeString(codec);
out.writeInt(version);
}
/**
* Writes a codec header for a per-segment, which records both a string to
* identify the file, a version number, and the unique ID of the segment.
* This header can be parsed and validated with
* {@link #checkSegmentHeader(DataInput, String, int, int, byte[]) checkSegmentHeader()}.
* <p>
* CodecSegmentHeader --&gt; CodecHeader,SegmentID
* <ul>
* <li>CodecHeader --&gt; {@link #writeHeader}
* <li>SegmentID --&gt; {@link DataOutput#writeByte byte}<sup>16</sup>.
* Unique identifier for the segment.
* </ul>
* <p>
* Note that the length of a segment header depends only upon the
* name of the codec, so this length can be computed at any time
* with {@link #headerLength(String)}.
*
* @param out Output stream
* @param codec String to identify this file. It should be simple ASCII,
* less than 128 characters in length.
* @param segmentID Unique identifier for the segment
* @param version Version number
* @throws IOException If there is an I/O error writing to the underlying medium.
* @throws IllegalArgumentException If the codec name is not simple ASCII, or
* is more than 127 characters in length, or if segmentID is invalid.
*/
public static void writeSegmentHeader(DataOutput out, String codec, int version, byte[] segmentID) throws IOException {
if (segmentID.length != StringHelper.ID_LENGTH) {
throw new IllegalArgumentException("Invalid id: " + StringHelper.idToString(segmentID));
}
writeHeader(out, codec, version);
out.writeBytes(segmentID, 0, segmentID.length);
}
/**
* Computes the length of a codec header.
@ -101,6 +136,17 @@ public final class CodecUtil {
public static int headerLength(String codec) {
return 9+codec.length();
}
/**
* Computes the length of a segment header.
*
* @param codec Codec name.
* @return length of the entire segment header.
* @see #writeSegmentHeader(DataOutput, String, int, byte[])
*/
public static int segmentHeaderLength(String codec) {
return headerLength(codec) + StringHelper.ID_LENGTH;
}
/**
* Reads and validates a header previously written with
@ -129,9 +175,7 @@ public final class CodecUtil {
* @throws IOException If there is an I/O error reading from the underlying medium.
* @see #writeHeader(DataOutput, String, int)
*/
public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion)
throws IOException {
public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion) throws IOException {
// Safety to guard against reading a bogus string:
final int actualHeader = in.readInt();
if (actualHeader != CODEC_MAGIC) {
@ -161,6 +205,48 @@ public final class CodecUtil {
return actualVersion;
}
/**
* Reads and validates a header previously written with
* {@link #writeSegmentHeader(DataOutput, String, int, byte[])}.
* <p>
* When reading a file, supply the expected <code>codec</code>,
* expected version range (<code>minVersion to maxVersion</code>),
* and segment ID.
*
* @param in Input stream, positioned at the point where the
* header was previously written. Typically this is located
* at the beginning of the file.
* @param codec The expected codec name.
* @param minVersion The minimum supported expected version number.
* @param maxVersion The maximum supported expected version number.
* @param segmentID The expected segment this file belongs to.
* @return The actual version found, when a valid header is found
* that matches <code>codec</code>, with an actual version
* where <code>minVersion <= actual <= maxVersion</code>,
* and matching <code>segmentID</code>
* Otherwise an exception is thrown.
* @throws CorruptIndexException If the first four bytes are not
* {@link #CODEC_MAGIC}, or if the actual codec found is
* not <code>codec</code>, or if the <code>segmentID</code>
* does not match.
* @throws IndexFormatTooOldException If the actual version is less
* than <code>minVersion</code>.
* @throws IndexFormatTooNewException If the actual version is greater
* than <code>maxVersion</code>.
* @throws IOException If there is an I/O error reading from the underlying medium.
* @see #writeSegmentHeader(DataOutput, String, int, byte[])
*/
public static int checkSegmentHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] segmentID) throws IOException {
int version = checkHeader(in, codec, minVersion, maxVersion);
byte id[] = new byte[StringHelper.ID_LENGTH];
in.readBytes(id, 0, id.length);
if (!Arrays.equals(id, segmentID)) {
throw new CorruptIndexException("file mismatch, expected segment id=" + StringHelper.idToString(segmentID)
+ ", got=" + StringHelper.idToString(id), in);
}
return version;
}
/**
* Writes a codec footer, which records both a checksum
* algorithm ID and a checksum. This footer can

View File

@ -20,6 +20,7 @@ package org.apache.lucene.codecs;
import java.io.IOException;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -35,5 +36,5 @@ public abstract class FieldInfosReader {
/** Read the {@link FieldInfos} previously written with {@link
* FieldInfosWriter}. */
public abstract FieldInfos read(Directory directory, String segmentName, String segmentSuffix, IOContext iocontext) throws IOException;
public abstract FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext iocontext) throws IOException;
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.codecs;
import java.io.IOException;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -35,5 +36,5 @@ public abstract class FieldInfosWriter {
/** Writes the provided {@link FieldInfos} to the
* directory. */
public abstract void write(Directory directory, String segmentName, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException;
public abstract void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException;
}

View File

@ -1,25 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Lucene 4.0 file format.
</body>
</html>

View File

@ -21,384 +21,5 @@
</head>
<body>
Lucene 4.10 file format.
<h1>Apache Lucene - Index File Formats</h1>
<div>
<ul>
<li><a href="#Introduction">Introduction</a></li>
<li><a href="#Definitions">Definitions</a>
<ul>
<li><a href="#Inverted_Indexing">Inverted Indexing</a></li>
<li><a href="#Types_of_Fields">Types of Fields</a></li>
<li><a href="#Segments">Segments</a></li>
<li><a href="#Document_Numbers">Document Numbers</a></li>
</ul>
</li>
<li><a href="#Overview">Index Structure Overview</a></li>
<li><a href="#File_Naming">File Naming</a></li>
<li><a href="#file-names">Summary of File Extensions</a></li>
<ul>
<li><a href="#Lock_File">Lock File</a></li>
<li><a href="#History">History</a></li>
<li><a href="#Limitations">Limitations</a></li>
</ul>
</ul>
</div>
<a name="Introduction"></a>
<h2>Introduction</h2>
<div>
<p>This document defines the index file formats used in this version of Lucene.
If you are using a different version of Lucene, please consult the copy of
<code>docs/</code> that was distributed with
the version you are using.</p>
<p>Apache Lucene is written in Java, but several efforts are underway to write
<a href="http://wiki.apache.org/lucene-java/LuceneImplementations">versions of
Lucene in other programming languages</a>. If these versions are to remain
compatible with Apache Lucene, then a language-independent definition of the
Lucene index format is required. This document thus attempts to provide a
complete and independent definition of the Apache Lucene file formats.</p>
<p>As Lucene evolves, this document should evolve. Versions of Lucene in
different programming languages should endeavor to agree on file formats, and
generate new versions of this document.</p>
</div>
<a name="Definitions" id="Definitions"></a>
<h2>Definitions</h2>
<div>
<p>The fundamental concepts in Lucene are index, document, field and term.</p>
<p>An index contains a sequence of documents.</p>
<ul>
<li>A document is a sequence of fields.</li>
<li>A field is a named sequence of terms.</li>
<li>A term is a sequence of bytes.</li>
</ul>
<p>The same sequence of bytes in two different fields is considered a different
term. Thus terms are represented as a pair: the string naming the field, and the
bytes within the field.</p>
<a name="Inverted_Indexing"></a>
<h3>Inverted Indexing</h3>
<p>The index stores statistics about terms in order to make term-based search
more efficient. Lucene's index falls into the family of indexes known as an
<i>inverted index.</i> This is because it can list, for a term, the documents
that contain it. This is the inverse of the natural relationship, in which
documents list terms.</p>
<a name="Types_of_Fields"></a>
<h3>Types of Fields</h3>
<p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
in the index literally, in a non-inverted manner. Fields that are inverted are
called <i>indexed</i>. A field may be both stored and indexed.</p>
<p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the
text of a field may be used literally as a term to be indexed. Most fields are
tokenized, but sometimes it is useful for certain identifier fields to be
indexed literally.</p>
<p>See the {@link org.apache.lucene.document.Field Field}
java docs for more information on Fields.</p>
<a name="Segments" id="Segments"></a>
<h3>Segments</h3>
<p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
Each segment is a fully independent index, which could be searched separately.
Indexes evolve by:</p>
<ol>
<li>Creating new segments for newly added documents.</li>
<li>Merging existing segments.</li>
</ol>
<p>Searches may involve multiple segments and/or multiple indexes, each index
potentially composed of a set of segments.</p>
<a name="Document_Numbers"></a>
<h3>Document Numbers</h3>
<p>Internally, Lucene refers to documents by an integer <i>document number</i>.
The first document added to an index is numbered zero, and each subsequent
document added gets a number one greater than the previous.</p>
<p>Note that a document's number may change, so caution should be taken when
storing these numbers outside of Lucene. In particular, numbers may change in
the following situations:</p>
<ul>
<li>
<p>The numbers stored in each segment are unique only within the segment, and
must be converted before they can be used in a larger context. The standard
technique is to allocate each segment a range of values, based on the range of
numbers used in that segment. To convert a document number from a segment to an
external value, the segment's <i>base</i> document number is added. To convert
an external value back to a segment-specific value, the segment is identified
by the range that the external value is in, and the segment's base value is
subtracted. For example two five document segments might be combined, so that
the first segment has a base value of zero, and the second of five. Document
three from the second segment would have an external value of eight.</p>
</li>
<li>
<p>When documents are deleted, gaps are created in the numbering. These are
eventually removed as the index evolves through merging. Deleted documents are
dropped when segments are merged. A freshly-merged segment thus has no gaps in
its numbering.</p>
</li>
</ul>
</div>
<a name="Overview" id="Overview"></a>
<h2>Index Structure Overview</h2>
<div>
<p>Each segment index maintains the following:</p>
<ul>
<li>
{@link org.apache.lucene.codecs.lucene46.Lucene46SegmentInfoFormat Segment info}.
This contains metadata about a segment, such as the number of documents,
what files it uses,
</li>
<li>
{@link org.apache.lucene.codecs.lucene46.Lucene46FieldInfosFormat Field names}.
This contains the set of field names used in the index.
</li>
<li>
{@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Stored Field values}.
This contains, for each document, a list of attribute-value pairs, where the attributes
are field names. These are used to store auxiliary information about the document, such as
its title, url, or an identifier to access a database. The set of stored fields are what is
returned for each hit when searching. This is keyed by document number.
</li>
<li>
{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term dictionary}.
A dictionary containing all of the terms used in all of the
indexed fields of all of the documents. The dictionary also contains the number
of documents which contain the term, and pointers to the term's frequency and
proximity data.
</li>
<li>
{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Frequency data}.
For each term in the dictionary, the numbers of all the
documents that contain that term, and the frequency of the term in that
document, unless frequencies are omitted (IndexOptions.DOCS_ONLY)
</li>
<li>
{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Proximity data}.
For each term in the dictionary, the positions that the
term occurs in each document. Note that this will not exist if all fields in
all documents omit position data.
</li>
<li>
{@link org.apache.lucene.codecs.lucene49.Lucene49NormsFormat Normalization factors}.
For each field in each document, a value is stored
that is multiplied into the score for hits on that field.
</li>
<li>
{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vectors}.
For each field in each document, the term vector (sometimes
called document vector) may be stored. A term vector consists of term text and
term frequency. To add Term Vectors to your index see the
{@link org.apache.lucene.document.Field Field} constructors
</li>
<li>
{@link org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat Per-document values}.
Like stored values, these are also keyed by document
number, but are generally intended to be loaded into main memory for fast
access. Whereas stored values are generally intended for summary results from
searches, per-document values are useful for things like scoring factors.
</li>
<li>
{@link org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat Deleted documents}.
An optional file indicating which documents are deleted.
</li>
</ul>
<p>Details on each of these are provided in their linked pages.</p>
</div>
<a name="File_Naming"></a>
<h2>File Naming</h2>
<div>
<p>All files belonging to a segment have the same name with varying extensions.
The extensions correspond to the different file formats described below. When
using the Compound File format (default in 1.4 and greater) these files (except
for the Segment info file, the Lock file, and Deleted documents file) are collapsed
into a single .cfs file (see below for details)</p>
<p>Typically, all segments in an index are stored in a single directory,
although this is not required.</p>
<p>As of version 2.1 (lock-less commits), file names are never re-used.
That is, when any file is saved
to the Directory it is given a never before used filename. This is achieved
using a simple generations approach. For example, the first segments file is
segments_1, then segments_2, etc. The generation is a sequential long integer
represented in alpha-numeric (base 36) form.</p>
</div>
<a name="file-names" id="file-names"></a>
<h2>Summary of File Extensions</h2>
<div>
<p>The following table summarizes the names and extensions of the files in
Lucene:</p>
<table cellspacing="1" cellpadding="4">
<tr>
<th>Name</th>
<th>Extension</th>
<th>Brief Description</th>
</tr>
<tr>
<td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
<td>segments_N</td>
<td>Stores information about a commit point</td>
</tr>
<tr>
<td><a href="#Lock_File">Lock File</a></td>
<td>write.lock</td>
<td>The Write lock prevents multiple IndexWriters from writing to the same
file.</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene46.Lucene46SegmentInfoFormat Segment Info}</td>
<td>.si</td>
<td>Stores metadata about a segment</td>
</tr>
<tr>
<td>{@link org.apache.lucene.store.CompoundFileDirectory Compound File}</td>
<td>.cfs, .cfe</td>
<td>An optional "virtual" file consisting of all the other index files for
systems that frequently run out of file handles.</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene46.Lucene46FieldInfosFormat Fields}</td>
<td>.fnm</td>
<td>Stores information about the fields</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Field Index}</td>
<td>.fdx</td>
<td>Contains pointers to field data</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Field Data}</td>
<td>.fdt</td>
<td>The stored fields for documents</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Dictionary}</td>
<td>.tim</td>
<td>The term dictionary, stores term info</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Index}</td>
<td>.tip</td>
<td>The index into the Term Dictionary</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Frequencies}</td>
<td>.doc</td>
<td>Contains the list of docs which contain each term along with frequency</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Positions}</td>
<td>.pos</td>
<td>Stores position information about where a term occurs in the index</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Payloads}</td>
<td>.pay</td>
<td>Stores additional per-position metadata information such as character offsets and user payloads</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene49.Lucene49NormsFormat Norms}</td>
<td>.nvd, .nvm</td>
<td>Encodes length and boost factors for docs and fields</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat Per-Document Values}</td>
<td>.dvd, .dvm</td>
<td>Encodes additional scoring factors or other per-document information.</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Index}</td>
<td>.tvx</td>
<td>Stores offset into the document data file</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Documents}</td>
<td>.tvd</td>
<td>Contains information about each document that has term vectors</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Fields}</td>
<td>.tvf</td>
<td>The field level info about term vectors</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat Deleted Documents}</td>
<td>.del</td>
<td>Info about what files are deleted</td>
</tr>
</table>
</div>
<a name="Lock_File" id="Lock_File"></a>
<h2>Lock File</h2>
The write lock, which is stored in the index directory by default, is named
"write.lock". If the lock directory is different from the index directory then
the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
derived from the full path to the index directory. When this file is present, a
writer is currently modifying the index (adding or removing documents). This
lock file ensures that only one writer is modifying the index at a time.</p>
<a name="History"></a>
<h2>History</h2>
<p>Compatibility notes are provided in this document, describing how file
formats have changed from prior versions:</p>
<ul>
<li>In version 2.1, the file format was changed to allow lock-less commits (ie,
no more commit lock). The change is fully backwards compatible: you can open a
pre-2.1 index for searching or adding/deleting of docs. When the new segments
file is saved (committed), it will be written in the new file format (meaning
no specific "upgrade" process is needed). But note that once a commit has
occurred, pre-2.1 Lucene will not be able to read the index.</li>
<li>In version 2.3, the file format was changed to allow segments to share a
single set of doc store (vectors &amp; stored fields) files. This allows for
faster indexing in certain cases. The change is fully backwards compatible (in
the same way as the lock-less commits change in 2.1).</li>
<li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not
Java's modified UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">
LUCENE-510</a> for details.</li>
<li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData
may be passed to IndexWriter's commit methods (and later retrieved), which is
recorded in the segments_N file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">
LUCENE-1382</a> for details. Also,
diagnostics were added to each segment written recording details about why it
was written (due to flush, merge; which OS/JRE was used; etc.). See issue
<a href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.</li>
<li>In version 3.0, compressed fields are no longer written to the index (they
can still be read, but on merge the new segment will write them, uncompressed).
See issue <a href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a>
for details.</li>
<li>In version 3.1, segments records the code version that created them. See
<a href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
Additionally segments track explicitly whether or not they have term vectors.
See <a href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a>
for details.</li>
<li>In version 3.2, numeric fields are written as natively to stored fields
file, previously they were stored in text format only.</li>
<li>In version 3.4, fields can omit position data while still indexing term
frequencies.</li>
<li>In version 4.0, the format of the inverted index became extensible via
the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
({@code DocValues}) was introduced. Normalization factors need no longer be a
single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
Terms need not be unicode strings, they can be any byte sequence. Term offsets
can optionally be indexed into the postings lists. Payloads can be stored in the
term vectors.</li>
<li>In version 4.1, the format of the postings list changed to use either
of FOR compression or variable-byte encoding, depending upon the frequency
of the term. Terms appearing only once were changed to inline directly into
the term dictionary. Stored fields are compressed by default. </li>
<li>In version 4.2, term vectors are compressed by default. DocValues has
a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
on multi-valued fields.</li>
<li>In version 4.5, DocValues were extended to explicitly represent missing values.</li>
<li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
allow updating NumericDocValues fields.</li>
<li>In version 4.8, checksum footers were added to the end of each index file
for improved data integrity. Specifically, the last 8 bytes of every index file
contain the zlib-crc32 checksum of the file.</li>
<li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
that is suitable for faceting/sorting/analytics.
</li>
</ul>
<a name="Limitations" id="Limitations"></a>
<h2>Limitations</h2>
<div>
<p>Lucene uses a Java <code>int</code> to refer to
document numbers, and the index file format uses an <code>Int32</code>
on-disk to store document numbers. This is a limitation
of both the index file format and the current implementation. Eventually these
should be replaced with either <code>UInt64</code> values, or
better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.</p>
</div>
</body>
</html>

View File

@ -1,25 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Lucene 4.6 file format.
</body>
</html>

View File

@ -27,7 +27,6 @@ import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat;
import org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat;
import org.apache.lucene.codecs.lucene49.Lucene49NormsFormat;
@ -49,7 +48,7 @@ public class Lucene50Codec extends Codec {
private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
private final LiveDocsFormat liveDocsFormat = new Lucene40LiveDocsFormat();
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
@Override

View File

@ -25,6 +25,7 @@ import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FieldInfosReader;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.DataOutput;
/**
@ -35,7 +36,7 @@ import org.apache.lucene.store.DataOutput;
* FieldBits,DocValuesBits,DocValuesGen,Attributes&gt; <sup>FieldsCount</sup>,Footer</p>
* <p>Data types:
* <ul>
* <li>Header --&gt; {@link CodecUtil#checkHeader CodecHeader}</li>
* <li>Header --&gt; {@link CodecUtil#checkSegmentHeader SegmentHeader}</li>
* <li>FieldsCount --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>FieldName --&gt; {@link DataOutput#writeString String}</li>
* <li>FieldBits, DocValuesBits --&gt; {@link DataOutput#writeByte Byte}</li>

View File

@ -29,6 +29,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -47,15 +48,16 @@ final class Lucene50FieldInfosReader extends FieldInfosReader {
}
@Override
public FieldInfos read(Directory directory, String segmentName, String segmentSuffix, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, segmentSuffix, Lucene50FieldInfosFormat.EXTENSION);
public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene50FieldInfosFormat.EXTENSION);
try (ChecksumIndexInput input = directory.openChecksumInput(fileName, context)) {
Throwable priorE = null;
FieldInfo infos[] = null;
try {
CodecUtil.checkHeader(input, Lucene50FieldInfosFormat.CODEC_NAME,
CodecUtil.checkSegmentHeader(input, Lucene50FieldInfosFormat.CODEC_NAME,
Lucene50FieldInfosFormat.FORMAT_START,
Lucene50FieldInfosFormat.FORMAT_CURRENT);
Lucene50FieldInfosFormat.FORMAT_CURRENT,
segmentInfo.getId());
final int size = input.readVInt(); //read in the size
infos = new FieldInfo[size];

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -43,10 +44,10 @@ final class Lucene50FieldInfosWriter extends FieldInfosWriter {
}
@Override
public void write(Directory directory, String segmentName, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, segmentSuffix, Lucene50FieldInfosFormat.EXTENSION);
public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene50FieldInfosFormat.EXTENSION);
try (IndexOutput output = directory.createOutput(fileName, context)) {
CodecUtil.writeHeader(output, Lucene50FieldInfosFormat.CODEC_NAME, Lucene50FieldInfosFormat.FORMAT_CURRENT);
CodecUtil.writeSegmentHeader(output, Lucene50FieldInfosFormat.CODEC_NAME, Lucene50FieldInfosFormat.FORMAT_CURRENT, segmentInfo.getId());
output.writeVInt(infos.size());
for (FieldInfo fi : infos) {
fi.checkConsistency();

View File

@ -0,0 +1,138 @@
package org.apache.lucene.codecs.lucene50;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.MutableBits;
/**
* Lucene 5.0 live docs format
* <p>
* <p>The .liv file is optional, and only exists when a segment contains
* deletions.</p>
* <p>Although per-segment, this file is maintained exterior to compound segment
* files.</p>
* <p>Deletions (.liv) --&gt; SegmentHeader,Generation,Bits</p>
* <ul>
* <li>SegmentHeader --&gt; {@link CodecUtil#writeSegmentHeader SegmentHeader}</li>
* <li>Generation --&gt; {@link DataOutput#writeLong Int64}
* <li>Bits --&gt; &lt;{@link DataOutput#writeLong Int64}&gt; <sup>LongCount</sup></li>
* </ul>
*/
public final class Lucene50LiveDocsFormat extends LiveDocsFormat {
/** Sole constructor. */
public Lucene50LiveDocsFormat() {
}
/** extension of live docs */
private static final String EXTENSION = "liv";
/** codec of live docs */
private static final String CODEC_NAME = "Lucene50LiveDocs";
/** supported version range */
private static final int VERSION_START = 0;
private static final int VERSION_CURRENT = VERSION_START;
@Override
public MutableBits newLiveDocs(int size) throws IOException {
FixedBitSet bits = new FixedBitSet(size);
bits.set(0, size);
return bits;
}
@Override
public MutableBits newLiveDocs(Bits existing) throws IOException {
FixedBitSet fbs = (FixedBitSet) existing;
return fbs.clone();
}
@Override
public Bits readLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context) throws IOException {
long gen = info.getDelGen();
String name = IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, gen);
final int length = info.info.getDocCount();
try (ChecksumIndexInput input = dir.openChecksumInput(name, context)) {
Throwable priorE = null;
try {
CodecUtil.checkSegmentHeader(input, CODEC_NAME, VERSION_START, VERSION_CURRENT, info.info.getId());
long filegen = input.readLong();
if (gen != filegen) {
throw new CorruptIndexException("file mismatch, expected generation=" + gen + ", got=" + filegen, input);
}
long data[] = new long[FixedBitSet.bits2words(length)];
for (int i = 0; i < data.length; i++) {
data[i] = input.readLong();
}
FixedBitSet fbs = new FixedBitSet(data, length);
if (fbs.length() - fbs.cardinality() != info.getDelCount()) {
throw new CorruptIndexException("bits.deleted=" + (fbs.length() - fbs.cardinality()) +
" info.delcount=" + info.getDelCount(), input);
}
return fbs;
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(input, priorE);
}
}
throw new AssertionError();
}
@Override
public void writeLiveDocs(MutableBits bits, Directory dir, SegmentCommitInfo info, int newDelCount, IOContext context) throws IOException {
long gen = info.getNextDelGen();
String name = IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, gen);
FixedBitSet fbs = (FixedBitSet) bits;
if (fbs.length() - fbs.cardinality() != info.getDelCount() + newDelCount) {
throw new CorruptIndexException("bits.deleted=" + (fbs.length() - fbs.cardinality()) +
" info.delcount=" + info.getDelCount() + " newdelcount=" + newDelCount, name);
}
long data[] = fbs.getBits();
try (IndexOutput output = dir.createOutput(name, context)) {
CodecUtil.writeSegmentHeader(output, CODEC_NAME, VERSION_CURRENT, info.info.getId());
output.writeLong(gen);
for (int i = 0; i < data.length; i++) {
output.writeLong(data[i]);
}
CodecUtil.writeFooter(output);
}
}
@Override
public void files(SegmentCommitInfo info, Collection<String> files) throws IOException {
if (info.hasDeletions()) {
files.add(IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, info.getDelGen()));
}
}
}

View File

@ -30,6 +30,7 @@ import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
/**
@ -69,7 +70,8 @@ public class Lucene50SegmentInfoReader extends SegmentInfoReader {
final Map<String,String> diagnostics = input.readStringStringMap();
final Set<String> files = input.readStringSet();
String id = input.readString();
byte[] id = new byte[StringHelper.ID_LENGTH];
input.readBytes(id, 0, id.length);
si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, id);
si.setFiles(files);

View File

@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene50;
*/
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.SegmentInfoWriter;
@ -28,6 +29,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
/**
@ -48,10 +50,8 @@ public class Lucene50SegmentInfoWriter extends SegmentInfoWriter {
final String fileName = IndexFileNames.segmentFileName(si.name, "", Lucene50SegmentInfoFormat.SI_EXTENSION);
si.addFile(fileName);
final IndexOutput output = dir.createOutput(fileName, ioContext);
boolean success = false;
try {
try (IndexOutput output = dir.createOutput(fileName, ioContext)) {
CodecUtil.writeHeader(output, Lucene50SegmentInfoFormat.CODEC_NAME, Lucene50SegmentInfoFormat.VERSION_CURRENT);
Version version = si.getVersion();
if (version.major < 5) {
@ -63,17 +63,24 @@ public class Lucene50SegmentInfoWriter extends SegmentInfoWriter {
output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
output.writeStringStringMap(si.getDiagnostics());
output.writeStringSet(si.files());
output.writeString(si.getId());
Set<String> files = si.files();
for (String file : files) {
if (!IndexFileNames.parseSegmentName(file).equals(si.name)) {
throw new IllegalArgumentException("invalid files: expected segment=" + si.name + ", got=" + files);
}
}
output.writeStringSet(files);
byte[] id = si.getId();
if (id.length != StringHelper.ID_LENGTH) {
throw new IllegalArgumentException("invalid id, got=" + StringHelper.idToString(id));
}
output.writeBytes(id, 0, id.length);
CodecUtil.writeFooter(output);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(output);
// TODO: are we doing this outside of the tracking wrapper? why must SIWriter cleanup like this?
IOUtils.deleteFilesIgnoringExceptions(si.dir, fileName);
} else {
output.close();
}
}
}

View File

@ -21,5 +21,384 @@
</head>
<body>
Lucene 5.0 file format.
<h1>Apache Lucene - Index File Formats</h1>
<div>
<ul>
<li><a href="#Introduction">Introduction</a></li>
<li><a href="#Definitions">Definitions</a>
<ul>
<li><a href="#Inverted_Indexing">Inverted Indexing</a></li>
<li><a href="#Types_of_Fields">Types of Fields</a></li>
<li><a href="#Segments">Segments</a></li>
<li><a href="#Document_Numbers">Document Numbers</a></li>
</ul>
</li>
<li><a href="#Overview">Index Structure Overview</a></li>
<li><a href="#File_Naming">File Naming</a></li>
<li><a href="#file-names">Summary of File Extensions</a></li>
<ul>
<li><a href="#Lock_File">Lock File</a></li>
<li><a href="#History">History</a></li>
<li><a href="#Limitations">Limitations</a></li>
</ul>
</ul>
</div>
<a name="Introduction"></a>
<h2>Introduction</h2>
<div>
<p>This document defines the index file formats used in this version of Lucene.
If you are using a different version of Lucene, please consult the copy of
<code>docs/</code> that was distributed with
the version you are using.</p>
<p>Apache Lucene is written in Java, but several efforts are underway to write
<a href="http://wiki.apache.org/lucene-java/LuceneImplementations">versions of
Lucene in other programming languages</a>. If these versions are to remain
compatible with Apache Lucene, then a language-independent definition of the
Lucene index format is required. This document thus attempts to provide a
complete and independent definition of the Apache Lucene file formats.</p>
<p>As Lucene evolves, this document should evolve. Versions of Lucene in
different programming languages should endeavor to agree on file formats, and
generate new versions of this document.</p>
</div>
<a name="Definitions" id="Definitions"></a>
<h2>Definitions</h2>
<div>
<p>The fundamental concepts in Lucene are index, document, field and term.</p>
<p>An index contains a sequence of documents.</p>
<ul>
<li>A document is a sequence of fields.</li>
<li>A field is a named sequence of terms.</li>
<li>A term is a sequence of bytes.</li>
</ul>
<p>The same sequence of bytes in two different fields is considered a different
term. Thus terms are represented as a pair: the string naming the field, and the
bytes within the field.</p>
<a name="Inverted_Indexing"></a>
<h3>Inverted Indexing</h3>
<p>The index stores statistics about terms in order to make term-based search
more efficient. Lucene's index falls into the family of indexes known as an
<i>inverted index.</i> This is because it can list, for a term, the documents
that contain it. This is the inverse of the natural relationship, in which
documents list terms.</p>
<a name="Types_of_Fields"></a>
<h3>Types of Fields</h3>
<p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
in the index literally, in a non-inverted manner. Fields that are inverted are
called <i>indexed</i>. A field may be both stored and indexed.</p>
<p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the
text of a field may be used literally as a term to be indexed. Most fields are
tokenized, but sometimes it is useful for certain identifier fields to be
indexed literally.</p>
<p>See the {@link org.apache.lucene.document.Field Field}
java docs for more information on Fields.</p>
<a name="Segments" id="Segments"></a>
<h3>Segments</h3>
<p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
Each segment is a fully independent index, which could be searched separately.
Indexes evolve by:</p>
<ol>
<li>Creating new segments for newly added documents.</li>
<li>Merging existing segments.</li>
</ol>
<p>Searches may involve multiple segments and/or multiple indexes, each index
potentially composed of a set of segments.</p>
<a name="Document_Numbers"></a>
<h3>Document Numbers</h3>
<p>Internally, Lucene refers to documents by an integer <i>document number</i>.
The first document added to an index is numbered zero, and each subsequent
document added gets a number one greater than the previous.</p>
<p>Note that a document's number may change, so caution should be taken when
storing these numbers outside of Lucene. In particular, numbers may change in
the following situations:</p>
<ul>
<li>
<p>The numbers stored in each segment are unique only within the segment, and
must be converted before they can be used in a larger context. The standard
technique is to allocate each segment a range of values, based on the range of
numbers used in that segment. To convert a document number from a segment to an
external value, the segment's <i>base</i> document number is added. To convert
an external value back to a segment-specific value, the segment is identified
by the range that the external value is in, and the segment's base value is
subtracted. For example two five document segments might be combined, so that
the first segment has a base value of zero, and the second of five. Document
three from the second segment would have an external value of eight.</p>
</li>
<li>
<p>When documents are deleted, gaps are created in the numbering. These are
eventually removed as the index evolves through merging. Deleted documents are
dropped when segments are merged. A freshly-merged segment thus has no gaps in
its numbering.</p>
</li>
</ul>
</div>
<a name="Overview" id="Overview"></a>
<h2>Index Structure Overview</h2>
<div>
<p>Each segment index maintains the following:</p>
<ul>
<li>
{@link org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat Segment info}.
This contains metadata about a segment, such as the number of documents,
what files it uses,
</li>
<li>
{@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Field names}.
This contains the set of field names used in the index.
</li>
<li>
{@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Stored Field values}.
This contains, for each document, a list of attribute-value pairs, where the attributes
are field names. These are used to store auxiliary information about the document, such as
its title, url, or an identifier to access a database. The set of stored fields are what is
returned for each hit when searching. This is keyed by document number.
</li>
<li>
{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term dictionary}.
A dictionary containing all of the terms used in all of the
indexed fields of all of the documents. The dictionary also contains the number
of documents which contain the term, and pointers to the term's frequency and
proximity data.
</li>
<li>
{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Frequency data}.
For each term in the dictionary, the numbers of all the
documents that contain that term, and the frequency of the term in that
document, unless frequencies are omitted (IndexOptions.DOCS_ONLY)
</li>
<li>
{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Proximity data}.
For each term in the dictionary, the positions that the
term occurs in each document. Note that this will not exist if all fields in
all documents omit position data.
</li>
<li>
{@link org.apache.lucene.codecs.lucene49.Lucene49NormsFormat Normalization factors}.
For each field in each document, a value is stored
that is multiplied into the score for hits on that field.
</li>
<li>
{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vectors}.
For each field in each document, the term vector (sometimes
called document vector) may be stored. A term vector consists of term text and
term frequency. To add Term Vectors to your index see the
{@link org.apache.lucene.document.Field Field} constructors
</li>
<li>
{@link org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat Per-document values}.
Like stored values, these are also keyed by document
number, but are generally intended to be loaded into main memory for fast
access. Whereas stored values are generally intended for summary results from
searches, per-document values are useful for things like scoring factors.
</li>
<li>
{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
An optional file indicating which documents are live.
</li>
</ul>
<p>Details on each of these are provided in their linked pages.</p>
</div>
<a name="File_Naming"></a>
<h2>File Naming</h2>
<div>
<p>All files belonging to a segment have the same name with varying extensions.
The extensions correspond to the different file formats described below. When
using the Compound File format (default in 1.4 and greater) these files (except
for the Segment info file, the Lock file, and Deleted documents file) are collapsed
into a single .cfs file (see below for details)</p>
<p>Typically, all segments in an index are stored in a single directory,
although this is not required.</p>
<p>As of version 2.1 (lock-less commits), file names are never re-used.
That is, when any file is saved
to the Directory it is given a never before used filename. This is achieved
using a simple generations approach. For example, the first segments file is
segments_1, then segments_2, etc. The generation is a sequential long integer
represented in alpha-numeric (base 36) form.</p>
</div>
<a name="file-names" id="file-names"></a>
<h2>Summary of File Extensions</h2>
<div>
<p>The following table summarizes the names and extensions of the files in
Lucene:</p>
<table cellspacing="1" cellpadding="4">
<tr>
<th>Name</th>
<th>Extension</th>
<th>Brief Description</th>
</tr>
<tr>
<td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
<td>segments_N</td>
<td>Stores information about a commit point</td>
</tr>
<tr>
<td><a href="#Lock_File">Lock File</a></td>
<td>write.lock</td>
<td>The Write lock prevents multiple IndexWriters from writing to the same
file.</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat Segment Info}</td>
<td>.si</td>
<td>Stores metadata about a segment</td>
</tr>
<tr>
<td>{@link org.apache.lucene.store.CompoundFileDirectory Compound File}</td>
<td>.cfs, .cfe</td>
<td>An optional "virtual" file consisting of all the other index files for
systems that frequently run out of file handles.</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Fields}</td>
<td>.fnm</td>
<td>Stores information about the fields</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Field Index}</td>
<td>.fdx</td>
<td>Contains pointers to field data</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Field Data}</td>
<td>.fdt</td>
<td>The stored fields for documents</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Dictionary}</td>
<td>.tim</td>
<td>The term dictionary, stores term info</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Index}</td>
<td>.tip</td>
<td>The index into the Term Dictionary</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Frequencies}</td>
<td>.doc</td>
<td>Contains the list of docs which contain each term along with frequency</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Positions}</td>
<td>.pos</td>
<td>Stores position information about where a term occurs in the index</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Payloads}</td>
<td>.pay</td>
<td>Stores additional per-position metadata information such as character offsets and user payloads</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene49.Lucene49NormsFormat Norms}</td>
<td>.nvd, .nvm</td>
<td>Encodes length and boost factors for docs and fields</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat Per-Document Values}</td>
<td>.dvd, .dvm</td>
<td>Encodes additional scoring factors or other per-document information.</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Index}</td>
<td>.tvx</td>
<td>Stores offset into the document data file</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Documents}</td>
<td>.tvd</td>
<td>Contains information about each document that has term vectors</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Fields}</td>
<td>.tvf</td>
<td>The field level info about term vectors</td>
</tr>
<tr>
<td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
<td>.liv</td>
<td>Info about what files are live</td>
</tr>
</table>
</div>
<a name="Lock_File" id="Lock_File"></a>
<h2>Lock File</h2>
The write lock, which is stored in the index directory by default, is named
"write.lock". If the lock directory is different from the index directory then
the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
derived from the full path to the index directory. When this file is present, a
writer is currently modifying the index (adding or removing documents). This
lock file ensures that only one writer is modifying the index at a time.</p>
<a name="History"></a>
<h2>History</h2>
<p>Compatibility notes are provided in this document, describing how file
formats have changed from prior versions:</p>
<ul>
<li>In version 2.1, the file format was changed to allow lock-less commits (ie,
no more commit lock). The change is fully backwards compatible: you can open a
pre-2.1 index for searching or adding/deleting of docs. When the new segments
file is saved (committed), it will be written in the new file format (meaning
no specific "upgrade" process is needed). But note that once a commit has
occurred, pre-2.1 Lucene will not be able to read the index.</li>
<li>In version 2.3, the file format was changed to allow segments to share a
single set of doc store (vectors &amp; stored fields) files. This allows for
faster indexing in certain cases. The change is fully backwards compatible (in
the same way as the lock-less commits change in 2.1).</li>
<li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not
Java's modified UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">
LUCENE-510</a> for details.</li>
<li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData
may be passed to IndexWriter's commit methods (and later retrieved), which is
recorded in the segments_N file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">
LUCENE-1382</a> for details. Also,
diagnostics were added to each segment written recording details about why it
was written (due to flush, merge; which OS/JRE was used; etc.). See issue
<a href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.</li>
<li>In version 3.0, compressed fields are no longer written to the index (they
can still be read, but on merge the new segment will write them, uncompressed).
See issue <a href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a>
for details.</li>
<li>In version 3.1, segments records the code version that created them. See
<a href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
Additionally segments track explicitly whether or not they have term vectors.
See <a href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a>
for details.</li>
<li>In version 3.2, numeric fields are written as natively to stored fields
file, previously they were stored in text format only.</li>
<li>In version 3.4, fields can omit position data while still indexing term
frequencies.</li>
<li>In version 4.0, the format of the inverted index became extensible via
the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
({@code DocValues}) was introduced. Normalization factors need no longer be a
single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
Terms need not be unicode strings, they can be any byte sequence. Term offsets
can optionally be indexed into the postings lists. Payloads can be stored in the
term vectors.</li>
<li>In version 4.1, the format of the postings list changed to use either
of FOR compression or variable-byte encoding, depending upon the frequency
of the term. Terms appearing only once were changed to inline directly into
the term dictionary. Stored fields are compressed by default. </li>
<li>In version 4.2, term vectors are compressed by default. DocValues has
a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
on multi-valued fields.</li>
<li>In version 4.5, DocValues were extended to explicitly represent missing values.</li>
<li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
allow updating NumericDocValues fields.</li>
<li>In version 4.8, checksum footers were added to the end of each index file
for improved data integrity. Specifically, the last 8 bytes of every index file
contain the zlib-crc32 checksum of the file.</li>
<li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
that is suitable for faceting/sorting/analytics.
</li>
</ul>
<a name="Limitations" id="Limitations"></a>
<h2>Limitations</h2>
<div>
<p>Lucene uses a Java <code>int</code> to refer to
document numbers, and the index file format uses an <code>Int32</code>
on-disk to store document numbers. This is a limitation
of both the index file format and the current implementation. Eventually these
should be replaced with either <code>UInt64</code> values, or
better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.</p>
</div>
</body>
</html>

View File

@ -47,6 +47,7 @@ import org.apache.lucene.util.CommandLineUtil;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
@ -514,7 +515,7 @@ public class CheckIndex {
}
msg(infoStream, "Segments file=" + segmentsFileName + " numSegments=" + numSegments
+ " " + versionString + " id=" + sis.getId() + " format=" + sFormat + userDataString);
+ " " + versionString + " id=" + StringHelper.idToString(sis.getId()) + " format=" + sFormat + userDataString);
if (onlySegments != null) {
result.partial = true;
@ -565,7 +566,7 @@ public class CheckIndex {
try {
msg(infoStream, " version=" + (version == null ? "3.0" : version));
msg(infoStream, " id=" + info.info.getId());
msg(infoStream, " id=" + StringHelper.idToString(info.info.getId()));
final Codec codec = info.info.getCodec();
msg(infoStream, " codec=" + codec);
segInfoStat.codec = codec;

View File

@ -119,7 +119,7 @@ final class DefaultIndexingChain extends DocConsumer {
// FreqProxTermsWriter does this with
// FieldInfo.storePayload.
FieldInfosWriter infosWriter = docWriter.codec.fieldInfosFormat().getFieldInfosWriter();
infosWriter.write(state.directory, state.segmentInfo.name, "", state.fieldInfos, IOContext.DEFAULT);
infosWriter.write(state.directory, state.segmentInfo, "", state.fieldInfos, IOContext.DEFAULT);
}
/** Writes all buffered doc values (called from {@link #flush}). */

View File

@ -2591,67 +2591,33 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
}
/** Copies the segment files as-is into the IndexWriter's directory. */
private SegmentCommitInfo copySegmentAsIs(SegmentCommitInfo info, String segName, IOContext context)
throws IOException {
// note: we don't really need this fis (its copied), but we load it up
// so we don't pass a null value to the si writer
FieldInfos fis = SegmentReader.readFieldInfos(info);
private SegmentCommitInfo copySegmentAsIs(SegmentCommitInfo info, String segName, IOContext context) throws IOException {
//System.out.println("copy seg=" + info.info.name + " version=" + info.info.getVersion());
// Same SI as before but we change directory and name
SegmentInfo newInfo = new SegmentInfo(directory, info.info.getVersion(), segName, info.info.getDocCount(),
info.info.getUseCompoundFile(), info.info.getCodec(),
info.info.getDiagnostics(), StringHelper.randomId());
SegmentCommitInfo newInfoPerCommit = new SegmentCommitInfo(newInfo,
info.getDelCount(), info.getDelGen(), info.getFieldInfosGen(),
info.getDocValuesGen());
Set<String> segFiles = new HashSet<>();
// Build up new segment's file names. Must do this
// before writing SegmentInfo:
for (String file: info.files()) {
final String newFileName;
newFileName = segName + IndexFileNames.stripSegmentName(file);
segFiles.add(newFileName);
}
newInfo.setFiles(segFiles);
// We must rewrite the SI file because it references segment name in its list of files, etc
TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(directory);
info.info.getDiagnostics(), info.info.getId());
SegmentCommitInfo newInfoPerCommit = new SegmentCommitInfo(newInfo, info.getDelCount(), info.getDelGen(),
info.getFieldInfosGen(), info.getDocValuesGen());
newInfo.setFiles(info.files());
boolean success = false;
try {
newInfo.getCodec().segmentInfoFormat().getSegmentInfoWriter().write(trackingDir, newInfo, fis, context);
final Collection<String> siFiles = trackingDir.getCreatedFiles();
// Copy the segment's files
for (String file: info.files()) {
final String newFileName = newInfo.namedForThisSegment(file);
final String newFileName = segName + IndexFileNames.stripSegmentName(file);
if (siFiles.contains(newFileName)) {
// We already rewrote this above
continue;
}
assert !slowFileExists(directory, newFileName): "file \"" + newFileName + "\" already exists; siFiles=" + siFiles;
assert !slowFileExists(directory, newFileName): "file \"" + newFileName + "\" already exists; newInfo.files=" + newInfo.files();
info.info.dir.copy(directory, file, newFileName, context);
}
success = true;
} finally {
if (!success) {
for(String file : newInfo.files()) {
try {
directory.deleteFile(file);
} catch (Throwable t) {
}
}
IOUtils.deleteFilesIgnoringExceptions(directory, newInfo.files().toArray(new String[0]));
}
}

View File

@ -451,7 +451,7 @@ class ReadersAndUpdates {
final IOContext infosContext = new IOContext(new FlushInfo(info.info.getDocCount(), estInfosSize));
// separately also track which files were created for this gen
final TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(dir);
infosFormat.getFieldInfosWriter().write(trackingDir, info.info.name, segmentSuffix, fieldInfos, infosContext);
infosFormat.getFieldInfosWriter().write(trackingDir, info.info, segmentSuffix, fieldInfos, infosContext);
info.advanceFieldInfosGen();
return trackingDir.getCreatedFiles();
}

View File

@ -109,7 +109,14 @@ public class SegmentCommitInfo {
@Deprecated
public void setGenUpdatesFiles(Map<Long,Set<String>> genUpdatesFiles) {
this.genUpdatesFiles.clear();
this.genUpdatesFiles.putAll(genUpdatesFiles);
for (Map.Entry<Long,Set<String>> kv : genUpdatesFiles.entrySet()) {
// rename the set
Set<String> set = new HashSet<>();
for (String file : kv.getValue()) {
set.add(info.namedForThisSegment(file));
}
this.genUpdatesFiles.put(kv.getKey(), set);
}
}
/** Returns the per-field DocValues updates files. */
@ -120,7 +127,14 @@ public class SegmentCommitInfo {
/** Sets the DocValues updates file names, per field number. Does not deep clone the map. */
public void setDocValuesUpdatesFiles(Map<Integer,Set<String>> dvUpdatesFiles) {
this.dvUpdatesFiles.clear();
this.dvUpdatesFiles.putAll(dvUpdatesFiles);
for (Map.Entry<Integer,Set<String>> kv : dvUpdatesFiles.entrySet()) {
// rename the set
Set<String> set = new HashSet<>();
for (String file : kv.getValue()) {
set.add(info.namedForThisSegment(file));
}
this.dvUpdatesFiles.put(kv.getKey(), set);
}
}
/** Returns the FieldInfos file names. */
@ -131,7 +145,9 @@ public class SegmentCommitInfo {
/** Sets the FieldInfos file names. */
public void setFieldInfosFiles(Set<String> fieldInfosFiles) {
this.fieldInfosFiles.clear();
this.fieldInfosFiles.addAll(fieldInfosFiles);
for (String file : fieldInfosFiles) {
this.fieldInfosFiles.add(info.namedForThisSegment(file));
}
}
/** Called when we succeed in writing deletes */

View File

@ -18,8 +18,10 @@ package org.apache.lucene.index;
*/
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
@ -58,7 +60,7 @@ public final class SegmentInfo {
private boolean isCompoundFile;
/** Id that uniquely identifies this segment. */
private final String id;
private final byte[] id;
private Codec codec;
@ -81,15 +83,6 @@ public final class SegmentInfo {
return diagnostics;
}
/**
* Construct a new complete SegmentInfo instance from
* input, with a newly generated random id.
*/
public SegmentInfo(Directory dir, Version version, String name, int docCount,
boolean isCompoundFile, Codec codec, Map<String,String> diagnostics) {
this(dir, version, name, docCount, isCompoundFile, codec, diagnostics, null);
}
/**
* Construct a new complete SegmentInfo instance from input.
* <p>Note: this is public only to allow access from
@ -97,7 +90,7 @@ public final class SegmentInfo {
*/
public SegmentInfo(Directory dir, Version version, String name, int docCount,
boolean isCompoundFile, Codec codec, Map<String,String> diagnostics,
String id) {
byte[] id) {
assert !(dir instanceof TrackingDirectoryWrapper);
this.dir = dir;
this.version = version;
@ -107,6 +100,9 @@ public final class SegmentInfo {
this.codec = codec;
this.diagnostics = diagnostics;
this.id = id;
if (id != null && id.length != StringHelper.ID_LENGTH) {
throw new IllegalArgumentException("invalid id: " + Arrays.toString(id));
}
}
/**
@ -226,30 +222,32 @@ public final class SegmentInfo {
}
/** Return the id that uniquely identifies this segment. */
public String getId() {
return id;
public byte[] getId() {
return id == null ? null : id.clone();
}
private Set<String> setFiles;
/** Sets the files written for this segment. */
public void setFiles(Set<String> files) {
checkFileNames(files);
setFiles = files;
public void setFiles(Collection<String> files) {
setFiles = new HashSet<>();
addFiles(files);
}
/** Add these files to the set of files written for this
* segment. */
public void addFiles(Collection<String> files) {
checkFileNames(files);
setFiles.addAll(files);
for (String f : files) {
setFiles.add(namedForThisSegment(f));
}
}
/** Add this file to the set of files written for this
* segment. */
public void addFile(String file) {
checkFileNames(Collections.singleton(file));
setFiles.add(file);
setFiles.add(namedForThisSegment(file));
}
private void checkFileNames(Collection<String> files) {
@ -261,5 +259,12 @@ public final class SegmentInfo {
}
}
}
/**
* strips any segment name from the file, naming it with this segment
* this is because "segment names" can change, e.g. by addIndexes(Dir)
*/
String namedForThisSegment(String file) {
return name + IndexFileNames.stripSegmentName(file);
}
}

View File

@ -125,8 +125,8 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
/** The file format version for the segments_N codec header, since 4.9+ */
public static final int VERSION_49 = 3;
/** The file format version for the segments_N codec header, since 4.11+ */
public static final int VERSION_411 = 4;
/** The file format version for the segments_N codec header, since 5.0+ */
public static final int VERSION_50 = 4;
/** Used to name new segments. */
// TODO: should this be a long ...?
@ -151,8 +151,8 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
*/
private static PrintStream infoStream = null;
/** Id for this commit; only written starting with Lucene 4.11 */
private String id;
/** Id for this commit; only written starting with Lucene 5.0 */
private byte[] id;
/** Sole constructor. Typically you call this and then
* use {@link #read(Directory) or
@ -262,10 +262,10 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
nextGeneration);
}
/** Since Lucene 4.11, every commit (segments_N) writes a unique id. This will
* return that id, or null if this commit was pre-4.11. */
public String getId() {
return id;
/** Since Lucene 5.0, every commit (segments_N) writes a unique id. This will
* return that id, or null if this commit was 5.0. */
public byte[] getId() {
return id == null ? null : id.clone();
}
/**
@ -296,7 +296,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
throw new IndexFormatTooOldException(input, magic, CodecUtil.CODEC_MAGIC, CodecUtil.CODEC_MAGIC);
}
// 4.0+
int format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_40, VERSION_411);
int format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_40, VERSION_50);
version = input.readLong();
counter = input.readInt();
int numSegments = input.readInt();
@ -361,8 +361,9 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
add(siPerCommit);
}
userData = input.readStringStringMap();
if (format >= VERSION_411) {
id = input.readString();
if (format >= VERSION_50) {
id = new byte[StringHelper.ID_LENGTH];
input.readBytes(id, 0, id.length);
}
if (format >= VERSION_48) {
@ -425,7 +426,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
try {
segnOutput = directory.createOutput(segmentFileName, IOContext.DEFAULT);
CodecUtil.writeHeader(segnOutput, "segments", VERSION_411);
CodecUtil.writeHeader(segnOutput, "segments", VERSION_50);
segnOutput.writeLong(version);
segnOutput.writeInt(counter); // write counter
segnOutput.writeInt(size()); // write infos
@ -451,7 +452,8 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
assert si.dir == directory;
}
segnOutput.writeStringStringMap(userData);
segnOutput.writeString(StringHelper.randomId());
byte[] id = StringHelper.randomId();
segnOutput.writeBytes(id, 0, id.length);
CodecUtil.writeFooter(segnOutput);
segnOutput.close();
directory.sync(Collections.singleton(segmentFileName));

View File

@ -146,7 +146,7 @@ final class SegmentMerger {
// write the merged infos
FieldInfosWriter fieldInfosWriter = codec.fieldInfosFormat().getFieldInfosWriter();
fieldInfosWriter.write(directory, mergeState.segmentInfo.name, "", mergeState.fieldInfos, context);
fieldInfosWriter.write(directory, mergeState.segmentInfo, "", mergeState.fieldInfos, context);
return mergeState;
}

View File

@ -217,7 +217,7 @@ public final class SegmentReader extends LeafReader implements Accountable {
final String segmentSuffix = info.getFieldInfosGen() == -1 ? "" : Long.toString(info.getFieldInfosGen(), Character.MAX_RADIX);
Codec codec = info.info.getCodec();
FieldInfosFormat fisFormat = codec.fieldInfosFormat();
return fisFormat.getFieldInfosReader().read(dir, info.info.name, segmentSuffix, IOContext.READONCE);
return fisFormat.getFieldInfosReader().read(dir, info.info, segmentSuffix, IOContext.READONCE);
} finally {
if (closeDir) {
dir.close();

View File

@ -31,7 +31,7 @@ import org.apache.lucene.search.DocIdSetIterator;
*
* @lucene.internal
*/
public final class FixedBitSet extends DocIdSet implements Bits {
public final class FixedBitSet extends DocIdSet implements MutableBits {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FixedBitSet.class);

View File

@ -232,21 +232,21 @@ public abstract class StringHelper {
// Holds 128 bit unsigned value:
private static BigInteger nextId;
private static final BigInteger idMask;
private static final BigInteger mask128;
private static final Object idLock = new Object();
private static final String idPad = "00000000000000000000000000000000";
static {
byte[] maskBytes = new byte[16];
Arrays.fill(maskBytes, (byte) 0xff);
idMask = new BigInteger(maskBytes);
// 128 bit unsigned mask
byte[] maskBytes128 = new byte[16];
Arrays.fill(maskBytes128, (byte) 0xff);
mask128 = new BigInteger(1, maskBytes128);
String prop = System.getProperty("tests.seed");
// State for xorshift128:
long x0;
long x1;
long seed;
if (prop != null) {
// So if there is a test failure that somehow relied on this id,
// we remain reproducible based on the test seed:
@ -280,17 +280,25 @@ public abstract class StringHelper {
s1 ^= s1 << 23; // a
x1 = s1 ^ s0 ^ (s1 >>> 17) ^ (s0 >>> 26); // b, c
}
// 64-bit unsigned mask
byte[] maskBytes64 = new byte[8];
Arrays.fill(maskBytes64, (byte) 0xff);
BigInteger mask64 = new BigInteger(1, maskBytes64);
// First make unsigned versions of x0, x1:
BigInteger unsignedX0 = new BigInteger(1, BigInteger.valueOf(x0).toByteArray());
BigInteger unsignedX1 = new BigInteger(1, BigInteger.valueOf(x1).toByteArray());
BigInteger unsignedX0 = BigInteger.valueOf(x0).and(mask64);
BigInteger unsignedX1 = BigInteger.valueOf(x1).and(mask64);
// Concatentate bits of x0 and x1, as unsigned 128 bit integer:
nextId = unsignedX0.shiftLeft(64).or(unsignedX1);
}
/** length in bytes of an ID */
public static final int ID_LENGTH = 16;
/** Generates a non-cryptographic globally unique id. */
public static String randomId() {
public static byte[] randomId() {
// NOTE: we don't use Java's UUID.randomUUID() implementation here because:
//
@ -306,15 +314,42 @@ public abstract class StringHelper {
// what impact that has on the period, whereas the simple ++ (mod 2^128)
// we use here is guaranteed to have the full period.
String id;
byte bits[];
synchronized(idLock) {
id = nextId.toString(16);
nextId = nextId.add(BigInteger.ONE).and(idMask);
bits = nextId.toByteArray();
nextId = nextId.add(BigInteger.ONE).and(mask128);
}
// toByteArray() always returns a sign bit, so it may require an extra byte (always zero)
if (bits.length > ID_LENGTH) {
assert bits.length == ID_LENGTH + 1;
assert bits[0] == 0;
return Arrays.copyOfRange(bits, 1, bits.length);
} else {
byte[] result = new byte[ID_LENGTH];
System.arraycopy(bits, 0, result, result.length - bits.length, bits.length);
return result;
}
}
/**
* Helper method to render an ID as a string, for debugging
* <p>
* Returns the string {@code (null)} if the id is null.
* Otherwise, returns a string representation for debugging.
* Never throws an exception. The returned string may
* indicate if the id is definitely invalid.
*/
public static String idToString(byte id[]) {
if (id == null) {
return "(null)";
} else {
StringBuilder sb = new StringBuilder();
sb.append(new BigInteger(1, id).toString(Character.MAX_RADIX));
if (id.length != ID_LENGTH) {
sb.append(" (INVALID FORMAT)");
}
return sb.toString();
}
assert id.length() <= 32: "id=" + id;
id = idPad.substring(id.length()) + id;
return id;
}
}

View File

@ -42,6 +42,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
import org.junit.BeforeClass;
@ -248,7 +249,7 @@ public class TestCodecs extends LuceneTestCase {
final Directory dir = newDirectory();
this.write(fieldInfos, dir, fields);
Codec codec = Codec.getDefault();
final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, null);
final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, null, StringHelper.randomId());
final FieldsProducer reader = codec.postingsFormat().fieldsProducer(new SegmentReadState(dir, si, fieldInfos, newIOContext(random())));
@ -305,7 +306,7 @@ public class TestCodecs extends LuceneTestCase {
this.write(fieldInfos, dir, fields);
Codec codec = Codec.getDefault();
final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, null);
final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, null, StringHelper.randomId());
if (VERBOSE) {
System.out.println("TEST: now read postings");
@ -800,7 +801,7 @@ public class TestCodecs extends LuceneTestCase {
private void write(final FieldInfos fieldInfos, final Directory dir, final FieldData[] fields) throws Throwable {
final Codec codec = Codec.getDefault();
final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, null);
final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, null, StringHelper.randomId());
final SegmentWriteState state = new SegmentWriteState(InfoStream.getDefault(), dir, si, fieldInfos, null, newIOContext(random()));
Arrays.sort(fields);

View File

@ -43,6 +43,7 @@ import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
@ -218,7 +219,7 @@ public class TestDoc extends LuceneTestCase {
final Codec codec = Codec.getDefault();
TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(si1.info.dir);
final SegmentInfo si = new SegmentInfo(si1.info.dir, Version.LATEST, merged, -1, false, codec, null);
final SegmentInfo si = new SegmentInfo(si1.info.dir, Version.LATEST, merged, -1, false, codec, null, StringHelper.randomId());
SegmentMerger merger = new SegmentMerger(Arrays.<LeafReader>asList(r1, r2),
si, InfoStream.getDefault(), trackingDir,
@ -226,21 +227,18 @@ public class TestDoc extends LuceneTestCase {
MergeState mergeState = merger.merge();
r1.close();
r2.close();
final SegmentInfo info = new SegmentInfo(si1.info.dir, Version.LATEST, merged,
si1.info.getDocCount() + si2.info.getDocCount(),
false, codec, null);
info.setFiles(new HashSet<>(trackingDir.getCreatedFiles()));
r2.close();;
si.setFiles(new HashSet<>(trackingDir.getCreatedFiles()));
if (useCompoundFile) {
Collection<String> filesToDelete = IndexWriter.createCompoundFile(InfoStream.getDefault(), dir, MergeState.CheckAbort.NONE, info, newIOContext(random()));
info.setUseCompoundFile(true);
Collection<String> filesToDelete = IndexWriter.createCompoundFile(InfoStream.getDefault(), dir, MergeState.CheckAbort.NONE, si, newIOContext(random()));
si.setUseCompoundFile(true);
for (final String fileToDelete : filesToDelete) {
si1.info.dir.deleteFile(fileToDelete);
}
}
return new SegmentCommitInfo(info, 0, -1L, -1L, -1L);
return new SegmentCommitInfo(si, 0, -1L, -1L, -1L);
}

View File

@ -1,115 +0,0 @@
package org.apache.lucene.index;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FieldInfosReader;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.LuceneTestCase;
//import org.cnlp.utils.properties.ResourceBundleHelper;
public class TestFieldInfos extends LuceneTestCase {
private Document testDoc = new Document();
@Override
public void setUp() throws Exception {
super.setUp();
DocHelper.setupDoc(testDoc);
}
public FieldInfos createAndWriteFieldInfos(Directory dir, String filename) throws IOException{
//Positive test of FieldInfos
assertTrue(testDoc != null);
FieldInfos.Builder builder = new FieldInfos.Builder();
for (IndexableField field : testDoc.getFields()) {
builder.addOrUpdate(field.name(), field.fieldType());
}
FieldInfos fieldInfos = builder.finish();
//Since the complement is stored as well in the fields map
assertTrue(fieldInfos.size() == DocHelper.all.size()); //this is all b/c we are using the no-arg constructor
IndexOutput output = dir.createOutput(filename, newIOContext(random()));
assertTrue(output != null);
//Use a RAMOutputStream
FieldInfosWriter writer = Codec.getDefault().fieldInfosFormat().getFieldInfosWriter();
writer.write(dir, filename, "", fieldInfos, IOContext.DEFAULT);
output.close();
return fieldInfos;
}
public FieldInfos readFieldInfos(Directory dir, String filename) throws IOException {
FieldInfosReader reader = Codec.getDefault().fieldInfosFormat().getFieldInfosReader();
return reader.read(dir, filename, "", IOContext.DEFAULT);
}
public void test() throws IOException {
String name = "testFile";
Directory dir = newDirectory();
FieldInfos fieldInfos = createAndWriteFieldInfos(dir, name);
FieldInfos readIn = readFieldInfos(dir, name);
assertTrue(fieldInfos.size() == readIn.size());
FieldInfo info = readIn.fieldInfo("textField1");
assertTrue(info != null);
assertTrue(info.hasVectors() == false);
assertTrue(info.omitsNorms() == false);
info = readIn.fieldInfo("textField2");
assertTrue(info != null);
assertTrue(info.omitsNorms() == false);
info = readIn.fieldInfo("textField3");
assertTrue(info != null);
assertTrue(info.hasVectors() == false);
assertTrue(info.omitsNorms() == true);
info = readIn.fieldInfo("omitNorms");
assertTrue(info != null);
assertTrue(info.hasVectors() == false);
assertTrue(info.omitsNorms() == true);
dir.close();
}
public void testReadOnly() throws IOException {
String name = "testFile";
Directory dir = newDirectory();
FieldInfos fieldInfos = createAndWriteFieldInfos(dir, name);
FieldInfos readOnly = readFieldInfos(dir, name);
assertReadOnly(readOnly, fieldInfos);
dir.close();
}
private void assertReadOnly(FieldInfos readOnly, FieldInfos modifiable) {
assertEquals(modifiable.size(), readOnly.size());
// assert we can iterate
for (FieldInfo fi : readOnly) {
assertEquals(fi.name, modifiable.fieldInfo(fi.number).name);
}
}
}

View File

@ -96,7 +96,7 @@ public class TestIndexFileDeleter extends LuceneTestCase {
*/
// TODO: fix this test better
String ext = Codec.getDefault().getName().equals("SimpleText") ? ".liv" : ".del";
String ext = ".liv";
// Create a bogus separate del file for a
// segment that already has a separate del file:

View File

@ -2767,11 +2767,13 @@ public class TestIndexWriter extends LuceneTestCase {
SegmentInfos sis = new SegmentInfos();
sis.read(d);
String id1 = sis.getId();
byte[] id1 = sis.getId();
assertNotNull(id1);
assertEquals(StringHelper.ID_LENGTH, id1.length);
String id2 = sis.info(0).info.getId();
byte[] id2 = sis.info(0).info.getId();
assertNotNull(id2);
assertEquals(StringHelper.ID_LENGTH, id2.length);
// Make sure CheckIndex includes id output:
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
@ -2784,14 +2786,14 @@ public class TestIndexWriter extends LuceneTestCase {
assertTrue(s, indexStatus != null && indexStatus.clean);
// Commit id is always stored:
assertTrue("missing id=" + id1 + " in:\n" + s, s.contains("id=" + id1));
assertTrue("missing id=" + StringHelper.idToString(id1) + " in:\n" + s, s.contains("id=" + StringHelper.idToString(id1)));
assertTrue("missing id=" + id2 + " in:\n" + s, s.contains("id=" + id2));
assertTrue("missing id=" + StringHelper.idToString(id1) + " in:\n" + s, s.contains("id=" + StringHelper.idToString(id1)));
d.close();
Set<String> ids = new HashSet<>();
for(int i=0;i<100000;i++) {
String id = StringHelper.randomId();
String id = StringHelper.idToString(StringHelper.randomId());
assertFalse("id=" + id + " i=" + i, ids.contains(id));
ids.add(id);
}

View File

@ -28,6 +28,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
@ -78,7 +79,7 @@ public class TestSegmentMerger extends LuceneTestCase {
public void testMerge() throws IOException {
final Codec codec = Codec.getDefault();
final SegmentInfo si = new SegmentInfo(mergedDir, Version.LATEST, mergedSegment, -1, false, codec, null);
final SegmentInfo si = new SegmentInfo(mergedDir, Version.LATEST, mergedSegment, -1, false, codec, null, StringHelper.randomId());
SegmentMerger merger = new SegmentMerger(Arrays.<LeafReader>asList(reader1, reader2),
si, InfoStream.getDefault(), mergedDir,
@ -88,8 +89,7 @@ public class TestSegmentMerger extends LuceneTestCase {
assertTrue(docsMerged == 2);
//Should be able to open a new SegmentReader against the new directory
SegmentReader mergedReader = new SegmentReader(new SegmentCommitInfo(
new SegmentInfo(mergedDir, Version.LATEST, mergedSegment, docsMerged,
false, codec, null),
mergeState.segmentInfo,
0, -1L, -1L, -1L),
newIOContext(random()));
assertTrue(mergedReader != null);

View File

@ -28,6 +28,7 @@ import java.util.List;
import java.util.Locale;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.StringHelper;
/**
* Command-line tool that enables listing segments in an
@ -137,7 +138,7 @@ public class IndexSplitter {
SegmentInfo info = infoPerCommit.info;
// Same info just changing the dir:
SegmentInfo newInfo = new SegmentInfo(destFSDir, info.getVersion(), info.name, info.getDocCount(),
info.getUseCompoundFile(), info.getCodec(), info.getDiagnostics());
info.getUseCompoundFile(), info.getCodec(), info.getDiagnostics(), info.getId());
destInfos.add(new SegmentCommitInfo(newInfo, infoPerCommit.getDelCount(),
infoPerCommit.getDelGen(), infoPerCommit.getFieldInfosGen(),
infoPerCommit.getDocValuesGen()));

View File

@ -19,6 +19,7 @@ package org.apache.lucene.codecs.asserting;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
@ -49,7 +50,7 @@ public class AssertingCodec extends FilterCodec {
private final TermVectorsFormat vectors = new AssertingTermVectorsFormat();
private final StoredFieldsFormat storedFields = new AssertingStoredFieldsFormat();
private final NormsFormat norms = new AssertingNormsFormat();
private final LiveDocsFormat liveDocs = new AssertingLiveDocsFormat();
private final PostingsFormat defaultFormat = new AssertingPostingsFormat();
private final DocValuesFormat defaultDVFormat = new AssertingDocValuesFormat();
@ -82,6 +83,11 @@ public class AssertingCodec extends FilterCodec {
return norms;
}
@Override
public LiveDocsFormat liveDocsFormat() {
return liveDocs;
}
@Override
public String toString() {
return "Asserting(" + delegate + ")";

View File

@ -0,0 +1,137 @@
package org.apache.lucene.codecs.asserting;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.MutableBits;
import org.apache.lucene.util.TestUtil;
/**
* Just like the default live docs format but with additional asserts.
*/
public class AssertingLiveDocsFormat extends LiveDocsFormat {
private final LiveDocsFormat in = TestUtil.getDefaultCodec().liveDocsFormat();
@Override
public MutableBits newLiveDocs(int size) throws IOException {
assert size >= 0;
MutableBits raw = in.newLiveDocs(size);
assert raw != null;
assert raw.length() == size;
for (int i = 0; i < raw.length(); i++) {
assert raw.get(i);
}
return new AssertingMutableBits(raw);
}
@Override
public MutableBits newLiveDocs(Bits existing) throws IOException {
assert existing instanceof AssertingBits;
Bits rawExisting = ((AssertingBits)existing).in;
MutableBits raw = in.newLiveDocs(rawExisting);
assert raw != null;
assert raw.length() == rawExisting.length();
for (int i = 0; i < raw.length(); i++) {
assert rawExisting.get(i) == raw.get(i);
}
return new AssertingMutableBits(raw);
}
@Override
public Bits readLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context) throws IOException {
Bits raw = in.readLiveDocs(dir, info, context);
assert raw != null;
check(raw, info.info.getDocCount(), info.getDelCount());
return new AssertingBits(raw);
}
@Override
public void writeLiveDocs(MutableBits bits, Directory dir, SegmentCommitInfo info, int newDelCount, IOContext context) throws IOException {
assert bits instanceof AssertingMutableBits;
MutableBits raw = (MutableBits) ((AssertingMutableBits)bits).in;
check(raw, info.info.getDocCount(), info.getDelCount() + newDelCount);
in.writeLiveDocs(raw, dir, info, newDelCount, context);
}
private void check(Bits bits, int expectedLength, int expectedDeleteCount) {
assert bits.length() == expectedLength;
int deletedCount = 0;
for (int i = 0; i < bits.length(); i++) {
if (!bits.get(i)) {
deletedCount++;
}
}
assert deletedCount == expectedDeleteCount;
}
@Override
public void files(SegmentCommitInfo info, Collection<String> files) throws IOException {
in.files(info, files);
}
@Override
public String toString() {
return "Asserting(" + in + ")";
}
static class AssertingBits implements Bits {
final Bits in;
AssertingBits(Bits in) {
this.in = in;
assert in.length() >= 0;
}
@Override
public boolean get(int index) {
assert index >= 0;
assert index < in.length();
return in.get(index);
}
@Override
public int length() {
return in.length();
}
@Override
public String toString() {
return "Asserting(" + in + ")";
}
}
static class AssertingMutableBits extends AssertingBits implements MutableBits {
AssertingMutableBits(MutableBits in) {
super(in);
}
@Override
public void clear(int index) {
assert index >= 0;
assert index < in.length();
((MutableBits)in).clear(index);
}
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FieldInfosReader;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -59,11 +60,11 @@ class CrankyFieldInfosFormat extends FieldInfosFormat {
}
@Override
public void write(Directory directory, String segmentName, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
if (random.nextInt(100) == 0) {
throw new IOException("Fake IOException from FieldInfosWriter.write()");
}
delegate.write(directory, segmentName, segmentSuffix, infos, context);
delegate.write(directory, segmentInfo, segmentSuffix, infos, context);
}
}
}

View File

@ -57,6 +57,7 @@ import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.RamUsageTester;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.Version;
@ -675,7 +676,7 @@ public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTest
// randomly index at lower IndexOption
private FieldsProducer buildIndex(Directory dir, IndexOptions maxAllowed, boolean allowPayloads, boolean alwaysTestMax) throws IOException {
Codec codec = getCodec();
SegmentInfo segmentInfo = new SegmentInfo(dir, Version.LATEST, "_0", maxDoc, false, codec, null);
SegmentInfo segmentInfo = new SegmentInfo(dir, Version.LATEST, "_0", maxDoc, false, codec, null, StringHelper.randomId());
int maxIndexOption = Arrays.asList(IndexOptions.values()).indexOf(maxAllowed);
if (VERBOSE) {