mirror of https://github.com/apache/lucene.git
LUCENE-6504: implement norms with random access API
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1685007 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c68b04d90c
commit
50fc8ad31d
|
@ -72,6 +72,9 @@ New Features
|
|||
|
||||
* LUCENE-6549: Add preload option to MMapDirectory. (Robert Muir)
|
||||
|
||||
* LUCENE-6504: Add Lucene53Codec, with norms implemented directly
|
||||
via the Directory's RandomAccessInput api. (Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-6508: Simplify Lock api, there is now just
|
||||
|
|
|
@ -42,8 +42,9 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene50 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
* @deprecated Only for reading old 5.0-5.2 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene50Codec extends Codec {
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
|
||||
|
@ -157,7 +158,7 @@ public class Lucene50Codec extends Codec {
|
|||
private final NormsFormat normsFormat = new Lucene50NormsFormat();
|
||||
|
||||
@Override
|
||||
public final NormsFormat normsFormat() {
|
||||
public NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Lucene 5.0 Score normalization format.
|
||||
* @deprecated Only for reading old 5.0-5.2 segments
|
||||
*/
|
||||
@Deprecated
|
||||
class Lucene50NormsFormat extends NormsFormat {
|
||||
|
||||
/** Sole Constructor */
|
||||
public Lucene50NormsFormat() {}
|
||||
|
||||
@Override
|
||||
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||
throw new UnsupportedOperationException("this codec can only be used for reading");
|
||||
}
|
||||
|
||||
@Override
|
||||
public NormsProducer normsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene50NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
static final String DATA_CODEC = "Lucene50NormsData";
|
||||
static final String DATA_EXTENSION = "nvd";
|
||||
static final String METADATA_CODEC = "Lucene50NormsMetadata";
|
||||
static final String METADATA_EXTENSION = "nvm";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
static final byte DELTA_COMPRESSED = 0;
|
||||
static final byte TABLE_COMPRESSED = 1;
|
||||
static final byte CONST_COMPRESSED = 2;
|
||||
static final byte UNCOMPRESSED = 3;
|
||||
static final byte INDIRECT = 4;
|
||||
static final byte PATCHED_BITSET = 5;
|
||||
static final byte PATCHED_TABLE = 6;
|
||||
}
|
|
@ -17,13 +17,13 @@ package org.apache.lucene.codecs.lucene50;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.CONST_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.DELTA_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.INDIRECT;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.PATCHED_BITSET;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.PATCHED_TABLE;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.TABLE_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.UNCOMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.CONST_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.DELTA_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.INDIRECT;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.PATCHED_BITSET;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.PATCHED_TABLE;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.TABLE_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.UNCOMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.VERSION_START;
|
||||
|
||||
|
@ -58,8 +58,10 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
|
||||
/**
|
||||
* Reader for {@link Lucene50NormsFormat}
|
||||
* @deprecated Only for reading old 5.0-5.2 segments
|
||||
*/
|
||||
class Lucene50NormsProducer extends NormsProducer {
|
||||
@Deprecated
|
||||
final class Lucene50NormsProducer extends NormsProducer {
|
||||
// metadata maps (just file pointers and minimal stuff)
|
||||
private final Map<String,NormsEntry> norms = new HashMap<>();
|
||||
private final IndexInput data;
|
|
@ -0,0 +1,25 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Lucene 5.0 file format.
|
||||
</body>
|
||||
</html>
|
|
@ -13,4 +13,4 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
org.apache.lucene.codecs.lucene50.Lucene50Codec
|
||||
|
|
|
@ -39,15 +39,10 @@ import static org.apache.lucene.codecs.lucene50.Lucene50NormsFormat.VERSION_CURR
|
|||
|
||||
/**
|
||||
* Writer for {@link Lucene50NormsFormat}
|
||||
* @deprecated Only for testing old 5.0-5.2 segments
|
||||
*/
|
||||
class Lucene50NormsConsumer extends NormsConsumer {
|
||||
static final byte DELTA_COMPRESSED = 0;
|
||||
static final byte TABLE_COMPRESSED = 1;
|
||||
static final byte CONST_COMPRESSED = 2;
|
||||
static final byte UNCOMPRESSED = 3;
|
||||
static final byte INDIRECT = 4;
|
||||
static final byte PATCHED_BITSET = 5;
|
||||
static final byte PATCHED_TABLE = 6;
|
||||
@Deprecated
|
||||
final class Lucene50NormsConsumer extends NormsConsumer {
|
||||
static final int BLOCK_SIZE = 1 << 14;
|
||||
|
||||
// threshold for indirect encoding, computed as 1 - 1/log2(maxint)
|
||||
|
@ -181,13 +176,13 @@ class Lucene50NormsConsumer extends NormsConsumer {
|
|||
|
||||
private void addConstant(byte constant) throws IOException {
|
||||
meta.writeVInt(0);
|
||||
meta.writeByte(CONST_COMPRESSED);
|
||||
meta.writeByte(Lucene50NormsFormat.CONST_COMPRESSED);
|
||||
meta.writeLong(constant);
|
||||
}
|
||||
|
||||
private void addUncompressed(Iterable<Number> values, int count) throws IOException {
|
||||
meta.writeVInt(count);
|
||||
meta.writeByte(UNCOMPRESSED); // uncompressed byte[]
|
||||
meta.writeByte(Lucene50NormsFormat.UNCOMPRESSED); // uncompressed byte[]
|
||||
meta.writeLong(data.getFilePointer());
|
||||
for (Number nv : values) {
|
||||
data.writeByte(nv.byteValue());
|
||||
|
@ -196,7 +191,7 @@ class Lucene50NormsConsumer extends NormsConsumer {
|
|||
|
||||
private void addTableCompressed(Iterable<Number> values, FormatAndBits compression, int count, NormMap uniqueValues) throws IOException {
|
||||
meta.writeVInt(count);
|
||||
meta.writeByte(TABLE_COMPRESSED); // table-compressed
|
||||
meta.writeByte(Lucene50NormsFormat.TABLE_COMPRESSED); // table-compressed
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
writeTable(values, compression, count, uniqueValues, uniqueValues.size);
|
||||
|
@ -226,7 +221,7 @@ class Lucene50NormsConsumer extends NormsConsumer {
|
|||
|
||||
private void addDeltaCompressed(Iterable<Number> values, int count) throws IOException {
|
||||
meta.writeVInt(count);
|
||||
meta.writeByte(DELTA_COMPRESSED); // delta-compressed
|
||||
meta.writeByte(Lucene50NormsFormat.DELTA_COMPRESSED); // delta-compressed
|
||||
meta.writeLong(data.getFilePointer());
|
||||
data.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
data.writeVInt(BLOCK_SIZE);
|
||||
|
@ -245,7 +240,7 @@ class Lucene50NormsConsumer extends NormsConsumer {
|
|||
int commonCount = uniqueValues.freqs[0];
|
||||
|
||||
meta.writeVInt(count - commonCount);
|
||||
meta.writeByte(PATCHED_BITSET);
|
||||
meta.writeByte(Lucene50NormsFormat.PATCHED_BITSET);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
// write docs with value
|
||||
|
@ -268,7 +263,7 @@ class Lucene50NormsConsumer extends NormsConsumer {
|
|||
// the exceptions should not be accessed very often, since the values are uncommon
|
||||
private void addPatchedTable(FieldInfo field, final Iterable<Number> values, final int numCommonValues, int commonValuesCount, int count, final NormMap uniqueValues) throws IOException {
|
||||
meta.writeVInt(count);
|
||||
meta.writeByte(PATCHED_TABLE);
|
||||
meta.writeByte(Lucene50NormsFormat.PATCHED_TABLE);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
assert numCommonValues == 3 || numCommonValues == 15;
|
||||
|
@ -287,7 +282,7 @@ class Lucene50NormsConsumer extends NormsConsumer {
|
|||
int commonCount = uniqueValues.freqs[minOrd];
|
||||
|
||||
meta.writeVInt(count - commonCount);
|
||||
meta.writeByte(INDIRECT);
|
||||
meta.writeByte(Lucene50NormsFormat.INDIRECT);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
// write docs with value
|
|
@ -0,0 +1,34 @@
|
|||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
|
||||
/**
|
||||
* Codec for testing 5.0 index format
|
||||
* @deprecated Only for testing old 5.0-5.2 segments
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene50RWCodec extends Lucene50Codec {
|
||||
private final NormsFormat normsFormat = new Lucene50RWNormsFormat();
|
||||
|
||||
@Override
|
||||
public NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Read-write version of 5.0 norms format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene50RWNormsFormat extends Lucene50NormsFormat {
|
||||
|
||||
@Override
|
||||
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||
return new Lucene50NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
}
|
|
@ -28,10 +28,10 @@ import org.apache.lucene.index.BaseNormsFormatTestCase;
|
|||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Tests Lucene49NormsFormat
|
||||
* Tests Lucene50NormsFormat
|
||||
*/
|
||||
public class TestLucene50NormsFormat extends BaseNormsFormatTestCase {
|
||||
private final Codec codec = TestUtil.getDefaultCodec();
|
||||
private final Codec codec = new Lucene50RWCodec();
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
|
@ -29,7 +29,7 @@ import org.apache.lucene.benchmark.byTask.PerfRunData;
|
|||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53Codec;
|
||||
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
||||
import org.apache.lucene.index.IndexCommit;
|
||||
import org.apache.lucene.index.IndexDeletionPolicy;
|
||||
|
@ -139,7 +139,7 @@ public class CreateIndexTask extends PerfTask {
|
|||
if (defaultCodec == null && postingsFormat != null) {
|
||||
try {
|
||||
final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat);
|
||||
iwConf.setCodec(new Lucene50Codec(){
|
||||
iwConf.setCodec(new Lucene53Codec() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return postingsFormatChosen;
|
||||
|
|
|
@ -57,7 +57,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
|
|||
}
|
||||
|
||||
// TODO: should we use this, or maybe a system property is better?
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene50");
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene53");
|
||||
}
|
||||
|
||||
private final String name;
|
||||
|
|
|
@ -53,7 +53,6 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
|||
private int numberOfLevelsToBuffer = 1;
|
||||
|
||||
private int docCount;
|
||||
private boolean haveSkipped;
|
||||
|
||||
/** skipStream for each level. */
|
||||
private IndexInput[] skipStream;
|
||||
|
@ -120,11 +119,6 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
|||
* greater than or equal to <i>target</i>. Returns the current doc count.
|
||||
*/
|
||||
public int skipTo(int target) throws IOException {
|
||||
if (!haveSkipped) {
|
||||
// first time, load skip levels
|
||||
loadSkipLevels();
|
||||
haveSkipped = true;
|
||||
}
|
||||
|
||||
// walk up the levels until highest level is found that has a skip
|
||||
// for this target
|
||||
|
@ -196,7 +190,7 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
|||
}
|
||||
|
||||
/** Initializes the reader, for reuse on a new term. */
|
||||
public void init(long skipPointer, int df) {
|
||||
public void init(long skipPointer, int df) throws IOException {
|
||||
this.skipPointer[0] = skipPointer;
|
||||
this.docCount = df;
|
||||
assert skipPointer >= 0 && skipPointer <= skipStream[0].length()
|
||||
|
@ -205,10 +199,10 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
|||
Arrays.fill(numSkipped, 0);
|
||||
Arrays.fill(childPointer, 0);
|
||||
|
||||
haveSkipped = false;
|
||||
for (int i = 1; i < numberOfSkipLevels; i++) {
|
||||
skipStream[i] = null;
|
||||
}
|
||||
loadSkipLevels();
|
||||
}
|
||||
|
||||
/** Loads the skip levels */
|
||||
|
|
|
@ -1,137 +0,0 @@
|
|||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Lucene 5.0 Score normalization format.
|
||||
* <p>
|
||||
* Encodes normalization values with these strategies:
|
||||
* <ul>
|
||||
* <li>Uncompressed: when values fit into a single byte and would require more than 4 bits
|
||||
* per value, they are just encoded as an uncompressed byte array.
|
||||
* <li>Constant: when there is only one value present for the entire field, no actual data
|
||||
* is written: this constant is encoded in the metadata
|
||||
* <li>Table-compressed: when the number of unique values is very small (< 64), and
|
||||
* when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
|
||||
* a lookup table is written instead. Each per-document entry is instead the ordinal
|
||||
* to this table, and those ordinals are compressed with bitpacking ({@link PackedInts}).
|
||||
* <li>Delta-compressed: per-document integers written as deltas from the minimum value,
|
||||
* compressed with bitpacking. For more information, see {@link BlockPackedWriter}.
|
||||
* This is only used when norms of larger than one byte are present.
|
||||
* <li>Indirect: when norms are extremely sparse, missing values are omitted.
|
||||
* Access to an individual value is slower, but missing norm values are never accessed
|
||||
* by search code.
|
||||
* <li>Patched bitset: when a single norm value dominates, a sparse bitset encodes docs
|
||||
* with exceptions, so that access to the common value is still very fast. outliers
|
||||
* fall through to an exception handling mechanism (Indirect or Constant).
|
||||
* <li>Patched table: when a small number of norm values dominate, a table is used for the
|
||||
* common values to allow fast access. less common values fall through to an exception
|
||||
* handling mechanism (Indirect).
|
||||
* </ul>
|
||||
* <p>
|
||||
* Files:
|
||||
* <ol>
|
||||
* <li><tt>.nvd</tt>: Norms data</li>
|
||||
* <li><tt>.nvm</tt>: Norms metadata</li>
|
||||
* </ol>
|
||||
* <ol>
|
||||
* <li><a name="nvm"></a>
|
||||
* <p>The Norms metadata or .nvm file.</p>
|
||||
* <p>For each norms field, this stores metadata, such as the offset into the
|
||||
* Norms data (.nvd)</p>
|
||||
* <p>Norms metadata (.dvm) --> Header,<Entry><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
* <li>Entry --> FieldNumber,Type,Offset</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeVInt vInt}</li>
|
||||
* <li>Type --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>Offset --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>FieldNumber of -1 indicates the end of metadata.</p>
|
||||
* <p>Offset is the pointer to the start of the data in the norms data (.nvd), or the singleton value for Constant</p>
|
||||
* <p>Type indicates how Numeric values will be compressed:
|
||||
* <ul>
|
||||
* <li>0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded
|
||||
* from the minimum value within the block.
|
||||
* <li>1 --> table-compressed. When the number of unique numeric values is small and it would save space,
|
||||
* a lookup table of unique values is written, followed by the ordinal for each document.
|
||||
* <li>2 --> constant. When there is a single value for the entire field.
|
||||
* <li>3 --> uncompressed: Values written as a simple byte[].
|
||||
* <li>4 --> indirect. Only documents with a value are written with monotonic compression. a nested
|
||||
* entry for the same field will follow for the exception handler.
|
||||
* <li>5 --> patched bitset. Encoded the same as indirect.
|
||||
* <li>6 --> patched table. Documents with very common values are written with a lookup table.
|
||||
* Other values are written using a nested indirect.
|
||||
* </ul>
|
||||
* <li><a name="nvd"></a>
|
||||
* <p>The Norms data or .nvd file.</p>
|
||||
* <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
* <p>Norms data (.nvd) --> Header,<Uncompressed | TableCompressed | DeltaCompressed | MonotonicCompressed ><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
* <li>Uncompressed --> {@link DataOutput#writeByte Byte}<sup>maxDoc</sup></li>
|
||||
* <li>TableCompressed --> PackedIntsVersion,Table,BitPackedData</li>
|
||||
* <li>Table --> TableSize, {@link DataOutput#writeLong int64}<sup>TableSize</sup></li>
|
||||
* <li>BitpackedData --> {@link PackedInts}</li>
|
||||
* <li>DeltaCompressed --> PackedIntsVersion,BlockSize,DeltaCompressedData</li>
|
||||
* <li>DeltaCompressedData --> {@link BlockPackedWriter BlockPackedWriter(blockSize=16k)}</li>
|
||||
* <li>MonotonicCompressed --> PackedIntsVersion,BlockSize,MonotonicCompressedData</li>
|
||||
* <li>MonotonicCompressedData --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedWriter(blockSize=16k)}</li>
|
||||
* <li>PackedIntsVersion,BlockSize,TableSize --> {@link DataOutput#writeVInt vInt}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </ol>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene50NormsFormat extends NormsFormat {
|
||||
|
||||
/** Sole Constructor */
|
||||
public Lucene50NormsFormat() {}
|
||||
|
||||
@Override
|
||||
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||
return new Lucene50NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
@Override
|
||||
public NormsProducer normsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene50NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
private static final String DATA_CODEC = "Lucene50NormsData";
|
||||
private static final String DATA_EXTENSION = "nvd";
|
||||
private static final String METADATA_CODEC = "Lucene50NormsMetadata";
|
||||
private static final String METADATA_EXTENSION = "nvm";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
}
|
|
@ -414,7 +414,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
|
|||
// Lazy init: first time this enum has ever been used for skipping
|
||||
skipper = new Lucene50SkipReader(docIn.clone(),
|
||||
MAX_SKIP_LEVELS,
|
||||
BLOCK_SIZE,
|
||||
indexHasPos,
|
||||
indexHasOffsets,
|
||||
indexHasPayloads);
|
||||
|
@ -692,7 +691,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
|
|||
// Lazy init: first time this enum has ever been used for skipping
|
||||
skipper = new Lucene50SkipReader(docIn.clone(),
|
||||
MAX_SKIP_LEVELS,
|
||||
BLOCK_SIZE,
|
||||
true,
|
||||
indexHasOffsets,
|
||||
indexHasPayloads);
|
||||
|
@ -1121,7 +1119,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
|
|||
// Lazy init: first time this enum has ever been used for skipping
|
||||
skipper = new Lucene50SkipReader(docIn.clone(),
|
||||
MAX_SKIP_LEVELS,
|
||||
BLOCK_SIZE,
|
||||
true,
|
||||
indexHasOffsets,
|
||||
indexHasPayloads);
|
||||
|
|
|
@ -23,6 +23,8 @@ import java.util.Arrays;
|
|||
import org.apache.lucene.codecs.MultiLevelSkipListReader;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.BLOCK_SIZE;
|
||||
|
||||
/**
|
||||
* Implements the skip list reader for block postings format
|
||||
* that stores positions and payloads.
|
||||
|
@ -51,8 +53,6 @@ import org.apache.lucene.store.IndexInput;
|
|||
*
|
||||
*/
|
||||
final class Lucene50SkipReader extends MultiLevelSkipListReader {
|
||||
private final int blockSize;
|
||||
|
||||
private long docPointer[];
|
||||
private long posPointer[];
|
||||
private long payPointer[];
|
||||
|
@ -65,9 +65,8 @@ final class Lucene50SkipReader extends MultiLevelSkipListReader {
|
|||
private long lastDocPointer;
|
||||
private int lastPosBufferUpto;
|
||||
|
||||
public Lucene50SkipReader(IndexInput skipStream, int maxSkipLevels, int blockSize, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
|
||||
super(skipStream, maxSkipLevels, blockSize, 8);
|
||||
this.blockSize = blockSize;
|
||||
public Lucene50SkipReader(IndexInput skipStream, int maxSkipLevels, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
|
||||
super(skipStream, maxSkipLevels, BLOCK_SIZE, 8);
|
||||
docPointer = new long[maxSkipLevels];
|
||||
if (hasPos) {
|
||||
posPointer = new long[maxSkipLevels];
|
||||
|
@ -97,10 +96,10 @@ final class Lucene50SkipReader extends MultiLevelSkipListReader {
|
|||
*
|
||||
*/
|
||||
protected int trim(int df) {
|
||||
return df % blockSize == 0? df - 1: df;
|
||||
return df % BLOCK_SIZE == 0? df - 1: df;
|
||||
}
|
||||
|
||||
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
|
||||
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) throws IOException {
|
||||
super.init(skipPointer, trim(df));
|
||||
lastDocPointer = docBasePointer;
|
||||
lastPosPointer = posBasePointer;
|
||||
|
|
|
@ -53,9 +53,9 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* These two options can be configured like this:
|
||||
* <pre class="prettyprint">
|
||||
* // the default: for high performance
|
||||
* indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_SPEED));
|
||||
* indexWriterConfig.setCodec(new Lucene53Codec(Mode.BEST_SPEED));
|
||||
* // instead for higher performance (but slower):
|
||||
* // indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_COMPRESSION));
|
||||
* // indexWriterConfig.setCodec(new Lucene53Codec(Mode.BEST_COMPRESSION));
|
||||
* </pre>
|
||||
* <p><b>File formats</b>
|
||||
* <p>Stored fields are represented by two files:
|
||||
|
|
|
@ -16,386 +16,8 @@
|
|||
*/
|
||||
|
||||
/**
|
||||
* Lucene 5.0 file format.
|
||||
*
|
||||
* <h1>Apache Lucene - Index File Formats</h1>
|
||||
* <div>
|
||||
* <ul>
|
||||
* <li><a href="#Introduction">Introduction</a></li>
|
||||
* <li><a href="#Definitions">Definitions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a></li>
|
||||
* <li><a href="#Types_of_Fields">Types of Fields</a></li>
|
||||
* <li><a href="#Segments">Segments</a></li>
|
||||
* <li><a href="#Document_Numbers">Document Numbers</a></li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a href="#Overview">Index Structure Overview</a></li>
|
||||
* <li><a href="#File_Naming">File Naming</a></li>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Lock_File">Lock File</a></li>
|
||||
* <li><a href="#History">History</a></li>
|
||||
* <li><a href="#Limitations">Limitations</a></li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* </ul>
|
||||
* </div>
|
||||
* <a name="Introduction"></a>
|
||||
* <h2>Introduction</h2>
|
||||
* <div>
|
||||
* <p>This document defines the index file formats used in this version of Lucene.
|
||||
* If you are using a different version of Lucene, please consult the copy of
|
||||
* <code>docs/</code> that was distributed with
|
||||
* the version you are using.</p>
|
||||
* <p>Apache Lucene is written in Java, but several efforts are underway to write
|
||||
* <a href="http://wiki.apache.org/lucene-java/LuceneImplementations">versions of
|
||||
* Lucene in other programming languages</a>. If these versions are to remain
|
||||
* compatible with Apache Lucene, then a language-independent definition of the
|
||||
* Lucene index format is required. This document thus attempts to provide a
|
||||
* complete and independent definition of the Apache Lucene file formats.</p>
|
||||
* <p>As Lucene evolves, this document should evolve. Versions of Lucene in
|
||||
* different programming languages should endeavor to agree on file formats, and
|
||||
* generate new versions of this document.</p>
|
||||
* </div>
|
||||
* <a name="Definitions"></a>
|
||||
* <h2>Definitions</h2>
|
||||
* <div>
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.</p>
|
||||
* <p>An index contains a sequence of documents.</p>
|
||||
* <ul>
|
||||
* <li>A document is a sequence of fields.</li>
|
||||
* <li>A field is a named sequence of terms.</li>
|
||||
* <li>A term is a sequence of bytes.</li>
|
||||
* </ul>
|
||||
* <p>The same sequence of bytes in two different fields is considered a different
|
||||
* term. Thus terms are represented as a pair: the string naming the field, and the
|
||||
* bytes within the field.</p>
|
||||
* <a name="Inverted_Indexing"></a>
|
||||
* <h3>Inverted Indexing</h3>
|
||||
* <p>The index stores statistics about terms in order to make term-based search
|
||||
* more efficient. Lucene's index falls into the family of indexes known as an
|
||||
* <i>inverted index.</i> This is because it can list, for a term, the documents
|
||||
* that contain it. This is the inverse of the natural relationship, in which
|
||||
* documents list terms.</p>
|
||||
* <a name="Types_of_Fields"></a>
|
||||
* <h3>Types of Fields</h3>
|
||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
|
||||
* in the index literally, in a non-inverted manner. Fields that are inverted are
|
||||
* called <i>indexed</i>. A field may be both stored and indexed.</p>
|
||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the
|
||||
* text of a field may be used literally as a term to be indexed. Most fields are
|
||||
* tokenized, but sometimes it is useful for certain identifier fields to be
|
||||
* indexed literally.</p>
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field}
|
||||
* java docs for more information on Fields.</p>
|
||||
* <a name="Segments"></a>
|
||||
* <h3>Segments</h3>
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
|
||||
* Each segment is a fully independent index, which could be searched separately.
|
||||
* Indexes evolve by:</p>
|
||||
* <ol>
|
||||
* <li>Creating new segments for newly added documents.</li>
|
||||
* <li>Merging existing segments.</li>
|
||||
* </ol>
|
||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index
|
||||
* potentially composed of a set of segments.</p>
|
||||
* <a name="Document_Numbers"></a>
|
||||
* <h3>Document Numbers</h3>
|
||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>.
|
||||
* The first document added to an index is numbered zero, and each subsequent
|
||||
* document added gets a number one greater than the previous.</p>
|
||||
* <p>Note that a document's number may change, so caution should be taken when
|
||||
* storing these numbers outside of Lucene. In particular, numbers may change in
|
||||
* the following situations:</p>
|
||||
* <ul>
|
||||
* <li>
|
||||
* <p>The numbers stored in each segment are unique only within the segment, and
|
||||
* must be converted before they can be used in a larger context. The standard
|
||||
* technique is to allocate each segment a range of values, based on the range of
|
||||
* numbers used in that segment. To convert a document number from a segment to an
|
||||
* external value, the segment's <i>base</i> document number is added. To convert
|
||||
* an external value back to a segment-specific value, the segment is identified
|
||||
* by the range that the external value is in, and the segment's base value is
|
||||
* subtracted. For example two five document segments might be combined, so that
|
||||
* the first segment has a base value of zero, and the second of five. Document
|
||||
* three from the second segment would have an external value of eight.</p>
|
||||
* </li>
|
||||
* <li>
|
||||
* <p>When documents are deleted, gaps are created in the numbering. These are
|
||||
* eventually removed as the index evolves through merging. Deleted documents are
|
||||
* dropped when segments are merged. A freshly-merged segment thus has no gaps in
|
||||
* its numbering.</p>
|
||||
* </li>
|
||||
* </ul>
|
||||
* </div>
|
||||
* <a name="Overview"></a>
|
||||
* <h2>Index Structure Overview</h2>
|
||||
* <div>
|
||||
* <p>Each segment index maintains the following:</p>
|
||||
* <ul>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat Segment info}.
|
||||
* This contains metadata about a segment, such as the number of documents,
|
||||
* what files it uses,
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Field names}.
|
||||
* This contains the set of field names used in the index.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Stored Field values}.
|
||||
* This contains, for each document, a list of attribute-value pairs, where the attributes
|
||||
* are field names. These are used to store auxiliary information about the document, such as
|
||||
* its title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term dictionary}.
|
||||
* A dictionary containing all of the terms used in all of the
|
||||
* indexed fields of all of the documents. The dictionary also contains the number
|
||||
* of documents which contain the term, and pointers to the term's frequency and
|
||||
* proximity data.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Frequency data}.
|
||||
* For each term in the dictionary, the numbers of all the
|
||||
* documents that contain that term, and the frequency of the term in that
|
||||
* document, unless frequencies are omitted (IndexOptions.DOCS_ONLY)
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Proximity data}.
|
||||
* For each term in the dictionary, the positions that the
|
||||
* term occurs in each document. Note that this will not exist if all fields in
|
||||
* all documents omit position data.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50NormsFormat Normalization factors}.
|
||||
* For each field in each document, a value is stored
|
||||
* that is multiplied into the score for hits on that field.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
|
||||
* For each field in each document, the term vector (sometimes
|
||||
* called document vector) may be stored. A term vector consists of term text and
|
||||
* term frequency. To add Term Vectors to your index see the
|
||||
* {@link org.apache.lucene.document.Field Field} constructors
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat Per-document values}.
|
||||
* Like stored values, these are also keyed by document
|
||||
* number, but are generally intended to be loaded into main memory for fast
|
||||
* access. Whereas stored values are generally intended for summary results from
|
||||
* searches, per-document values are useful for things like scoring factors.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
|
||||
* An optional file indicating which documents are live.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <p>Details on each of these are provided in their linked pages.</p>
|
||||
* </div>
|
||||
* <a name="File_Naming"></a>
|
||||
* <h2>File Naming</h2>
|
||||
* <div>
|
||||
* <p>All files belonging to a segment have the same name with varying extensions.
|
||||
* The extensions correspond to the different file formats described below. When
|
||||
* using the Compound File format (default in 1.4 and greater) these files (except
|
||||
* for the Segment info file, the Lock file, and Deleted documents file) are collapsed
|
||||
* into a single .cfs file (see below for details)</p>
|
||||
* <p>Typically, all segments in an index are stored in a single directory,
|
||||
* although this is not required.</p>
|
||||
* <p>As of version 2.1 (lock-less commits), file names are never re-used.
|
||||
* That is, when any file is saved
|
||||
* to the Directory it is given a never before used filename. This is achieved
|
||||
* using a simple generations approach. For example, the first segments file is
|
||||
* segments_1, then segments_2, etc. The generation is a sequential long integer
|
||||
* represented in alpha-numeric (base 36) form.</p>
|
||||
* </div>
|
||||
* <a name="file-names"></a>
|
||||
* <h2>Summary of File Extensions</h2>
|
||||
* <div>
|
||||
* <p>The following table summarizes the names and extensions of the files in
|
||||
* Lucene:</p>
|
||||
* <table cellspacing="1" cellpadding="4" summary="lucene filenames by extension">
|
||||
* <tr>
|
||||
* <th>Name</th>
|
||||
* <th>Extension</th>
|
||||
* <th>Brief Description</th>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||
* <td>segments_N</td>
|
||||
* <td>Stores information about a commit point</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td><a href="#Lock_File">Lock File</a></td>
|
||||
* <td>write.lock</td>
|
||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}</td>
|
||||
* <td>.cfs, .cfe</td>
|
||||
* <td>An optional "virtual" file consisting of all the other index files for
|
||||
* systems that frequently run out of file handles.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Fields}</td>
|
||||
* <td>.fnm</td>
|
||||
* <td>Stores information about the fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Index}</td>
|
||||
* <td>.fdx</td>
|
||||
* <td>Contains pointers to field data</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Data}</td>
|
||||
* <td>.fdt</td>
|
||||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50NormsFormat Norms}</td>
|
||||
* <td>.nvd, .nvm</td>
|
||||
* <td>Encodes length and boost factors for docs and fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat Per-Document Values}</td>
|
||||
* <td>.dvd, .dvm</td>
|
||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}</td>
|
||||
* <td>.tvx</td>
|
||||
* <td>Stores offset into the document data file</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Documents}</td>
|
||||
* <td>.tvd</td>
|
||||
* <td>Contains information about each document that has term vectors</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Fields}</td>
|
||||
* <td>.tvf</td>
|
||||
* <td>The field level info about term vectors</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
|
||||
* <td>.liv</td>
|
||||
* <td>Info about what files are live</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
* </div>
|
||||
* <a name="Lock_File"></a>
|
||||
* <h2>Lock File</h2>
|
||||
* The write lock, which is stored in the index directory by default, is named
|
||||
* "write.lock". If the lock directory is different from the index directory then
|
||||
* the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
|
||||
* derived from the full path to the index directory. When this file is present, a
|
||||
* writer is currently modifying the index (adding or removing documents). This
|
||||
* lock file ensures that only one writer is modifying the index at a time.
|
||||
* <a name="History"></a>
|
||||
* <h2>History</h2>
|
||||
* <p>Compatibility notes are provided in this document, describing how file
|
||||
* formats have changed from prior versions:</p>
|
||||
* <ul>
|
||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie,
|
||||
* no more commit lock). The change is fully backwards compatible: you can open a
|
||||
* pre-2.1 index for searching or adding/deleting of docs. When the new segments
|
||||
* file is saved (committed), it will be written in the new file format (meaning
|
||||
* no specific "upgrade" process is needed). But note that once a commit has
|
||||
* occurred, pre-2.1 Lucene will not be able to read the index.</li>
|
||||
* <li>In version 2.3, the file format was changed to allow segments to share a
|
||||
* single set of doc store (vectors & stored fields) files. This allows for
|
||||
* faster indexing in certain cases. The change is fully backwards compatible (in
|
||||
* the same way as the lock-less commits change in 2.1).</li>
|
||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not
|
||||
* Java's modified UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">
|
||||
* LUCENE-510</a> for details.</li>
|
||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData
|
||||
* may be passed to IndexWriter's commit methods (and later retrieved), which is
|
||||
* recorded in the segments_N file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">
|
||||
* LUCENE-1382</a> for details. Also,
|
||||
* diagnostics were added to each segment written recording details about why it
|
||||
* was written (due to flush, merge; which OS/JRE was used; etc.). See issue
|
||||
* <a href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.</li>
|
||||
* <li>In version 3.0, compressed fields are no longer written to the index (they
|
||||
* can still be read, but on merge the new segment will write them, uncompressed).
|
||||
* See issue <a href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a>
|
||||
* for details.</li>
|
||||
* <li>In version 3.1, segments records the code version that created them. See
|
||||
* <a href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||
* Additionally segments track explicitly whether or not they have term vectors.
|
||||
* See <a href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a>
|
||||
* for details.</li>
|
||||
* <li>In version 3.2, numeric fields are written as natively to stored fields
|
||||
* file, previously they were stored in text format only.</li>
|
||||
* <li>In version 3.4, fields can omit position data while still indexing term
|
||||
* frequencies.</li>
|
||||
* <li>In version 4.0, the format of the inverted index became extensible via
|
||||
* the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
|
||||
* ({@code DocValues}) was introduced. Normalization factors need no longer be a
|
||||
* single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
|
||||
* Terms need not be unicode strings, they can be any byte sequence. Term offsets
|
||||
* can optionally be indexed into the postings lists. Payloads can be stored in the
|
||||
* term vectors.</li>
|
||||
* <li>In version 4.1, the format of the postings list changed to use either
|
||||
* of FOR compression or variable-byte encoding, depending upon the frequency
|
||||
* of the term. Terms appearing only once were changed to inline directly into
|
||||
* the term dictionary. Stored fields are compressed by default. </li>
|
||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has
|
||||
* a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
|
||||
* on multi-valued fields.</li>
|
||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.</li>
|
||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||
* allow updating NumericDocValues fields.</li>
|
||||
* <li>In version 4.8, checksum footers were added to the end of each index file
|
||||
* for improved data integrity. Specifically, the last 8 bytes of every index file
|
||||
* contain the zlib-crc32 checksum of the file.</li>
|
||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
|
||||
* that is suitable for faceting/sorting/analytics.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <a name="Limitations"></a>
|
||||
* <h2>Limitations</h2>
|
||||
* <div>
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to
|
||||
* document numbers, and the index file format uses an <code>Int32</code>
|
||||
* on-disk to store document numbers. This is a limitation
|
||||
* of both the index file format and the current implementation. Eventually these
|
||||
* should be replaced with either <code>UInt64</code> values, or
|
||||
* better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.</p>
|
||||
* </div>
|
||||
* Components from the Lucene 5.0 index format
|
||||
* See {@link org.apache.lucene.codecs.lucene53} for an overview
|
||||
* of the index format.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
|
|
@ -0,0 +1,169 @@
|
|||
package org.apache.lucene.codecs.lucene53;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 5.3 index format, with configurable per-field postings
|
||||
* and docvalues formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene53 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene53Codec extends Codec {
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
|
||||
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
|
||||
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
|
||||
|
||||
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return Lucene53Codec.this.getPostingsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return Lucene53Codec.this.getDocValuesFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final StoredFieldsFormat storedFieldsFormat;
|
||||
|
||||
/**
|
||||
* Instantiates a new codec.
|
||||
*/
|
||||
public Lucene53Codec() {
|
||||
this(Mode.BEST_SPEED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new codec, specifying the stored fields compression
|
||||
* mode to use.
|
||||
* @param mode stored fields compression mode to use for newly
|
||||
* flushed/merged segments.
|
||||
*/
|
||||
public Lucene53Codec(Mode mode) {
|
||||
super("Lucene53");
|
||||
this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final StoredFieldsFormat storedFieldsFormat() {
|
||||
return storedFieldsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final TermVectorsFormat termVectorsFormat() {
|
||||
return vectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PostingsFormat postingsFormat() {
|
||||
return postingsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final FieldInfosFormat fieldInfosFormat() {
|
||||
return fieldInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final LiveDocsFormat liveDocsFormat() {
|
||||
return liveDocsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final CompoundFormat compoundFormat() {
|
||||
return compoundFormat;
|
||||
}
|
||||
|
||||
/** Returns the postings format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene50".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return defaultFormat;
|
||||
}
|
||||
|
||||
/** Returns the docvalues format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene50".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return defaultDVFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final DocValuesFormat docValuesFormat() {
|
||||
return docValuesFormat;
|
||||
}
|
||||
|
||||
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50");
|
||||
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene50");
|
||||
|
||||
private final NormsFormat normsFormat = new Lucene53NormsFormat();
|
||||
|
||||
@Override
|
||||
public final NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,153 @@
|
|||
package org.apache.lucene.codecs.lucene53;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene53.Lucene53NormsFormat.VERSION_CURRENT;
|
||||
|
||||
/**
|
||||
* Writer for {@link Lucene53NormsFormat}
|
||||
*/
|
||||
class Lucene53NormsConsumer extends NormsConsumer {
|
||||
IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
|
||||
Lucene53NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
data = state.directory.createOutput(dataName, state.context);
|
||||
CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
meta = state.directory.createOutput(metaName, state.context);
|
||||
CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
maxDoc = state.segmentInfo.maxDoc();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addNormsField(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
long minValue = Long.MAX_VALUE;
|
||||
long maxValue = Long.MIN_VALUE;
|
||||
int count = 0;
|
||||
|
||||
for (Number nv : values) {
|
||||
if (nv == null) {
|
||||
throw new IllegalStateException("illegal norms data for field " + field.name + ", got null for value: " + count);
|
||||
}
|
||||
final long v = nv.longValue();
|
||||
minValue = Math.min(minValue, v);
|
||||
maxValue = Math.max(maxValue, v);
|
||||
count++;
|
||||
}
|
||||
|
||||
if (count != maxDoc) {
|
||||
throw new IllegalStateException("illegal norms data for field " + field.name + ", expected count=" + maxDoc + ", got=" + count);
|
||||
}
|
||||
|
||||
if (minValue == maxValue) {
|
||||
addConstant(minValue);
|
||||
} else if (minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
|
||||
addByte1(values);
|
||||
} else if (minValue >= Short.MIN_VALUE && maxValue <= Short.MAX_VALUE) {
|
||||
addByte2(values);
|
||||
} else if (minValue >= Integer.MIN_VALUE && maxValue <= Integer.MAX_VALUE) {
|
||||
addByte4(values);
|
||||
} else {
|
||||
addByte8(values);
|
||||
}
|
||||
}
|
||||
|
||||
private void addConstant(long constant) throws IOException {
|
||||
meta.writeByte((byte) 0);
|
||||
meta.writeLong(constant);
|
||||
}
|
||||
|
||||
private void addByte1(Iterable<Number> values) throws IOException {
|
||||
meta.writeByte((byte) 1);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
for (Number value : values) {
|
||||
data.writeByte(value.byteValue());
|
||||
}
|
||||
}
|
||||
|
||||
private void addByte2(Iterable<Number> values) throws IOException {
|
||||
meta.writeByte((byte) 2);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
for (Number value : values) {
|
||||
data.writeShort(value.shortValue());
|
||||
}
|
||||
}
|
||||
|
||||
private void addByte4(Iterable<Number> values) throws IOException {
|
||||
meta.writeByte((byte) 4);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
for (Number value : values) {
|
||||
data.writeInt(value.intValue());
|
||||
}
|
||||
}
|
||||
|
||||
private void addByte8(Iterable<Number> values) throws IOException {
|
||||
meta.writeByte((byte) 8);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
for (Number value : values) {
|
||||
data.writeLong(value.longValue());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
if (meta != null) {
|
||||
meta.writeVInt(-1); // write EOF marker
|
||||
CodecUtil.writeFooter(meta); // write checksum
|
||||
}
|
||||
if (data != null) {
|
||||
CodecUtil.writeFooter(data); // write checksum
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(data, meta);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(data, meta);
|
||||
}
|
||||
meta = data = null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
package org.apache.lucene.codecs.lucene53;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* Lucene 5.3 Score normalization format.
|
||||
* <p>
|
||||
* Encodes normalization values by encoding each value with the minimum
|
||||
* number of bytes needed to represent the range (which can be zero).
|
||||
* <p>
|
||||
* Files:
|
||||
* <ol>
|
||||
* <li><tt>.nvd</tt>: Norms data</li>
|
||||
* <li><tt>.nvm</tt>: Norms metadata</li>
|
||||
* </ol>
|
||||
* <ol>
|
||||
* <li><a name="nvm"></a>
|
||||
* <p>The Norms metadata or .nvm file.</p>
|
||||
* <p>For each norms field, this stores metadata, such as the offset into the
|
||||
* Norms data (.nvd)</p>
|
||||
* <p>Norms metadata (.dvm) --> Header,<Entry><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
* <li>Entry --> FieldNumber,BytesPerValue, Address</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeVInt vInt}</li>
|
||||
* <li>BytesPerValue --> {@link DataOutput#writeByte byte}</li>
|
||||
* <li>Offset --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>FieldNumber of -1 indicates the end of metadata.</p>
|
||||
* <p>Offset is the pointer to the start of the data in the norms data (.nvd), or the singleton value
|
||||
* when BytesPerValue = 0</p>
|
||||
* <li><a name="nvd"></a>
|
||||
* <p>The Norms data or .nvd file.</p>
|
||||
* <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
* <p>Norms data (.nvd) --> Header,< Data ><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
* <li>Data --> {@link DataOutput#writeByte(byte) byte}<sup>MaxDoc * BytesPerValue</sup></li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </ol>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene53NormsFormat extends NormsFormat {
|
||||
|
||||
/** Sole Constructor */
|
||||
public Lucene53NormsFormat() {}
|
||||
|
||||
@Override
|
||||
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||
return new Lucene53NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
@Override
|
||||
public NormsProducer normsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene53NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
private static final String DATA_CODEC = "Lucene53NormsData";
|
||||
private static final String DATA_EXTENSION = "nvd";
|
||||
private static final String METADATA_CODEC = "Lucene53NormsMetadata";
|
||||
private static final String METADATA_EXTENSION = "nvm";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
}
|
|
@ -0,0 +1,208 @@
|
|||
package org.apache.lucene.codecs.lucene53;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.codecs.lucene53.Lucene53NormsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.lucene53.Lucene53NormsFormat.VERSION_START;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.RandomAccessInput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Reader for {@link Lucene53NormsFormat}
|
||||
*/
|
||||
class Lucene53NormsProducer extends NormsProducer {
|
||||
// metadata maps (just file pointers and minimal stuff)
|
||||
private final Map<Integer,NormsEntry> norms = new HashMap<>();
|
||||
private final IndexInput data;
|
||||
private final int maxDoc;
|
||||
|
||||
Lucene53NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
maxDoc = state.segmentInfo.maxDoc();
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
int version = -1;
|
||||
|
||||
// read in the entries from the metadata file.
|
||||
try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) {
|
||||
Throwable priorE = null;
|
||||
try {
|
||||
version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
readFields(in, state.fieldInfos);
|
||||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
CodecUtil.checkFooter(in, priorE);
|
||||
}
|
||||
}
|
||||
|
||||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
data = state.directory.openInput(dataName, state.context);
|
||||
boolean success = false;
|
||||
try {
|
||||
final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
if (version != version2) {
|
||||
throw new CorruptIndexException("Format versions mismatch: meta=" + version + ",data=" + version2, data);
|
||||
}
|
||||
|
||||
// NOTE: data file is too costly to verify checksum against all the bytes on open,
|
||||
// but for now we at least verify proper structure of the checksum footer: which looks
|
||||
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
|
||||
// such as file truncation.
|
||||
CodecUtil.retrieveChecksum(data);
|
||||
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
|
||||
int fieldNumber = meta.readVInt();
|
||||
while (fieldNumber != -1) {
|
||||
FieldInfo info = infos.fieldInfo(fieldNumber);
|
||||
if (info == null) {
|
||||
throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
|
||||
} else if (!info.hasNorms()) {
|
||||
throw new CorruptIndexException("Invalid field: " + info.name, meta);
|
||||
}
|
||||
NormsEntry entry = new NormsEntry();
|
||||
entry.bytesPerValue = meta.readByte();
|
||||
switch (entry.bytesPerValue) {
|
||||
case 0: case 1: case 2: case 4: case 8:
|
||||
break;
|
||||
default:
|
||||
throw new CorruptIndexException("Invalid bytesPerValue: " + entry.bytesPerValue + ", field: " + info.name, meta);
|
||||
}
|
||||
entry.offset = meta.readLong();
|
||||
norms.put(info.number, entry);
|
||||
fieldNumber = meta.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumericDocValues getNorms(FieldInfo field) throws IOException {
|
||||
final NormsEntry entry = norms.get(field.number);
|
||||
|
||||
if (entry.bytesPerValue == 0) {
|
||||
final long value = entry.offset;
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
return value;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
RandomAccessInput slice;
|
||||
synchronized (data) {
|
||||
switch (entry.bytesPerValue) {
|
||||
case 1:
|
||||
slice = data.randomAccessSlice(entry.offset, maxDoc);
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
try {
|
||||
return slice.readByte(docID);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
case 2:
|
||||
slice = data.randomAccessSlice(entry.offset, maxDoc * 2L);
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
try {
|
||||
return slice.readShort(((long)docID) << 1L);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
case 4:
|
||||
slice = data.randomAccessSlice(entry.offset, maxDoc * 4L);
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
try {
|
||||
return slice.readInt(((long)docID) << 2L);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
case 8:
|
||||
slice = data.randomAccessSlice(entry.offset, maxDoc * 8L);
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
try {
|
||||
return slice.readLong(((long)docID) << 3L);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
data.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return 64L * norms.size(); // good enough
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkIntegrity() throws IOException {
|
||||
CodecUtil.checksumEntireFile(data);
|
||||
}
|
||||
|
||||
static class NormsEntry {
|
||||
byte bytesPerValue;
|
||||
long offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + "(fields=" + norms.size() + ")";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,401 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lucene 5.3 file format.
|
||||
*
|
||||
* <h1>Apache Lucene - Index File Formats</h1>
|
||||
* <div>
|
||||
* <ul>
|
||||
* <li><a href="#Introduction">Introduction</a></li>
|
||||
* <li><a href="#Definitions">Definitions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a></li>
|
||||
* <li><a href="#Types_of_Fields">Types of Fields</a></li>
|
||||
* <li><a href="#Segments">Segments</a></li>
|
||||
* <li><a href="#Document_Numbers">Document Numbers</a></li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a href="#Overview">Index Structure Overview</a></li>
|
||||
* <li><a href="#File_Naming">File Naming</a></li>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Lock_File">Lock File</a></li>
|
||||
* <li><a href="#History">History</a></li>
|
||||
* <li><a href="#Limitations">Limitations</a></li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* </ul>
|
||||
* </div>
|
||||
* <a name="Introduction"></a>
|
||||
* <h2>Introduction</h2>
|
||||
* <div>
|
||||
* <p>This document defines the index file formats used in this version of Lucene.
|
||||
* If you are using a different version of Lucene, please consult the copy of
|
||||
* <code>docs/</code> that was distributed with
|
||||
* the version you are using.</p>
|
||||
* <p>Apache Lucene is written in Java, but several efforts are underway to write
|
||||
* <a href="http://wiki.apache.org/lucene-java/LuceneImplementations">versions of
|
||||
* Lucene in other programming languages</a>. If these versions are to remain
|
||||
* compatible with Apache Lucene, then a language-independent definition of the
|
||||
* Lucene index format is required. This document thus attempts to provide a
|
||||
* complete and independent definition of the Apache Lucene file formats.</p>
|
||||
* <p>As Lucene evolves, this document should evolve. Versions of Lucene in
|
||||
* different programming languages should endeavor to agree on file formats, and
|
||||
* generate new versions of this document.</p>
|
||||
* </div>
|
||||
* <a name="Definitions"></a>
|
||||
* <h2>Definitions</h2>
|
||||
* <div>
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.</p>
|
||||
* <p>An index contains a sequence of documents.</p>
|
||||
* <ul>
|
||||
* <li>A document is a sequence of fields.</li>
|
||||
* <li>A field is a named sequence of terms.</li>
|
||||
* <li>A term is a sequence of bytes.</li>
|
||||
* </ul>
|
||||
* <p>The same sequence of bytes in two different fields is considered a different
|
||||
* term. Thus terms are represented as a pair: the string naming the field, and the
|
||||
* bytes within the field.</p>
|
||||
* <a name="Inverted_Indexing"></a>
|
||||
* <h3>Inverted Indexing</h3>
|
||||
* <p>The index stores statistics about terms in order to make term-based search
|
||||
* more efficient. Lucene's index falls into the family of indexes known as an
|
||||
* <i>inverted index.</i> This is because it can list, for a term, the documents
|
||||
* that contain it. This is the inverse of the natural relationship, in which
|
||||
* documents list terms.</p>
|
||||
* <a name="Types_of_Fields"></a>
|
||||
* <h3>Types of Fields</h3>
|
||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
|
||||
* in the index literally, in a non-inverted manner. Fields that are inverted are
|
||||
* called <i>indexed</i>. A field may be both stored and indexed.</p>
|
||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the
|
||||
* text of a field may be used literally as a term to be indexed. Most fields are
|
||||
* tokenized, but sometimes it is useful for certain identifier fields to be
|
||||
* indexed literally.</p>
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field}
|
||||
* java docs for more information on Fields.</p>
|
||||
* <a name="Segments"></a>
|
||||
* <h3>Segments</h3>
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
|
||||
* Each segment is a fully independent index, which could be searched separately.
|
||||
* Indexes evolve by:</p>
|
||||
* <ol>
|
||||
* <li>Creating new segments for newly added documents.</li>
|
||||
* <li>Merging existing segments.</li>
|
||||
* </ol>
|
||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index
|
||||
* potentially composed of a set of segments.</p>
|
||||
* <a name="Document_Numbers"></a>
|
||||
* <h3>Document Numbers</h3>
|
||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>.
|
||||
* The first document added to an index is numbered zero, and each subsequent
|
||||
* document added gets a number one greater than the previous.</p>
|
||||
* <p>Note that a document's number may change, so caution should be taken when
|
||||
* storing these numbers outside of Lucene. In particular, numbers may change in
|
||||
* the following situations:</p>
|
||||
* <ul>
|
||||
* <li>
|
||||
* <p>The numbers stored in each segment are unique only within the segment, and
|
||||
* must be converted before they can be used in a larger context. The standard
|
||||
* technique is to allocate each segment a range of values, based on the range of
|
||||
* numbers used in that segment. To convert a document number from a segment to an
|
||||
* external value, the segment's <i>base</i> document number is added. To convert
|
||||
* an external value back to a segment-specific value, the segment is identified
|
||||
* by the range that the external value is in, and the segment's base value is
|
||||
* subtracted. For example two five document segments might be combined, so that
|
||||
* the first segment has a base value of zero, and the second of five. Document
|
||||
* three from the second segment would have an external value of eight.</p>
|
||||
* </li>
|
||||
* <li>
|
||||
* <p>When documents are deleted, gaps are created in the numbering. These are
|
||||
* eventually removed as the index evolves through merging. Deleted documents are
|
||||
* dropped when segments are merged. A freshly-merged segment thus has no gaps in
|
||||
* its numbering.</p>
|
||||
* </li>
|
||||
* </ul>
|
||||
* </div>
|
||||
* <a name="Overview"></a>
|
||||
* <h2>Index Structure Overview</h2>
|
||||
* <div>
|
||||
* <p>Each segment index maintains the following:</p>
|
||||
* <ul>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat Segment info}.
|
||||
* This contains metadata about a segment, such as the number of documents,
|
||||
* what files it uses,
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Field names}.
|
||||
* This contains the set of field names used in the index.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Stored Field values}.
|
||||
* This contains, for each document, a list of attribute-value pairs, where the attributes
|
||||
* are field names. These are used to store auxiliary information about the document, such as
|
||||
* its title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term dictionary}.
|
||||
* A dictionary containing all of the terms used in all of the
|
||||
* indexed fields of all of the documents. The dictionary also contains the number
|
||||
* of documents which contain the term, and pointers to the term's frequency and
|
||||
* proximity data.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Frequency data}.
|
||||
* For each term in the dictionary, the numbers of all the
|
||||
* documents that contain that term, and the frequency of the term in that
|
||||
* document, unless frequencies are omitted (IndexOptions.DOCS_ONLY)
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Proximity data}.
|
||||
* For each term in the dictionary, the positions that the
|
||||
* term occurs in each document. Note that this will not exist if all fields in
|
||||
* all documents omit position data.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene53.Lucene53NormsFormat Normalization factors}.
|
||||
* For each field in each document, a value is stored
|
||||
* that is multiplied into the score for hits on that field.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
|
||||
* For each field in each document, the term vector (sometimes
|
||||
* called document vector) may be stored. A term vector consists of term text and
|
||||
* term frequency. To add Term Vectors to your index see the
|
||||
* {@link org.apache.lucene.document.Field Field} constructors
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat Per-document values}.
|
||||
* Like stored values, these are also keyed by document
|
||||
* number, but are generally intended to be loaded into main memory for fast
|
||||
* access. Whereas stored values are generally intended for summary results from
|
||||
* searches, per-document values are useful for things like scoring factors.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
|
||||
* An optional file indicating which documents are live.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <p>Details on each of these are provided in their linked pages.</p>
|
||||
* </div>
|
||||
* <a name="File_Naming"></a>
|
||||
* <h2>File Naming</h2>
|
||||
* <div>
|
||||
* <p>All files belonging to a segment have the same name with varying extensions.
|
||||
* The extensions correspond to the different file formats described below. When
|
||||
* using the Compound File format (default in 1.4 and greater) these files (except
|
||||
* for the Segment info file, the Lock file, and Deleted documents file) are collapsed
|
||||
* into a single .cfs file (see below for details)</p>
|
||||
* <p>Typically, all segments in an index are stored in a single directory,
|
||||
* although this is not required.</p>
|
||||
* <p>As of version 2.1 (lock-less commits), file names are never re-used.
|
||||
* That is, when any file is saved
|
||||
* to the Directory it is given a never before used filename. This is achieved
|
||||
* using a simple generations approach. For example, the first segments file is
|
||||
* segments_1, then segments_2, etc. The generation is a sequential long integer
|
||||
* represented in alpha-numeric (base 36) form.</p>
|
||||
* </div>
|
||||
* <a name="file-names"></a>
|
||||
* <h2>Summary of File Extensions</h2>
|
||||
* <div>
|
||||
* <p>The following table summarizes the names and extensions of the files in
|
||||
* Lucene:</p>
|
||||
* <table cellspacing="1" cellpadding="4" summary="lucene filenames by extension">
|
||||
* <tr>
|
||||
* <th>Name</th>
|
||||
* <th>Extension</th>
|
||||
* <th>Brief Description</th>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||
* <td>segments_N</td>
|
||||
* <td>Stores information about a commit point</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td><a href="#Lock_File">Lock File</a></td>
|
||||
* <td>write.lock</td>
|
||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}</td>
|
||||
* <td>.cfs, .cfe</td>
|
||||
* <td>An optional "virtual" file consisting of all the other index files for
|
||||
* systems that frequently run out of file handles.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Fields}</td>
|
||||
* <td>.fnm</td>
|
||||
* <td>Stores information about the fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Index}</td>
|
||||
* <td>.fdx</td>
|
||||
* <td>Contains pointers to field data</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Data}</td>
|
||||
* <td>.fdt</td>
|
||||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene53.Lucene53NormsFormat Norms}</td>
|
||||
* <td>.nvd, .nvm</td>
|
||||
* <td>Encodes length and boost factors for docs and fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat Per-Document Values}</td>
|
||||
* <td>.dvd, .dvm</td>
|
||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}</td>
|
||||
* <td>.tvx</td>
|
||||
* <td>Stores offset into the document data file</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Documents}</td>
|
||||
* <td>.tvd</td>
|
||||
* <td>Contains information about each document that has term vectors</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Fields}</td>
|
||||
* <td>.tvf</td>
|
||||
* <td>The field level info about term vectors</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
|
||||
* <td>.liv</td>
|
||||
* <td>Info about what files are live</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
* </div>
|
||||
* <a name="Lock_File"></a>
|
||||
* <h2>Lock File</h2>
|
||||
* The write lock, which is stored in the index directory by default, is named
|
||||
* "write.lock". If the lock directory is different from the index directory then
|
||||
* the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
|
||||
* derived from the full path to the index directory. When this file is present, a
|
||||
* writer is currently modifying the index (adding or removing documents). This
|
||||
* lock file ensures that only one writer is modifying the index at a time.
|
||||
* <a name="History"></a>
|
||||
* <h2>History</h2>
|
||||
* <p>Compatibility notes are provided in this document, describing how file
|
||||
* formats have changed from prior versions:</p>
|
||||
* <ul>
|
||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie,
|
||||
* no more commit lock). The change is fully backwards compatible: you can open a
|
||||
* pre-2.1 index for searching or adding/deleting of docs. When the new segments
|
||||
* file is saved (committed), it will be written in the new file format (meaning
|
||||
* no specific "upgrade" process is needed). But note that once a commit has
|
||||
* occurred, pre-2.1 Lucene will not be able to read the index.</li>
|
||||
* <li>In version 2.3, the file format was changed to allow segments to share a
|
||||
* single set of doc store (vectors & stored fields) files. This allows for
|
||||
* faster indexing in certain cases. The change is fully backwards compatible (in
|
||||
* the same way as the lock-less commits change in 2.1).</li>
|
||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not
|
||||
* Java's modified UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">
|
||||
* LUCENE-510</a> for details.</li>
|
||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData
|
||||
* may be passed to IndexWriter's commit methods (and later retrieved), which is
|
||||
* recorded in the segments_N file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">
|
||||
* LUCENE-1382</a> for details. Also,
|
||||
* diagnostics were added to each segment written recording details about why it
|
||||
* was written (due to flush, merge; which OS/JRE was used; etc.). See issue
|
||||
* <a href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.</li>
|
||||
* <li>In version 3.0, compressed fields are no longer written to the index (they
|
||||
* can still be read, but on merge the new segment will write them, uncompressed).
|
||||
* See issue <a href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a>
|
||||
* for details.</li>
|
||||
* <li>In version 3.1, segments records the code version that created them. See
|
||||
* <a href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||
* Additionally segments track explicitly whether or not they have term vectors.
|
||||
* See <a href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a>
|
||||
* for details.</li>
|
||||
* <li>In version 3.2, numeric fields are written as natively to stored fields
|
||||
* file, previously they were stored in text format only.</li>
|
||||
* <li>In version 3.4, fields can omit position data while still indexing term
|
||||
* frequencies.</li>
|
||||
* <li>In version 4.0, the format of the inverted index became extensible via
|
||||
* the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
|
||||
* ({@code DocValues}) was introduced. Normalization factors need no longer be a
|
||||
* single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
|
||||
* Terms need not be unicode strings, they can be any byte sequence. Term offsets
|
||||
* can optionally be indexed into the postings lists. Payloads can be stored in the
|
||||
* term vectors.</li>
|
||||
* <li>In version 4.1, the format of the postings list changed to use either
|
||||
* of FOR compression or variable-byte encoding, depending upon the frequency
|
||||
* of the term. Terms appearing only once were changed to inline directly into
|
||||
* the term dictionary. Stored fields are compressed by default. </li>
|
||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has
|
||||
* a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
|
||||
* on multi-valued fields.</li>
|
||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.</li>
|
||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||
* allow updating NumericDocValues fields.</li>
|
||||
* <li>In version 4.8, checksum footers were added to the end of each index file
|
||||
* for improved data integrity. Specifically, the last 8 bytes of every index file
|
||||
* contain the zlib-crc32 checksum of the file.</li>
|
||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
|
||||
* that is suitable for faceting/sorting/analytics.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <a name="Limitations"></a>
|
||||
* <h2>Limitations</h2>
|
||||
* <div>
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to
|
||||
* document numbers, and the index file format uses an <code>Int32</code>
|
||||
* on-disk to store document numbers. This is a limitation
|
||||
* of both the index file format and the current implementation. Eventually these
|
||||
* should be replaced with either <code>UInt64</code> values, or
|
||||
* better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.</p>
|
||||
* </div>
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene53;
|
|
@ -54,7 +54,7 @@ abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessIn
|
|||
if (buffers.length == 1) {
|
||||
return new SingleBufferImpl(resourceDescription, buffers[0], length, chunkSizePower, cleaner, clones);
|
||||
} else {
|
||||
return new DefaultImpl(resourceDescription, buffers, length, chunkSizePower, cleaner, clones);
|
||||
return new MultiBufferImpl(resourceDescription, buffers, 0, length, chunkSizePower, cleaner, clones);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -301,9 +301,7 @@ abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessIn
|
|||
newBuffers[0].position(offset);
|
||||
return new SingleBufferImpl(newResourceDescription, newBuffers[0].slice(), length, chunkSizePower, this.cleaner, this.clones);
|
||||
} else {
|
||||
return (offset == 0) ?
|
||||
new DefaultImpl(newResourceDescription, newBuffers, length, chunkSizePower, cleaner, clones) :
|
||||
new WithOffsetImpl(newResourceDescription, newBuffers, offset, length, chunkSizePower, cleaner, clones);
|
||||
return new MultiBufferImpl(newResourceDescription, newBuffers, offset, length, chunkSizePower, cleaner, clones);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -388,21 +386,6 @@ abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessIn
|
|||
void freeBuffer(ByteBufferIndexInput parent, ByteBuffer b) throws IOException;
|
||||
}
|
||||
|
||||
/** Default implementation of ByteBufferIndexInput, supporting multiple buffers, but no offset. */
|
||||
static final class DefaultImpl extends ByteBufferIndexInput {
|
||||
|
||||
DefaultImpl(String resourceDescription, ByteBuffer[] buffers, long length, int chunkSizePower,
|
||||
BufferCleaner cleaner, WeakIdentityMap<ByteBufferIndexInput,Boolean> clones) {
|
||||
super(resourceDescription, buffers, length, chunkSizePower, cleaner, clones);
|
||||
try {
|
||||
seek(0L);
|
||||
} catch (IOException ioe) {
|
||||
throw new AssertionError(ioe);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Optimization of ByteBufferIndexInput for when there is only one buffer */
|
||||
static final class SingleBufferImpl extends ByteBufferIndexInput {
|
||||
|
||||
|
@ -502,10 +485,10 @@ abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessIn
|
|||
}
|
||||
|
||||
/** This class adds offset support to ByteBufferIndexInput, which is needed for slices. */
|
||||
static final class WithOffsetImpl extends ByteBufferIndexInput {
|
||||
static final class MultiBufferImpl extends ByteBufferIndexInput {
|
||||
private final int offset;
|
||||
|
||||
WithOffsetImpl(String resourceDescription, ByteBuffer[] buffers, int offset, long length, int chunkSizePower,
|
||||
MultiBufferImpl(String resourceDescription, ByteBuffer[] buffers, int offset, long length, int chunkSizePower,
|
||||
BufferCleaner cleaner, WeakIdentityMap<ByteBufferIndexInput,Boolean> clones) {
|
||||
super(resourceDescription, buffers, length, chunkSizePower, cleaner, clones);
|
||||
this.offset = offset;
|
||||
|
@ -518,10 +501,7 @@ abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessIn
|
|||
|
||||
@Override
|
||||
public void seek(long pos) throws IOException {
|
||||
// necessary in case offset != 0 and pos < 0, but pos >= -offset
|
||||
if (pos < 0L) {
|
||||
throw new IllegalArgumentException("Seeking to negative position: " + this);
|
||||
}
|
||||
assert pos >= 0L;
|
||||
super.seek(pos + offset);
|
||||
}
|
||||
|
||||
|
|
|
@ -13,4 +13,4 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.codecs.lucene50.Lucene50Codec
|
||||
org.apache.lucene.codecs.lucene53.Lucene53Codec
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.codecs.lucene50;
|
|||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.index.BaseStoredFieldsFormatTestCase;
|
||||
|
@ -33,7 +34,7 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
|||
public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene50Codec(Mode.BEST_COMPRESSION);
|
||||
return new Lucene53Codec(Mode.BEST_COMPRESSION);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -44,7 +45,7 @@ public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFie
|
|||
Directory dir = newDirectory();
|
||||
for (int i = 0; i < 10; i++) {
|
||||
IndexWriterConfig iwc = newIndexWriterConfig();
|
||||
iwc.setCodec(new Lucene50Codec(RandomPicks.randomFrom(random(), Mode.values())));
|
||||
iwc.setCodec(new Lucene53Codec(RandomPicks.randomFrom(random(), Mode.values())));
|
||||
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
|
||||
Document doc = new Document();
|
||||
doc.add(new StoredField("field1", "value1"));
|
||||
|
@ -71,7 +72,7 @@ public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFie
|
|||
|
||||
public void testInvalidOptions() throws Exception {
|
||||
try {
|
||||
new Lucene50Codec(null);
|
||||
new Lucene53Codec(null);
|
||||
fail("didn't hit exception");
|
||||
} catch (NullPointerException expected) {
|
||||
// expected
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
package org.apache.lucene.codecs.lucene53;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BaseNormsFormatTestCase;
|
||||
|
||||
/**
|
||||
* Tests Lucene53NormsFormat
|
||||
*/
|
||||
public class TestLucene53NormsFormat extends BaseNormsFormatTestCase {
|
||||
private final Codec codec = new Lucene53Codec();
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
|
||||
}
|
|
@ -51,7 +51,7 @@ import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
|||
// disk (but, should run successfully). Best to run w/
|
||||
// -Dtests.codec=<current codec>, and w/ plenty of RAM, eg:
|
||||
//
|
||||
// ant test -Dtests.monster=true -Dtests.heapsize=8g -Dtests.codec=Lucene50 -Dtestcase=Test2BTerms
|
||||
// ant test -Dtests.monster=true -Dtests.heapsize=8g -Dtests.codec=Lucene53 -Dtestcase=Test2BTerms
|
||||
//
|
||||
@SuppressCodecs({ "SimpleText", "Memory", "Direct" })
|
||||
@Monster("very slow, use 5g minimum heap")
|
||||
|
|
|
@ -386,7 +386,7 @@ public class TestMultiMMap extends BaseDirectoryTestCase {
|
|||
ii.seek(0L);
|
||||
|
||||
// check impl (we must check size < chunksize: currently, if size==chunkSize, we get 2 buffers, the second one empty:
|
||||
assertTrue((size < chunkSize) ? (ii instanceof ByteBufferIndexInput.SingleBufferImpl) : (ii instanceof ByteBufferIndexInput.DefaultImpl));
|
||||
assertTrue((size < chunkSize) ? (ii instanceof ByteBufferIndexInput.SingleBufferImpl) : (ii instanceof ByteBufferIndexInput.MultiBufferImpl));
|
||||
|
||||
// clone tests:
|
||||
assertSame(ii.getClass(), ii.clone().getClass());
|
||||
|
@ -394,7 +394,7 @@ public class TestMultiMMap extends BaseDirectoryTestCase {
|
|||
// slice test (offset 0)
|
||||
int sliceSize = random().nextInt(size);
|
||||
IndexInput slice = ii.slice("slice", 0, sliceSize);
|
||||
assertTrue((sliceSize < chunkSize) ? (slice instanceof ByteBufferIndexInput.SingleBufferImpl) : (slice instanceof ByteBufferIndexInput.DefaultImpl));
|
||||
assertTrue((sliceSize < chunkSize) ? (slice instanceof ByteBufferIndexInput.SingleBufferImpl) : (slice instanceof ByteBufferIndexInput.MultiBufferImpl));
|
||||
|
||||
// slice test (offset > 0 )
|
||||
int offset = random().nextInt(size - 1) + 1;
|
||||
|
@ -403,10 +403,8 @@ public class TestMultiMMap extends BaseDirectoryTestCase {
|
|||
//System.out.println(offset + "/" + sliceSize + " chunkSize=" + chunkSize + " " + slice.getClass());
|
||||
if (offset % chunkSize + sliceSize < chunkSize) {
|
||||
assertTrue(slice instanceof ByteBufferIndexInput.SingleBufferImpl);
|
||||
} else if (offset % chunkSize == 0) {
|
||||
assertTrue(slice instanceof ByteBufferIndexInput.DefaultImpl);
|
||||
} else {
|
||||
assertTrue(slice instanceof ByteBufferIndexInput.WithOffsetImpl);
|
||||
assertTrue(slice instanceof ByteBufferIndexInput.MultiBufferImpl);
|
||||
}
|
||||
|
||||
ii.close();
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.bkdtree;
|
|||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
|
@ -369,7 +369,7 @@ public class TestBKDTree extends LuceneTestCase {
|
|||
iwc.setMaxBufferedDocs(lats.length/100);
|
||||
}
|
||||
final DocValuesFormat dvFormat = new BKDTreeDocValuesFormat(maxPointsInLeaf, maxPointsSortInHeap);
|
||||
Codec codec = new Lucene50Codec() {
|
||||
Codec codec = new Lucene53Codec() {
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
if (field.equals("point")) {
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.IntField;
|
||||
|
@ -626,7 +626,7 @@ public class TestSuggestField extends LuceneTestCase {
|
|||
static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, final Set<String> suggestFields) {
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
Codec filterCodec = new Lucene50Codec() {
|
||||
Codec filterCodec = new Lucene53Codec() {
|
||||
PostingsFormat postingsFormat = new Completion50PostingsFormat();
|
||||
|
||||
@Override
|
||||
|
|
|
@ -34,8 +34,8 @@ import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat;
|
|||
import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
|
||||
import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
|
||||
import org.apache.lucene.codecs.compressing.CompressingCodec;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53Codec;
|
||||
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
|
||||
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
|
||||
import org.apache.lucene.index.RandomCodec;
|
||||
|
@ -184,8 +184,8 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
|
|||
codec = new AssertingCodec();
|
||||
} else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
|
||||
codec = CompressingCodec.randomInstance(random);
|
||||
} else if ("Lucene50".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene50"))) {
|
||||
codec = new Lucene50Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values()));
|
||||
} else if ("Lucene53".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene53"))) {
|
||||
codec = new Lucene53Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values()));
|
||||
} else if (!"random".equals(TEST_CODEC)) {
|
||||
codec = Codec.forName(TEST_CODEC);
|
||||
} else if ("random".equals(TEST_POSTINGSFORMAT)) {
|
||||
|
|
|
@ -54,9 +54,9 @@ import org.apache.lucene.codecs.PostingsFormat;
|
|||
import org.apache.lucene.codecs.asserting.AssertingCodec;
|
||||
import org.apache.lucene.codecs.blockterms.LuceneFixedGap;
|
||||
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53Codec;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
|
@ -881,7 +881,7 @@ public final class TestUtil {
|
|||
* This may be different than {@link Codec#getDefault()} because that is randomized.
|
||||
*/
|
||||
public static Codec getDefaultCodec() {
|
||||
return new Lucene50Codec();
|
||||
return new Lucene53Codec();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -3,7 +3,7 @@ package org.apache.solr.core;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53Codec;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.util.plugin.SolrCoreAware;
|
||||
|
@ -51,7 +51,7 @@ public class SchemaCodecFactory extends CodecFactory implements SolrCoreAware {
|
|||
@Override
|
||||
public void init(NamedList args) {
|
||||
super.init(args);
|
||||
codec = new Lucene50Codec() {
|
||||
codec = new Lucene53Codec() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
final SchemaField schemaField = core.getLatestSchema().getFieldOrNull(field);
|
||||
|
|
Loading…
Reference in New Issue