mirror of https://github.com/apache/lucene.git
Merge branch 'upgrade-master-to-8' of https://github.com/anshumg/lucene-solr for 7.0 release ( closes #119 )
This commit is contained in:
commit
8218a5b2c6
|
@ -3,6 +3,9 @@ Lucene Change Log
|
|||
For more information on past and future Lucene versions, please see:
|
||||
http://s.apache.org/luceneversions
|
||||
|
||||
======================= Lucene 8.0.0 =======================
|
||||
(No Changes)
|
||||
|
||||
======================= Lucene 7.0.0 =======================
|
||||
|
||||
New Features
|
||||
|
|
|
@ -107,7 +107,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testFactoryHtmlStripClassicFolding() throws Exception {
|
||||
CustomAnalyzer a = CustomAnalyzer.builder()
|
||||
.withDefaultMatchVersion(Version.LUCENE_6_0_0)
|
||||
.withDefaultMatchVersion(Version.LUCENE_7_0_0)
|
||||
.addCharFilter(HTMLStripCharFilterFactory.class)
|
||||
.withTokenizer(ClassicTokenizerFactory.class)
|
||||
.addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
|
||||
|
@ -126,7 +126,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
|
||||
assertEquals(100, a.getPositionIncrementGap("dummy"));
|
||||
assertEquals(1000, a.getOffsetGap("dummy"));
|
||||
assertSame(Version.LUCENE_6_0_0, a.getVersion());
|
||||
assertSame(Version.LUCENE_7_0_0, a.getVersion());
|
||||
|
||||
assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR",
|
||||
new String[] { "foo", "bar", "foo", "bar" },
|
||||
|
@ -139,7 +139,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testHtmlStripClassicFolding() throws Exception {
|
||||
CustomAnalyzer a = CustomAnalyzer.builder()
|
||||
.withDefaultMatchVersion(Version.LUCENE_6_0_0)
|
||||
.withDefaultMatchVersion(Version.LUCENE_7_0_0)
|
||||
.addCharFilter("htmlstrip")
|
||||
.withTokenizer("classic")
|
||||
.addTokenFilter("asciifolding", "preserveOriginal", "true")
|
||||
|
@ -158,7 +158,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
|
||||
assertEquals(100, a.getPositionIncrementGap("dummy"));
|
||||
assertEquals(1000, a.getOffsetGap("dummy"));
|
||||
assertSame(Version.LUCENE_6_0_0, a.getVersion());
|
||||
assertSame(Version.LUCENE_7_0_0, a.getVersion());
|
||||
|
||||
assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR",
|
||||
new String[] { "foo", "bar", "foo", "bar" },
|
||||
|
|
|
@ -1,90 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo; // javadocs
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Lucene 5.0 Segment info format.
|
||||
* @deprecated Only for reading old 5.0-6.0 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene50SegmentInfoFormat extends SegmentInfoFormat {
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene50SegmentInfoFormat() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException {
|
||||
final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene50SegmentInfoFormat.SI_EXTENSION);
|
||||
try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) {
|
||||
Throwable priorE = null;
|
||||
SegmentInfo si = null;
|
||||
try {
|
||||
CodecUtil.checkIndexHeader(input, Lucene50SegmentInfoFormat.CODEC_NAME,
|
||||
Lucene50SegmentInfoFormat.VERSION_START,
|
||||
Lucene50SegmentInfoFormat.VERSION_CURRENT,
|
||||
segmentID, "");
|
||||
final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
|
||||
|
||||
final int docCount = input.readInt();
|
||||
if (docCount < 0) {
|
||||
throw new CorruptIndexException("invalid docCount: " + docCount, input);
|
||||
}
|
||||
final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
|
||||
|
||||
final Map<String,String> diagnostics = input.readMapOfStrings();
|
||||
final Set<String> files = input.readSetOfStrings();
|
||||
final Map<String,String> attributes = input.readMapOfStrings();
|
||||
|
||||
si = new SegmentInfo(dir, version, null, segment, docCount, isCompoundFile, null, diagnostics, segmentID, attributes, null);
|
||||
si.setFiles(files);
|
||||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
CodecUtil.checkFooter(input, priorE);
|
||||
}
|
||||
return si;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
|
||||
throw new UnsupportedOperationException("this codec can only be used for reading");
|
||||
}
|
||||
|
||||
/** File extension used to store {@link SegmentInfo}. */
|
||||
public final static String SI_EXTENSION = "si";
|
||||
static final String CODEC_NAME = "Lucene50SegmentInfo";
|
||||
static final int VERSION_SAFE_MAPS = 1;
|
||||
static final int VERSION_START = VERSION_SAFE_MAPS;
|
||||
static final int VERSION_CURRENT = VERSION_SAFE_MAPS;
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Lucene 5.0 file format.
|
||||
</body>
|
||||
</html>
|
|
@ -1,91 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene53;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* Lucene 5.3 Score normalization format.
|
||||
* <p>
|
||||
* Encodes normalization values by encoding each value with the minimum
|
||||
* number of bytes needed to represent the range (which can be zero).
|
||||
* <p>
|
||||
* Files:
|
||||
* <ol>
|
||||
* <li><tt>.nvd</tt>: Norms data</li>
|
||||
* <li><tt>.nvm</tt>: Norms metadata</li>
|
||||
* </ol>
|
||||
* <ol>
|
||||
* <li><a name="nvm"></a>
|
||||
* <p>The Norms metadata or .nvm file.</p>
|
||||
* <p>For each norms field, this stores metadata, such as the offset into the
|
||||
* Norms data (.nvd)</p>
|
||||
* <p>Norms metadata (.dvm) --> Header,<Entry><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
* <li>Entry --> FieldNumber,BytesPerValue, Address</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeVInt vInt}</li>
|
||||
* <li>BytesPerValue --> {@link DataOutput#writeByte byte}</li>
|
||||
* <li>Offset --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>FieldNumber of -1 indicates the end of metadata.</p>
|
||||
* <p>Offset is the pointer to the start of the data in the norms data (.nvd), or the singleton value
|
||||
* when BytesPerValue = 0</p>
|
||||
* <li><a name="nvd"></a>
|
||||
* <p>The Norms data or .nvd file.</p>
|
||||
* <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
* <p>Norms data (.nvd) --> Header,< Data ><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
* <li>Data --> {@link DataOutput#writeByte(byte) byte}<sup>MaxDoc * BytesPerValue</sup></li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </ol>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene53NormsFormat extends NormsFormat {
|
||||
|
||||
/** Sole Constructor */
|
||||
public Lucene53NormsFormat() {}
|
||||
|
||||
@Override
|
||||
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||
throw new UnsupportedOperationException("This format can only be used for reading");
|
||||
}
|
||||
|
||||
@Override
|
||||
public NormsProducer normsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene53NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
static final String DATA_CODEC = "Lucene53NormsData";
|
||||
static final String DATA_EXTENSION = "nvd";
|
||||
static final String METADATA_CODEC = "Lucene53NormsMetadata";
|
||||
static final String METADATA_EXTENSION = "nvm";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
}
|
|
@ -1,236 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene53;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.RandomAccessInput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene53.Lucene53NormsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.lucene53.Lucene53NormsFormat.VERSION_START;
|
||||
|
||||
/**
|
||||
* Reader for {@link Lucene53NormsFormat}
|
||||
*/
|
||||
class Lucene53NormsProducer extends NormsProducer {
|
||||
// metadata maps (just file pointers and minimal stuff)
|
||||
private final Map<Integer,NormsEntry> norms = new HashMap<>();
|
||||
private final IndexInput data;
|
||||
private final int maxDoc;
|
||||
|
||||
Lucene53NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
maxDoc = state.segmentInfo.maxDoc();
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
int version = -1;
|
||||
|
||||
// read in the entries from the metadata file.
|
||||
try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) {
|
||||
Throwable priorE = null;
|
||||
try {
|
||||
version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
readFields(in, state.fieldInfos);
|
||||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
CodecUtil.checkFooter(in, priorE);
|
||||
}
|
||||
}
|
||||
|
||||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
data = state.directory.openInput(dataName, state.context);
|
||||
boolean success = false;
|
||||
try {
|
||||
final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
if (version != version2) {
|
||||
throw new CorruptIndexException("Format versions mismatch: meta=" + version + ",data=" + version2, data);
|
||||
}
|
||||
|
||||
// NOTE: data file is too costly to verify checksum against all the bytes on open,
|
||||
// but for now we at least verify proper structure of the checksum footer: which looks
|
||||
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
|
||||
// such as file truncation.
|
||||
CodecUtil.retrieveChecksum(data);
|
||||
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
|
||||
int fieldNumber = meta.readVInt();
|
||||
while (fieldNumber != -1) {
|
||||
FieldInfo info = infos.fieldInfo(fieldNumber);
|
||||
if (info == null) {
|
||||
throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
|
||||
} else if (!info.hasNorms()) {
|
||||
throw new CorruptIndexException("Invalid field: " + info.name, meta);
|
||||
}
|
||||
NormsEntry entry = new NormsEntry();
|
||||
entry.bytesPerValue = meta.readByte();
|
||||
switch (entry.bytesPerValue) {
|
||||
case 0: case 1: case 2: case 4: case 8:
|
||||
break;
|
||||
default:
|
||||
throw new CorruptIndexException("Invalid bytesPerValue: " + entry.bytesPerValue + ", field: " + info.name, meta);
|
||||
}
|
||||
entry.offset = meta.readLong();
|
||||
norms.put(info.number, entry);
|
||||
fieldNumber = meta.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumericDocValues getNorms(FieldInfo field) throws IOException {
|
||||
final NormsEntry entry = norms.get(field.number);
|
||||
|
||||
if (entry.bytesPerValue == 0) {
|
||||
final long value = entry.offset;
|
||||
return new NormsIterator(maxDoc) {
|
||||
@Override
|
||||
public long longValue() {
|
||||
return value;
|
||||
}
|
||||
};
|
||||
} else {
|
||||
RandomAccessInput slice;
|
||||
synchronized (data) {
|
||||
switch (entry.bytesPerValue) {
|
||||
case 1:
|
||||
slice = data.randomAccessSlice(entry.offset, maxDoc);
|
||||
return new NormsIterator(maxDoc) {
|
||||
@Override
|
||||
public long longValue() throws IOException {
|
||||
return slice.readByte(docID);
|
||||
}
|
||||
};
|
||||
case 2:
|
||||
slice = data.randomAccessSlice(entry.offset, maxDoc * 2L);
|
||||
return new NormsIterator(maxDoc) {
|
||||
@Override
|
||||
public long longValue() throws IOException {
|
||||
return slice.readShort(((long)docID) << 1L);
|
||||
}
|
||||
};
|
||||
case 4:
|
||||
slice = data.randomAccessSlice(entry.offset, maxDoc * 4L);
|
||||
return new NormsIterator(maxDoc) {
|
||||
@Override
|
||||
public long longValue() throws IOException {
|
||||
return slice.readInt(((long)docID) << 2L);
|
||||
}
|
||||
};
|
||||
case 8:
|
||||
slice = data.randomAccessSlice(entry.offset, maxDoc * 8L);
|
||||
return new NormsIterator(maxDoc) {
|
||||
@Override
|
||||
public long longValue() throws IOException {
|
||||
return slice.readLong(((long)docID) << 3L);
|
||||
}
|
||||
};
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
data.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return 64L * norms.size(); // good enough
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkIntegrity() throws IOException {
|
||||
CodecUtil.checksumEntireFile(data);
|
||||
}
|
||||
|
||||
static class NormsEntry {
|
||||
byte bytesPerValue;
|
||||
long offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + "(fields=" + norms.size() + ")";
|
||||
}
|
||||
|
||||
private static abstract class NormsIterator extends NumericDocValues {
|
||||
private final int maxDoc;
|
||||
protected int docID = -1;
|
||||
|
||||
public NormsIterator(int maxDoc) {
|
||||
this.maxDoc = maxDoc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() {
|
||||
docID++;
|
||||
if (docID == maxDoc) {
|
||||
docID = NO_MORE_DOCS;
|
||||
}
|
||||
return docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) {
|
||||
docID = target;
|
||||
if (docID >= maxDoc) {
|
||||
docID = NO_MORE_DOCS;
|
||||
}
|
||||
return docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean advanceExact(int target) throws IOException {
|
||||
docID = target;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
// TODO
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,23 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Components from the Lucene 5.3 index format
|
||||
* See {@link org.apache.lucene.codecs.lucene53} for an overview
|
||||
* of the index format.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene53;
|
|
@ -1,797 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene54;
|
||||
|
||||
|
||||
import java.io.Closeable; // javadocs
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.LegacyDocValuesIterables;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LongsRef;
|
||||
import org.apache.lucene.util.MathUtil;
|
||||
import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
|
||||
import org.apache.lucene.util.PagedBytes;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.packed.DirectMonotonicWriter;
|
||||
import org.apache.lucene.util.packed.DirectWriter;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat.*;
|
||||
|
||||
/** writer for {@link Lucene54DocValuesFormat} */
|
||||
final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Closeable {
|
||||
|
||||
enum NumberType {
|
||||
/** Dense ordinals */
|
||||
ORDINAL,
|
||||
/** Random long values */
|
||||
VALUE;
|
||||
}
|
||||
|
||||
IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
|
||||
/** expert: Creates a new writer */
|
||||
public Lucene54DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
data = state.directory.createOutput(dataName, state.context);
|
||||
CodecUtil.writeIndexHeader(data, dataCodec, Lucene54DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
meta = state.directory.createOutput(metaName, state.context);
|
||||
CodecUtil.writeIndexHeader(meta, metaCodec, Lucene54DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
maxDoc = state.segmentInfo.maxDoc();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
|
||||
addNumericField(field, LegacyDocValuesIterables.numericIterable(field, valuesProducer, maxDoc), NumberType.VALUE);
|
||||
}
|
||||
|
||||
void addNumericField(FieldInfo field, Iterable<Number> values, NumberType numberType) throws IOException {
|
||||
long count = 0;
|
||||
long minValue = Long.MAX_VALUE;
|
||||
long maxValue = Long.MIN_VALUE;
|
||||
long gcd = 0;
|
||||
long missingCount = 0;
|
||||
long zeroCount = 0;
|
||||
// TODO: more efficient?
|
||||
HashSet<Long> uniqueValues = null;
|
||||
long missingOrdCount = 0;
|
||||
if (numberType == NumberType.VALUE) {
|
||||
uniqueValues = new HashSet<>();
|
||||
|
||||
for (Number nv : values) {
|
||||
final long v;
|
||||
if (nv == null) {
|
||||
v = 0;
|
||||
missingCount++;
|
||||
zeroCount++;
|
||||
} else {
|
||||
v = nv.longValue();
|
||||
if (v == 0) {
|
||||
zeroCount++;
|
||||
}
|
||||
}
|
||||
|
||||
if (gcd != 1) {
|
||||
if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
|
||||
// in that case v - minValue might overflow and make the GCD computation return
|
||||
// wrong results. Since these extreme values are unlikely, we just discard
|
||||
// GCD computation for them
|
||||
gcd = 1;
|
||||
} else if (count != 0) { // minValue needs to be set first
|
||||
gcd = MathUtil.gcd(gcd, v - minValue);
|
||||
}
|
||||
}
|
||||
|
||||
minValue = Math.min(minValue, v);
|
||||
maxValue = Math.max(maxValue, v);
|
||||
|
||||
if (uniqueValues != null) {
|
||||
if (uniqueValues.add(v)) {
|
||||
if (uniqueValues.size() > 256) {
|
||||
uniqueValues = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++count;
|
||||
}
|
||||
} else {
|
||||
for (Number nv : values) {
|
||||
long v = nv.longValue();
|
||||
if (v == -1L) {
|
||||
missingOrdCount++;
|
||||
}
|
||||
minValue = Math.min(minValue, v);
|
||||
maxValue = Math.max(maxValue, v);
|
||||
++count;
|
||||
}
|
||||
}
|
||||
|
||||
final long delta = maxValue - minValue;
|
||||
final int deltaBitsRequired = DirectWriter.unsignedBitsRequired(delta);
|
||||
final int tableBitsRequired = uniqueValues == null
|
||||
? Integer.MAX_VALUE
|
||||
: DirectWriter.bitsRequired(uniqueValues.size() - 1);
|
||||
|
||||
final boolean sparse; // 1% of docs or less have a value
|
||||
switch (numberType) {
|
||||
case VALUE:
|
||||
sparse = (double) missingCount / count >= 0.99;
|
||||
break;
|
||||
case ORDINAL:
|
||||
sparse = (double) missingOrdCount / count >= 0.99;
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
|
||||
final int format;
|
||||
if (uniqueValues != null
|
||||
&& count <= Integer.MAX_VALUE
|
||||
&& (uniqueValues.size() == 1
|
||||
|| (uniqueValues.size() == 2 && missingCount > 0 && zeroCount == missingCount))) {
|
||||
// either one unique value C or two unique values: "missing" and C
|
||||
format = CONST_COMPRESSED;
|
||||
} else if (sparse && count >= 1024) {
|
||||
// require at least 1024 docs to avoid flipping back and forth when doing NRT search
|
||||
format = SPARSE_COMPRESSED;
|
||||
} else if (uniqueValues != null && tableBitsRequired < deltaBitsRequired) {
|
||||
format = TABLE_COMPRESSED;
|
||||
} else if (gcd != 0 && gcd != 1) {
|
||||
final long gcdDelta = (maxValue - minValue) / gcd;
|
||||
final long gcdBitsRequired = DirectWriter.unsignedBitsRequired(gcdDelta);
|
||||
format = gcdBitsRequired < deltaBitsRequired ? GCD_COMPRESSED : DELTA_COMPRESSED;
|
||||
} else {
|
||||
format = DELTA_COMPRESSED;
|
||||
}
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene54DocValuesFormat.NUMERIC);
|
||||
meta.writeVInt(format);
|
||||
if (format == SPARSE_COMPRESSED) {
|
||||
meta.writeLong(data.getFilePointer());
|
||||
final long numDocsWithValue;
|
||||
switch (numberType) {
|
||||
case VALUE:
|
||||
numDocsWithValue = count - missingCount;
|
||||
break;
|
||||
case ORDINAL:
|
||||
numDocsWithValue = count - missingOrdCount;
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
final long maxDoc = writeSparseMissingBitset(values, numberType, numDocsWithValue);
|
||||
assert maxDoc == count;
|
||||
} else if (missingCount == 0) {
|
||||
meta.writeLong(ALL_LIVE);
|
||||
} else if (missingCount == count) {
|
||||
meta.writeLong(ALL_MISSING);
|
||||
} else {
|
||||
meta.writeLong(data.getFilePointer());
|
||||
writeMissingBitset(values);
|
||||
}
|
||||
meta.writeLong(data.getFilePointer());
|
||||
meta.writeVLong(count);
|
||||
|
||||
switch (format) {
|
||||
case CONST_COMPRESSED:
|
||||
// write the constant (nonzero value in the n=2 case, singleton value otherwise)
|
||||
meta.writeLong(minValue < 0 ? Collections.min(uniqueValues) : Collections.max(uniqueValues));
|
||||
break;
|
||||
case GCD_COMPRESSED:
|
||||
meta.writeLong(minValue);
|
||||
meta.writeLong(gcd);
|
||||
final long maxDelta = (maxValue - minValue) / gcd;
|
||||
final int bits = DirectWriter.unsignedBitsRequired(maxDelta);
|
||||
meta.writeVInt(bits);
|
||||
final DirectWriter quotientWriter = DirectWriter.getInstance(data, count, bits);
|
||||
for (Number nv : values) {
|
||||
long value = nv == null ? 0 : nv.longValue();
|
||||
quotientWriter.add((value - minValue) / gcd);
|
||||
}
|
||||
quotientWriter.finish();
|
||||
break;
|
||||
case DELTA_COMPRESSED:
|
||||
final long minDelta = delta < 0 ? 0 : minValue;
|
||||
meta.writeLong(minDelta);
|
||||
meta.writeVInt(deltaBitsRequired);
|
||||
final DirectWriter writer = DirectWriter.getInstance(data, count, deltaBitsRequired);
|
||||
for (Number nv : values) {
|
||||
long v = nv == null ? 0 : nv.longValue();
|
||||
writer.add(v - minDelta);
|
||||
}
|
||||
writer.finish();
|
||||
break;
|
||||
case TABLE_COMPRESSED:
|
||||
final Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
|
||||
Arrays.sort(decode);
|
||||
final HashMap<Long,Integer> encode = new HashMap<>();
|
||||
meta.writeVInt(decode.length);
|
||||
for (int i = 0; i < decode.length; i++) {
|
||||
meta.writeLong(decode[i]);
|
||||
encode.put(decode[i], i);
|
||||
}
|
||||
meta.writeVInt(tableBitsRequired);
|
||||
final DirectWriter ordsWriter = DirectWriter.getInstance(data, count, tableBitsRequired);
|
||||
for (Number nv : values) {
|
||||
ordsWriter.add(encode.get(nv == null ? 0 : nv.longValue()));
|
||||
}
|
||||
ordsWriter.finish();
|
||||
break;
|
||||
case SPARSE_COMPRESSED:
|
||||
final Iterable<Number> filteredMissingValues;
|
||||
switch (numberType) {
|
||||
case VALUE:
|
||||
meta.writeByte((byte) 0);
|
||||
filteredMissingValues = new Iterable<Number>() {
|
||||
@Override
|
||||
public Iterator<Number> iterator() {
|
||||
return StreamSupport
|
||||
.stream(values.spliterator(), false)
|
||||
.filter(value -> value != null)
|
||||
.iterator();
|
||||
}
|
||||
};
|
||||
break;
|
||||
case ORDINAL:
|
||||
meta.writeByte((byte) 1);
|
||||
filteredMissingValues = new Iterable<Number>() {
|
||||
@Override
|
||||
public Iterator<Number> iterator() {
|
||||
return StreamSupport
|
||||
.stream(values.spliterator(), false)
|
||||
.filter(value -> value.longValue() != -1L)
|
||||
.iterator();
|
||||
}
|
||||
};
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
// Write non-missing values as a numeric field
|
||||
addNumericField(field, filteredMissingValues, numberType);
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
meta.writeLong(data.getFilePointer());
|
||||
}
|
||||
|
||||
// TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
|
||||
// but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
|
||||
void writeMissingBitset(Iterable<?> values) throws IOException {
|
||||
byte bits = 0;
|
||||
int count = 0;
|
||||
for (Object v : values) {
|
||||
if (count == 8) {
|
||||
data.writeByte(bits);
|
||||
count = 0;
|
||||
bits = 0;
|
||||
}
|
||||
if (v != null) {
|
||||
bits |= 1 << (count & 7);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
if (count > 0) {
|
||||
data.writeByte(bits);
|
||||
}
|
||||
}
|
||||
|
||||
long writeSparseMissingBitset(Iterable<Number> values, NumberType numberType, long numDocsWithValue) throws IOException {
|
||||
meta.writeVLong(numDocsWithValue);
|
||||
|
||||
// Write doc IDs that have a value
|
||||
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
final DirectMonotonicWriter docIdsWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithValue, DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
long docID = 0;
|
||||
for (Number nv : values) {
|
||||
switch (numberType) {
|
||||
case VALUE:
|
||||
if (nv != null) {
|
||||
docIdsWriter.add(docID);
|
||||
}
|
||||
break;
|
||||
case ORDINAL:
|
||||
if (nv.longValue() != -1L) {
|
||||
docIdsWriter.add(docID);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
docID++;
|
||||
}
|
||||
docIdsWriter.finish();
|
||||
return docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
|
||||
addBinaryField(field, LegacyDocValuesIterables.binaryIterable(field, valuesProducer, maxDoc));
|
||||
}
|
||||
|
||||
private void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
|
||||
// write the byte[] data
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene54DocValuesFormat.BINARY);
|
||||
int minLength = Integer.MAX_VALUE;
|
||||
int maxLength = Integer.MIN_VALUE;
|
||||
final long startFP = data.getFilePointer();
|
||||
long count = 0;
|
||||
long missingCount = 0;
|
||||
for(BytesRef v : values) {
|
||||
final int length;
|
||||
if (v == null) {
|
||||
length = 0;
|
||||
missingCount++;
|
||||
} else {
|
||||
length = v.length;
|
||||
}
|
||||
minLength = Math.min(minLength, length);
|
||||
maxLength = Math.max(maxLength, length);
|
||||
if (v != null) {
|
||||
data.writeBytes(v.bytes, v.offset, v.length);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED);
|
||||
if (missingCount == 0) {
|
||||
meta.writeLong(ALL_LIVE);
|
||||
} else if (missingCount == count) {
|
||||
meta.writeLong(ALL_MISSING);
|
||||
} else {
|
||||
meta.writeLong(data.getFilePointer());
|
||||
writeMissingBitset(values);
|
||||
}
|
||||
meta.writeVInt(minLength);
|
||||
meta.writeVInt(maxLength);
|
||||
meta.writeVLong(count);
|
||||
meta.writeLong(startFP);
|
||||
|
||||
// if minLength == maxLength, it's a fixed-length byte[], we are done (the addresses are implicit)
|
||||
// otherwise, we need to record the length fields...
|
||||
if (minLength != maxLength) {
|
||||
meta.writeLong(data.getFilePointer());
|
||||
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
|
||||
final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, data, count + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
long addr = 0;
|
||||
writer.add(addr);
|
||||
for (BytesRef v : values) {
|
||||
if (v != null) {
|
||||
addr += v.length;
|
||||
}
|
||||
writer.add(addr);
|
||||
}
|
||||
writer.finish();
|
||||
meta.writeLong(data.getFilePointer());
|
||||
}
|
||||
}
|
||||
|
||||
/** expert: writes a value dictionary for a sorted/sortedset field */
|
||||
private void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
|
||||
// first check if it's a "fixed-length" terms dict, and compressibility if so
|
||||
int minLength = Integer.MAX_VALUE;
|
||||
int maxLength = Integer.MIN_VALUE;
|
||||
long numValues = 0;
|
||||
BytesRefBuilder previousValue = new BytesRefBuilder();
|
||||
long prefixSum = 0; // only valid for fixed-width data, as we have a choice there
|
||||
for (BytesRef v : values) {
|
||||
minLength = Math.min(minLength, v.length);
|
||||
maxLength = Math.max(maxLength, v.length);
|
||||
if (minLength == maxLength) {
|
||||
int termPosition = (int) (numValues & INTERVAL_MASK);
|
||||
if (termPosition == 0) {
|
||||
// first term in block, save it away to compare against the last term later
|
||||
previousValue.copyBytes(v);
|
||||
} else if (termPosition == INTERVAL_COUNT - 1) {
|
||||
// last term in block, accumulate shared prefix against first term
|
||||
prefixSum += StringHelper.bytesDifference(previousValue.get(), v);
|
||||
}
|
||||
}
|
||||
numValues++;
|
||||
}
|
||||
// for fixed width data, look at the avg(shared prefix) before deciding how to encode:
|
||||
// prefix compression "costs" worst case 2 bytes per term because we must store suffix lengths.
|
||||
// so if we share at least 3 bytes on average, always compress.
|
||||
if (minLength == maxLength && prefixSum <= 3*(numValues >> INTERVAL_SHIFT)) {
|
||||
// no index needed: not very compressible, direct addressing by mult
|
||||
addBinaryField(field, values);
|
||||
} else if (numValues < REVERSE_INTERVAL_COUNT) {
|
||||
// low cardinality: waste a few KB of ram, but can't really use fancy index etc
|
||||
addBinaryField(field, values);
|
||||
} else {
|
||||
assert numValues > 0; // we don't have to handle the empty case
|
||||
// header
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene54DocValuesFormat.BINARY);
|
||||
meta.writeVInt(BINARY_PREFIX_COMPRESSED);
|
||||
meta.writeLong(-1L);
|
||||
// now write the bytes: sharing prefixes within a block
|
||||
final long startFP = data.getFilePointer();
|
||||
// currently, we have to store the delta from expected for every 1/nth term
|
||||
// we could avoid this, but it's not much and less overall RAM than the previous approach!
|
||||
RAMOutputStream addressBuffer = new RAMOutputStream();
|
||||
MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, MONOTONIC_BLOCK_SIZE);
|
||||
// buffers up 16 terms
|
||||
RAMOutputStream bytesBuffer = new RAMOutputStream();
|
||||
// buffers up block header
|
||||
RAMOutputStream headerBuffer = new RAMOutputStream();
|
||||
BytesRefBuilder lastTerm = new BytesRefBuilder();
|
||||
lastTerm.grow(maxLength);
|
||||
long count = 0;
|
||||
int suffixDeltas[] = new int[INTERVAL_COUNT];
|
||||
for (BytesRef v : values) {
|
||||
int termPosition = (int) (count & INTERVAL_MASK);
|
||||
if (termPosition == 0) {
|
||||
termAddresses.add(data.getFilePointer() - startFP);
|
||||
// abs-encode first term
|
||||
headerBuffer.writeVInt(v.length);
|
||||
headerBuffer.writeBytes(v.bytes, v.offset, v.length);
|
||||
lastTerm.copyBytes(v);
|
||||
} else {
|
||||
// prefix-code: we only share at most 255 characters, to encode the length as a single
|
||||
// byte and have random access. Larger terms just get less compression.
|
||||
int sharedPrefix = Math.min(255, StringHelper.bytesDifference(lastTerm.get(), v));
|
||||
bytesBuffer.writeByte((byte) sharedPrefix);
|
||||
bytesBuffer.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
|
||||
// we can encode one smaller, because terms are unique.
|
||||
suffixDeltas[termPosition] = v.length - sharedPrefix - 1;
|
||||
}
|
||||
|
||||
count++;
|
||||
// flush block
|
||||
if ((count & INTERVAL_MASK) == 0) {
|
||||
flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
|
||||
}
|
||||
}
|
||||
// flush trailing crap
|
||||
int leftover = (int) (count & INTERVAL_MASK);
|
||||
if (leftover > 0) {
|
||||
Arrays.fill(suffixDeltas, leftover, suffixDeltas.length, 0);
|
||||
flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
|
||||
}
|
||||
final long indexStartFP = data.getFilePointer();
|
||||
// write addresses of indexed terms
|
||||
termAddresses.finish();
|
||||
addressBuffer.writeTo(data);
|
||||
addressBuffer = null;
|
||||
termAddresses = null;
|
||||
meta.writeVInt(minLength);
|
||||
meta.writeVInt(maxLength);
|
||||
meta.writeVLong(count);
|
||||
meta.writeLong(startFP);
|
||||
meta.writeLong(indexStartFP);
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
meta.writeVInt(MONOTONIC_BLOCK_SIZE);
|
||||
addReverseTermIndex(field, values, maxLength);
|
||||
}
|
||||
}
|
||||
// writes term dictionary "block"
|
||||
// first term is absolute encoded as vint length + bytes.
|
||||
// lengths of subsequent N terms are encoded as either N bytes or N shorts.
|
||||
// in the double-byte case, the first byte is indicated with -1.
|
||||
// subsequent terms are encoded as byte suffixLength + bytes.
|
||||
private void flushTermsDictBlock(RAMOutputStream headerBuffer, RAMOutputStream bytesBuffer, int suffixDeltas[]) throws IOException {
|
||||
boolean twoByte = false;
|
||||
for (int i = 1; i < suffixDeltas.length; i++) {
|
||||
if (suffixDeltas[i] > 254) {
|
||||
twoByte = true;
|
||||
}
|
||||
}
|
||||
if (twoByte) {
|
||||
headerBuffer.writeByte((byte)255);
|
||||
for (int i = 1; i < suffixDeltas.length; i++) {
|
||||
headerBuffer.writeShort((short) suffixDeltas[i]);
|
||||
}
|
||||
} else {
|
||||
for (int i = 1; i < suffixDeltas.length; i++) {
|
||||
headerBuffer.writeByte((byte) suffixDeltas[i]);
|
||||
}
|
||||
}
|
||||
headerBuffer.writeTo(data);
|
||||
headerBuffer.reset();
|
||||
bytesBuffer.writeTo(data);
|
||||
bytesBuffer.reset();
|
||||
}
|
||||
|
||||
// writes reverse term index: used for binary searching a term into a range of 64 blocks
|
||||
// for every 64 blocks (1024 terms) we store a term, trimming any suffix unnecessary for comparison
|
||||
// terms are written as a contiguous byte[], but never spanning 2^15 byte boundaries.
|
||||
private void addReverseTermIndex(FieldInfo field, final Iterable<BytesRef> values, int maxLength) throws IOException {
|
||||
long count = 0;
|
||||
BytesRefBuilder priorTerm = new BytesRefBuilder();
|
||||
priorTerm.grow(maxLength);
|
||||
BytesRef indexTerm = new BytesRef();
|
||||
long startFP = data.getFilePointer();
|
||||
PagedBytes pagedBytes = new PagedBytes(15);
|
||||
MonotonicBlockPackedWriter addresses = new MonotonicBlockPackedWriter(data, MONOTONIC_BLOCK_SIZE);
|
||||
|
||||
for (BytesRef b : values) {
|
||||
int termPosition = (int) (count & REVERSE_INTERVAL_MASK);
|
||||
if (termPosition == 0) {
|
||||
int len = StringHelper.sortKeyLength(priorTerm.get(), b);
|
||||
indexTerm.bytes = b.bytes;
|
||||
indexTerm.offset = b.offset;
|
||||
indexTerm.length = len;
|
||||
addresses.add(pagedBytes.copyUsingLengthPrefix(indexTerm));
|
||||
} else if (termPosition == REVERSE_INTERVAL_MASK) {
|
||||
priorTerm.copyBytes(b);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
addresses.finish();
|
||||
long numBytes = pagedBytes.getPointer();
|
||||
pagedBytes.freeze(true);
|
||||
PagedBytesDataInput in = pagedBytes.getDataInput();
|
||||
meta.writeLong(startFP);
|
||||
data.writeVLong(numBytes);
|
||||
data.copyBytes(in, numBytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene54DocValuesFormat.SORTED);
|
||||
addTermsDict(field, LegacyDocValuesIterables.valuesIterable(valuesProducer.getSorted(field)));
|
||||
addNumericField(field, LegacyDocValuesIterables.sortedOrdIterable(valuesProducer, field, maxDoc), NumberType.ORDINAL);
|
||||
}
|
||||
|
||||
private void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> ords) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene54DocValuesFormat.SORTED);
|
||||
addTermsDict(field, values);
|
||||
addNumericField(field, ords, NumberType.ORDINAL);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addSortedNumericField(FieldInfo field, final DocValuesProducer valuesProducer) throws IOException {
|
||||
|
||||
final Iterable<Number> docToValueCount = LegacyDocValuesIterables.sortedNumericToDocCount(valuesProducer, field, maxDoc);
|
||||
final Iterable<Number> values = LegacyDocValuesIterables.sortedNumericToValues(valuesProducer, field);
|
||||
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene54DocValuesFormat.SORTED_NUMERIC);
|
||||
if (isSingleValued(docToValueCount)) {
|
||||
meta.writeVInt(SORTED_SINGLE_VALUED);
|
||||
// The field is single-valued, we can encode it as NUMERIC
|
||||
addNumericField(field, singletonView(docToValueCount, values, null), NumberType.VALUE);
|
||||
} else {
|
||||
final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToValueCount, values);
|
||||
if (uniqueValueSets != null) {
|
||||
meta.writeVInt(SORTED_SET_TABLE);
|
||||
|
||||
// write the set_id -> values mapping
|
||||
writeDictionary(uniqueValueSets);
|
||||
|
||||
// write the doc -> set_id as a numeric field
|
||||
addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), NumberType.ORDINAL);
|
||||
} else {
|
||||
meta.writeVInt(SORTED_WITH_ADDRESSES);
|
||||
// write the stream of values as a numeric field
|
||||
addNumericField(field, values, NumberType.VALUE);
|
||||
// write the doc -> ord count as a absolute index to the stream
|
||||
addOrdIndex(field, docToValueCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
|
||||
|
||||
Iterable<BytesRef> values = LegacyDocValuesIterables.valuesIterable(valuesProducer.getSortedSet(field));
|
||||
Iterable<Number> docToOrdCount = LegacyDocValuesIterables.sortedSetOrdCountIterable(valuesProducer, field, maxDoc);
|
||||
Iterable<Number> ords = LegacyDocValuesIterables.sortedSetOrdsIterable(valuesProducer, field);
|
||||
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene54DocValuesFormat.SORTED_SET);
|
||||
|
||||
if (isSingleValued(docToOrdCount)) {
|
||||
meta.writeVInt(SORTED_SINGLE_VALUED);
|
||||
// The field is single-valued, we can encode it as SORTED
|
||||
addSortedField(field, values, singletonView(docToOrdCount, ords, -1L));
|
||||
} else {
|
||||
final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToOrdCount, ords);
|
||||
if (uniqueValueSets != null) {
|
||||
meta.writeVInt(SORTED_SET_TABLE);
|
||||
|
||||
// write the set_id -> ords mapping
|
||||
writeDictionary(uniqueValueSets);
|
||||
|
||||
// write the ord -> byte[] as a binary field
|
||||
addTermsDict(field, values);
|
||||
|
||||
// write the doc -> set_id as a numeric field
|
||||
addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), NumberType.ORDINAL);
|
||||
} else {
|
||||
meta.writeVInt(SORTED_WITH_ADDRESSES);
|
||||
|
||||
// write the ord -> byte[] as a binary field
|
||||
addTermsDict(field, values);
|
||||
|
||||
// write the stream of ords as a numeric field
|
||||
// NOTE: we could return an iterator that delta-encodes these within a doc
|
||||
addNumericField(field, ords, NumberType.ORDINAL);
|
||||
|
||||
// write the doc -> ord count as a absolute index to the stream
|
||||
addOrdIndex(field, docToOrdCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private SortedSet<LongsRef> uniqueValueSets(Iterable<Number> docToValueCount, Iterable<Number> values) {
|
||||
Set<LongsRef> uniqueValueSet = new HashSet<>();
|
||||
LongsRef docValues = new LongsRef(256);
|
||||
|
||||
Iterator<Number> valueCountIterator = docToValueCount.iterator();
|
||||
Iterator<Number> valueIterator = values.iterator();
|
||||
int totalDictSize = 0;
|
||||
while (valueCountIterator.hasNext()) {
|
||||
docValues.length = valueCountIterator.next().intValue();
|
||||
if (docValues.length > 256) {
|
||||
return null;
|
||||
}
|
||||
for (int i = 0; i < docValues.length; ++i) {
|
||||
docValues.longs[i] = valueIterator.next().longValue();
|
||||
}
|
||||
if (uniqueValueSet.contains(docValues)) {
|
||||
continue;
|
||||
}
|
||||
totalDictSize += docValues.length;
|
||||
if (totalDictSize > 256) {
|
||||
return null;
|
||||
}
|
||||
uniqueValueSet.add(new LongsRef(Arrays.copyOf(docValues.longs, docValues.length), 0, docValues.length));
|
||||
}
|
||||
assert valueIterator.hasNext() == false;
|
||||
return new TreeSet<>(uniqueValueSet);
|
||||
}
|
||||
|
||||
private void writeDictionary(SortedSet<LongsRef> uniqueValueSets) throws IOException {
|
||||
int lengthSum = 0;
|
||||
for (LongsRef longs : uniqueValueSets) {
|
||||
lengthSum += longs.length;
|
||||
}
|
||||
|
||||
meta.writeInt(lengthSum);
|
||||
for (LongsRef valueSet : uniqueValueSets) {
|
||||
for (int i = 0; i < valueSet.length; ++i) {
|
||||
meta.writeLong(valueSet.longs[valueSet.offset + i]);
|
||||
}
|
||||
}
|
||||
|
||||
meta.writeInt(uniqueValueSets.size());
|
||||
for (LongsRef valueSet : uniqueValueSets) {
|
||||
meta.writeInt(valueSet.length);
|
||||
}
|
||||
}
|
||||
|
||||
private Iterable<Number> docToSetId(SortedSet<LongsRef> uniqueValueSets, Iterable<Number> docToValueCount, Iterable<Number> values) {
|
||||
final Map<LongsRef, Integer> setIds = new HashMap<>();
|
||||
int i = 0;
|
||||
for (LongsRef set : uniqueValueSets) {
|
||||
setIds.put(set, i++);
|
||||
}
|
||||
assert i == uniqueValueSets.size();
|
||||
|
||||
return new Iterable<Number>() {
|
||||
|
||||
@Override
|
||||
public Iterator<Number> iterator() {
|
||||
final Iterator<Number> valueCountIterator = docToValueCount.iterator();
|
||||
final Iterator<Number> valueIterator = values.iterator();
|
||||
final LongsRef docValues = new LongsRef(256);
|
||||
return new Iterator<Number>() {
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return valueCountIterator.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Number next() {
|
||||
docValues.length = valueCountIterator.next().intValue();
|
||||
for (int i = 0; i < docValues.length; ++i) {
|
||||
docValues.longs[i] = valueIterator.next().longValue();
|
||||
}
|
||||
final Integer id = setIds.get(docValues);
|
||||
assert id != null;
|
||||
return id;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// writes addressing information as MONOTONIC_COMPRESSED integer
|
||||
private void addOrdIndex(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene54DocValuesFormat.NUMERIC);
|
||||
meta.writeVInt(MONOTONIC_COMPRESSED);
|
||||
meta.writeLong(-1L);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
meta.writeVLong(maxDoc);
|
||||
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
|
||||
final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, data, maxDoc + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
long addr = 0;
|
||||
writer.add(addr);
|
||||
for (Number v : values) {
|
||||
addr += v.longValue();
|
||||
writer.add(addr);
|
||||
}
|
||||
writer.finish();
|
||||
meta.writeLong(data.getFilePointer());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
if (meta != null) {
|
||||
meta.writeVInt(-1); // write EOF marker
|
||||
CodecUtil.writeFooter(meta); // write checksum
|
||||
}
|
||||
if (data != null) {
|
||||
CodecUtil.writeFooter(data); // write checksum
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(data, meta);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(data, meta);
|
||||
}
|
||||
meta = data = null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,186 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene54;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat;
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.apache.lucene.util.packed.DirectWriter;
|
||||
|
||||
/**
|
||||
* Lucene 5.4 DocValues format.
|
||||
* <p>
|
||||
* Encodes the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) with these strategies:
|
||||
* <p>
|
||||
* {@link DocValuesType#NUMERIC NUMERIC}:
|
||||
* <ul>
|
||||
* <li>Delta-compressed: per-document integers written as deltas from the minimum value,
|
||||
* compressed with bitpacking. For more information, see {@link DirectWriter}.
|
||||
* <li>Table-compressed: when the number of unique values is very small (< 256), and
|
||||
* when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
|
||||
* a lookup table is written instead. Each per-document entry is instead the ordinal
|
||||
* to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}).
|
||||
* <li>GCD-compressed: when all numbers share a common divisor, such as dates, the greatest
|
||||
* common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
|
||||
* <li>Monotonic-compressed: when all numbers are monotonically increasing offsets, they are written
|
||||
* as blocks of bitpacked integers, encoding the deviation from the expected delta.
|
||||
* <li>Const-compressed: when there is only one possible non-missing value, only the missing
|
||||
* bitset is encoded.
|
||||
* <li>Sparse-compressed: only documents with a value are stored, and lookups are performed
|
||||
* using binary search.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#BINARY BINARY}:
|
||||
* <ul>
|
||||
* <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
|
||||
* Each document's value can be addressed directly with multiplication ({@code docID * length}).
|
||||
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
|
||||
* for each document. The addresses are written as Monotonic-compressed numerics.
|
||||
* <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written
|
||||
* completely and other values sharing prefixes. chunk addresses are written as Monotonic-compressed
|
||||
* numerics. A reverse lookup index is written from a portion of every 1024th term.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED SORTED}:
|
||||
* <ul>
|
||||
* <li>Sorted: a mapping of ordinals to deduplicated terms is written as Binary,
|
||||
* along with the per-document ordinals written using one of the numeric strategies above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED_SET SORTED_SET}:
|
||||
* <ul>
|
||||
* <li>Single: if all documents have 0 or 1 value, then data are written like SORTED.
|
||||
* <li>SortedSet table: when there are few unique sets of values (< 256) then each set is assigned
|
||||
* an id, a lookup table is written and the mapping from document to set id is written using the
|
||||
* numeric strategies above.
|
||||
* <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary,
|
||||
* an ordinal list and per-document index into this list are written using the numeric strategies
|
||||
* above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}:
|
||||
* <ul>
|
||||
* <li>Single: if all documents have 0 or 1 value, then data are written like NUMERIC.
|
||||
* <li>SortedSet table: when there are few unique sets of values (< 256) then each set is assigned
|
||||
* an id, a lookup table is written and the mapping from document to set id is written using the
|
||||
* numeric strategies above.
|
||||
* <li>SortedNumeric: a value list and per-document index into this list are written using the numeric
|
||||
* strategies above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Files:
|
||||
* <ol>
|
||||
* <li><tt>.dvd</tt>: DocValues data</li>
|
||||
* <li><tt>.dvm</tt>: DocValues metadata</li>
|
||||
* </ol>
|
||||
* @lucene.experimental
|
||||
* @deprecated Use {@link Lucene70DocValuesFormat}.
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene54DocValuesFormat extends DocValuesFormat {
|
||||
|
||||
/** Sole Constructor */
|
||||
public Lucene54DocValuesFormat() {
|
||||
super("Lucene54");
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
return new Lucene54DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene54DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||
}
|
||||
|
||||
static final String DATA_CODEC = "Lucene54DocValuesData";
|
||||
static final String DATA_EXTENSION = "dvd";
|
||||
static final String META_CODEC = "Lucene54DocValuesMetadata";
|
||||
static final String META_EXTENSION = "dvm";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
// indicates docvalues type
|
||||
static final byte NUMERIC = 0;
|
||||
static final byte BINARY = 1;
|
||||
static final byte SORTED = 2;
|
||||
static final byte SORTED_SET = 3;
|
||||
static final byte SORTED_NUMERIC = 4;
|
||||
|
||||
// address terms in blocks of 16 terms
|
||||
static final int INTERVAL_SHIFT = 4;
|
||||
static final int INTERVAL_COUNT = 1 << INTERVAL_SHIFT;
|
||||
static final int INTERVAL_MASK = INTERVAL_COUNT - 1;
|
||||
|
||||
// build reverse index from every 1024th term
|
||||
static final int REVERSE_INTERVAL_SHIFT = 10;
|
||||
static final int REVERSE_INTERVAL_COUNT = 1 << REVERSE_INTERVAL_SHIFT;
|
||||
static final int REVERSE_INTERVAL_MASK = REVERSE_INTERVAL_COUNT - 1;
|
||||
|
||||
// for conversion from reverse index to block
|
||||
static final int BLOCK_INTERVAL_SHIFT = REVERSE_INTERVAL_SHIFT - INTERVAL_SHIFT;
|
||||
static final int BLOCK_INTERVAL_COUNT = 1 << BLOCK_INTERVAL_SHIFT;
|
||||
static final int BLOCK_INTERVAL_MASK = BLOCK_INTERVAL_COUNT - 1;
|
||||
|
||||
/** Compressed using packed blocks of ints. */
|
||||
static final int DELTA_COMPRESSED = 0;
|
||||
/** Compressed by computing the GCD. */
|
||||
static final int GCD_COMPRESSED = 1;
|
||||
/** Compressed by giving IDs to unique values. */
|
||||
static final int TABLE_COMPRESSED = 2;
|
||||
/** Compressed with monotonically increasing values */
|
||||
static final int MONOTONIC_COMPRESSED = 3;
|
||||
/** Compressed with constant value (uses only missing bitset) */
|
||||
static final int CONST_COMPRESSED = 4;
|
||||
/** Compressed with sparse arrays. */
|
||||
static final int SPARSE_COMPRESSED = 5;
|
||||
|
||||
/** Uncompressed binary, written directly (fixed length). */
|
||||
static final int BINARY_FIXED_UNCOMPRESSED = 0;
|
||||
/** Uncompressed binary, written directly (variable length). */
|
||||
static final int BINARY_VARIABLE_UNCOMPRESSED = 1;
|
||||
/** Compressed binary with shared prefixes */
|
||||
static final int BINARY_PREFIX_COMPRESSED = 2;
|
||||
|
||||
/** Standard storage for sorted set values with 1 level of indirection:
|
||||
* {@code docId -> address -> ord}. */
|
||||
static final int SORTED_WITH_ADDRESSES = 0;
|
||||
/** Single-valued sorted set values, encoded as sorted values, so no level
|
||||
* of indirection: {@code docId -> ord}. */
|
||||
static final int SORTED_SINGLE_VALUED = 1;
|
||||
/** Compressed giving IDs to unique sets of values:
|
||||
* {@code docId -> setId -> ords} */
|
||||
static final int SORTED_SET_TABLE = 2;
|
||||
|
||||
/** placeholder for missing offset that means there are no missing values */
|
||||
static final int ALL_LIVE = -1;
|
||||
/** placeholder for missing offset that means all values are missing */
|
||||
static final int ALL_MISSING = -2;
|
||||
|
||||
// addressing uses 16k blocks
|
||||
static final int MONOTONIC_BLOCK_SIZE = 16384;
|
||||
static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,403 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lucene 5.4 file format.
|
||||
*
|
||||
* <h1>Apache Lucene - Index File Formats</h1>
|
||||
* <div>
|
||||
* <ul>
|
||||
* <li><a href="#Introduction">Introduction</a></li>
|
||||
* <li><a href="#Definitions">Definitions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a></li>
|
||||
* <li><a href="#Types_of_Fields">Types of Fields</a></li>
|
||||
* <li><a href="#Segments">Segments</a></li>
|
||||
* <li><a href="#Document_Numbers">Document Numbers</a></li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a href="#Overview">Index Structure Overview</a></li>
|
||||
* <li><a href="#File_Naming">File Naming</a></li>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Lock_File">Lock File</a></li>
|
||||
* <li><a href="#History">History</a></li>
|
||||
* <li><a href="#Limitations">Limitations</a></li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* </ul>
|
||||
* </div>
|
||||
* <a name="Introduction"></a>
|
||||
* <h2>Introduction</h2>
|
||||
* <div>
|
||||
* <p>This document defines the index file formats used in this version of Lucene.
|
||||
* If you are using a different version of Lucene, please consult the copy of
|
||||
* <code>docs/</code> that was distributed with
|
||||
* the version you are using.</p>
|
||||
* <p>Apache Lucene is written in Java, but several efforts are underway to write
|
||||
* <a href="http://wiki.apache.org/lucene-java/LuceneImplementations">versions of
|
||||
* Lucene in other programming languages</a>. If these versions are to remain
|
||||
* compatible with Apache Lucene, then a language-independent definition of the
|
||||
* Lucene index format is required. This document thus attempts to provide a
|
||||
* complete and independent definition of the Apache Lucene file formats.</p>
|
||||
* <p>As Lucene evolves, this document should evolve. Versions of Lucene in
|
||||
* different programming languages should endeavor to agree on file formats, and
|
||||
* generate new versions of this document.</p>
|
||||
* </div>
|
||||
* <a name="Definitions"></a>
|
||||
* <h2>Definitions</h2>
|
||||
* <div>
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.</p>
|
||||
* <p>An index contains a sequence of documents.</p>
|
||||
* <ul>
|
||||
* <li>A document is a sequence of fields.</li>
|
||||
* <li>A field is a named sequence of terms.</li>
|
||||
* <li>A term is a sequence of bytes.</li>
|
||||
* </ul>
|
||||
* <p>The same sequence of bytes in two different fields is considered a different
|
||||
* term. Thus terms are represented as a pair: the string naming the field, and the
|
||||
* bytes within the field.</p>
|
||||
* <a name="Inverted_Indexing"></a>
|
||||
* <h3>Inverted Indexing</h3>
|
||||
* <p>The index stores statistics about terms in order to make term-based search
|
||||
* more efficient. Lucene's index falls into the family of indexes known as an
|
||||
* <i>inverted index.</i> This is because it can list, for a term, the documents
|
||||
* that contain it. This is the inverse of the natural relationship, in which
|
||||
* documents list terms.</p>
|
||||
* <a name="Types_of_Fields"></a>
|
||||
* <h3>Types of Fields</h3>
|
||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
|
||||
* in the index literally, in a non-inverted manner. Fields that are inverted are
|
||||
* called <i>indexed</i>. A field may be both stored and indexed.</p>
|
||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the
|
||||
* text of a field may be used literally as a term to be indexed. Most fields are
|
||||
* tokenized, but sometimes it is useful for certain identifier fields to be
|
||||
* indexed literally.</p>
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field}
|
||||
* java docs for more information on Fields.</p>
|
||||
* <a name="Segments"></a>
|
||||
* <h3>Segments</h3>
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
|
||||
* Each segment is a fully independent index, which could be searched separately.
|
||||
* Indexes evolve by:</p>
|
||||
* <ol>
|
||||
* <li>Creating new segments for newly added documents.</li>
|
||||
* <li>Merging existing segments.</li>
|
||||
* </ol>
|
||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index
|
||||
* potentially composed of a set of segments.</p>
|
||||
* <a name="Document_Numbers"></a>
|
||||
* <h3>Document Numbers</h3>
|
||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>.
|
||||
* The first document added to an index is numbered zero, and each subsequent
|
||||
* document added gets a number one greater than the previous.</p>
|
||||
* <p>Note that a document's number may change, so caution should be taken when
|
||||
* storing these numbers outside of Lucene. In particular, numbers may change in
|
||||
* the following situations:</p>
|
||||
* <ul>
|
||||
* <li>
|
||||
* <p>The numbers stored in each segment are unique only within the segment, and
|
||||
* must be converted before they can be used in a larger context. The standard
|
||||
* technique is to allocate each segment a range of values, based on the range of
|
||||
* numbers used in that segment. To convert a document number from a segment to an
|
||||
* external value, the segment's <i>base</i> document number is added. To convert
|
||||
* an external value back to a segment-specific value, the segment is identified
|
||||
* by the range that the external value is in, and the segment's base value is
|
||||
* subtracted. For example two five document segments might be combined, so that
|
||||
* the first segment has a base value of zero, and the second of five. Document
|
||||
* three from the second segment would have an external value of eight.</p>
|
||||
* </li>
|
||||
* <li>
|
||||
* <p>When documents are deleted, gaps are created in the numbering. These are
|
||||
* eventually removed as the index evolves through merging. Deleted documents are
|
||||
* dropped when segments are merged. A freshly-merged segment thus has no gaps in
|
||||
* its numbering.</p>
|
||||
* </li>
|
||||
* </ul>
|
||||
* </div>
|
||||
* <a name="Overview"></a>
|
||||
* <h2>Index Structure Overview</h2>
|
||||
* <div>
|
||||
* <p>Each segment index maintains the following:</p>
|
||||
* <ul>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene62.Lucene62SegmentInfoFormat Segment info}.
|
||||
* This contains metadata about a segment, such as the number of documents,
|
||||
* what files it uses,
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Field names}.
|
||||
* This contains the set of field names used in the index.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Stored Field values}.
|
||||
* This contains, for each document, a list of attribute-value pairs, where the attributes
|
||||
* are field names. These are used to store auxiliary information about the document, such as
|
||||
* its title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term dictionary}.
|
||||
* A dictionary containing all of the terms used in all of the
|
||||
* indexed fields of all of the documents. The dictionary also contains the number
|
||||
* of documents which contain the term, and pointers to the term's frequency and
|
||||
* proximity data.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Frequency data}.
|
||||
* For each term in the dictionary, the numbers of all the
|
||||
* documents that contain that term, and the frequency of the term in that
|
||||
* document, unless frequencies are omitted (IndexOptions.DOCS_ONLY)
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Proximity data}.
|
||||
* For each term in the dictionary, the positions that the
|
||||
* term occurs in each document. Note that this will not exist if all fields in
|
||||
* all documents omit position data.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene53.Lucene53NormsFormat Normalization factors}.
|
||||
* For each field in each document, a value is stored
|
||||
* that is multiplied into the score for hits on that field.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
|
||||
* For each field in each document, the term vector (sometimes
|
||||
* called document vector) may be stored. A term vector consists of term text and
|
||||
* term frequency. To add Term Vectors to your index see the
|
||||
* {@link org.apache.lucene.document.Field Field} constructors
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat Per-document values}.
|
||||
* Like stored values, these are also keyed by document
|
||||
* number, but are generally intended to be loaded into main memory for fast
|
||||
* access. Whereas stored values are generally intended for summary results from
|
||||
* searches, per-document values are useful for things like scoring factors.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
|
||||
* An optional file indicating which documents are live.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <p>Details on each of these are provided in their linked pages.</p>
|
||||
* </div>
|
||||
* <a name="File_Naming"></a>
|
||||
* <h2>File Naming</h2>
|
||||
* <div>
|
||||
* <p>All files belonging to a segment have the same name with varying extensions.
|
||||
* The extensions correspond to the different file formats described below. When
|
||||
* using the Compound File format (default in 1.4 and greater) these files (except
|
||||
* for the Segment info file, the Lock file, and Deleted documents file) are collapsed
|
||||
* into a single .cfs file (see below for details)</p>
|
||||
* <p>Typically, all segments in an index are stored in a single directory,
|
||||
* although this is not required.</p>
|
||||
* <p>As of version 2.1 (lock-less commits), file names are never re-used.
|
||||
* That is, when any file is saved
|
||||
* to the Directory it is given a never before used filename. This is achieved
|
||||
* using a simple generations approach. For example, the first segments file is
|
||||
* segments_1, then segments_2, etc. The generation is a sequential long integer
|
||||
* represented in alpha-numeric (base 36) form.</p>
|
||||
* </div>
|
||||
* <a name="file-names"></a>
|
||||
* <h2>Summary of File Extensions</h2>
|
||||
* <div>
|
||||
* <p>The following table summarizes the names and extensions of the files in
|
||||
* Lucene:</p>
|
||||
* <table cellspacing="1" cellpadding="4" summary="lucene filenames by extension">
|
||||
* <tr>
|
||||
* <th>Name</th>
|
||||
* <th>Extension</th>
|
||||
* <th>Brief Description</th>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||
* <td>segments_N</td>
|
||||
* <td>Stores information about a commit point</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td><a href="#Lock_File">Lock File</a></td>
|
||||
* <td>write.lock</td>
|
||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene62.Lucene62SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}</td>
|
||||
* <td>.cfs, .cfe</td>
|
||||
* <td>An optional "virtual" file consisting of all the other index files for
|
||||
* systems that frequently run out of file handles.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Fields}</td>
|
||||
* <td>.fnm</td>
|
||||
* <td>Stores information about the fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Index}</td>
|
||||
* <td>.fdx</td>
|
||||
* <td>Contains pointers to field data</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Data}</td>
|
||||
* <td>.fdt</td>
|
||||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene53.Lucene53NormsFormat Norms}</td>
|
||||
* <td>.nvd, .nvm</td>
|
||||
* <td>Encodes length and boost factors for docs and fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat Per-Document Values}</td>
|
||||
* <td>.dvd, .dvm</td>
|
||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}</td>
|
||||
* <td>.tvx</td>
|
||||
* <td>Stores offset into the document data file</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Documents}</td>
|
||||
* <td>.tvd</td>
|
||||
* <td>Contains information about each document that has term vectors</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Fields}</td>
|
||||
* <td>.tvf</td>
|
||||
* <td>The field level info about term vectors</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
|
||||
* <td>.liv</td>
|
||||
* <td>Info about what files are live</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
* </div>
|
||||
* <a name="Lock_File"></a>
|
||||
* <h2>Lock File</h2>
|
||||
* The write lock, which is stored in the index directory by default, is named
|
||||
* "write.lock". If the lock directory is different from the index directory then
|
||||
* the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
|
||||
* derived from the full path to the index directory. When this file is present, a
|
||||
* writer is currently modifying the index (adding or removing documents). This
|
||||
* lock file ensures that only one writer is modifying the index at a time.
|
||||
* <a name="History"></a>
|
||||
* <h2>History</h2>
|
||||
* <p>Compatibility notes are provided in this document, describing how file
|
||||
* formats have changed from prior versions:</p>
|
||||
* <ul>
|
||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie,
|
||||
* no more commit lock). The change is fully backwards compatible: you can open a
|
||||
* pre-2.1 index for searching or adding/deleting of docs. When the new segments
|
||||
* file is saved (committed), it will be written in the new file format (meaning
|
||||
* no specific "upgrade" process is needed). But note that once a commit has
|
||||
* occurred, pre-2.1 Lucene will not be able to read the index.</li>
|
||||
* <li>In version 2.3, the file format was changed to allow segments to share a
|
||||
* single set of doc store (vectors & stored fields) files. This allows for
|
||||
* faster indexing in certain cases. The change is fully backwards compatible (in
|
||||
* the same way as the lock-less commits change in 2.1).</li>
|
||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not
|
||||
* Java's modified UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">
|
||||
* LUCENE-510</a> for details.</li>
|
||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData
|
||||
* may be passed to IndexWriter's commit methods (and later retrieved), which is
|
||||
* recorded in the segments_N file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">
|
||||
* LUCENE-1382</a> for details. Also,
|
||||
* diagnostics were added to each segment written recording details about why it
|
||||
* was written (due to flush, merge; which OS/JRE was used; etc.). See issue
|
||||
* <a href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.</li>
|
||||
* <li>In version 3.0, compressed fields are no longer written to the index (they
|
||||
* can still be read, but on merge the new segment will write them, uncompressed).
|
||||
* See issue <a href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a>
|
||||
* for details.</li>
|
||||
* <li>In version 3.1, segments records the code version that created them. See
|
||||
* <a href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||
* Additionally segments track explicitly whether or not they have term vectors.
|
||||
* See <a href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a>
|
||||
* for details.</li>
|
||||
* <li>In version 3.2, numeric fields are written as natively to stored fields
|
||||
* file, previously they were stored in text format only.</li>
|
||||
* <li>In version 3.4, fields can omit position data while still indexing term
|
||||
* frequencies.</li>
|
||||
* <li>In version 4.0, the format of the inverted index became extensible via
|
||||
* the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
|
||||
* ({@code DocValues}) was introduced. Normalization factors need no longer be a
|
||||
* single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
|
||||
* Terms need not be unicode strings, they can be any byte sequence. Term offsets
|
||||
* can optionally be indexed into the postings lists. Payloads can be stored in the
|
||||
* term vectors.</li>
|
||||
* <li>In version 4.1, the format of the postings list changed to use either
|
||||
* of FOR compression or variable-byte encoding, depending upon the frequency
|
||||
* of the term. Terms appearing only once were changed to inline directly into
|
||||
* the term dictionary. Stored fields are compressed by default. </li>
|
||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has
|
||||
* a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
|
||||
* on multi-valued fields.</li>
|
||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.</li>
|
||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||
* allow updating NumericDocValues fields.</li>
|
||||
* <li>In version 4.8, checksum footers were added to the end of each index file
|
||||
* for improved data integrity. Specifically, the last 8 bytes of every index file
|
||||
* contain the zlib-crc32 checksum of the file.</li>
|
||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
|
||||
* that is suitable for faceting/sorting/analytics.
|
||||
* <li>In version 5.4, DocValues have been improved to store more information on disk:
|
||||
* addresses for binary fields and ord indexes for multi-valued fields.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <a name="Limitations"></a>
|
||||
* <h2>Limitations</h2>
|
||||
* <div>
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to
|
||||
* document numbers, and the index file format uses an <code>Int32</code>
|
||||
* on-disk to store document numbers. This is a limitation
|
||||
* of both the index file format and the current implementation. Eventually these
|
||||
* should be replaced with either <code>UInt64</code> values, or
|
||||
* better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.</p>
|
||||
* </div>
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene54;
|
|
@ -1,177 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene60;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.PointsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53NormsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 6.0 index format, with configurable per-field postings
|
||||
* and docvalues formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene60 package documentation for file format details.
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @deprecated Only for 6.0 back compat
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene60Codec extends Codec {
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene60FieldInfosFormat();
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
|
||||
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
|
||||
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
|
||||
|
||||
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return Lucene60Codec.this.getPostingsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return Lucene60Codec.this.getDocValuesFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final StoredFieldsFormat storedFieldsFormat;
|
||||
|
||||
/**
|
||||
* Instantiates a new codec.
|
||||
*/
|
||||
public Lucene60Codec() {
|
||||
this(Mode.BEST_SPEED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new codec, specifying the stored fields compression
|
||||
* mode to use.
|
||||
* @param mode stored fields compression mode to use for newly
|
||||
* flushed/merged segments.
|
||||
*/
|
||||
public Lucene60Codec(Mode mode) {
|
||||
super("Lucene60");
|
||||
this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final StoredFieldsFormat storedFieldsFormat() {
|
||||
return storedFieldsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final TermVectorsFormat termVectorsFormat() {
|
||||
return vectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PostingsFormat postingsFormat() {
|
||||
return postingsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final FieldInfosFormat fieldInfosFormat() {
|
||||
return fieldInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final LiveDocsFormat liveDocsFormat() {
|
||||
return liveDocsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final CompoundFormat compoundFormat() {
|
||||
return compoundFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PointsFormat pointsFormat() {
|
||||
return new Lucene60PointsFormat();
|
||||
}
|
||||
|
||||
/** Returns the postings format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene50".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return defaultFormat;
|
||||
}
|
||||
|
||||
/** Returns the docvalues format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene54".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return defaultDVFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final DocValuesFormat docValuesFormat() {
|
||||
return docValuesFormat;
|
||||
}
|
||||
|
||||
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50");
|
||||
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene54");
|
||||
|
||||
private final NormsFormat normsFormat = new Lucene53NormsFormat();
|
||||
|
||||
@Override
|
||||
public NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Lucene 6.0 file format.
|
||||
</body>
|
||||
</html>
|
|
@ -1,176 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene62;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.PointsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53NormsFormat;
|
||||
import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene60.Lucene60PointsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 6.2 index format, with configurable per-field postings
|
||||
* and docvalues formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene60 package documentation for file format details.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene62Codec extends Codec {
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene60FieldInfosFormat();
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene62SegmentInfoFormat();
|
||||
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
|
||||
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
|
||||
|
||||
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return Lucene62Codec.this.getPostingsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return Lucene62Codec.this.getDocValuesFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final StoredFieldsFormat storedFieldsFormat;
|
||||
|
||||
/**
|
||||
* Instantiates a new codec.
|
||||
*/
|
||||
public Lucene62Codec() {
|
||||
this(Mode.BEST_SPEED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new codec, specifying the stored fields compression
|
||||
* mode to use.
|
||||
* @param mode stored fields compression mode to use for newly
|
||||
* flushed/merged segments.
|
||||
*/
|
||||
public Lucene62Codec(Mode mode) {
|
||||
super("Lucene62");
|
||||
this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final StoredFieldsFormat storedFieldsFormat() {
|
||||
return storedFieldsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final TermVectorsFormat termVectorsFormat() {
|
||||
return vectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PostingsFormat postingsFormat() {
|
||||
return postingsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final FieldInfosFormat fieldInfosFormat() {
|
||||
return fieldInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final LiveDocsFormat liveDocsFormat() {
|
||||
return liveDocsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final CompoundFormat compoundFormat() {
|
||||
return compoundFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PointsFormat pointsFormat() {
|
||||
return new Lucene60PointsFormat();
|
||||
}
|
||||
|
||||
/** Returns the postings format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene50".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return defaultFormat;
|
||||
}
|
||||
|
||||
/** Returns the docvalues format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene54".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return defaultDVFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final DocValuesFormat docValuesFormat() {
|
||||
return docValuesFormat;
|
||||
}
|
||||
|
||||
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50");
|
||||
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene54");
|
||||
|
||||
private final NormsFormat normsFormat = new Lucene53NormsFormat();
|
||||
|
||||
@Override
|
||||
public NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Lucene 6.2 file format.
|
||||
</body>
|
||||
</html>
|
|
@ -1,138 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.index;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.SuppressForbidden;
|
||||
|
||||
/**
|
||||
* Command-line tool that reads from a source index and
|
||||
* writes to a dest index, correcting any broken offsets
|
||||
* in the process.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class FixBrokenOffsets {
|
||||
public SegmentInfos infos;
|
||||
|
||||
FSDirectory fsDir;
|
||||
|
||||
Path dir;
|
||||
|
||||
@SuppressForbidden(reason = "System.out required: command line tool")
|
||||
public static void main(String[] args) throws IOException {
|
||||
if (args.length < 2) {
|
||||
System.err.println("Usage: FixBrokenOffsetse <srcDir> <destDir>");
|
||||
return;
|
||||
}
|
||||
Path srcPath = Paths.get(args[0]);
|
||||
if (!Files.exists(srcPath)) {
|
||||
throw new RuntimeException("srcPath " + srcPath.toAbsolutePath() + " doesn't exist");
|
||||
}
|
||||
Path destPath = Paths.get(args[1]);
|
||||
if (Files.exists(destPath)) {
|
||||
throw new RuntimeException("destPath " + destPath.toAbsolutePath() + " already exists; please remove it and re-run");
|
||||
}
|
||||
Directory srcDir = FSDirectory.open(srcPath);
|
||||
DirectoryReader reader = DirectoryReader.open(srcDir);
|
||||
|
||||
List<LeafReaderContext> leaves = reader.leaves();
|
||||
CodecReader[] filtered = new CodecReader[leaves.size()];
|
||||
for(int i=0;i<leaves.size();i++) {
|
||||
filtered[i] = SlowCodecReaderWrapper.wrap(new FilterLeafReader(leaves.get(i).reader()) {
|
||||
@Override
|
||||
public Fields getTermVectors(int docID) throws IOException {
|
||||
Fields termVectors = in.getTermVectors(docID);
|
||||
if (termVectors == null) {
|
||||
return null;
|
||||
}
|
||||
return new FilterFields(termVectors) {
|
||||
@Override
|
||||
public Terms terms(String field) throws IOException {
|
||||
return new FilterTerms(super.terms(field)) {
|
||||
@Override
|
||||
public TermsEnum iterator() throws IOException {
|
||||
return new FilterTermsEnum(super.iterator()) {
|
||||
@Override
|
||||
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
|
||||
return new FilterPostingsEnum(super.postings(reuse, flags)) {
|
||||
int nextLastStartOffset = 0;
|
||||
int lastStartOffset = 0;
|
||||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
int pos = super.nextPosition();
|
||||
lastStartOffset = nextLastStartOffset;
|
||||
nextLastStartOffset = startOffset();
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
int offset = super.startOffset();
|
||||
if (offset < lastStartOffset) {
|
||||
offset = lastStartOffset;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
int offset = super.endOffset();
|
||||
if (offset < lastStartOffset) {
|
||||
offset = lastStartOffset;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public CacheHelper getCoreCacheHelper() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CacheHelper getReaderCacheHelper() {
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Directory destDir = FSDirectory.open(destPath);
|
||||
// We need to maintain the same major version
|
||||
int createdMajor = SegmentInfos.readLatestCommit(srcDir).getIndexCreatedVersionMajor();
|
||||
new SegmentInfos(createdMajor).commit(destDir);
|
||||
IndexWriter writer = new IndexWriter(destDir, new IndexWriterConfig());
|
||||
writer.addIndexes(filtered);
|
||||
IOUtils.close(writer, reader, srcDir, destDir);
|
||||
}
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<!-- not a package-info.java, because we already defined this package in core/ -->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<title>Tools for handling backwards compatibility issues with indices.</title>
|
||||
</head>
|
||||
<body>
|
||||
Tools for handling backwards compatibility issues with indices.
|
||||
</body>
|
||||
</html>
|
|
@ -13,5 +13,3 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.codecs.lucene60.Lucene60Codec
|
||||
org.apache.lucene.codecs.lucene62.Lucene62Codec
|
||||
|
|
|
@ -13,4 +13,3 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat
|
||||
|
|
|
@ -1,125 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo; // javadocs
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Read-write version of 5.0 SegmentInfoFormat for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene50RWSegmentInfoFormat extends Lucene50SegmentInfoFormat {
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene50RWSegmentInfoFormat() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException {
|
||||
final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene50SegmentInfoFormat.SI_EXTENSION);
|
||||
try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) {
|
||||
Throwable priorE = null;
|
||||
SegmentInfo si = null;
|
||||
try {
|
||||
CodecUtil.checkIndexHeader(input, Lucene50SegmentInfoFormat.CODEC_NAME,
|
||||
Lucene50SegmentInfoFormat.VERSION_START,
|
||||
Lucene50SegmentInfoFormat.VERSION_CURRENT,
|
||||
segmentID, "");
|
||||
final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
|
||||
|
||||
final int docCount = input.readInt();
|
||||
if (docCount < 0) {
|
||||
throw new CorruptIndexException("invalid docCount: " + docCount, input);
|
||||
}
|
||||
final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
|
||||
|
||||
final Map<String,String> diagnostics = input.readMapOfStrings();
|
||||
final Set<String> files = input.readSetOfStrings();
|
||||
final Map<String,String> attributes = input.readMapOfStrings();
|
||||
|
||||
si = new SegmentInfo(dir, version, null, segment, docCount, isCompoundFile, null, diagnostics, segmentID, attributes, null);
|
||||
si.setFiles(files);
|
||||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
CodecUtil.checkFooter(input, priorE);
|
||||
}
|
||||
return si;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
|
||||
final String fileName = IndexFileNames.segmentFileName(si.name, "", Lucene50SegmentInfoFormat.SI_EXTENSION);
|
||||
|
||||
assert si.getIndexSort() == null;
|
||||
|
||||
try (IndexOutput output = dir.createOutput(fileName, ioContext)) {
|
||||
// Only add the file once we've successfully created it, else IFD assert can trip:
|
||||
si.addFile(fileName);
|
||||
CodecUtil.writeIndexHeader(output,
|
||||
Lucene50SegmentInfoFormat.CODEC_NAME,
|
||||
Lucene50SegmentInfoFormat.VERSION_CURRENT,
|
||||
si.getId(),
|
||||
"");
|
||||
Version version = si.getVersion();
|
||||
if (version.major < 5) {
|
||||
throw new IllegalArgumentException("invalid major version: should be >= 5 but got: " + version.major + " segment=" + si);
|
||||
}
|
||||
// Write the Lucene version that created this segment, since 3.1
|
||||
output.writeInt(version.major);
|
||||
output.writeInt(version.minor);
|
||||
output.writeInt(version.bugfix);
|
||||
assert version.prerelease == 0;
|
||||
output.writeInt(si.maxDoc());
|
||||
|
||||
output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
|
||||
output.writeMapOfStrings(si.getDiagnostics());
|
||||
Set<String> files = si.files();
|
||||
for (String file : files) {
|
||||
if (!IndexFileNames.parseSegmentName(file).equals(si.name)) {
|
||||
throw new IllegalArgumentException("invalid files: expected segment=" + si.name + ", got=" + files);
|
||||
}
|
||||
}
|
||||
output.writeSetOfStrings(files);
|
||||
output.writeMapOfStrings(si.getAttributes());
|
||||
|
||||
CodecUtil.writeFooter(output);
|
||||
}
|
||||
}
|
||||
|
||||
/** File extension used to store {@link SegmentInfo}. */
|
||||
public final static String SI_EXTENSION = "si";
|
||||
static final String CODEC_NAME = "Lucene50SegmentInfo";
|
||||
static final int VERSION_SAFE_MAPS = 1;
|
||||
static final int VERSION_START = VERSION_SAFE_MAPS;
|
||||
static final int VERSION_CURRENT = VERSION_SAFE_MAPS;
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene60.Lucene60RWCodec;
|
||||
import org.apache.lucene.index.BaseSegmentInfoFormatTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestLucene50SegmentInfoFormat extends BaseSegmentInfoFormatTestCase {
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene60RWCodec();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int getCreatedVersionMajor() {
|
||||
return Version.LUCENE_6_0_0.major;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Version[] getVersions() {
|
||||
return new Version[] { Version.LUCENE_6_0_0 };
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean supportsIndexSort() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean supportsMinVersion() {
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -1,159 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene53;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.LegacyDocValuesIterables;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene53.Lucene53NormsFormat.VERSION_CURRENT;
|
||||
|
||||
/**
|
||||
* Writer for {@link Lucene53NormsFormat}
|
||||
*/
|
||||
class Lucene53NormsConsumer extends NormsConsumer {
|
||||
IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
|
||||
Lucene53NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
data = state.directory.createOutput(dataName, state.context);
|
||||
CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
meta = state.directory.createOutput(metaName, state.context);
|
||||
CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
maxDoc = state.segmentInfo.maxDoc();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addNormsField(FieldInfo field, NormsProducer normsProducer) throws IOException {
|
||||
addNormsField(field, LegacyDocValuesIterables.normsIterable(field, normsProducer, maxDoc));
|
||||
}
|
||||
|
||||
private void addNormsField(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
long minValue = Long.MAX_VALUE;
|
||||
long maxValue = Long.MIN_VALUE;
|
||||
int count = 0;
|
||||
|
||||
for (Number nv : values) {
|
||||
if (nv == null) {
|
||||
throw new IllegalStateException("illegal norms data for field " + field.name + ", got null for value: " + count);
|
||||
}
|
||||
final long v = nv.longValue();
|
||||
minValue = Math.min(minValue, v);
|
||||
maxValue = Math.max(maxValue, v);
|
||||
count++;
|
||||
}
|
||||
|
||||
if (count != maxDoc) {
|
||||
throw new IllegalStateException("illegal norms data for field " + field.name + ", expected count=" + maxDoc + ", got=" + count);
|
||||
}
|
||||
|
||||
if (minValue == maxValue) {
|
||||
addConstant(minValue);
|
||||
} else if (minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
|
||||
addByte1(values);
|
||||
} else if (minValue >= Short.MIN_VALUE && maxValue <= Short.MAX_VALUE) {
|
||||
addByte2(values);
|
||||
} else if (minValue >= Integer.MIN_VALUE && maxValue <= Integer.MAX_VALUE) {
|
||||
addByte4(values);
|
||||
} else {
|
||||
addByte8(values);
|
||||
}
|
||||
}
|
||||
|
||||
private void addConstant(long constant) throws IOException {
|
||||
meta.writeByte((byte) 0);
|
||||
meta.writeLong(constant);
|
||||
}
|
||||
|
||||
private void addByte1(Iterable<Number> values) throws IOException {
|
||||
meta.writeByte((byte) 1);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
for (Number value : values) {
|
||||
data.writeByte(value.byteValue());
|
||||
}
|
||||
}
|
||||
|
||||
private void addByte2(Iterable<Number> values) throws IOException {
|
||||
meta.writeByte((byte) 2);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
for (Number value : values) {
|
||||
data.writeShort(value.shortValue());
|
||||
}
|
||||
}
|
||||
|
||||
private void addByte4(Iterable<Number> values) throws IOException {
|
||||
meta.writeByte((byte) 4);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
for (Number value : values) {
|
||||
data.writeInt(value.intValue());
|
||||
}
|
||||
}
|
||||
|
||||
private void addByte8(Iterable<Number> values) throws IOException {
|
||||
meta.writeByte((byte) 8);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
|
||||
for (Number value : values) {
|
||||
data.writeLong(value.longValue());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
if (meta != null) {
|
||||
meta.writeVInt(-1); // write EOF marker
|
||||
CodecUtil.writeFooter(meta); // write checksum
|
||||
}
|
||||
if (data != null) {
|
||||
CodecUtil.writeFooter(data); // write checksum
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(data, meta);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(data, meta);
|
||||
}
|
||||
meta = data = null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,31 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene53;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
public class Lucene53RWNormsFormat extends Lucene53NormsFormat {
|
||||
|
||||
@Override
|
||||
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||
return new Lucene53NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene53;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene62.Lucene62RWCodec;
|
||||
import org.apache.lucene.index.BaseNormsFormatTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Tests Lucene53NormsFormat
|
||||
*/
|
||||
public class TestLucene53NormsFormat extends BaseNormsFormatTestCase {
|
||||
private final Codec codec = new Lucene62RWCodec();
|
||||
|
||||
@Override
|
||||
protected int getCreatedVersionMajor() {
|
||||
return Version.LUCENE_6_2_0.major;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean codecSupportsSparsity() {
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -1,640 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene54;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.asserting.AssertingCodec;
|
||||
import org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseNumericDocValues;
|
||||
import org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseNumericDocValuesRandomAccessWrapper;
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.document.SortedDocValuesField;
|
||||
import org.apache.lucene.document.SortedNumericDocValuesField;
|
||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.SerialMergeScheduler;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.SortedNumericDocValues;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMFile;
|
||||
import org.apache.lucene.store.RAMInputStream;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.LongValues;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Tests Lucene54DocValuesFormat
|
||||
*/
|
||||
public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatTestCase {
|
||||
private final Codec codec = TestUtil.alwaysDocValuesFormat(new Lucene54DocValuesFormat());
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
|
||||
// TODO: these big methods can easily blow up some of the other ram-hungry codecs...
|
||||
// for now just keep them here, as we want to test this for this format.
|
||||
|
||||
@Slow
|
||||
public void testSortedSetVariableLengthBigVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16, 100);
|
||||
}
|
||||
}
|
||||
|
||||
@Nightly
|
||||
public void testSortedSetVariableLengthManyVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16, 100);
|
||||
}
|
||||
}
|
||||
|
||||
@Slow
|
||||
public void testSortedVariableLengthBigVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestSortedVsStoredFields(atLeast(300), 1d, 1, 32766);
|
||||
}
|
||||
}
|
||||
|
||||
@Nightly
|
||||
public void testSortedVariableLengthManyVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestSortedVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1d, 1, 500);
|
||||
}
|
||||
}
|
||||
|
||||
@Slow
|
||||
public void testTermsEnumFixedWidth() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 10, 10);
|
||||
}
|
||||
}
|
||||
|
||||
@Slow
|
||||
public void testTermsEnumVariableWidth() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 1, 500);
|
||||
}
|
||||
}
|
||||
|
||||
@Nightly
|
||||
public void testTermsEnumRandomMany() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), 1, 500);
|
||||
}
|
||||
}
|
||||
|
||||
@Slow
|
||||
public void testSparseDocValuesVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestSparseDocValuesVsStoredFields();
|
||||
}
|
||||
}
|
||||
|
||||
private void doTestSparseDocValuesVsStoredFields() throws Exception {
|
||||
final long[] values = new long[TestUtil.nextInt(random(), 1, 500)];
|
||||
for (int i = 0; i < values.length; ++i) {
|
||||
values[i] = random().nextLong();
|
||||
}
|
||||
|
||||
Directory dir = newFSDirectory(createTempDir());
|
||||
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||
conf.setMergeScheduler(new SerialMergeScheduler());
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
|
||||
|
||||
// sparse compression is only enabled if less than 1% of docs have a value
|
||||
final int avgGap = 100;
|
||||
|
||||
final int numDocs = atLeast(200);
|
||||
for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
|
||||
writer.addDocument(new Document());
|
||||
}
|
||||
final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5);
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
Document doc = new Document();
|
||||
|
||||
// single-valued
|
||||
long docValue = values[random().nextInt(values.length)];
|
||||
doc.add(new NumericDocValuesField("numeric", docValue));
|
||||
doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue))));
|
||||
doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue))));
|
||||
doc.add(new StoredField("value", docValue));
|
||||
|
||||
// multi-valued
|
||||
final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc);
|
||||
for (int j = 0; j < numValues; ++j) {
|
||||
docValue = values[random().nextInt(values.length)];
|
||||
doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue));
|
||||
doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue))));
|
||||
doc.add(new StoredField("values", docValue));
|
||||
}
|
||||
|
||||
writer.addDocument(doc);
|
||||
|
||||
// add a gap
|
||||
for (int j = TestUtil.nextInt(random(), 0, avgGap * 2); j >= 0; --j) {
|
||||
writer.addDocument(new Document());
|
||||
}
|
||||
}
|
||||
|
||||
if (random().nextBoolean()) {
|
||||
writer.forceMerge(1);
|
||||
}
|
||||
|
||||
final IndexReader indexReader = writer.getReader();
|
||||
TestUtil.checkReader(indexReader);
|
||||
writer.close();
|
||||
|
||||
for (LeafReaderContext context : indexReader.leaves()) {
|
||||
final LeafReader reader = context.reader();
|
||||
final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric");
|
||||
|
||||
final SortedDocValues sorted = DocValues.getSorted(reader, "sorted");
|
||||
|
||||
final BinaryDocValues binary = DocValues.getBinary(reader, "binary");
|
||||
|
||||
final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric");
|
||||
|
||||
final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set");
|
||||
|
||||
for (int i = 0; i < reader.maxDoc(); ++i) {
|
||||
final Document doc = reader.document(i);
|
||||
final IndexableField valueField = doc.getField("value");
|
||||
final Long value = valueField == null ? null : valueField.numericValue().longValue();
|
||||
|
||||
if (value == null) {
|
||||
assertTrue(numeric.docID() + " vs " + i, numeric.docID() < i);
|
||||
} else {
|
||||
assertEquals(i, numeric.nextDoc());
|
||||
assertEquals(i, binary.nextDoc());
|
||||
assertEquals(i, sorted.nextDoc());
|
||||
assertEquals(value.longValue(), numeric.longValue());
|
||||
assertTrue(sorted.ordValue() >= 0);
|
||||
assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.ordValue()));
|
||||
assertEquals(new BytesRef(Long.toString(value)), binary.binaryValue());
|
||||
}
|
||||
|
||||
final IndexableField[] valuesFields = doc.getFields("values");
|
||||
if (valuesFields.length == 0) {
|
||||
assertTrue(sortedNumeric.docID() + " vs " + i, sortedNumeric.docID() < i);
|
||||
} else {
|
||||
final Set<Long> valueSet = new HashSet<>();
|
||||
for (IndexableField sf : valuesFields) {
|
||||
valueSet.add(sf.numericValue().longValue());
|
||||
}
|
||||
|
||||
assertEquals(i, sortedNumeric.nextDoc());
|
||||
assertEquals(valuesFields.length, sortedNumeric.docValueCount());
|
||||
for (int j = 0; j < sortedNumeric.docValueCount(); ++j) {
|
||||
assertTrue(valueSet.contains(sortedNumeric.nextValue()));
|
||||
}
|
||||
assertEquals(i, sortedSet.nextDoc());
|
||||
int sortedSetCount = 0;
|
||||
while (true) {
|
||||
long ord = sortedSet.nextOrd();
|
||||
if (ord == SortedSetDocValues.NO_MORE_ORDS) {
|
||||
break;
|
||||
}
|
||||
assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString())));
|
||||
sortedSetCount++;
|
||||
}
|
||||
assertEquals(valueSet.size(), sortedSetCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
indexReader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// TODO: try to refactor this and some termsenum tests into the base class.
|
||||
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
|
||||
// the postings format correctly.
|
||||
private void doTestTermsEnumRandom(int numDocs, int minLength, int maxLength) throws Exception {
|
||||
Directory dir = newFSDirectory(createTempDir());
|
||||
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||
conf.setMergeScheduler(new SerialMergeScheduler());
|
||||
// set to duel against a codec which has ordinals:
|
||||
final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
|
||||
final DocValuesFormat dv = new Lucene54DocValuesFormat();
|
||||
conf.setCodec(new AssertingCodec() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return pf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return dv;
|
||||
}
|
||||
});
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
|
||||
|
||||
// index some docs
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
|
||||
doc.add(idField);
|
||||
final int length = TestUtil.nextInt(random(), minLength, maxLength);
|
||||
int numValues = random().nextInt(17);
|
||||
// create a random list of strings
|
||||
List<String> values = new ArrayList<>();
|
||||
for (int v = 0; v < numValues; v++) {
|
||||
values.add(TestUtil.randomSimpleString(random(), minLength, length));
|
||||
}
|
||||
|
||||
// add in any order to the indexed field
|
||||
ArrayList<String> unordered = new ArrayList<>(values);
|
||||
Collections.shuffle(unordered, random());
|
||||
for (String v : values) {
|
||||
doc.add(newStringField("indexed", v, Field.Store.NO));
|
||||
}
|
||||
|
||||
// add in any order to the dv field
|
||||
ArrayList<String> unordered2 = new ArrayList<>(values);
|
||||
Collections.shuffle(unordered2, random());
|
||||
for (String v : unordered2) {
|
||||
doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
|
||||
}
|
||||
|
||||
writer.addDocument(doc);
|
||||
if (random().nextInt(31) == 0) {
|
||||
writer.commit();
|
||||
}
|
||||
}
|
||||
|
||||
// delete some docs
|
||||
int numDeletions = random().nextInt(numDocs/10);
|
||||
for (int i = 0; i < numDeletions; i++) {
|
||||
int id = random().nextInt(numDocs);
|
||||
writer.deleteDocuments(new Term("id", Integer.toString(id)));
|
||||
}
|
||||
|
||||
// compare per-segment
|
||||
DirectoryReader ir = writer.getReader();
|
||||
for (LeafReaderContext context : ir.leaves()) {
|
||||
LeafReader r = context.reader();
|
||||
Terms terms = r.terms("indexed");
|
||||
if (terms != null) {
|
||||
SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
|
||||
assertEquals(terms.size(), ssdv.getValueCount());
|
||||
TermsEnum expected = terms.iterator();
|
||||
TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
|
||||
assertEquals(terms.size(), expected, actual);
|
||||
|
||||
doTestSortedSetEnumAdvanceIndependently(ssdv);
|
||||
}
|
||||
}
|
||||
ir.close();
|
||||
|
||||
writer.forceMerge(1);
|
||||
|
||||
// now compare again after the merge
|
||||
ir = writer.getReader();
|
||||
LeafReader ar = getOnlyLeafReader(ir);
|
||||
Terms terms = ar.terms("indexed");
|
||||
if (terms != null) {
|
||||
assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
|
||||
TermsEnum expected = terms.iterator();
|
||||
TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
|
||||
assertEquals(terms.size(), expected, actual);
|
||||
}
|
||||
ir.close();
|
||||
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void assertEquals(long numOrds, TermsEnum expected, TermsEnum actual) throws Exception {
|
||||
BytesRef ref;
|
||||
|
||||
// sequential next() through all terms
|
||||
while ((ref = expected.next()) != null) {
|
||||
assertEquals(ref, actual.next());
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
assertNull(actual.next());
|
||||
|
||||
// sequential seekExact(ord) through all terms
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
expected.seekExact(i);
|
||||
actual.seekExact(i);
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// sequential seekExact(BytesRef) through all terms
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
expected.seekExact(i);
|
||||
assertTrue(actual.seekExact(expected.term()));
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// sequential seekCeil(BytesRef) through all terms
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
expected.seekExact(i);
|
||||
assertEquals(SeekStatus.FOUND, actual.seekCeil(expected.term()));
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// random seekExact(ord)
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1);
|
||||
expected.seekExact(randomOrd);
|
||||
actual.seekExact(randomOrd);
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// random seekExact(BytesRef)
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1);
|
||||
expected.seekExact(randomOrd);
|
||||
actual.seekExact(expected.term());
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// random seekCeil(BytesRef)
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
BytesRef target = new BytesRef(TestUtil.randomUnicodeString(random()));
|
||||
SeekStatus expectedStatus = expected.seekCeil(target);
|
||||
assertEquals(expectedStatus, actual.seekCeil(target));
|
||||
if (expectedStatus != SeekStatus.END) {
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testSparseLongValues() throws IOException {
|
||||
final int iters = atLeast(5);
|
||||
for (int iter = 0; iter < iters; ++iter) {
|
||||
final int numDocs = TestUtil.nextInt(random(), 0, 100);
|
||||
final int[] docIds = new int[numDocs];
|
||||
final long[] values = new long[numDocs];
|
||||
final int maxDoc;
|
||||
if (numDocs == 0) {
|
||||
maxDoc = 1 + random().nextInt(10);
|
||||
} else {
|
||||
docIds[0] = random().nextInt(10);
|
||||
for (int i = 1; i < docIds.length; ++i) {
|
||||
docIds[i] = docIds[i - 1] + 1 + random().nextInt(100);
|
||||
}
|
||||
maxDoc = docIds[numDocs - 1] + 1 + random().nextInt(10);
|
||||
}
|
||||
for (int i = 0; i < values.length; ++i) {
|
||||
values[i] = random().nextLong();
|
||||
}
|
||||
final long missingValue = random().nextLong();
|
||||
final LongValues docIdsValues = new LongValues() {
|
||||
@Override
|
||||
public long get(long index) {
|
||||
return docIds[Math.toIntExact(index)];
|
||||
}
|
||||
};
|
||||
final LongValues valuesValues = new LongValues() {
|
||||
@Override
|
||||
public long get(long index) {
|
||||
return values[Math.toIntExact(index)];
|
||||
}
|
||||
};
|
||||
final SparseNumericDocValues sparseValues = new SparseNumericDocValues(numDocs, docIdsValues, valuesValues);
|
||||
|
||||
// sequential access
|
||||
assertEquals(-1, sparseValues.docID());
|
||||
for (int i = 0; i < docIds.length; ++i) {
|
||||
assertEquals(docIds[i], sparseValues.nextDoc());
|
||||
}
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, sparseValues.nextDoc());
|
||||
|
||||
// advance
|
||||
for (int i = 0; i < 2000; ++i) {
|
||||
final int target = TestUtil.nextInt(random(), 0, maxDoc);
|
||||
int index = Arrays.binarySearch(docIds, target);
|
||||
if (index < 0) {
|
||||
index = -1 - index;
|
||||
}
|
||||
sparseValues.reset();
|
||||
if (index > 0) {
|
||||
assertEquals(docIds[index - 1], sparseValues.advance(Math.toIntExact(docIds[index - 1])));
|
||||
}
|
||||
if (index == docIds.length) {
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, sparseValues.advance(target));
|
||||
} else {
|
||||
assertEquals(docIds[index], sparseValues.advance(target));
|
||||
}
|
||||
}
|
||||
|
||||
// advanceExact
|
||||
for (int i = 0; i < 2000; ++i) {
|
||||
sparseValues.reset();
|
||||
if (random().nextBoolean() && docIds.length > 0) {
|
||||
sparseValues.advance(docIds[TestUtil.nextInt(random(), 0, docIds.length - 1)]);
|
||||
}
|
||||
|
||||
final int target = TestUtil.nextInt(random(), Math.max(0, sparseValues.docID()), maxDoc - 1);
|
||||
final boolean exists = sparseValues.advanceExact(target);
|
||||
|
||||
final int index = Arrays.binarySearch(docIds, target);
|
||||
assertEquals(index >= 0, exists);
|
||||
assertEquals(target, sparseValues.docID());
|
||||
|
||||
final boolean exists2 = sparseValues.advanceExact(target);
|
||||
assertEquals(index >= 0, exists2);
|
||||
assertEquals(target, sparseValues.docID());
|
||||
|
||||
final int nextIndex = index >= 0 ? index + 1 : -1 - index;
|
||||
if (nextIndex >= docIds.length) {
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, sparseValues.nextDoc());
|
||||
} else {
|
||||
assertEquals(docIds[nextIndex], sparseValues.nextDoc());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
final SparseNumericDocValuesRandomAccessWrapper raWrapper = new SparseNumericDocValuesRandomAccessWrapper(sparseValues, missingValue);
|
||||
|
||||
// random-access
|
||||
for (int i = 0; i < 2000; ++i) {
|
||||
final int docId = TestUtil.nextInt(random(), 0, maxDoc - 1);
|
||||
final int idx = Arrays.binarySearch(docIds, docId);
|
||||
final long value = raWrapper.get(docId);
|
||||
if (idx >= 0) {
|
||||
assertEquals(values[idx], value);
|
||||
} else {
|
||||
assertEquals(missingValue, value);
|
||||
}
|
||||
}
|
||||
|
||||
// sequential access
|
||||
for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) {
|
||||
final int idx = Arrays.binarySearch(docIds, docId);
|
||||
final long value = raWrapper.get(docId);
|
||||
if (idx >= 0) {
|
||||
assertEquals(values[idx], value);
|
||||
} else {
|
||||
assertEquals(missingValue, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Slow
|
||||
public void testSortedSetAroundBlockSize() throws IOException {
|
||||
final int frontier = 1 << Lucene54DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||
for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
|
||||
final Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
|
||||
RAMFile buffer = new RAMFile();
|
||||
RAMOutputStream out = new RAMOutputStream(buffer, false);
|
||||
Document doc = new Document();
|
||||
SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
|
||||
doc.add(field1);
|
||||
SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
|
||||
doc.add(field2);
|
||||
for (int i = 0; i < maxDoc; ++i) {
|
||||
BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
|
||||
BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
|
||||
field1.setBytesValue(s1);
|
||||
field2.setBytesValue(s2);
|
||||
w.addDocument(doc);
|
||||
Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
|
||||
out.writeVInt(set.size());
|
||||
for (BytesRef ref : set) {
|
||||
out.writeVInt(ref.length);
|
||||
out.writeBytes(ref.bytes, ref.offset, ref.length);
|
||||
}
|
||||
}
|
||||
out.close();
|
||||
w.forceMerge(1);
|
||||
DirectoryReader r = DirectoryReader.open(w);
|
||||
w.close();
|
||||
LeafReader sr = getOnlyLeafReader(r);
|
||||
assertEquals(maxDoc, sr.maxDoc());
|
||||
SortedSetDocValues values = sr.getSortedSetDocValues("sset");
|
||||
assertNotNull(values);
|
||||
RAMInputStream in = new RAMInputStream("", buffer);
|
||||
BytesRefBuilder b = new BytesRefBuilder();
|
||||
for (int i = 0; i < maxDoc; ++i) {
|
||||
assertEquals(i, values.nextDoc());
|
||||
final int numValues = in.readVInt();
|
||||
|
||||
for (int j = 0; j < numValues; ++j) {
|
||||
b.setLength(in.readVInt());
|
||||
b.grow(b.length());
|
||||
in.readBytes(b.bytes(), 0, b.length());
|
||||
assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
|
||||
}
|
||||
|
||||
assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
|
||||
}
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Slow
|
||||
public void testSortedNumericAroundBlockSize() throws IOException {
|
||||
final int frontier = 1 << Lucene54DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||
for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
|
||||
final Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
|
||||
RAMFile buffer = new RAMFile();
|
||||
RAMOutputStream out = new RAMOutputStream(buffer, false);
|
||||
Document doc = new Document();
|
||||
SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("snum", 0L);
|
||||
doc.add(field1);
|
||||
SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("snum", 0L);
|
||||
doc.add(field2);
|
||||
for (int i = 0; i < maxDoc; ++i) {
|
||||
long s1 = random().nextInt(100);
|
||||
long s2 = random().nextInt(100);
|
||||
field1.setLongValue(s1);
|
||||
field2.setLongValue(s2);
|
||||
w.addDocument(doc);
|
||||
out.writeVLong(Math.min(s1, s2));
|
||||
out.writeVLong(Math.max(s1, s2));
|
||||
}
|
||||
out.close();
|
||||
w.forceMerge(1);
|
||||
DirectoryReader r = DirectoryReader.open(w);
|
||||
w.close();
|
||||
LeafReader sr = getOnlyLeafReader(r);
|
||||
assertEquals(maxDoc, sr.maxDoc());
|
||||
SortedNumericDocValues values = sr.getSortedNumericDocValues("snum");
|
||||
assertNotNull(values);
|
||||
RAMInputStream in = new RAMInputStream("", buffer);
|
||||
for (int i = 0; i < maxDoc; ++i) {
|
||||
assertEquals(i, values.nextDoc());
|
||||
assertEquals(2, values.docValueCount());
|
||||
assertEquals(in.readVLong(), values.nextValue());
|
||||
assertEquals(in.readVLong(), values.nextValue());
|
||||
}
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,38 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene60;
|
||||
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50RWSegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53RWNormsFormat;
|
||||
|
||||
public class Lucene60RWCodec extends Lucene60Codec {
|
||||
|
||||
private final SegmentInfoFormat segmentInfoFormat = new Lucene50RWSegmentInfoFormat();
|
||||
private final NormsFormat normsFormat = new Lucene53RWNormsFormat();
|
||||
|
||||
@Override
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfoFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene62;
|
||||
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene53.Lucene53RWNormsFormat;
|
||||
import org.apache.lucene.codecs.lucene62.Lucene62Codec;
|
||||
|
||||
/**
|
||||
* Read-write version of 6.2 codec for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene62RWCodec extends Lucene62Codec {
|
||||
|
||||
private final SegmentInfoFormat segmentInfoFormat = new Lucene62RWSegmentInfoFormat();
|
||||
private final NormsFormat normsFormat = new Lucene53RWNormsFormat();
|
||||
|
||||
@Override
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfoFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,193 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene62;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.SortedNumericSelector;
|
||||
import org.apache.lucene.search.SortedNumericSortField;
|
||||
import org.apache.lucene.search.SortedSetSelector;
|
||||
import org.apache.lucene.search.SortedSetSortField;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Read-write version of 6.2 SegmentInfoFormat for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene62RWSegmentInfoFormat extends Lucene62SegmentInfoFormat {
|
||||
|
||||
@Override
|
||||
public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
|
||||
final String fileName = IndexFileNames.segmentFileName(si.name, "", Lucene62SegmentInfoFormat.SI_EXTENSION);
|
||||
|
||||
try (IndexOutput output = dir.createOutput(fileName, ioContext)) {
|
||||
// Only add the file once we've successfully created it, else IFD assert can trip:
|
||||
si.addFile(fileName);
|
||||
CodecUtil.writeIndexHeader(output,
|
||||
Lucene62SegmentInfoFormat.CODEC_NAME,
|
||||
Lucene62SegmentInfoFormat.VERSION_CURRENT,
|
||||
si.getId(),
|
||||
"");
|
||||
Version version = si.getVersion();
|
||||
if (version.major < 5) {
|
||||
throw new IllegalArgumentException("invalid major version: should be >= 5 but got: " + version.major + " segment=" + si);
|
||||
}
|
||||
// Write the Lucene version that created this segment, since 3.1
|
||||
output.writeInt(version.major);
|
||||
output.writeInt(version.minor);
|
||||
output.writeInt(version.bugfix);
|
||||
assert version.prerelease == 0;
|
||||
output.writeInt(si.maxDoc());
|
||||
|
||||
output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
|
||||
output.writeMapOfStrings(si.getDiagnostics());
|
||||
Set<String> files = si.files();
|
||||
for (String file : files) {
|
||||
if (!IndexFileNames.parseSegmentName(file).equals(si.name)) {
|
||||
throw new IllegalArgumentException("invalid files: expected segment=" + si.name + ", got=" + files);
|
||||
}
|
||||
}
|
||||
output.writeSetOfStrings(files);
|
||||
output.writeMapOfStrings(si.getAttributes());
|
||||
|
||||
Sort indexSort = si.getIndexSort();
|
||||
int numSortFields = indexSort == null ? 0 : indexSort.getSort().length;
|
||||
output.writeVInt(numSortFields);
|
||||
for (int i = 0; i < numSortFields; ++i) {
|
||||
SortField sortField = indexSort.getSort()[i];
|
||||
SortField.Type sortType = sortField.getType();
|
||||
output.writeString(sortField.getField());
|
||||
int sortTypeID;
|
||||
switch (sortField.getType()) {
|
||||
case STRING:
|
||||
sortTypeID = 0;
|
||||
break;
|
||||
case LONG:
|
||||
sortTypeID = 1;
|
||||
break;
|
||||
case INT:
|
||||
sortTypeID = 2;
|
||||
break;
|
||||
case DOUBLE:
|
||||
sortTypeID = 3;
|
||||
break;
|
||||
case FLOAT:
|
||||
sortTypeID = 4;
|
||||
break;
|
||||
case CUSTOM:
|
||||
if (sortField instanceof SortedSetSortField) {
|
||||
sortTypeID = 5;
|
||||
sortType = SortField.Type.STRING;
|
||||
} else if (sortField instanceof SortedNumericSortField) {
|
||||
sortTypeID = 6;
|
||||
sortType = ((SortedNumericSortField) sortField).getNumericType();
|
||||
} else {
|
||||
throw new IllegalStateException("Unexpected SortedNumericSortField " + sortField);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("Unexpected sort type: " + sortField.getType());
|
||||
}
|
||||
output.writeVInt(sortTypeID);
|
||||
if (sortTypeID == 5) {
|
||||
SortedSetSortField ssf = (SortedSetSortField) sortField;
|
||||
if (ssf.getSelector() == SortedSetSelector.Type.MIN) {
|
||||
output.writeByte((byte) 0);
|
||||
} else if (ssf.getSelector() == SortedSetSelector.Type.MAX) {
|
||||
output.writeByte((byte) 1);
|
||||
} else if (ssf.getSelector() == SortedSetSelector.Type.MIDDLE_MIN) {
|
||||
output.writeByte((byte) 2);
|
||||
} else if (ssf.getSelector() == SortedSetSelector.Type.MIDDLE_MAX) {
|
||||
output.writeByte((byte) 3);
|
||||
} else {
|
||||
throw new IllegalStateException("Unexpected SortedSetSelector type: " + ssf.getSelector());
|
||||
}
|
||||
} else if (sortTypeID == 6) {
|
||||
SortedNumericSortField snsf = (SortedNumericSortField) sortField;
|
||||
if (snsf.getNumericType() == SortField.Type.LONG) {
|
||||
output.writeByte((byte) 0);
|
||||
} else if (snsf.getNumericType() == SortField.Type.INT) {
|
||||
output.writeByte((byte) 1);
|
||||
} else if (snsf.getNumericType() == SortField.Type.DOUBLE) {
|
||||
output.writeByte((byte) 2);
|
||||
} else if (snsf.getNumericType() == SortField.Type.FLOAT) {
|
||||
output.writeByte((byte) 3);
|
||||
} else {
|
||||
throw new IllegalStateException("Unexpected SortedNumericSelector type: " + snsf.getNumericType());
|
||||
}
|
||||
if (snsf.getSelector() == SortedNumericSelector.Type.MIN) {
|
||||
output.writeByte((byte) 0);
|
||||
} else if (snsf.getSelector() == SortedNumericSelector.Type.MAX) {
|
||||
output.writeByte((byte) 1);
|
||||
} else {
|
||||
throw new IllegalStateException("Unexpected sorted numeric selector type: " + snsf.getSelector());
|
||||
}
|
||||
}
|
||||
output.writeByte((byte) (sortField.getReverse() ? 0 : 1));
|
||||
|
||||
// write missing value
|
||||
Object missingValue = sortField.getMissingValue();
|
||||
if (missingValue == null) {
|
||||
output.writeByte((byte) 0);
|
||||
} else {
|
||||
switch(sortType) {
|
||||
case STRING:
|
||||
if (missingValue == SortField.STRING_LAST) {
|
||||
output.writeByte((byte) 1);
|
||||
} else if (missingValue == SortField.STRING_FIRST) {
|
||||
output.writeByte((byte) 2);
|
||||
} else {
|
||||
throw new AssertionError("unrecognized missing value for STRING field \"" + sortField.getField() + "\": " + missingValue);
|
||||
}
|
||||
break;
|
||||
case LONG:
|
||||
output.writeByte((byte) 1);
|
||||
output.writeLong(((Long) missingValue).longValue());
|
||||
break;
|
||||
case INT:
|
||||
output.writeByte((byte) 1);
|
||||
output.writeInt(((Integer) missingValue).intValue());
|
||||
break;
|
||||
case DOUBLE:
|
||||
output.writeByte((byte) 1);
|
||||
output.writeLong(Double.doubleToLongBits(((Double) missingValue).doubleValue()));
|
||||
break;
|
||||
case FLOAT:
|
||||
output.writeByte((byte) 1);
|
||||
output.writeInt(Float.floatToIntBits(((Float) missingValue).floatValue()));
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("Unexpected sort type: " + sortField.getType());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CodecUtil.writeFooter(output);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.codecs.lucene62;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BaseSegmentInfoFormatTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Tests Lucene62SegmentInfoFormat
|
||||
*/
|
||||
public class TestLucene62SegmentInfoFormat extends BaseSegmentInfoFormatTestCase {
|
||||
|
||||
@Override
|
||||
protected int getCreatedVersionMajor() {
|
||||
return Version.LUCENE_6_2_0.major;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Version[] getVersions() {
|
||||
return new Version[] { Version.LUCENE_6_2_0 };
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene62RWCodec();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean supportsMinVersion() {
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -276,30 +276,6 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
}
|
||||
|
||||
final static String[] oldNames = {
|
||||
"6.0.0-cfs",
|
||||
"6.0.0-nocfs",
|
||||
"6.0.1-cfs",
|
||||
"6.0.1-nocfs",
|
||||
"6.1.0-cfs",
|
||||
"6.1.0-nocfs",
|
||||
"6.2.0-cfs",
|
||||
"6.2.0-nocfs",
|
||||
"6.2.1-cfs",
|
||||
"6.2.1-nocfs",
|
||||
"6.3.0-cfs",
|
||||
"6.3.0-nocfs",
|
||||
"6.4.0-cfs",
|
||||
"6.4.0-nocfs",
|
||||
"6.4.1-cfs",
|
||||
"6.4.1-nocfs",
|
||||
"6.4.2-cfs",
|
||||
"6.4.2-nocfs",
|
||||
"6.5.0-cfs",
|
||||
"6.5.0-nocfs",
|
||||
"6.5.1-cfs",
|
||||
"6.5.1-nocfs",
|
||||
"6.6.0-cfs",
|
||||
"6.6.0-nocfs"
|
||||
};
|
||||
|
||||
final String[] unsupportedNames = {
|
||||
|
@ -432,7 +408,31 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
"5.5.3-cfs",
|
||||
"5.5.3-nocfs",
|
||||
"5.5.4-cfs",
|
||||
"5.5.4-nocfs"
|
||||
"5.5.4-nocfs",
|
||||
"6.0.0-cfs",
|
||||
"6.0.0-nocfs",
|
||||
"6.0.1-cfs",
|
||||
"6.0.1-nocfs",
|
||||
"6.1.0-cfs",
|
||||
"6.1.0-nocfs",
|
||||
"6.2.0-cfs",
|
||||
"6.2.0-nocfs",
|
||||
"6.2.1-cfs",
|
||||
"6.2.1-nocfs",
|
||||
"6.3.0-cfs",
|
||||
"6.3.0-nocfs",
|
||||
"6.4.0-cfs",
|
||||
"6.4.0-nocfs",
|
||||
"6.4.1-cfs",
|
||||
"6.4.1-nocfs",
|
||||
"6.4.2-cfs",
|
||||
"6.4.2-nocfs",
|
||||
"6.5.0-cfs",
|
||||
"6.5.0-nocfs",
|
||||
"6.5.1-cfs",
|
||||
"6.5.1-nocfs",
|
||||
"6.6.0-cfs",
|
||||
"6.6.0-nocfs"
|
||||
};
|
||||
|
||||
// TODO: on 6.0.0 release, gen the single segment indices and add here:
|
||||
|
@ -1436,9 +1436,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public static final String emptyIndex = "empty.6.0.0.zip";
|
||||
public static final String emptyIndex = "empty.7.0.0.zip";
|
||||
|
||||
public void testUpgradeEmptyOldIndex() throws Exception {
|
||||
assumeTrue("Reenable when 7.0 is released", false);
|
||||
Path oldIndexDir = createTempDir("emptyIndex");
|
||||
TestUtil.unzip(getDataInputStream(emptyIndex), oldIndexDir);
|
||||
Directory dir = newFSDirectory(oldIndexDir);
|
||||
|
@ -1450,9 +1451,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
public static final String moreTermsIndex = "moreterms.6.0.0.zip";
|
||||
public static final String moreTermsIndex = "moreterms.7.0.0.zip";
|
||||
|
||||
public void testMoreTerms() throws Exception {
|
||||
assumeTrue("Reenable when 7.0 is released", false);
|
||||
Path oldIndexDir = createTempDir("moreterms");
|
||||
TestUtil.unzip(getDataInputStream(moreTermsIndex), oldIndexDir);
|
||||
Directory dir = newFSDirectory(oldIndexDir);
|
||||
|
@ -1462,7 +1464,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
public static final String dvUpdatesIndex = "dvupdates.6.0.0.zip";
|
||||
public static final String dvUpdatesIndex = "dvupdates.7.0.0.zip";
|
||||
|
||||
private void assertNumericDocValues(LeafReader r, String f, String cf) throws IOException {
|
||||
NumericDocValues ndvf = r.getNumericDocValues(f);
|
||||
|
@ -1495,8 +1497,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
}
|
||||
reader.close();
|
||||
}
|
||||
|
||||
|
||||
public void testDocValuesUpdates() throws Exception {
|
||||
assumeTrue("Reenable when 7.0 is released", false);
|
||||
Path oldIndexDir = createTempDir("dvupdates");
|
||||
TestUtil.unzip(getDataInputStream(dvUpdatesIndex), oldIndexDir);
|
||||
Directory dir = newFSDirectory(oldIndexDir);
|
||||
|
@ -1559,7 +1562,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testSortedIndex() throws Exception {
|
||||
String[] versions = new String[] {"6.2.0", "6.2.1", "6.3.0"};
|
||||
assumeTrue("Reenable when 7.0 is released", false);
|
||||
String[] versions = new String[] {};
|
||||
for(String version : versions) {
|
||||
Path path = createTempDir("sorted");
|
||||
InputStream resource = TestBackwardsCompatibility.class.getResourceAsStream("sorted." + version + ".zip");
|
||||
|
|
|
@ -1,112 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.index;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.MockDirectoryWrapper;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestFixBrokenOffsets extends LuceneTestCase {
|
||||
|
||||
// Run this in Lucene 6.x:
|
||||
//
|
||||
// ant test -Dtestcase=TestFixBrokenOffsets -Dtestmethod=testCreateBrokenOffsetsIndex -Dtests.codec=default -Dtests.useSecurityManager=false
|
||||
/*
|
||||
public void testCreateBrokenOffsetsIndex() throws IOException {
|
||||
|
||||
Path indexDir = Paths.get("/tmp/brokenoffsets");
|
||||
Files.deleteIfExists(indexDir);
|
||||
Directory dir = newFSDirectory(indexDir);
|
||||
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_STORED);
|
||||
fieldType.setStoreTermVectors(true);
|
||||
fieldType.setStoreTermVectorPositions(true);
|
||||
fieldType.setStoreTermVectorOffsets(true);
|
||||
Field field = new Field("foo", "bar", fieldType);
|
||||
field.setTokenStream(new CannedTokenStream(new Token("foo", 10, 13), new Token("foo", 7, 9)));
|
||||
doc.add(field);
|
||||
writer.addDocument(doc);
|
||||
writer.commit();
|
||||
|
||||
// 2nd segment
|
||||
doc = new Document();
|
||||
field = new Field("foo", "bar", fieldType);
|
||||
field.setTokenStream(new CannedTokenStream(new Token("bar", 15, 17), new Token("bar", 1, 5)));
|
||||
doc.add(field);
|
||||
writer.addDocument(doc);
|
||||
|
||||
writer.close();
|
||||
|
||||
dir.close();
|
||||
}
|
||||
*/
|
||||
|
||||
public void testFixBrokenOffsetsIndex() throws IOException {
|
||||
InputStream resource = getClass().getResourceAsStream("index.630.brokenoffsets.zip");
|
||||
assertNotNull("Broken offsets index not found", resource);
|
||||
Path path = createTempDir("brokenoffsets");
|
||||
TestUtil.unzip(resource, path);
|
||||
Directory dir = newFSDirectory(path);
|
||||
|
||||
// OK: index is 6.3.0 so offsets not checked:
|
||||
TestUtil.checkIndex(dir);
|
||||
|
||||
MockDirectoryWrapper tmpDir = newMockDirectory();
|
||||
tmpDir.setCheckIndexOnClose(false);
|
||||
IndexWriter w = new IndexWriter(tmpDir, new IndexWriterConfig());
|
||||
IndexWriter finalW = w;
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> finalW.addIndexes(dir));
|
||||
assertTrue(e.getMessage(), e.getMessage().startsWith("Cannot use addIndexes(Directory) with indexes that have been created by a different Lucene version."));
|
||||
w.close();
|
||||
// OK: addIndexes(Directory...) refuses to execute if the index creation version is different so broken offsets are not carried over
|
||||
tmpDir.close();
|
||||
|
||||
final MockDirectoryWrapper tmpDir2 = newMockDirectory();
|
||||
tmpDir2.setCheckIndexOnClose(false);
|
||||
w = new IndexWriter(tmpDir2, new IndexWriterConfig());
|
||||
DirectoryReader reader = DirectoryReader.open(dir);
|
||||
List<LeafReaderContext> leaves = reader.leaves();
|
||||
CodecReader[] codecReaders = new CodecReader[leaves.size()];
|
||||
for(int i=0;i<leaves.size();i++) {
|
||||
codecReaders[i] = (CodecReader) leaves.get(i).reader();
|
||||
}
|
||||
IndexWriter finalW2 = w;
|
||||
e = expectThrows(IllegalArgumentException.class, () -> finalW2.addIndexes(codecReaders));
|
||||
assertEquals("Cannot merge a segment that has been created with major version 6 into this index which has been created by major version 7", e.getMessage());
|
||||
reader.close();
|
||||
w.close();
|
||||
tmpDir2.close();
|
||||
|
||||
// Now run the tool and confirm the broken offsets are fixed:
|
||||
Path path2 = createTempDir("fixedbrokenoffsets").resolve("subdir");
|
||||
FixBrokenOffsets.main(new String[] {path.toString(), path2.toString()});
|
||||
Directory tmpDir3 = FSDirectory.open(path2);
|
||||
TestUtil.checkIndex(tmpDir3);
|
||||
tmpDir3.close();
|
||||
|
||||
dir.close();
|
||||
}
|
||||
}
|
|
@ -29,14 +29,15 @@ import org.apache.lucene.util.Version;
|
|||
public class TestIndexWriterOnOldIndex extends LuceneTestCase {
|
||||
|
||||
public void testOpenModeAndCreatedVersion() throws IOException {
|
||||
InputStream resource = getClass().getResourceAsStream("index.single-empty-doc.630.zip");
|
||||
assumeTrue("Reenable when 7.0 is released", false);
|
||||
InputStream resource = getClass().getResourceAsStream("unsupported.index.single-empty-doc.7.0.0.zip");
|
||||
assertNotNull(resource);
|
||||
Path path = createTempDir();
|
||||
TestUtil.unzip(resource, path);
|
||||
Directory dir = newFSDirectory(path);
|
||||
for (OpenMode openMode : OpenMode.values()) {
|
||||
Directory tmpDir = newDirectory(dir);
|
||||
assertEquals(6 /** 6.3.0 */, SegmentInfos.readLatestCommit(tmpDir).getIndexCreatedVersionMajor());
|
||||
assertEquals(7 /** 7.0.0 */, SegmentInfos.readLatestCommit(tmpDir).getIndexCreatedVersionMajor());
|
||||
IndexWriter w = new IndexWriter(tmpDir, newIndexWriterConfig().setOpenMode(openMode));
|
||||
w.commit();
|
||||
w.close();
|
||||
|
@ -45,7 +46,7 @@ public class TestIndexWriterOnOldIndex extends LuceneTestCase {
|
|||
assertEquals(Version.LATEST.major, SegmentInfos.readLatestCommit(tmpDir).getIndexCreatedVersionMajor());
|
||||
break;
|
||||
default:
|
||||
assertEquals(6 /** 6.3.0 */, SegmentInfos.readLatestCommit(tmpDir).getIndexCreatedVersionMajor());
|
||||
assertEquals(7 /** 7.0.0 */, SegmentInfos.readLatestCommit(tmpDir).getIndexCreatedVersionMajor());
|
||||
}
|
||||
tmpDir.close();
|
||||
}
|
||||
|
|
|
@ -59,6 +59,7 @@ public class TestManyPointsInOldIndex extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testCheckOldIndex() throws IOException {
|
||||
assumeTrue("Reenable when 7.0 is released", false);
|
||||
Path path = createTempDir("manypointsindex");
|
||||
InputStream resource = getClass().getResourceAsStream("manypointsindex.zip");
|
||||
assertNotNull("manypointsindex not found", resource);
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -744,13 +744,13 @@ public final class CheckIndex implements Closeable {
|
|||
segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast);
|
||||
|
||||
// Test the Term Index
|
||||
segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, failFast, version);
|
||||
segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, failFast);
|
||||
|
||||
// Test Stored Fields
|
||||
segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream, failFast);
|
||||
|
||||
// Test Term Vectors
|
||||
segInfoStat.termVectorStatus = testTermVectors(reader, infoStream, verbose, crossCheckTermVectors, failFast, version);
|
||||
segInfoStat.termVectorStatus = testTermVectors(reader, infoStream, verbose, crossCheckTermVectors, failFast);
|
||||
|
||||
// Test Docvalues
|
||||
segInfoStat.docValuesStatus = testDocValues(reader, infoStream, failFast);
|
||||
|
@ -1209,7 +1209,7 @@ public final class CheckIndex implements Closeable {
|
|||
* checks Fields api is consistent with itself.
|
||||
* searcher is optional, to verify with queries. Can be null.
|
||||
*/
|
||||
private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose, Version version) throws IOException {
|
||||
private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose) throws IOException {
|
||||
// TODO: we should probably return our own stats thing...?!
|
||||
long startNS;
|
||||
if (doPrint) {
|
||||
|
@ -1465,20 +1465,17 @@ public final class CheckIndex implements Closeable {
|
|||
if (hasOffsets) {
|
||||
int startOffset = postings.startOffset();
|
||||
int endOffset = postings.endOffset();
|
||||
// In Lucene 7 we fixed IndexWriter to also enforce term vector offsets
|
||||
if (isVectors == false || version.onOrAfter(Version.LUCENE_7_0_0)) {
|
||||
if (startOffset < 0) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
|
||||
}
|
||||
if (startOffset < lastOffset) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset + "; consider using the FixBrokenOffsets tool in Lucene's backward-codecs module to correct your index");
|
||||
}
|
||||
if (endOffset < 0) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds");
|
||||
}
|
||||
if (endOffset < startOffset) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset);
|
||||
}
|
||||
if (startOffset < 0) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
|
||||
}
|
||||
if (startOffset < lastOffset) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset + "; consider using the FixBrokenOffsets tool in Lucene's backward-codecs module to correct your index");
|
||||
}
|
||||
if (endOffset < 0) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds");
|
||||
}
|
||||
if (endOffset < startOffset) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset);
|
||||
}
|
||||
lastOffset = startOffset;
|
||||
}
|
||||
|
@ -1745,15 +1742,15 @@ public final class CheckIndex implements Closeable {
|
|||
* Test the term index.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, Version version) throws IOException {
|
||||
return testPostings(reader, infoStream, false, false, version);
|
||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream) throws IOException {
|
||||
return testPostings(reader, infoStream, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the term index.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, boolean verbose, boolean failFast, Version version) throws IOException {
|
||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, boolean verbose, boolean failFast) throws IOException {
|
||||
|
||||
// TODO: we should go and verify term vectors match, if
|
||||
// crossCheckTermVectors is on...
|
||||
|
@ -1768,7 +1765,7 @@ public final class CheckIndex implements Closeable {
|
|||
|
||||
final Fields fields = reader.getPostingsReader().getMergeInstance();
|
||||
final FieldInfos fieldInfos = reader.getFieldInfos();
|
||||
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose, version);
|
||||
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose);
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
throw IOUtils.rethrowAlways(e);
|
||||
|
@ -2377,15 +2374,15 @@ public final class CheckIndex implements Closeable {
|
|||
* Test term vectors.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, Version version) throws IOException {
|
||||
return testTermVectors(reader, infoStream, false, false, false, version);
|
||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream) throws IOException {
|
||||
return testTermVectors(reader, infoStream, false, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test term vectors.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast, Version version) throws IOException {
|
||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast) throws IOException {
|
||||
long startNS = System.nanoTime();
|
||||
final Status.TermVectorStatus status = new Status.TermVectorStatus();
|
||||
final FieldInfos fieldInfos = reader.getFieldInfos();
|
||||
|
@ -2425,7 +2422,7 @@ public final class CheckIndex implements Closeable {
|
|||
|
||||
if (tfv != null) {
|
||||
// First run with no deletions:
|
||||
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose, version);
|
||||
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose);
|
||||
|
||||
// Only agg stats if the doc is live:
|
||||
final boolean doStats = liveDocs == null || liveDocs.get(j);
|
||||
|
|
|
@ -42,7 +42,7 @@ public class IndexFormatTooOldException extends IOException {
|
|||
* @lucene.internal */
|
||||
public IndexFormatTooOldException(String resourceDescription, String reason) {
|
||||
super("Format version is not supported (resource " + resourceDescription + "): " +
|
||||
reason + ". This version of Lucene only supports indexes created with release 6.0 and later.");
|
||||
reason + ". This version of Lucene only supports indexes created with release 7.0 and later.");
|
||||
this.resourceDescription = resourceDescription;
|
||||
this.reason = reason;
|
||||
this.version = null;
|
||||
|
|
|
@ -1170,9 +1170,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
|
|||
}
|
||||
}
|
||||
|
||||
/** Confirms that the incoming index sort (if any) matches the existing index sort (if any).
|
||||
* This is unfortunately just best effort, because it could be the old index only has unsorted flushed segments built
|
||||
* before {@link Version#LUCENE_6_5_0} (flushed segments are sorted in Lucene 7.0). */
|
||||
/** Confirms that the incoming index sort (if any) matches the existing index sort (if any). */
|
||||
private void validateIndexSort() throws CorruptIndexException {
|
||||
Sort indexSort = config.getIndexSort();
|
||||
if (indexSort != null) {
|
||||
|
@ -1180,7 +1178,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
|
|||
Sort segmentIndexSort = info.info.getIndexSort();
|
||||
if (segmentIndexSort != null && indexSort.equals(segmentIndexSort) == false) {
|
||||
throw new IllegalArgumentException("cannot change previous indexSort=" + segmentIndexSort + " (from segment=" + info + ") to new indexSort=" + indexSort);
|
||||
} else if (segmentIndexSort == null && info.info.getVersion().onOrAfter(Version.LUCENE_6_5_0)) {
|
||||
} else if (segmentIndexSort == null) {
|
||||
// Flushed segments are not sorted if they were built with a version prior to 6.5.0
|
||||
throw new CorruptIndexException("segment not sorted with indexSort=" + segmentIndexSort, info.info.toString());
|
||||
}
|
||||
|
|
|
@ -312,7 +312,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
|
|||
CodecUtil.checkIndexHeaderSuffix(input, Long.toString(generation, Character.MAX_RADIX));
|
||||
|
||||
Version luceneVersion = Version.fromBits(input.readVInt(), input.readVInt(), input.readVInt());
|
||||
if (luceneVersion.onOrAfter(Version.LUCENE_6_0_0) == false) {
|
||||
if (luceneVersion.onOrAfter(Version.LUCENE_7_0_0) == false) {
|
||||
// TODO: should we check indexCreatedVersion instead?
|
||||
throw new IndexFormatTooOldException(input, "this index is too old (version: " + luceneVersion + ")");
|
||||
}
|
||||
|
|
|
@ -118,16 +118,9 @@ public class BM25Similarity extends Similarity {
|
|||
}
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] OLD_LENGTH_TABLE = new float[256];
|
||||
private static final float[] LENGTH_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 1; i < 256; i++) {
|
||||
float f = SmallFloat.byte315ToFloat((byte)i);
|
||||
OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
|
||||
}
|
||||
OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
|
||||
|
||||
for (int i = 0; i < 256; i++) {
|
||||
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
|
||||
}
|
||||
|
@ -137,12 +130,7 @@ public class BM25Similarity extends Similarity {
|
|||
@Override
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
|
||||
int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
|
||||
if (indexCreatedVersionMajor >= 7) {
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
} else {
|
||||
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
|
||||
}
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -205,19 +193,17 @@ public class BM25Similarity extends Similarity {
|
|||
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
|
||||
float avgdl = avgFieldLength(collectionStats);
|
||||
|
||||
float[] oldCache = new float[256];
|
||||
float[] cache = new float[256];
|
||||
for (int i = 0; i < cache.length; i++) {
|
||||
oldCache[i] = k1 * ((1 - b) + b * OLD_LENGTH_TABLE[i] / avgdl);
|
||||
cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
|
||||
}
|
||||
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, oldCache, cache);
|
||||
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, cache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
BM25Stats bm25stats = (BM25Stats) stats;
|
||||
return new BM25DocScorer(bm25stats, context.reader().getMetaData().getCreatedVersionMajor(), context.reader().getNormValues(bm25stats.field));
|
||||
return new BM25DocScorer(bm25stats, context.reader().getNormValues(bm25stats.field));
|
||||
}
|
||||
|
||||
private class BM25DocScorer extends SimScorer {
|
||||
|
@ -229,17 +215,12 @@ public class BM25Similarity extends Similarity {
|
|||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
|
||||
private final float[] cache;
|
||||
|
||||
BM25DocScorer(BM25Stats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
|
||||
BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException {
|
||||
this.stats = stats;
|
||||
this.weightValue = stats.weight * (k1 + 1);
|
||||
this.norms = norms;
|
||||
if (indexCreatedVersionMajor >= 7) {
|
||||
lengthCache = LENGTH_TABLE;
|
||||
cache = stats.cache;
|
||||
} else {
|
||||
lengthCache = OLD_LENGTH_TABLE;
|
||||
cache = stats.oldCache;
|
||||
}
|
||||
lengthCache = LENGTH_TABLE;
|
||||
cache = stats.cache;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -287,16 +268,15 @@ public class BM25Similarity extends Similarity {
|
|||
/** field name, for pulling norms */
|
||||
private final String field;
|
||||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
|
||||
* for both OLD_LENGTH_TABLE and LENGTH_TABLE */
|
||||
private final float[] oldCache, cache;
|
||||
* for LENGTH_TABLE */
|
||||
private final float[] cache;
|
||||
|
||||
BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] oldCache, float[] cache) {
|
||||
BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] cache) {
|
||||
this.field = field;
|
||||
this.boost = boost;
|
||||
this.idf = idf;
|
||||
this.avgdl = avgdl;
|
||||
this.weight = idf.getValue() * boost;
|
||||
this.oldCache = oldCache;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
|
|
|
@ -191,7 +191,6 @@ public abstract class SimilarityBase extends Similarity {
|
|||
|
||||
@Override
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
int indexCreatedVersionMajor = context.reader().getMetaData().getCreatedVersionMajor();
|
||||
if (stats instanceof MultiSimilarity.MultiStats) {
|
||||
// a multi term query (e.g. phrase). return the summation,
|
||||
// scoring almost as if it were boolean query
|
||||
|
@ -199,12 +198,12 @@ public abstract class SimilarityBase extends Similarity {
|
|||
SimScorer subScorers[] = new SimScorer[subStats.length];
|
||||
for (int i = 0; i < subScorers.length; i++) {
|
||||
BasicStats basicstats = (BasicStats) subStats[i];
|
||||
subScorers[i] = new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
|
||||
subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
|
||||
}
|
||||
return new MultiSimilarity.MultiSimScorer(subScorers);
|
||||
} else {
|
||||
BasicStats basicstats = (BasicStats) stats;
|
||||
return new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
|
||||
return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -218,16 +217,9 @@ public abstract class SimilarityBase extends Similarity {
|
|||
// ------------------------------ Norm handling ------------------------------
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] OLD_LENGTH_TABLE = new float[256];
|
||||
private static final float[] LENGTH_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 1; i < 256; i++) {
|
||||
float f = SmallFloat.byte315ToFloat((byte)i);
|
||||
OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
|
||||
}
|
||||
OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
|
||||
|
||||
for (int i = 0; i < 256; i++) {
|
||||
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
|
||||
}
|
||||
|
@ -241,12 +233,7 @@ public abstract class SimilarityBase extends Similarity {
|
|||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
numTerms = state.getLength();
|
||||
int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
|
||||
if (indexCreatedVersionMajor >= 7) {
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
} else {
|
||||
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
|
||||
}
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
}
|
||||
|
||||
// ----------------------------- Static methods ------------------------------
|
||||
|
@ -268,12 +255,10 @@ public abstract class SimilarityBase extends Similarity {
|
|||
final class BasicSimScorer extends SimScorer {
|
||||
private final BasicStats stats;
|
||||
private final NumericDocValues norms;
|
||||
private final float[] normCache;
|
||||
|
||||
BasicSimScorer(BasicStats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
|
||||
BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException {
|
||||
this.stats = stats;
|
||||
this.norms = norms;
|
||||
this.normCache = indexCreatedVersionMajor >= 7 ? LENGTH_TABLE : OLD_LENGTH_TABLE;
|
||||
}
|
||||
|
||||
float getLengthValue(int doc) throws IOException {
|
||||
|
@ -281,7 +266,7 @@ public abstract class SimilarityBase extends Similarity {
|
|||
return 1F;
|
||||
}
|
||||
if (norms.advanceExact(doc)) {
|
||||
return normCache[Byte.toUnsignedInt((byte) norms.longValue())];
|
||||
return LENGTH_TABLE[Byte.toUnsignedInt((byte) norms.longValue())];
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -376,15 +376,6 @@ import org.apache.lucene.util.SmallFloat;
|
|||
*/
|
||||
public abstract class TFIDFSimilarity extends Similarity {
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
static final float[] OLD_NORM_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
OLD_NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sole constructor. (For invocation by subclass
|
||||
* constructors, typically implicit.)
|
||||
|
@ -516,11 +507,7 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
numTerms = state.getLength();
|
||||
if (state.getIndexCreatedVersionMajor() >= 7) {
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
} else {
|
||||
return SmallFloat.floatToByte315(lengthNorm(numTerms));
|
||||
}
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
}
|
||||
|
||||
/** Computes the amount of a sloppy phrase match, based on an edit distance.
|
||||
|
@ -569,14 +556,8 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
@Override
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
IDFStats idfstats = (IDFStats) stats;
|
||||
final float[] normTable;
|
||||
if (context.reader().getMetaData().getCreatedVersionMajor() >= 7) {
|
||||
// the norms only encode the length, we need a translation table that depends on how lengthNorm is implemented
|
||||
normTable = idfstats.normTable;
|
||||
} else {
|
||||
// the norm is directly encoded in the index
|
||||
normTable = OLD_NORM_TABLE;
|
||||
}
|
||||
// the norms only encode the length, we need a translation table that depends on how lengthNorm is implemented
|
||||
final float[] normTable = idfstats.normTable;
|
||||
return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field), normTable);
|
||||
}
|
||||
|
||||
|
|
|
@ -32,104 +32,21 @@ import java.util.Locale;
|
|||
*/
|
||||
public final class Version {
|
||||
|
||||
/** Match settings and bugs in Lucene's 6.0 release.
|
||||
* @deprecated (7.0.0) Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_0_0 = new Version(6, 0, 0);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.0.1 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_0_1 = new Version(6, 0, 1);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.1.0 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_1_0 = new Version(6, 1, 0);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.2.0 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_2_0 = new Version(6, 2, 0);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.2.1 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_2_1 = new Version(6, 2, 1);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.3.0 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_3_0 = new Version(6, 3, 0);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.4.0 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_4_0 = new Version(6, 4, 0);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.4.1 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_4_1 = new Version(6, 4, 1);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.4.2 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_4_2 = new Version(6, 4, 2);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.5.0 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_5_0 = new Version(6, 5, 0);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.5.1 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_5_1 = new Version(6, 5, 1);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.6.0 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_6_0 = new Version(6, 6, 0);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.7.0 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_7_0 = new Version(6, 7, 0);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 7.0.0 release.
|
||||
* <p>
|
||||
* Use this to get the latest & greatest settings, bug
|
||||
* fixes, etc, for Lucene.
|
||||
* @deprecated (8.0.0) Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_7_0_0 = new Version(7, 0, 0);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 8.0.0 release.
|
||||
* <p>
|
||||
* Use this to get the latest & greatest settings, bug
|
||||
* fixes, etc, for Lucene.
|
||||
*/
|
||||
public static final Version LUCENE_8_0_0 = new Version(8, 0, 0);
|
||||
|
||||
// To add a new version:
|
||||
// * Only add above this comment
|
||||
// * If the new version is the newest, change LATEST below and deprecate the previous LATEST
|
||||
|
@ -149,7 +66,7 @@ public final class Version {
|
|||
* some defaults may have changed and may break functionality
|
||||
* in your application.
|
||||
*/
|
||||
public static final Version LATEST = LUCENE_7_0_0;
|
||||
public static final Version LATEST = LUCENE_8_0_0;
|
||||
|
||||
/**
|
||||
* Constant for backwards compatibility.
|
||||
|
|
|
@ -32,8 +32,8 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
public void testIllegalCreatedVersion() {
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> new SegmentInfos(5));
|
||||
assertEquals("indexCreatedVersionMajor must be >= 6, got: 5", e.getMessage());
|
||||
e = expectThrows(IllegalArgumentException.class, () -> new SegmentInfos(8));
|
||||
assertEquals("indexCreatedVersionMajor is in the future: 8", e.getMessage());
|
||||
e = expectThrows(IllegalArgumentException.class, () -> new SegmentInfos(Version.LATEST.major + 1));
|
||||
assertEquals("indexCreatedVersionMajor is in the future: " + (Version.LATEST.major + 1), e.getMessage());
|
||||
}
|
||||
|
||||
// LUCENE-5954
|
||||
|
@ -56,7 +56,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
Codec codec = Codec.getDefault();
|
||||
|
||||
SegmentInfos sis = new SegmentInfos(Version.LATEST.major);
|
||||
SegmentInfo info = new SegmentInfo(dir, Version.LUCENE_7_0_0, Version.LUCENE_7_0_0, "_0", 1, false, Codec.getDefault(),
|
||||
SegmentInfo info = new SegmentInfo(dir, Version.LUCENE_8_0_0, Version.LUCENE_8_0_0, "_0", 1, false, Codec.getDefault(),
|
||||
Collections.<String,String>emptyMap(), id, Collections.<String,String>emptyMap(), null);
|
||||
info.setFiles(Collections.<String>emptySet());
|
||||
codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
|
||||
|
@ -65,7 +65,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
sis.add(commitInfo);
|
||||
sis.commit(dir);
|
||||
sis = SegmentInfos.readLatestCommit(dir);
|
||||
assertEquals(Version.LUCENE_7_0_0, sis.getMinSegmentLuceneVersion());
|
||||
assertEquals(Version.LUCENE_8_0_0, sis.getMinSegmentLuceneVersion());
|
||||
assertEquals(Version.LATEST, sis.getCommitLuceneVersion());
|
||||
dir.close();
|
||||
}
|
||||
|
@ -78,14 +78,14 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
Codec codec = Codec.getDefault();
|
||||
|
||||
SegmentInfos sis = new SegmentInfos(Version.LATEST.major);
|
||||
SegmentInfo info = new SegmentInfo(dir, Version.LUCENE_7_0_0, Version.LUCENE_7_0_0, "_0", 1, false, Codec.getDefault(),
|
||||
SegmentInfo info = new SegmentInfo(dir, Version.LUCENE_8_0_0, Version.LUCENE_8_0_0, "_0", 1, false, Codec.getDefault(),
|
||||
Collections.<String,String>emptyMap(), id, Collections.<String,String>emptyMap(), null);
|
||||
info.setFiles(Collections.<String>emptySet());
|
||||
codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
|
||||
SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, -1, -1, -1);
|
||||
sis.add(commitInfo);
|
||||
|
||||
info = new SegmentInfo(dir, Version.LUCENE_7_0_0, Version.LUCENE_7_0_0, "_1", 1, false, Codec.getDefault(),
|
||||
info = new SegmentInfo(dir, Version.LUCENE_8_0_0, Version.LUCENE_8_0_0, "_1", 1, false, Codec.getDefault(),
|
||||
Collections.<String,String>emptyMap(), id, Collections.<String,String>emptyMap(), null);
|
||||
info.setFiles(Collections.<String>emptySet());
|
||||
codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
|
||||
|
@ -94,7 +94,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
|
||||
sis.commit(dir);
|
||||
sis = SegmentInfos.readLatestCommit(dir);
|
||||
assertEquals(Version.LUCENE_7_0_0, sis.getMinSegmentLuceneVersion());
|
||||
assertEquals(Version.LUCENE_8_0_0, sis.getMinSegmentLuceneVersion());
|
||||
assertEquals(Version.LATEST, sis.getCommitLuceneVersion());
|
||||
dir.close();
|
||||
}
|
||||
|
|
|
@ -17,24 +17,8 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.SegmentInfos;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestBM25Similarity extends LuceneTestCase {
|
||||
|
||||
|
@ -77,32 +61,6 @@ public class TestBM25Similarity extends LuceneTestCase {
|
|||
assertTrue(expected.getMessage().contains("illegal b value"));
|
||||
}
|
||||
|
||||
public void testLengthEncodingBackwardCompatibility() throws IOException {
|
||||
Similarity similarity = new BM25Similarity();
|
||||
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major}) {
|
||||
for (int length : new int[] {1, 2, 4}) { // these length values are encoded accurately on both cases
|
||||
Directory dir = newDirectory();
|
||||
// set the version on the directory
|
||||
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
|
||||
Document doc = new Document();
|
||||
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
|
||||
doc.add(new TextField("foo", value, Store.NO));
|
||||
w.addDocument(doc);
|
||||
IndexReader reader = DirectoryReader.open(w);
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.setSimilarity(similarity);
|
||||
Explanation expl = searcher.explain(new TermQuery(new Term("foo", "b")), 0);
|
||||
Explanation docLen = findExplanation(expl, "fieldLength");
|
||||
assertNotNull(docLen);
|
||||
assertEquals(docLen.toString(), length, (int) docLen.getValue());
|
||||
w.close();
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Explanation findExplanation(Explanation expl, String text) {
|
||||
if (expl.getDescription().equals(text)) {
|
||||
return expl;
|
||||
|
|
|
@ -19,29 +19,24 @@ package org.apache.lucene.search.similarities;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.MultiReader;
|
||||
import org.apache.lucene.index.SegmentInfos;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity.IDFStats;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -163,16 +158,6 @@ public class TestClassicSimilarity extends LuceneTestCase {
|
|||
|
||||
public void testSaneNormValues() throws IOException {
|
||||
ClassicSimilarity sim = new ClassicSimilarity();
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float boost = TFIDFSimilarity.OLD_NORM_TABLE[i];
|
||||
assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
|
||||
assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
|
||||
assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
|
||||
if (i > 0) {
|
||||
assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > TFIDFSimilarity.OLD_NORM_TABLE[i-1]);
|
||||
}
|
||||
}
|
||||
|
||||
TFIDFSimilarity.IDFStats stats = (IDFStats) sim.computeWeight(1f, new IndexSearcher(new MultiReader()).collectionStatistics("foo"));
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float boost = stats.normTable[i];
|
||||
|
@ -185,46 +170,6 @@ public class TestClassicSimilarity extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testNormEncodingBackwardCompatibility() throws IOException {
|
||||
Similarity similarity = new ClassicSimilarity();
|
||||
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major}) {
|
||||
for (int length : new int[] {1, 4, 16 }) { // these length values are encoded accurately on both cases
|
||||
Directory dir = newDirectory();
|
||||
// set the version on the directory
|
||||
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
|
||||
Document doc = new Document();
|
||||
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
|
||||
doc.add(new TextField("foo", value, Store.NO));
|
||||
w.addDocument(doc);
|
||||
IndexReader reader = DirectoryReader.open(w);
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.setSimilarity(similarity);
|
||||
Explanation expl = searcher.explain(new TermQuery(new Term("foo", "b")), 0);
|
||||
Explanation fieldNorm = findExplanation(expl, "fieldNorm");
|
||||
assertNotNull(fieldNorm);
|
||||
assertEquals(fieldNorm.toString(), 1/Math.sqrt(length), fieldNorm.getValue(), 0f);
|
||||
w.close();
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Explanation findExplanation(Explanation expl, String text) {
|
||||
if (expl.getDescription().startsWith(text)) {
|
||||
return expl;
|
||||
} else {
|
||||
for (Explanation sub : expl.getDetails()) {
|
||||
Explanation match = findExplanation(sub, text);
|
||||
if (match != null) {
|
||||
return match;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public void testSameNormsAsBM25() {
|
||||
ClassicSimilarity sim1 = new ClassicSimilarity();
|
||||
BM25Similarity sim2 = new BM25Similarity();
|
||||
|
|
|
@ -20,23 +20,16 @@ package org.apache.lucene.search.similarities;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.SegmentInfos;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -44,14 +37,11 @@ import org.apache.lucene.search.Query;
|
|||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimWeight;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
|
||||
/**
|
||||
* Tests the {@link SimilarityBase}-based Similarities. Contains unit tests and
|
||||
* integration tests for all Similarities and correctness tests for a select
|
||||
|
@ -609,33 +599,4 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
actual.setDiscountOverlaps(true);
|
||||
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
|
||||
}
|
||||
|
||||
public void testLengthEncodingBackwardCompatibility() throws IOException {
|
||||
Similarity similarity = RandomPicks.randomFrom(random(), sims);
|
||||
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major}) {
|
||||
for (int length : new int[] {1, 2, 4}) { // these length values are encoded accurately on both cases
|
||||
Directory dir = newDirectory();
|
||||
// set the version on the directory
|
||||
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
|
||||
Document doc = new Document();
|
||||
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
|
||||
doc.add(new TextField("foo", value, Store.NO));
|
||||
w.addDocument(doc);
|
||||
IndexReader reader = DirectoryReader.open(w);
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.setSimilarity(similarity);
|
||||
Term term = new Term("foo", "b");
|
||||
TermContext context = TermContext.build(reader.getContext(), term);
|
||||
SimWeight simWeight = similarity.computeWeight(1f, searcher.collectionStatistics("foo"), searcher.termStatistics(term, context));
|
||||
SimilarityBase.BasicSimScorer simScorer = (SimilarityBase.BasicSimScorer) similarity.simScorer(simWeight, reader.leaves().get(0));
|
||||
float docLength = simScorer.getLengthValue(0);
|
||||
assertEquals(length, (int) docLength);
|
||||
|
||||
w.close();
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,25 +32,26 @@ public class TestVersion extends LuceneTestCase {
|
|||
assertTrue("LATEST must be always onOrAfter("+v+")", Version.LATEST.onOrAfter(v));
|
||||
}
|
||||
}
|
||||
assertTrue(Version.LUCENE_7_0_0.onOrAfter(Version.LUCENE_6_0_0));;
|
||||
assertTrue(Version.LUCENE_8_0_0.onOrAfter(Version.LUCENE_7_0_0));;
|
||||
}
|
||||
|
||||
public void testToString() {
|
||||
assertEquals("6.0.0", Version.LUCENE_6_0_0.toString());
|
||||
assertEquals("7.0.0", Version.LUCENE_7_0_0.toString());
|
||||
assertEquals("8.0.0", Version.LUCENE_8_0_0.toString());
|
||||
}
|
||||
|
||||
public void testParseLeniently() throws Exception {
|
||||
assertEquals(Version.LUCENE_6_0_0, Version.parseLeniently("6.0"));
|
||||
assertEquals(Version.LUCENE_6_0_0, Version.parseLeniently("6.0.0"));
|
||||
assertEquals(Version.LUCENE_6_0_0, Version.parseLeniently("LUCENE_60"));
|
||||
assertEquals(Version.LUCENE_6_0_0, Version.parseLeniently("LUCENE_6_0"));
|
||||
assertEquals(Version.LUCENE_6_0_0, Version.parseLeniently("LUCENE_6_0_0"));
|
||||
assertEquals(Version.LUCENE_7_0_0, Version.parseLeniently("7.0"));
|
||||
assertEquals(Version.LUCENE_7_0_0, Version.parseLeniently("7.0.0"));
|
||||
assertEquals(Version.LUCENE_7_0_0, Version.parseLeniently("LUCENE_70"));
|
||||
assertEquals(Version.LUCENE_7_0_0, Version.parseLeniently("LUCENE_7_0"));
|
||||
assertEquals(Version.LUCENE_7_0_0, Version.parseLeniently("LUCENE_7_0_0"));
|
||||
assertEquals(Version.LUCENE_8_0_0, Version.parseLeniently("8.0"));
|
||||
assertEquals(Version.LUCENE_8_0_0, Version.parseLeniently("8.0.0"));
|
||||
assertEquals(Version.LUCENE_8_0_0, Version.parseLeniently("LUCENE_80"));
|
||||
assertEquals(Version.LUCENE_8_0_0, Version.parseLeniently("LUCENE_8_0"));
|
||||
assertEquals(Version.LUCENE_8_0_0, Version.parseLeniently("LUCENE_8_0_0"));
|
||||
|
||||
assertEquals(Version.LATEST, Version.parseLeniently("LATEST"));
|
||||
assertEquals(Version.LATEST, Version.parseLeniently("latest"));
|
||||
assertEquals(Version.LATEST, Version.parseLeniently("LUCENE_CURRENT"));
|
||||
|
@ -74,9 +75,9 @@ public class TestVersion extends LuceneTestCase {
|
|||
assertTrue(expected.getMessage().contains("LUCENE61"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parseLeniently("LUCENE_6.0.0");
|
||||
Version.parseLeniently("LUCENE_7.0.0");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("LUCENE_6.0.0"));
|
||||
assertTrue(expected.getMessage().contains("LUCENE_7.0.0"));
|
||||
}
|
||||
|
||||
public void testParseLenientlyOnAllConstants() throws Exception {
|
||||
|
@ -94,8 +95,8 @@ public class TestVersion extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testParse() throws Exception {
|
||||
assertEquals(Version.LUCENE_6_0_0, Version.parse("6.0.0"));
|
||||
assertEquals(Version.LUCENE_7_0_0, Version.parse("7.0.0"));
|
||||
assertEquals(Version.LUCENE_8_0_0, Version.parse("8.0.0"));
|
||||
|
||||
// Version does not pass judgement on the major version:
|
||||
assertEquals(1, Version.parse("1.0").major);
|
||||
|
@ -103,69 +104,69 @@ public class TestVersion extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testForwardsCompatibility() throws Exception {
|
||||
assertTrue(Version.parse("6.10.20").onOrAfter(Version.LUCENE_6_0_0));
|
||||
assertTrue(Version.parse("7.10.20").onOrAfter(Version.LUCENE_7_0_0));
|
||||
}
|
||||
|
||||
public void testParseExceptions() {
|
||||
ParseException expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("LUCENE_6_0_0");
|
||||
Version.parse("LUCENE_7_0_0");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("LUCENE_6_0_0"));
|
||||
assertTrue(expected.getMessage().contains("LUCENE_7_0_0"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("6.256");
|
||||
Version.parse("7.256");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("6.256"));
|
||||
assertTrue(expected.getMessage().contains("7.256"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("6.-1");
|
||||
Version.parse("7.-1");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("6.-1"));
|
||||
assertTrue(expected.getMessage().contains("7.-1"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("6.1.256");
|
||||
Version.parse("7.1.256");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("6.1.256"));
|
||||
assertTrue(expected.getMessage().contains("7.1.256"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("6.1.-1");
|
||||
Version.parse("7.1.-1");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("6.1.-1"));
|
||||
assertTrue(expected.getMessage().contains("7.1.-1"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("6.1.1.3");
|
||||
Version.parse("7.1.1.3");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("6.1.1.3"));
|
||||
assertTrue(expected.getMessage().contains("7.1.1.3"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("6.1.1.-1");
|
||||
Version.parse("7.1.1.-1");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("6.1.1.-1"));
|
||||
assertTrue(expected.getMessage().contains("7.1.1.-1"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("6.1.1.1");
|
||||
Version.parse("7.1.1.1");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("6.1.1.1"));
|
||||
assertTrue(expected.getMessage().contains("7.1.1.1"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("6.1.1.2");
|
||||
Version.parse("7.1.1.2");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("6.1.1.2"));
|
||||
assertTrue(expected.getMessage().contains("7.1.1.2"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("6.0.0.0");
|
||||
Version.parse("7.0.0.0");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("6.0.0.0"));
|
||||
assertTrue(expected.getMessage().contains("7.0.0.0"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("6.0.0.1.42");
|
||||
Version.parse("7.0.0.1.42");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("6.0.0.1.42"));
|
||||
assertTrue(expected.getMessage().contains("7.0.0.1.42"));
|
||||
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
Version.parse("6..0.1");
|
||||
Version.parse("7..0.1");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("6..0.1"));
|
||||
assertTrue(expected.getMessage().contains("7..0.1"));
|
||||
}
|
||||
|
||||
public void testDeprecations() throws Exception {
|
||||
|
|
|
@ -334,9 +334,9 @@ public final class TestUtil {
|
|||
CheckIndex.testLiveDocs(codecReader, infoStream, true);
|
||||
CheckIndex.testFieldInfos(codecReader, infoStream, true);
|
||||
CheckIndex.testFieldNorms(codecReader, infoStream, true);
|
||||
CheckIndex.testPostings(codecReader, infoStream, false, true, Version.LUCENE_7_0_0);
|
||||
CheckIndex.testPostings(codecReader, infoStream, false, true);
|
||||
CheckIndex.testStoredFields(codecReader, infoStream, true);
|
||||
CheckIndex.testTermVectors(codecReader, infoStream, false, crossCheckTermVectors, true, Version.LUCENE_7_0_0);
|
||||
CheckIndex.testTermVectors(codecReader, infoStream, false, crossCheckTermVectors, true);
|
||||
CheckIndex.testDocValues(codecReader, infoStream, true);
|
||||
CheckIndex.testPoints(codecReader, infoStream, true);
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
# RELEASE MANAGER must change this file after creating a release and
|
||||
# enter new base version (format "x.y.z", no prefix/appendix):
|
||||
version.base=7.0.0
|
||||
version.base=8.0.0
|
||||
|
||||
# Other version property defaults, don't change:
|
||||
version.suffix=SNAPSHOT
|
||||
|
|
|
@ -16,6 +16,23 @@ In this release, there is an example Solr server including a bundled
|
|||
servlet container in the directory named "example".
|
||||
See the Quick Start guide at http://lucene.apache.org/solr/quickstart.html
|
||||
|
||||
================== 8.0.0 ==================
|
||||
|
||||
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
|
||||
|
||||
Versions of Major Components
|
||||
---------------------
|
||||
Apache Tika 1.13
|
||||
Carrot2 3.15.0
|
||||
Velocity 1.7 and Velocity Tools 2.0
|
||||
Apache UIMA 2.3.1
|
||||
Apache ZooKeeper 3.4.10
|
||||
Jetty 9.3.14.v20161028
|
||||
|
||||
|
||||
(No Changes)
|
||||
|
||||
|
||||
================== 7.0.0 ==================
|
||||
|
||||
Versions of Major Components
|
||||
|
|
|
@ -421,10 +421,10 @@ public final class FieldTypePluginLoader
|
|||
Version version = (configuredVersion != null) ?
|
||||
Config.parseLuceneVersionString(configuredVersion) : schema.getDefaultLuceneMatchVersion();
|
||||
|
||||
if (!version.onOrAfter(Version.LUCENE_6_0_0)) {
|
||||
if (!version.onOrAfter(Version.LUCENE_7_0_0)) {
|
||||
log.warn(pluginClassName + " is using deprecated " + version +
|
||||
" emulation. You should at some point declare and reindex to at least 6.0, because " +
|
||||
"5.x emulation is deprecated and will be removed in 7.0");
|
||||
" emulation. You should at some point declare and reindex to at least 7.0, because " +
|
||||
"6.x emulation is deprecated and will be removed in 8.0");
|
||||
}
|
||||
return version;
|
||||
}
|
||||
|
|
|
@ -68,7 +68,6 @@ import org.apache.solr.core.SolrResourceLoader;
|
|||
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||
import org.apache.solr.response.SchemaXmlWriter;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.search.similarities.ClassicSimilarityFactory;
|
||||
import org.apache.solr.search.similarities.SchemaSimilarityFactory;
|
||||
import org.apache.solr.util.DOMUtil;
|
||||
import org.apache.solr.util.plugin.SolrCoreAware;
|
||||
|
@ -475,8 +474,7 @@ public class IndexSchema {
|
|||
Node node = (Node) xpath.evaluate(expression, document, XPathConstants.NODE);
|
||||
similarityFactory = readSimilarity(loader, node);
|
||||
if (similarityFactory == null) {
|
||||
final boolean modernSim = getDefaultLuceneMatchVersion().onOrAfter(Version.LUCENE_6_0_0);
|
||||
final Class simClass = modernSim ? SchemaSimilarityFactory.class : ClassicSimilarityFactory.class;
|
||||
final Class<?> simClass = SchemaSimilarityFactory.class;
|
||||
// use the loader to ensure proper SolrCoreAware handling
|
||||
similarityFactory = loader.newInstance(simClass.getName(), SimilarityFactory.class);
|
||||
similarityFactory.init(new ModifiableSolrParams());
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.schema;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
import org.apache.solr.core.PluginInfo;
|
||||
|
@ -65,12 +65,7 @@ public abstract class IndexSchemaFactory implements NamedListInitializedPlugin {
|
|||
factory = config.getResourceLoader().newInstance(info.className, IndexSchemaFactory.class);
|
||||
factory.init(info.initArgs);
|
||||
} else {
|
||||
if (config.luceneMatchVersion.onOrAfter(Version.LUCENE_6_0_0)) {
|
||||
// ManagedIndexSchemaFactory is SolrCoreAware so we must create using the resource loader
|
||||
factory = config.getResourceLoader().newInstance(ManagedIndexSchemaFactory.class.getName(), IndexSchemaFactory.class);
|
||||
} else {
|
||||
factory = new ClassicIndexSchemaFactory();
|
||||
}
|
||||
factory = config.getResourceLoader().newInstance(ManagedIndexSchemaFactory.class.getName(), IndexSchemaFactory.class);
|
||||
}
|
||||
IndexSchema schema = factory.create(resourceName, config);
|
||||
return schema;
|
||||
|
|
|
@ -114,9 +114,7 @@ public class SchemaSimilarityFactory extends SimilarityFactory implements SolrCo
|
|||
Similarity defaultSim = null;
|
||||
if (null == defaultSimFromFieldType) {
|
||||
// nothing configured, choose a sensible implicit default...
|
||||
defaultSim = this.core.getSolrConfig().luceneMatchVersion.onOrAfter(Version.LUCENE_6_0_0)
|
||||
? new BM25Similarity()
|
||||
: new ClassicSimilarity();
|
||||
defaultSim = new BM25Similarity();
|
||||
} else {
|
||||
FieldType defSimFT = core.getLatestSchema().getFieldTypeByName(defaultSimFromFieldType);
|
||||
if (null == defSimFT) {
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
that you fully re-index after changing this setting as it can
|
||||
affect both how text is indexed and queried.
|
||||
-->
|
||||
<luceneMatchVersion>7.0.0</luceneMatchVersion>
|
||||
<luceneMatchVersion>8.0.0</luceneMatchVersion>
|
||||
|
||||
<!-- <lib/> directives can be used to instruct Solr to load any Jars
|
||||
identified and use them to resolve any "plugins" specified in
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
*/
|
||||
package org.apache.solr.search.similarities;
|
||||
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||
import org.junit.After;
|
||||
|
||||
|
@ -40,13 +39,4 @@ public class TestNonDefinedSimilarityFactory extends BaseSimilarityTestCase {
|
|||
BM25Similarity sim = getSimilarity("text", BM25Similarity.class);
|
||||
assertEquals(0.75F, sim.getB(), 0.0F);
|
||||
}
|
||||
|
||||
public void testClassic() throws Exception {
|
||||
// any value below 6.0 should have this behavior
|
||||
System.setProperty("tests.luceneMatchVersion", "5.3");
|
||||
initCore("solrconfig-basic.xml","schema-tiny.xml");
|
||||
ClassicSimilarity sim = getSimilarity("text", ClassicSimilarity.class);
|
||||
assertEquals(true, sim.getDiscountOverlaps());
|
||||
System.clearProperty("tests.luceneMatchVersion");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
that you fully re-index after changing this setting as it can
|
||||
affect both how text is indexed and queried.
|
||||
-->
|
||||
<luceneMatchVersion>7.0.0</luceneMatchVersion>
|
||||
<luceneMatchVersion>8.0.0</luceneMatchVersion>
|
||||
|
||||
<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar"/>
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
that you fully re-index after changing this setting as it can
|
||||
affect both how text is indexed and queried.
|
||||
-->
|
||||
<luceneMatchVersion>7.0.0</luceneMatchVersion>
|
||||
<luceneMatchVersion>8.0.0</luceneMatchVersion>
|
||||
|
||||
<!-- <lib/> directives can be used to instruct Solr to load any Jars
|
||||
identified and use them to resolve any "plugins" specified in
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
that you fully re-index after changing this setting as it can
|
||||
affect both how text is indexed and queried.
|
||||
-->
|
||||
<luceneMatchVersion>7.0.0</luceneMatchVersion>
|
||||
<luceneMatchVersion>8.0.0</luceneMatchVersion>
|
||||
|
||||
<!-- <lib/> directives can be used to instruct Solr to load any Jars
|
||||
identified and use them to resolve any "plugins" specified in
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
that you fully re-index after changing this setting as it can
|
||||
affect both how text is indexed and queried.
|
||||
-->
|
||||
<luceneMatchVersion>7.0.0</luceneMatchVersion>
|
||||
<luceneMatchVersion>8.0.0</luceneMatchVersion>
|
||||
|
||||
<!-- <lib/> directives can be used to instruct Solr to load any Jars
|
||||
identified and use them to resolve any "plugins" specified in
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
that you fully re-index after changing this setting as it can
|
||||
affect both how text is indexed and queried.
|
||||
-->
|
||||
<luceneMatchVersion>7.0.0</luceneMatchVersion>
|
||||
<luceneMatchVersion>8.0.0</luceneMatchVersion>
|
||||
|
||||
<!-- Load Data Import Handler and Apache Tika (extraction) libraries -->
|
||||
<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar"/>
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
that you fully re-index after changing this setting as it can
|
||||
affect both how text is indexed and queried.
|
||||
-->
|
||||
<luceneMatchVersion>7.0.0</luceneMatchVersion>
|
||||
<luceneMatchVersion>8.0.0</luceneMatchVersion>
|
||||
|
||||
<!-- <lib/> directives can be used to instruct Solr to load any Jars
|
||||
identified and use them to resolve any "plugins" specified in
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
that you fully re-index after changing this setting as it can
|
||||
affect both how text is indexed and queried.
|
||||
-->
|
||||
<luceneMatchVersion>7.0.0</luceneMatchVersion>
|
||||
<luceneMatchVersion>8.0.0</luceneMatchVersion>
|
||||
|
||||
<!-- <lib/> directives can be used to instruct Solr to load any Jars
|
||||
identified and use them to resolve any "plugins" specified in
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
that you fully re-index after changing this setting as it can
|
||||
affect both how text is indexed and queried.
|
||||
-->
|
||||
<luceneMatchVersion>7.0.0</luceneMatchVersion>
|
||||
<luceneMatchVersion>8.0.0</luceneMatchVersion>
|
||||
|
||||
<!-- <lib/> directives can be used to instruct Solr to load any Jars
|
||||
identified and use them to resolve any "plugins" specified in
|
||||
|
|
Loading…
Reference in New Issue