mirror of
https://github.com/apache/lucene.git
synced 2025-02-08 02:58:58 +00:00
LUCENE-5296: add DirectDocValuesFormat
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1537105 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fe4699b6b0
commit
47480a8496
@ -128,6 +128,9 @@ New Features
|
||||
|
||||
* LUCENE-5302: Make StemmerOverrideMap's methods public (Alan Woodward)
|
||||
|
||||
* LUCENE-5296: Add DirectDocValuesFormat, which holds all doc values
|
||||
in heap as uncompressed java native arrays. (Mike McCandless)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
|
||||
|
@ -0,0 +1,299 @@
|
||||
package org.apache.lucene.codecs.memory;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.BYTES;
|
||||
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.SORTED;
|
||||
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.SORTED_SET;
|
||||
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.NUMBER;
|
||||
|
||||
/**
|
||||
* Writer for {@link DirectDocValuesFormat}
|
||||
*/
|
||||
|
||||
class DirectDocValuesConsumer extends DocValuesConsumer {
|
||||
final IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
|
||||
DirectDocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
maxDoc = state.segmentInfo.getDocCount();
|
||||
boolean success = false;
|
||||
try {
|
||||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
data = state.directory.createOutput(dataName, state.context);
|
||||
CodecUtil.writeHeader(data, dataCodec, VERSION_CURRENT);
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
meta = state.directory.createOutput(metaName, state.context);
|
||||
CodecUtil.writeHeader(meta, metaCodec, VERSION_CURRENT);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(NUMBER);
|
||||
addNumericFieldValues(field, values);
|
||||
}
|
||||
|
||||
private void addNumericFieldValues(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||
meta.writeLong(data.getFilePointer());
|
||||
long minValue = Long.MAX_VALUE;
|
||||
long maxValue = Long.MIN_VALUE;
|
||||
boolean missing = false;
|
||||
|
||||
long count = 0;
|
||||
for (Number nv : values) {
|
||||
if (nv != null) {
|
||||
long v = nv.longValue();
|
||||
minValue = Math.min(minValue, v);
|
||||
maxValue = Math.max(maxValue, v);
|
||||
} else {
|
||||
missing = true;
|
||||
}
|
||||
count++;
|
||||
if (count >= DirectDocValuesFormat.MAX_SORTED_SET_ORDS) {
|
||||
throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + DirectDocValuesFormat.MAX_SORTED_SET_ORDS + " values/total ords");
|
||||
}
|
||||
}
|
||||
meta.writeInt((int) count);
|
||||
|
||||
if (missing) {
|
||||
long start = data.getFilePointer();
|
||||
writeMissingBitset(values);
|
||||
meta.writeLong(start);
|
||||
meta.writeLong(data.getFilePointer() - start);
|
||||
} else {
|
||||
meta.writeLong(-1L);
|
||||
}
|
||||
|
||||
byte byteWidth;
|
||||
if (minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
|
||||
byteWidth = 1;
|
||||
} else if (minValue >= Short.MIN_VALUE && maxValue <= Short.MAX_VALUE) {
|
||||
byteWidth = 2;
|
||||
} else if (minValue >= Integer.MIN_VALUE && maxValue <= Integer.MAX_VALUE) {
|
||||
byteWidth = 4;
|
||||
} else {
|
||||
byteWidth = 8;
|
||||
}
|
||||
meta.writeByte(byteWidth);
|
||||
|
||||
for (Number nv : values) {
|
||||
long v;
|
||||
if (nv != null) {
|
||||
v = nv.longValue();
|
||||
} else {
|
||||
v = 0;
|
||||
}
|
||||
|
||||
switch(byteWidth) {
|
||||
case 1:
|
||||
data.writeByte((byte) v);
|
||||
break;
|
||||
case 2:
|
||||
data.writeShort((short) v);
|
||||
break;
|
||||
case 4:
|
||||
data.writeInt((int) v);
|
||||
break;
|
||||
case 8:
|
||||
data.writeLong(v);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
if (meta != null) {
|
||||
meta.writeVInt(-1); // write EOF marker
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(data, meta);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(data, meta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(BYTES);
|
||||
addBinaryFieldValues(field, values);
|
||||
}
|
||||
|
||||
private void addBinaryFieldValues(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
|
||||
// write the byte[] data
|
||||
final long startFP = data.getFilePointer();
|
||||
boolean missing = false;
|
||||
long totalBytes = 0;
|
||||
int count = 0;
|
||||
for(BytesRef v : values) {
|
||||
if (v != null) {
|
||||
data.writeBytes(v.bytes, v.offset, v.length);
|
||||
totalBytes += v.length;
|
||||
if (totalBytes > DirectDocValuesFormat.MAX_TOTAL_BYTES_LENGTH) {
|
||||
throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, cannot have more than DirectDocValuesFormat.MAX_TOTAL_BYTES_LENGTH (" + DirectDocValuesFormat.MAX_TOTAL_BYTES_LENGTH + ") bytes");
|
||||
}
|
||||
} else {
|
||||
missing = true;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
||||
meta.writeLong(startFP);
|
||||
meta.writeInt((int) totalBytes);
|
||||
meta.writeInt(count);
|
||||
if (missing) {
|
||||
long start = data.getFilePointer();
|
||||
writeMissingBitset(values);
|
||||
meta.writeLong(start);
|
||||
meta.writeLong(data.getFilePointer() - start);
|
||||
} else {
|
||||
meta.writeLong(-1L);
|
||||
}
|
||||
|
||||
int addr = 0;
|
||||
for (BytesRef v : values) {
|
||||
data.writeInt(addr);
|
||||
if (v != null) {
|
||||
addr += v.length;
|
||||
}
|
||||
}
|
||||
data.writeInt(addr);
|
||||
}
|
||||
|
||||
// TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
|
||||
// but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
|
||||
void writeMissingBitset(Iterable<?> values) throws IOException {
|
||||
long bits = 0;
|
||||
int count = 0;
|
||||
for (Object v : values) {
|
||||
if (count == 64) {
|
||||
data.writeLong(bits);
|
||||
count = 0;
|
||||
bits = 0;
|
||||
}
|
||||
if (v != null) {
|
||||
bits |= 1L << (count & 0x3f);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
if (count > 0) {
|
||||
data.writeLong(bits);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(SORTED);
|
||||
|
||||
// write the ordinals as numerics
|
||||
addNumericFieldValues(field, docToOrd);
|
||||
|
||||
// write the values as binary
|
||||
addBinaryFieldValues(field, values);
|
||||
}
|
||||
|
||||
// note: this might not be the most efficient... but its fairly simple
|
||||
@Override
|
||||
public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, final Iterable<Number> docToOrdCount, final Iterable<Number> ords) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(SORTED_SET);
|
||||
|
||||
// First write docToOrdCounts, except we "aggregate" the
|
||||
// counts so they turn into addresses, and add a final
|
||||
// value = the total aggregate:
|
||||
addNumericFieldValues(field, new Iterable<Number>() {
|
||||
|
||||
// Just aggregates the count values so they become
|
||||
// "addresses", and adds one more value in the end
|
||||
// (the final sum):
|
||||
|
||||
@Override
|
||||
public Iterator<Number> iterator() {
|
||||
final Iterator<Number> iter = docToOrdCount.iterator();
|
||||
|
||||
return new Iterator<Number>() {
|
||||
|
||||
long sum;
|
||||
boolean ended;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return iter.hasNext() || !ended;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Number next() {
|
||||
long toReturn = sum;
|
||||
|
||||
if (iter.hasNext()) {
|
||||
Number n = iter.next();
|
||||
if (n != null) {
|
||||
sum += n.longValue();
|
||||
}
|
||||
} else if (!ended) {
|
||||
ended = true;
|
||||
} else {
|
||||
assert false;
|
||||
}
|
||||
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
// Write ordinals for all docs, appended into one big
|
||||
// numerics:
|
||||
addNumericFieldValues(field, ords);
|
||||
|
||||
// write the values as binary
|
||||
addBinaryFieldValues(field, values);
|
||||
}
|
||||
}
|
@ -0,0 +1,78 @@
|
||||
package org.apache.lucene.codecs.memory;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/** In-memory docvalues format that does no (or very little)
|
||||
* compression. Indexed values are stored on disk, but
|
||||
* then at search time all values are loaded into memory as
|
||||
* simple java arrays. For numeric values, it uses
|
||||
* byte[], short[], int[], long[] as necessary to fit the
|
||||
* range of the values. For binary values, there is an int
|
||||
* (4 bytes) overhead per value.
|
||||
*
|
||||
* <p>Limitations:
|
||||
* <ul>
|
||||
* <li>For binary and sorted fields the total space
|
||||
* required for all binary values cannot exceed about
|
||||
* 2.1 GB (see #MAX_TOTAL_BYTES_LENGTH).</li>
|
||||
*
|
||||
* <li>For sorted set fields, the sum of the size of each
|
||||
* document's set of values cannot exceed about 2.1
|
||||
* B (see #MAX_SORTED_SET_ORDS).</li>
|
||||
* </ul> */
|
||||
|
||||
public class DirectDocValuesFormat extends DocValuesFormat {
|
||||
|
||||
/** The sum of all byte lengths for binary field, or for
|
||||
* the unique values in sorted or sorted set fields, cannot
|
||||
* exceed this. */
|
||||
public final static int MAX_TOTAL_BYTES_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH;
|
||||
|
||||
/** The sum of the number of values across all documents
|
||||
* in a sorted set field cannot exceed this. */
|
||||
public final static int MAX_SORTED_SET_ORDS = ArrayUtil.MAX_ARRAY_LENGTH;
|
||||
|
||||
/** Sole constructor. */
|
||||
public DirectDocValuesFormat() {
|
||||
super("Direct");
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
return new DirectDocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
return new DirectDocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
static final String DATA_CODEC = "DirectDocValuesData";
|
||||
static final String DATA_EXTENSION = "dvdd";
|
||||
static final String METADATA_CODEC = "DirectDocValuesMetadata";
|
||||
static final String METADATA_EXTENSION = "dvdm";
|
||||
}
|
@ -0,0 +1,478 @@
|
||||
package org.apache.lucene.codecs.memory;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* Reader for {@link DirectDocValuesFormat}
|
||||
*/
|
||||
|
||||
class DirectDocValuesProducer extends DocValuesProducer {
|
||||
// metadata maps (just file pointers and minimal stuff)
|
||||
private final Map<Integer,NumericEntry> numerics = new HashMap<Integer,NumericEntry>();
|
||||
private final Map<Integer,BinaryEntry> binaries = new HashMap<Integer,BinaryEntry>();
|
||||
private final Map<Integer,SortedEntry> sorteds = new HashMap<Integer,SortedEntry>();
|
||||
private final Map<Integer,SortedSetEntry> sortedSets = new HashMap<Integer,SortedSetEntry>();
|
||||
private final IndexInput data;
|
||||
|
||||
// ram instances we have already loaded
|
||||
private final Map<Integer,NumericDocValues> numericInstances =
|
||||
new HashMap<Integer,NumericDocValues>();
|
||||
private final Map<Integer,BinaryDocValues> binaryInstances =
|
||||
new HashMap<Integer,BinaryDocValues>();
|
||||
private final Map<Integer,SortedDocValues> sortedInstances =
|
||||
new HashMap<Integer,SortedDocValues>();
|
||||
private final Map<Integer,SortedSetRawValues> sortedSetInstances =
|
||||
new HashMap<Integer,SortedSetRawValues>();
|
||||
private final Map<Integer,Bits> docsWithFieldInstances = new HashMap<Integer,Bits>();
|
||||
|
||||
private final int maxDoc;
|
||||
|
||||
static final byte NUMBER = 0;
|
||||
static final byte BYTES = 1;
|
||||
static final byte SORTED = 2;
|
||||
static final byte SORTED_SET = 3;
|
||||
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
DirectDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
maxDoc = state.segmentInfo.getDocCount();
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
// read in the entries from the metadata file.
|
||||
IndexInput in = state.directory.openInput(metaName, state.context);
|
||||
boolean success = false;
|
||||
final int version;
|
||||
try {
|
||||
version = CodecUtil.checkHeader(in, metaCodec,
|
||||
VERSION_START,
|
||||
VERSION_CURRENT);
|
||||
readFields(in);
|
||||
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(in);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(in);
|
||||
}
|
||||
}
|
||||
|
||||
success = false;
|
||||
try {
|
||||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
data = state.directory.openInput(dataName, state.context);
|
||||
final int version2 = CodecUtil.checkHeader(data, dataCodec,
|
||||
VERSION_START,
|
||||
VERSION_CURRENT);
|
||||
if (version != version2) {
|
||||
throw new CorruptIndexException("Format versions mismatch");
|
||||
}
|
||||
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private NumericEntry readNumericEntry(IndexInput meta) throws IOException {
|
||||
NumericEntry entry = new NumericEntry();
|
||||
entry.offset = meta.readLong();
|
||||
entry.count = meta.readInt();
|
||||
entry.missingOffset = meta.readLong();
|
||||
if (entry.missingOffset != -1) {
|
||||
entry.missingBytes = meta.readLong();
|
||||
} else {
|
||||
entry.missingBytes = 0;
|
||||
}
|
||||
entry.byteWidth = meta.readByte();
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
private BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
|
||||
BinaryEntry entry = new BinaryEntry();
|
||||
entry.offset = meta.readLong();
|
||||
entry.numBytes = meta.readInt();
|
||||
entry.count = meta.readInt();
|
||||
entry.missingOffset = meta.readLong();
|
||||
if (entry.missingOffset != -1) {
|
||||
entry.missingBytes = meta.readLong();
|
||||
} else {
|
||||
entry.missingBytes = 0;
|
||||
}
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
private SortedEntry readSortedEntry(IndexInput meta) throws IOException {
|
||||
SortedEntry entry = new SortedEntry();
|
||||
entry.docToOrd = readNumericEntry(meta);
|
||||
entry.values = readBinaryEntry(meta);
|
||||
return entry;
|
||||
}
|
||||
|
||||
private SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException {
|
||||
SortedSetEntry entry = new SortedSetEntry();
|
||||
entry.docToOrdAddress = readNumericEntry(meta);
|
||||
entry.ords = readNumericEntry(meta);
|
||||
entry.values = readBinaryEntry(meta);
|
||||
return entry;
|
||||
}
|
||||
|
||||
private void readFields(IndexInput meta) throws IOException {
|
||||
int fieldNumber = meta.readVInt();
|
||||
while (fieldNumber != -1) {
|
||||
int fieldType = meta.readByte();
|
||||
if (fieldType == NUMBER) {
|
||||
numerics.put(fieldNumber, readNumericEntry(meta));
|
||||
} else if (fieldType == BYTES) {
|
||||
binaries.put(fieldNumber, readBinaryEntry(meta));
|
||||
} else if (fieldType == SORTED) {
|
||||
sorteds.put(fieldNumber, readSortedEntry(meta));
|
||||
} else if (fieldType == SORTED_SET) {
|
||||
sortedSets.put(fieldNumber, readSortedSetEntry(meta));
|
||||
} else {
|
||||
throw new CorruptIndexException("invalid entry type: " + fieldType + ", input=" + meta);
|
||||
}
|
||||
fieldNumber = meta.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
// TODO: optimize me
|
||||
return RamUsageEstimator.sizeOf(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized NumericDocValues getNumeric(FieldInfo field) throws IOException {
|
||||
NumericDocValues instance = numericInstances.get(field.number);
|
||||
if (instance == null) {
|
||||
// Lazy load
|
||||
instance = loadNumeric(numerics.get(field.number));
|
||||
numericInstances.put(field.number, instance);
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
|
||||
private NumericDocValues loadNumeric(NumericEntry entry) throws IOException {
|
||||
data.seek(entry.offset + entry.missingBytes);
|
||||
switch (entry.byteWidth) {
|
||||
case 1:
|
||||
{
|
||||
final byte[] values = new byte[entry.count];
|
||||
for(int i=0;i<entry.count;i++) {
|
||||
values[i] = data.readByte();
|
||||
}
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int idx) {
|
||||
return values[idx];
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
case 2:
|
||||
{
|
||||
final short[] values = new short[entry.count];
|
||||
for(int i=0;i<entry.count;i++) {
|
||||
values[i] = data.readShort();
|
||||
}
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int idx) {
|
||||
return values[idx];
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
case 4:
|
||||
{
|
||||
final int[] values = new int[entry.count];
|
||||
for(int i=0;i<entry.count;i++) {
|
||||
values[i] = data.readInt();
|
||||
}
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int idx) {
|
||||
return values[idx];
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
case 8:
|
||||
{
|
||||
final long[] values = new long[entry.count];
|
||||
for(int i=0;i<entry.count;i++) {
|
||||
values[i] = data.readLong();
|
||||
}
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int idx) {
|
||||
return values[idx];
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized BinaryDocValues getBinary(FieldInfo field) throws IOException {
|
||||
BinaryDocValues instance = binaryInstances.get(field.number);
|
||||
if (instance == null) {
|
||||
// Lazy load
|
||||
instance = loadBinary(binaries.get(field.number));
|
||||
binaryInstances.put(field.number, instance);
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
|
||||
private BinaryDocValues loadBinary(BinaryEntry entry) throws IOException {
|
||||
data.seek(entry.offset);
|
||||
final byte[] bytes = new byte[entry.numBytes];
|
||||
data.readBytes(bytes, 0, entry.numBytes);
|
||||
data.seek(entry.offset + entry.numBytes + entry.missingBytes);
|
||||
|
||||
final int[] address = new int[entry.count+1];
|
||||
for(int i=0;i<entry.count;i++) {
|
||||
address[i] = data.readInt();
|
||||
}
|
||||
address[entry.count] = data.readInt();
|
||||
|
||||
return new BinaryDocValues() {
|
||||
@Override
|
||||
public void get(int docID, BytesRef result) {
|
||||
result.bytes = bytes;
|
||||
result.offset = address[docID];
|
||||
result.length = address[docID+1] - result.offset;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||
SortedDocValues instance = sortedInstances.get(field.number);
|
||||
if (instance == null) {
|
||||
// Lazy load
|
||||
instance = loadSorted(field);
|
||||
sortedInstances.put(field.number, instance);
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
|
||||
private SortedDocValues loadSorted(FieldInfo field) throws IOException {
|
||||
final SortedEntry entry = sorteds.get(field.number);
|
||||
final NumericDocValues docToOrd = loadNumeric(entry.docToOrd);
|
||||
final BinaryDocValues values = loadBinary(entry.values);
|
||||
|
||||
return new SortedDocValues() {
|
||||
|
||||
@Override
|
||||
public int getOrd(int docID) {
|
||||
return (int) docToOrd.get(docID);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void lookupOrd(int ord, BytesRef result) {
|
||||
values.get(ord, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getValueCount() {
|
||||
return entry.values.count;
|
||||
}
|
||||
|
||||
// Leave lookupTerm to super's binary search
|
||||
|
||||
// Leave termsEnum to super
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
|
||||
SortedSetRawValues instance = sortedSetInstances.get(field.number);
|
||||
final SortedSetEntry entry = sortedSets.get(field.number);
|
||||
if (instance == null) {
|
||||
// Lazy load
|
||||
instance = loadSortedSet(entry);
|
||||
sortedSetInstances.put(field.number, instance);
|
||||
}
|
||||
|
||||
final NumericDocValues docToOrdAddress = instance.docToOrdAddress;
|
||||
final NumericDocValues ords = instance.ords;
|
||||
final BinaryDocValues values = instance.values;
|
||||
|
||||
// Must make a new instance since the iterator has state:
|
||||
return new SortedSetDocValues() {
|
||||
int ordUpto;
|
||||
int ordLimit;
|
||||
|
||||
@Override
|
||||
public long nextOrd() {
|
||||
if (ordUpto == ordLimit) {
|
||||
return NO_MORE_ORDS;
|
||||
} else {
|
||||
return ords.get(ordUpto++);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setDocument(int docID) {
|
||||
ordUpto = (int) docToOrdAddress.get(docID);
|
||||
ordLimit = (int) docToOrdAddress.get(docID+1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void lookupOrd(long ord, BytesRef result) {
|
||||
values.get((int) ord, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getValueCount() {
|
||||
return entry.values.count;
|
||||
}
|
||||
|
||||
// Leave lookupTerm to super's binary search
|
||||
|
||||
// Leave termsEnum to super
|
||||
};
|
||||
}
|
||||
|
||||
private SortedSetRawValues loadSortedSet(SortedSetEntry entry) throws IOException {
|
||||
SortedSetRawValues instance = new SortedSetRawValues();
|
||||
instance.docToOrdAddress = loadNumeric(entry.docToOrdAddress);
|
||||
instance.ords = loadNumeric(entry.ords);
|
||||
instance.values = loadBinary(entry.values);
|
||||
return instance;
|
||||
}
|
||||
|
||||
private Bits getMissingBits(int fieldNumber, final long offset, final long length) throws IOException {
|
||||
if (offset == -1) {
|
||||
return new Bits.MatchAllBits(maxDoc);
|
||||
} else {
|
||||
Bits instance;
|
||||
synchronized(this) {
|
||||
instance = docsWithFieldInstances.get(fieldNumber);
|
||||
if (instance == null) {
|
||||
IndexInput data = this.data.clone();
|
||||
data.seek(offset);
|
||||
assert length % 8 == 0;
|
||||
long bits[] = new long[(int) length >> 3];
|
||||
for (int i = 0; i < bits.length; i++) {
|
||||
bits[i] = data.readLong();
|
||||
}
|
||||
instance = new FixedBitSet(bits, maxDoc);
|
||||
docsWithFieldInstances.put(fieldNumber, instance);
|
||||
}
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Bits getDocsWithField(FieldInfo field) throws IOException {
|
||||
switch(field.getDocValuesType()) {
|
||||
case SORTED_SET:
|
||||
return new SortedSetDocsWithField(getSortedSet(field), maxDoc);
|
||||
case SORTED:
|
||||
return new SortedDocsWithField(getSorted(field), maxDoc);
|
||||
case BINARY:
|
||||
BinaryEntry be = binaries.get(field.number);
|
||||
return getMissingBits(field.number, be.missingOffset, be.missingBytes);
|
||||
case NUMERIC:
|
||||
NumericEntry ne = numerics.get(field.number);
|
||||
return getMissingBits(field.number, ne.missingOffset, ne.missingBytes);
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
data.close();
|
||||
}
|
||||
|
||||
static class SortedSetRawValues {
|
||||
NumericDocValues docToOrdAddress;
|
||||
NumericDocValues ords;
|
||||
BinaryDocValues values;
|
||||
}
|
||||
|
||||
static class NumericEntry {
|
||||
long offset;
|
||||
int count;
|
||||
long missingOffset;
|
||||
long missingBytes;
|
||||
byte byteWidth;
|
||||
int packedIntsVersion;
|
||||
}
|
||||
|
||||
static class BinaryEntry {
|
||||
long offset;
|
||||
long missingOffset;
|
||||
long missingBytes;
|
||||
int count;
|
||||
int numBytes;
|
||||
int minLength;
|
||||
int maxLength;
|
||||
int packedIntsVersion;
|
||||
int blockSize;
|
||||
}
|
||||
|
||||
static class SortedEntry {
|
||||
NumericEntry docToOrd;
|
||||
BinaryEntry values;
|
||||
}
|
||||
|
||||
static class SortedSetEntry {
|
||||
NumericEntry docToOrdAddress;
|
||||
NumericEntry ords;
|
||||
BinaryEntry values;
|
||||
}
|
||||
|
||||
static class FSTEntry {
|
||||
long offset;
|
||||
long numOrds;
|
||||
}
|
||||
}
|
@ -15,4 +15,5 @@
|
||||
|
||||
org.apache.lucene.codecs.diskdv.DiskDocValuesFormat
|
||||
org.apache.lucene.codecs.memory.MemoryDocValuesFormat
|
||||
org.apache.lucene.codecs.memory.DirectDocValuesFormat
|
||||
org.apache.lucene.codecs.simpletext.SimpleTextDocValuesFormat
|
||||
|
@ -0,0 +1,34 @@
|
||||
package org.apache.lucene.codecs.memory;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BaseDocValuesFormatTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
/**
|
||||
* Tests DirectDocValuesFormat
|
||||
*/
|
||||
public class TestDirectDocValuesFormat extends BaseDocValuesFormatTestCase {
|
||||
private final Codec codec = _TestUtil.alwaysDocValuesFormat(new DirectDocValuesFormat());
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
}
|
@ -24,11 +24,12 @@ import java.util.NoSuchElementException;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Counter;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.PagedBytes;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
@ -36,12 +37,8 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||
* segment flushes. */
|
||||
class BinaryDocValuesWriter extends DocValuesWriter {
|
||||
|
||||
/** Maximum length for a binary field; we set this to "a
|
||||
* bit" below Integer.MAX_VALUE because the exact max
|
||||
* allowed byte[] is JVM dependent, so we want to avoid
|
||||
* a case where a large value worked in one JVM but
|
||||
* failed later at search time with a different JVM. */
|
||||
private static final int MAX_LENGTH = Integer.MAX_VALUE-256;
|
||||
/** Maximum length for a binary field. */
|
||||
private static final int MAX_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH;
|
||||
|
||||
// 32 KB block sizes for PagedBytes storage:
|
||||
private final static int BLOCK_BITS = 15;
|
||||
|
@ -28,6 +28,14 @@ import java.util.Comparator;
|
||||
|
||||
public final class ArrayUtil {
|
||||
|
||||
/** Maximum length for an array; we set this to "a
|
||||
* bit" below Integer.MAX_VALUE because the exact max
|
||||
* allowed byte[] is JVM dependent, so we want to avoid
|
||||
* a case where a large value worked during indexing on
|
||||
* one JVM but failed later at search time with a
|
||||
* different JVM. */
|
||||
public static final int MAX_ARRAY_LENGTH = Integer.MAX_VALUE - 256;
|
||||
|
||||
private ArrayUtil() {} // no instance
|
||||
|
||||
/*
|
||||
|
@ -186,6 +186,44 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
|
||||
directory.close();
|
||||
}
|
||||
|
||||
public void testTwoBinaryValues() throws IOException {
|
||||
Directory directory = newDirectory();
|
||||
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory);
|
||||
Document doc = new Document();
|
||||
String longTerm = "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm";
|
||||
String text = "This is the text to be indexed. " + longTerm;
|
||||
doc.add(newTextField("fieldname", text, Field.Store.YES));
|
||||
doc.add(new BinaryDocValuesField("dv1", new BytesRef(longTerm)));
|
||||
doc.add(new BinaryDocValuesField("dv2", new BytesRef(text)));
|
||||
iwriter.addDocument(doc);
|
||||
iwriter.close();
|
||||
|
||||
// Now search the index:
|
||||
IndexReader ireader = DirectoryReader.open(directory); // read-only=true
|
||||
IndexSearcher isearcher = new IndexSearcher(ireader);
|
||||
|
||||
assertEquals(1, isearcher.search(new TermQuery(new Term("fieldname", longTerm)), 1).totalHits);
|
||||
Query query = new TermQuery(new Term("fieldname", "text"));
|
||||
TopDocs hits = isearcher.search(query, null, 1);
|
||||
assertEquals(1, hits.totalHits);
|
||||
// Iterate through the results:
|
||||
for (int i = 0; i < hits.scoreDocs.length; i++) {
|
||||
StoredDocument hitDoc = isearcher.doc(hits.scoreDocs[i].doc);
|
||||
assertEquals(text, hitDoc.get("fieldname"));
|
||||
assert ireader.leaves().size() == 1;
|
||||
BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv1");
|
||||
BytesRef scratch = new BytesRef();
|
||||
dv.get(hits.scoreDocs[i].doc, scratch);
|
||||
assertEquals(new BytesRef(longTerm), scratch);
|
||||
dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv2");
|
||||
dv.get(hits.scoreDocs[i].doc, scratch);
|
||||
assertEquals(new BytesRef(text), scratch);
|
||||
}
|
||||
|
||||
ireader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
public void testTwoFieldsMixed() throws IOException {
|
||||
Directory directory = newDirectory();
|
||||
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory);
|
||||
@ -2943,12 +2981,12 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
|
||||
if (values.length > 0) {
|
||||
assertNotNull(sortedSet);
|
||||
sortedSet.setDocument(j);
|
||||
for (int i = 0; i < values.length; i++) {
|
||||
for (int k = 0; k < values.length; k++) {
|
||||
long ord = sortedSet.nextOrd();
|
||||
assertTrue(ord != SortedSetDocValues.NO_MORE_ORDS);
|
||||
BytesRef value = new BytesRef();
|
||||
sortedSet.lookupOrd(ord, value);
|
||||
assertEquals(values[i], value.utf8ToString());
|
||||
assertEquals(values[k], value.utf8ToString());
|
||||
}
|
||||
assertEquals(SortedSetDocValues.NO_MORE_ORDS, sortedSet.nextOrd());
|
||||
assertTrue(sortedSetBits.get(j));
|
||||
|
Loading…
x
Reference in New Issue
Block a user