add in-ram sortedbytes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1433250 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-01-15 00:51:19 +00:00
parent 964960eccf
commit 6dcc657d91
4 changed files with 91 additions and 12 deletions

View File

@ -144,7 +144,7 @@ public class Lucene41Codec extends Codec {
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41");
// nocommit
private final SimpleDocValuesFormat defaultDVFormat = SimpleDocValuesFormat.forName("Disk");
private final SimpleDocValuesFormat defaultDVFormat = SimpleDocValuesFormat.forName("Lucene41");
private final SimpleNormsFormat simpleNormsFormat = new Lucene41SimpleNormsFormat();

View File

@ -30,6 +30,12 @@ import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
@ -50,7 +56,6 @@ class Lucene41SimpleDocValuesConsumer extends SimpleDVConsumer {
static final byte FST = 2;
final IndexOutput data, meta;
final int maxDoc;
Lucene41SimpleDocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
boolean success = false;
@ -61,7 +66,6 @@ class Lucene41SimpleDocValuesConsumer extends SimpleDVConsumer {
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
meta = state.directory.createOutput(metaName, state.context);
CodecUtil.writeHeader(meta, metaCodec, VERSION_CURRENT);
maxDoc = state.segmentInfo.getDocCount();
success = true;
} finally {
if (!success) {
@ -217,7 +221,24 @@ class Lucene41SimpleDocValuesConsumer extends SimpleDVConsumer {
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
throw new AssertionError();
// write the ordinals as numerics
addNumericField(field, docToOrd);
// write the values as FST
meta.writeVInt(field.number);
meta.writeByte(FST);
meta.writeLong(data.getFilePointer());
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs);
IntsRef scratch = new IntsRef();
long ord = 0;
for (BytesRef v : values) {
builder.add(Util.toIntsRef(v, scratch), ord);
ord++;
}
FST<Long> fst = builder.finish();
fst.save(data);
meta.writeVInt((int)ord);
}
// nocommit: can/should we make override merge + make it smarter to pull the values

View File

@ -24,6 +24,7 @@ import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.SimpleDVProducer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
@ -33,6 +34,12 @@ import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.PackedInts;
class Lucene41SimpleDocValuesProducer extends SimpleDVProducer {
@ -51,6 +58,9 @@ class Lucene41SimpleDocValuesProducer extends SimpleDVProducer {
private final Map<Integer,BinaryDocValues> binaryInstances =
new HashMap<Integer,BinaryDocValues>();
private final Map<Integer,FST<Long>> fstInstances =
new HashMap<Integer,FST<Long>>();
Lucene41SimpleDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
// read in the entries from the metadata file.
@ -96,6 +106,13 @@ class Lucene41SimpleDocValuesProducer extends SimpleDVProducer {
entry.minLength = meta.readVInt();
entry.maxLength = meta.readVInt();
binaries.put(fieldNumber, entry);
} else if (fieldType == Lucene41SimpleDocValuesConsumer.FST) {
FSTEntry entry = new FSTEntry();
entry.offset = meta.readLong();
entry.numOrds = meta.readVInt();
fsts.put(fieldNumber, entry);
} else {
throw new CorruptIndexException("invalid entry type: " + fieldType + ", input=" + meta);
}
fieldNumber = meta.readVInt();
}
@ -113,7 +130,6 @@ class Lucene41SimpleDocValuesProducer extends SimpleDVProducer {
private NumericDocValues loadNumeric(FieldInfo field) throws IOException {
NumericEntry entry = numerics.get(field.number);
final IndexInput data = this.data.clone();
data.seek(entry.offset);
if (entry.tableized) {
int size = data.readVInt();
@ -154,7 +170,6 @@ class Lucene41SimpleDocValuesProducer extends SimpleDVProducer {
private BinaryDocValues loadBinary(FieldInfo field) throws IOException {
BinaryEntry entry = binaries.get(field.number);
final IndexInput data = this.data.clone();
data.seek(entry.offset);
assert entry.numBytes < Integer.MAX_VALUE; // nocommit
final byte[] bytes = new byte[(int)entry.numBytes];
@ -184,11 +199,49 @@ class Lucene41SimpleDocValuesProducer extends SimpleDVProducer {
}
}
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
throw new AssertionError();
final FSTEntry entry = fsts.get(field.number);
FST<Long> instance;
synchronized(this) {
instance = fstInstances.get(field.number);
if (instance == null) {
data.seek(entry.offset);
instance = new FST<Long>(data, PositiveIntOutputs.getSingleton(true));
fstInstances.put(field.number, instance);
}
}
final NumericDocValues docToOrd = getNumeric(field);
final FST<Long> fst = instance;
// per-thread resources
final BytesReader in = fst.getBytesReader(0);
final Arc<Long> firstArc = new Arc<Long>();
final Arc<Long> scratchArc = new Arc<Long>();
final IntsRef scratchInts = new IntsRef();
return new SortedDocValues() {
@Override
public int getOrd(int docID) {
return (int) docToOrd.get(docID);
}
@Override
public void lookupOrd(int ord, BytesRef result) {
try {
in.setPosition(0);
fst.getFirstArc(firstArc);
Util.toBytesRef(Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts), result);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public int getValueCount() {
return entry.numOrds;
}
};
}
@Override
@ -212,5 +265,4 @@ class Lucene41SimpleDocValuesProducer extends SimpleDVProducer {
long offset;
int numOrds;
}
}

View File

@ -110,6 +110,12 @@ public final class Util {
final IntsRef result = new IntsRef();
return getByOutput(fst, targetOutput, in, arc, scratchArc, result);
}
/** Expert: like {@link Util#getByOutput(FST, long)} except reusing */
// nocommit
public static IntsRef getByOutput(FST<Long> fst, long targetOutput, BytesReader in, Arc<Long> arc, Arc<Long> scratchArc, IntsRef result) throws IOException {
long output = arc.output;
int upto = 0;