Compress PagedBytesAtomicFieldData's termOrdToBytesOffset.

Using MonotonicAppendingLongBuffer instead of a GrowableWriter should help
save several bits per value, especially when the bytes to store have similar
lengths.

Closes #3186
This commit is contained in:
Adrien Grand 2013-06-14 19:16:12 +02:00
parent 25f19f8b87
commit a30d58aae2
2 changed files with 17 additions and 27 deletions

View File

@ -22,8 +22,7 @@ package org.elasticsearch.index.fielddata.plain;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.PagedBytes.Reader; import org.apache.lucene.util.PagedBytes.Reader;
import org.apache.lucene.util.packed.GrowableWriter; import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
import org.apache.lucene.util.packed.PackedInts;
import org.elasticsearch.index.fielddata.AtomicFieldData; import org.elasticsearch.index.fielddata.AtomicFieldData;
import org.elasticsearch.index.fielddata.ScriptDocValues; import org.elasticsearch.index.fielddata.ScriptDocValues;
import org.elasticsearch.index.fielddata.ordinals.EmptyOrdinals; import org.elasticsearch.index.fielddata.ordinals.EmptyOrdinals;
@ -40,14 +39,14 @@ public class PagedBytesAtomicFieldData implements AtomicFieldData.WithOrdinals<S
// 0 ordinal in values means no value (its null) // 0 ordinal in values means no value (its null)
private final PagedBytes.Reader bytes; private final PagedBytes.Reader bytes;
private final PackedInts.Reader termOrdToBytesOffset; private final MonotonicAppendingLongBuffer termOrdToBytesOffset;
protected final Ordinals ordinals; protected final Ordinals ordinals;
private volatile int[] hashes; private volatile int[] hashes;
private long size = -1; private long size = -1;
private final long readerBytesSize; private final long readerBytesSize;
public PagedBytesAtomicFieldData(PagedBytes.Reader bytes, long readerBytesSize, PackedInts.Reader termOrdToBytesOffset, Ordinals ordinals) { public PagedBytesAtomicFieldData(PagedBytes.Reader bytes, long readerBytesSize, MonotonicAppendingLongBuffer termOrdToBytesOffset, Ordinals ordinals) {
this.bytes = bytes; this.bytes = bytes;
this.termOrdToBytesOffset = termOrdToBytesOffset; this.termOrdToBytesOffset = termOrdToBytesOffset;
this.ordinals = ordinals; this.ordinals = ordinals;
@ -88,7 +87,7 @@ public class PagedBytesAtomicFieldData implements AtomicFieldData.WithOrdinals<S
private final int[] getHashes() { private final int[] getHashes() {
if (hashes == null) { if (hashes == null) {
int numberOfValues = termOrdToBytesOffset.size(); int numberOfValues = (int) termOrdToBytesOffset.size();
int[] hashes = new int[numberOfValues]; int[] hashes = new int[numberOfValues];
BytesRef scratch = new BytesRef(); BytesRef scratch = new BytesRef();
for (int i = 0; i < numberOfValues; i++) { for (int i = 0; i < numberOfValues; i++) {
@ -121,12 +120,12 @@ public class PagedBytesAtomicFieldData implements AtomicFieldData.WithOrdinals<S
static abstract class BytesValues extends org.elasticsearch.index.fielddata.BytesValues.WithOrdinals { static abstract class BytesValues extends org.elasticsearch.index.fielddata.BytesValues.WithOrdinals {
protected final PagedBytes.Reader bytes; protected final PagedBytes.Reader bytes;
protected final PackedInts.Reader termOrdToBytesOffset; protected final MonotonicAppendingLongBuffer termOrdToBytesOffset;
protected final Ordinals.Docs ordinals; protected final Ordinals.Docs ordinals;
protected final BytesRef scratch = new BytesRef(); protected final BytesRef scratch = new BytesRef();
BytesValues(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, Ordinals.Docs ordinals) { BytesValues(PagedBytes.Reader bytes, MonotonicAppendingLongBuffer termOrdToBytesOffset, Ordinals.Docs ordinals) {
super(ordinals); super(ordinals);
this.bytes = bytes; this.bytes = bytes;
this.termOrdToBytesOffset = termOrdToBytesOffset; this.termOrdToBytesOffset = termOrdToBytesOffset;
@ -156,7 +155,7 @@ public class PagedBytesAtomicFieldData implements AtomicFieldData.WithOrdinals<S
private final Iter.Single iter; private final Iter.Single iter;
Single(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, Ordinals.Docs ordinals) { Single(PagedBytes.Reader bytes, MonotonicAppendingLongBuffer termOrdToBytesOffset, Ordinals.Docs ordinals) {
super(bytes, termOrdToBytesOffset, ordinals); super(bytes, termOrdToBytesOffset, ordinals);
assert !ordinals.isMultiValued(); assert !ordinals.isMultiValued();
iter = newSingleIter(); iter = newSingleIter();
@ -175,7 +174,7 @@ public class PagedBytesAtomicFieldData implements AtomicFieldData.WithOrdinals<S
static final class SingleHashed extends Single { static final class SingleHashed extends Single {
private final int[] hashes; private final int[] hashes;
SingleHashed(int[] hashes, Reader bytes, org.apache.lucene.util.packed.PackedInts.Reader termOrdToBytesOffset, Docs ordinals) { SingleHashed(int[] hashes, Reader bytes, MonotonicAppendingLongBuffer termOrdToBytesOffset, Docs ordinals) {
super(bytes, termOrdToBytesOffset, ordinals); super(bytes, termOrdToBytesOffset, ordinals);
this.hashes = hashes; this.hashes = hashes;
} }
@ -203,7 +202,7 @@ public class PagedBytesAtomicFieldData implements AtomicFieldData.WithOrdinals<S
private final Iter.Multi iter; private final Iter.Multi iter;
Multi(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, Ordinals.Docs ordinals) { Multi(PagedBytes.Reader bytes, MonotonicAppendingLongBuffer termOrdToBytesOffset, Ordinals.Docs ordinals) {
super(bytes, termOrdToBytesOffset, ordinals); super(bytes, termOrdToBytesOffset, ordinals);
assert ordinals.isMultiValued(); assert ordinals.isMultiValued();
this.iter = newMultiIter(); this.iter = newMultiIter();
@ -219,7 +218,7 @@ public class PagedBytesAtomicFieldData implements AtomicFieldData.WithOrdinals<S
private final int[] hashes; private final int[] hashes;
MultiHashed(int[] hashes, Reader bytes, org.apache.lucene.util.packed.PackedInts.Reader termOrdToBytesOffset, Docs ordinals) { MultiHashed(int[] hashes, Reader bytes, MonotonicAppendingLongBuffer termOrdToBytesOffset, Docs ordinals) {
super(bytes, termOrdToBytesOffset, ordinals); super(bytes, termOrdToBytesOffset, ordinals);
this.hashes = hashes; this.hashes = hashes;
} }
@ -246,7 +245,7 @@ public class PagedBytesAtomicFieldData implements AtomicFieldData.WithOrdinals<S
static class Empty extends PagedBytesAtomicFieldData { static class Empty extends PagedBytesAtomicFieldData {
Empty(int numDocs) { Empty(int numDocs) {
super(emptyBytes(), 0, new GrowableWriter(1, 2, PackedInts.FASTEST).getMutable(), new EmptyOrdinals(numDocs)); super(emptyBytes(), 0, new MonotonicAppendingLongBuffer(), new EmptyOrdinals(numDocs));
} }
static PagedBytes.Reader emptyBytes() { static PagedBytes.Reader emptyBytes() {

View File

@ -20,10 +20,9 @@
package org.elasticsearch.index.fielddata.plain; package org.elasticsearch.index.fielddata.plain;
import org.apache.lucene.index.*; import org.apache.lucene.index.*;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.GrowableWriter; import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedInts;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index; import org.elasticsearch.index.Index;
@ -92,10 +91,8 @@ public class PagedBytesIndexFieldData extends AbstractBytesIndexFieldData<PagedB
startNumUniqueTerms = 1; startNumUniqueTerms = 1;
} }
// TODO: expose this as an option..., have a nice parser for it... final MonotonicAppendingLongBuffer termOrdToBytesOffset = new MonotonicAppendingLongBuffer();
float acceptableOverheadRatio = PackedInts.FAST; termOrdToBytesOffset.add(0); // first ord is reserved for missing values
GrowableWriter termOrdToBytesOffset = new GrowableWriter(startBytesBPV, 1 + startNumUniqueTerms, acceptableOverheadRatio);
boolean preDefineBitsRequired = regex == null && frequency == null; boolean preDefineBitsRequired = regex == null && frequency == null;
OrdinalsBuilder builder = new OrdinalsBuilder(terms, preDefineBitsRequired, reader.maxDoc()); OrdinalsBuilder builder = new OrdinalsBuilder(terms, preDefineBitsRequired, reader.maxDoc());
try { try {
@ -105,13 +102,8 @@ public class PagedBytesIndexFieldData extends AbstractBytesIndexFieldData<PagedB
DocsEnum docsEnum = null; DocsEnum docsEnum = null;
for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) { for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) {
final int termOrd = builder.nextOrdinal(); final int termOrd = builder.nextOrdinal();
if (termOrd == termOrdToBytesOffset.size()) { assert termOrd == termOrdToBytesOffset.size();
// NOTE: this code only runs if the incoming termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
// reader impl doesn't implement
// size (which should be uncommon)
termOrdToBytesOffset = termOrdToBytesOffset.resize(ArrayUtil.oversize(1 + termOrd, 1));
}
termOrdToBytesOffset.set(termOrd, bytes.copyUsingLengthPrefix(term));
docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE); docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE);
for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
builder.addDoc(docId); builder.addDoc(docId);
@ -119,10 +111,9 @@ public class PagedBytesIndexFieldData extends AbstractBytesIndexFieldData<PagedB
} }
final long sizePointer = bytes.getPointer(); final long sizePointer = bytes.getPointer();
PagedBytes.Reader bytesReader = bytes.freeze(true); PagedBytes.Reader bytesReader = bytes.freeze(true);
PackedInts.Reader termOrdToBytesOffsetReader = termOrdToBytesOffset.getMutable();
final Ordinals ordinals = builder.build(fieldDataType.getSettings()); final Ordinals ordinals = builder.build(fieldDataType.getSettings());
return new PagedBytesAtomicFieldData(bytesReader, sizePointer, termOrdToBytesOffsetReader, ordinals); return new PagedBytesAtomicFieldData(bytesReader, sizePointer, termOrdToBytesOffset, ordinals);
} finally { } finally {
builder.close(); builder.close();
} }