Use SinglePackedOrdinals over SingleArrayOrdinals to reduce the memory ordinals take for single valued fields in field data.

Closes #3185
This commit is contained in:
Martijn van Groningen 2013-06-14 10:16:49 +02:00
parent b995abfa80
commit 8d59ed3ab0
7 changed files with 217 additions and 243 deletions

View File

@ -17,28 +17,23 @@ package org.elasticsearch.index.fielddata.ordinals;
* specific language governing permissions and limitations
* under the License.
*/
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntBlockPool;
import org.apache.lucene.util.*;
import org.apache.lucene.util.IntBlockPool.Allocator;
import org.apache.lucene.util.IntBlockPool.DirectAllocator;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.settings.Settings;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
/**
* Simple class to build document ID <-> ordinal mapping. Note: Ordinals are
* <tt>1</tt> based monotocially increasing positive integers. <tt>0</tt>
@ -46,7 +41,10 @@ import org.elasticsearch.common.settings.Settings;
*/
public final class OrdinalsBuilder implements Closeable {
private final int[] ords;
private final int maxDoc;
private int[] mvOrds;
private GrowableWriter svOrds;
private int[] offsets;
private final IntBlockPool pool;
private final IntBlockPool.SliceWriter writer;
@ -57,19 +55,35 @@ public final class OrdinalsBuilder implements Closeable {
private int numMultiValuedDocs = 0;
private int totalNumOrds = 0;
public OrdinalsBuilder(Terms terms, int maxDoc, Allocator allocator) {
this.ords = new int[maxDoc];
public OrdinalsBuilder(Terms terms, boolean preDefineBitsRequired, int maxDoc, Allocator allocator) throws IOException {
this.maxDoc = maxDoc;
// TODO: Make configurable...
float acceptableOverheadRatio = PackedInts.FAST;
if (preDefineBitsRequired) {
int numTerms = (int) terms.size();
if (numTerms == -1) {
svOrds = new GrowableWriter(1, maxDoc, acceptableOverheadRatio);
} else {
svOrds = new GrowableWriter(PackedInts.bitsRequired(numTerms), maxDoc, acceptableOverheadRatio);
}
} else {
svOrds = new GrowableWriter(1, maxDoc, acceptableOverheadRatio);
}
pool = new IntBlockPool(allocator);
reader = new IntBlockPool.SliceReader(pool);
writer = new IntBlockPool.SliceWriter(pool);
}
public OrdinalsBuilder(int maxDoc) {
this(null, maxDoc);
public OrdinalsBuilder(int maxDoc) throws IOException {
this(null, false, maxDoc);
}
public OrdinalsBuilder(Terms terms, int maxDoc) {
this(terms, maxDoc, new DirectAllocator());
public OrdinalsBuilder(Terms terms, boolean preDefineBitsRequired, int maxDoc) throws IOException {
this(terms, preDefineBitsRequired, maxDoc, new DirectAllocator());
}
public OrdinalsBuilder(Terms terms, int maxDoc) throws IOException {
this(terms, true, maxDoc, new DirectAllocator());
}
/**
@ -93,25 +107,42 @@ public final class OrdinalsBuilder implements Closeable {
*/
public OrdinalsBuilder addDoc(int doc) {
totalNumOrds++;
int docsOrd = ords[doc];
if (docsOrd == 0) {
ords[doc] = currentOrd;
numDocsWithValue++;
} else if (docsOrd > 0) {
numMultiValuedDocs++;
int offset = writer.startNewSlice();
writer.writeInt(docsOrd);
writer.writeInt(currentOrd);
if (offsets == null) {
offsets = new int[ords.length];
if (svOrds != null) {
int docsOrd = (int) svOrds.get(doc);
if (docsOrd == 0) {
svOrds.set(doc, currentOrd);
numDocsWithValue++;
} else {
// Rebuilding ords that supports mv based on sv ords.
mvOrds = new int[maxDoc];
for (int docId = 0; docId < maxDoc; docId++) {
mvOrds[docId] = (int) svOrds.get(docId);
}
svOrds = null;
}
}
if (mvOrds != null) {
int docsOrd = mvOrds[doc];
if (docsOrd == 0) {
mvOrds[doc] = currentOrd;
numDocsWithValue++;
} else if (docsOrd > 0) {
numMultiValuedDocs++;
int offset = writer.startNewSlice();
writer.writeInt(docsOrd);
writer.writeInt(currentOrd);
if (offsets == null) {
offsets = new int[mvOrds.length];
}
offsets[doc] = writer.getCurrentOffset();
mvOrds[doc] = (-1 * offset) - 1;
} else {
assert offsets != null;
writer.reset(offsets[doc]);
writer.writeInt(currentOrd);
offsets[doc] = writer.getCurrentOffset();
}
offsets[doc] = writer.getCurrentOffset();
ords[doc] = (-1 * offset) - 1;
} else {
assert offsets != null;
writer.reset(offsets[doc]);
writer.writeInt(currentOrd);
offsets[doc] = writer.getCurrentOffset();
}
return this;
}
@ -163,12 +194,22 @@ public final class OrdinalsBuilder implements Closeable {
* if every document has an ordinal associated with it this method returns <code>null</code>
*/
public FixedBitSet buildDocsWithValuesSet() {
if (numDocsWithValue == this.ords.length)
if (numDocsWithValue == maxDoc) {
return null;
final FixedBitSet bitSet = new FixedBitSet(this.ords.length);
for (int i = 0; i < ords.length; i++) {
if (ords[i] != 0) {
bitSet.set(i);
}
final FixedBitSet bitSet = new FixedBitSet(maxDoc);
if (svOrds != null) {
for (int docId = 0; docId < maxDoc; docId++) {
int ord = (int) svOrds.get(docId);
if (ord != 0) {
bitSet.set(docId);
}
}
} else {
for (int docId = 0; docId < maxDoc; docId++) {
if (mvOrds[docId] != 0) {
bitSet.set(docId);
}
}
}
return bitSet;
@ -179,15 +220,15 @@ public final class OrdinalsBuilder implements Closeable {
*/
public Ordinals build(Settings settings) {
if (numMultiValuedDocs == 0) {
return new SingleArrayOrdinals(ords, getNumOrds());
return new SinglePackedOrdinals(svOrds.getMutable(), getNumOrds());
}
final String multiOrdinals = settings.get("multi_ordinals", "sparse");
if ("flat".equals(multiOrdinals)) {
final ArrayList<int[]> ordinalBuffer = new ArrayList<int[]>();
for (int i = 0; i < ords.length; i++) {
for (int i = 0; i < mvOrds.length; i++) {
final IntsRef docOrds = docOrds(i);
while (ordinalBuffer.size() < docOrds.length) {
ordinalBuffer.add(new int[ords.length]);
ordinalBuffer.add(new int[mvOrds.length]);
}
for (int j = docOrds.offset; j < docOrds.offset+docOrds.length; j++) {
@ -211,24 +252,35 @@ public final class OrdinalsBuilder implements Closeable {
* Returns a shared {@link IntsRef} instance for the given doc ID holding all ordinals associated with it.
*/
public IntsRef docOrds(int doc) {
int docsOrd = ords[doc];
intsRef.offset = 0;
if (docsOrd == 0) {
intsRef.length = 0;
} else if (docsOrd > 0) {
intsRef.ints[0] = ords[doc];
intsRef.length = 1;
} else {
assert offsets != null;
reader.reset(-1 * (ords[doc] + 1), offsets[doc]);
int pos = 0;
while (!reader.endOfSlice()) {
if (intsRef.ints.length <= pos) {
intsRef.ints = ArrayUtil.grow(intsRef.ints, pos + 1);
}
intsRef.ints[pos++] = reader.readInt();
if (svOrds != null) {
int docsOrd = (int) svOrds.get(doc);
intsRef.offset = 0;
if (docsOrd == 0) {
intsRef.length = 0;
} else if (docsOrd > 0) {
intsRef.ints[0] = docsOrd;
intsRef.length = 1;
}
} else {
int docsOrd = mvOrds[doc];
intsRef.offset = 0;
if (docsOrd == 0) {
intsRef.length = 0;
} else if (docsOrd > 0) {
intsRef.ints[0] = mvOrds[doc];
intsRef.length = 1;
} else {
assert offsets != null;
reader.reset(-1 * (mvOrds[doc] + 1), offsets[doc]);
int pos = 0;
while (!reader.endOfSlice()) {
if (intsRef.ints.length <= pos) {
intsRef.ints = ArrayUtil.grow(intsRef.ints, pos + 1);
}
intsRef.ints[pos++] = reader.readInt();
}
intsRef.length = pos;
}
intsRef.length = pos;
}
return intsRef;
}
@ -237,7 +289,7 @@ public final class OrdinalsBuilder implements Closeable {
* Returns the maximum document ID this builder can associate with an ordinal
*/
public int maxDoc() {
return ords.length;
return maxDoc;
}
/**

View File

@ -1,147 +0,0 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.fielddata.ordinals;
import org.apache.lucene.util.IntsRef;
import org.elasticsearch.common.RamUsage;
/**
*/
public class SingleArrayOrdinals implements Ordinals {
// ordinals with value 0 indicates no value
private final int[] ordinals;
private final int numOrds;
private final int maxOrd;
private long size = -1;
public SingleArrayOrdinals(int[] ordinals, int numOrds) {
this.ordinals = ordinals;
this.numOrds = numOrds;
this.maxOrd = numOrds + 1;
}
@Override
public boolean hasSingleArrayBackingStorage() {
return true;
}
@Override
public Object getBackingStorage() {
return ordinals;
}
@Override
public long getMemorySizeInBytes() {
if (size == -1) {
size = RamUsage.NUM_BYTES_INT * ordinals.length + RamUsage.NUM_BYTES_ARRAY_HEADER;
}
return size;
}
@Override
public boolean isMultiValued() {
return false;
}
@Override
public int getNumDocs() {
return ordinals.length;
}
@Override
public int getNumOrds() {
return numOrds;
}
@Override
public int getMaxOrd() {
return maxOrd;
}
@Override
public Docs ordinals() {
return new Docs(this, ordinals);
}
public static class Docs implements Ordinals.Docs {
private final SingleArrayOrdinals parent;
private final int[] ordinals;
private final IntsRef intsScratch = new IntsRef(1);
private final SingleValueIter iter = new SingleValueIter();
public Docs(SingleArrayOrdinals parent, int[] ordinals) {
this.parent = parent;
this.ordinals = ordinals;
}
@Override
public Ordinals ordinals() {
return parent;
}
@Override
public int getNumDocs() {
return parent.getNumDocs();
}
@Override
public int getNumOrds() {
return parent.getNumOrds();
}
@Override
public int getMaxOrd() {
return parent.getMaxOrd();
}
@Override
public boolean isMultiValued() {
return false;
}
@Override
public int getOrd(int docId) {
return ordinals[docId];
}
@Override
public IntsRef getOrds(int docId) {
final int ordinal = ordinals[docId];
if (ordinal == 0) {
intsScratch.length = 0;
} else {
intsScratch.ints[0] = ordinal;
intsScratch.offset = 0;
intsScratch.length = 1;
}
return intsScratch;
}
@Override
public Iter getIter(int docId) {
return iter.reset(ordinals[docId]);
}
}
}

View File

@ -18,16 +18,7 @@
*/
package org.elasticsearch.index.fielddata.plain;
import java.io.IOException;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
@ -35,20 +26,20 @@ import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.fielddata.AbstractIndexFieldData;
import org.elasticsearch.index.fielddata.AtomicFieldData;
import org.elasticsearch.index.fielddata.FieldDataType;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.ScriptDocValues;
import org.elasticsearch.index.fielddata.*;
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
import org.elasticsearch.index.fielddata.fieldcomparator.SortMode;
import org.elasticsearch.index.mapper.FieldMapper.Names;
import java.io.IOException;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public abstract class AbstractBytesIndexFieldData<FD extends AtomicFieldData.WithOrdinals<ScriptDocValues.Strings>> extends AbstractIndexFieldData<FD> implements IndexFieldData.WithOrdinals<FD> {
private Settings frequency;
private Settings regex;
protected Settings frequency;
protected Settings regex;
protected AbstractBytesIndexFieldData(Index index, Settings indexSettings, Names fieldNames, FieldDataType fieldDataType,
IndexFieldDataCache cache) {

View File

@ -19,11 +19,7 @@
package org.elasticsearch.index.fielddata.plain;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST;
@ -67,8 +63,9 @@ public class FSTBytesIndexFieldData extends AbstractBytesIndexFieldData<FSTBytes
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
org.apache.lucene.util.fst.Builder<Long> fstBuilder = new org.apache.lucene.util.fst.Builder<Long>(INPUT_TYPE.BYTE1, outputs);
final IntsRef scratch = new IntsRef();
OrdinalsBuilder builder = new OrdinalsBuilder(terms, reader.maxDoc());
boolean preDefineBitsRequired = regex == null && frequency == null;
OrdinalsBuilder builder = new OrdinalsBuilder(terms, preDefineBitsRequired, reader.maxDoc());
try {
// we don't store an ord 0 in the FST since we could have an empty string in there and FST don't support

View File

@ -19,11 +19,7 @@
package org.elasticsearch.index.fielddata.plain;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.*;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PagedBytes;
@ -100,7 +96,8 @@ public class PagedBytesIndexFieldData extends AbstractBytesIndexFieldData<PagedB
float acceptableOverheadRatio = PackedInts.FAST;
GrowableWriter termOrdToBytesOffset = new GrowableWriter(startBytesBPV, 1 + startNumUniqueTerms, acceptableOverheadRatio);
OrdinalsBuilder builder = new OrdinalsBuilder(terms, reader.maxDoc());
boolean preDefineBitsRequired = regex == null && frequency == null;
OrdinalsBuilder builder = new OrdinalsBuilder(terms, preDefineBitsRequired, reader.maxDoc());
try {
// 0 is reserved for "unset"
bytes.copyUsingLengthPrefix(new BytesRef());

View File

@ -25,6 +25,7 @@ import org.elasticsearch.index.fielddata.ordinals.Ordinals;
import org.elasticsearch.index.fielddata.ordinals.OrdinalsBuilder;
import org.testng.annotations.Test;
import java.io.IOException;
import java.util.*;
import static org.hamcrest.MatcherAssert.assertThat;
@ -43,7 +44,7 @@ public abstract class MultiOrdinalsTests {
protected abstract Ordinals creationMultiOrdinals(OrdinalsBuilder builder, ImmutableSettings.Builder settings);
@Test
public void testRandomValues() {
public void testRandomValues() throws IOException {
Random random = new Random(100);
int numDocs = 100 + random.nextInt(1000);
int numOrdinals = 1 + random.nextInt(200);

View File

@ -0,0 +1,83 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.unit.index.fielddata.ordinals;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.index.fielddata.ordinals.Ordinals;
import org.elasticsearch.index.fielddata.ordinals.OrdinalsBuilder;
import org.elasticsearch.index.fielddata.ordinals.SinglePackedOrdinals;
import org.testng.annotations.Test;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.*;
/**
*/
public class SingleOrdinalsTests {
@Test
public void testSvValues() throws IOException {
int numDocs = 1000000;
int numOrdinals = numDocs / 4;
Map<Integer, Integer> controlDocToOrdinal = new HashMap<Integer, Integer>();
OrdinalsBuilder builder = new OrdinalsBuilder(numDocs);
int ordinal = builder.nextOrdinal();
for (int doc = 0; doc < numDocs; doc++) {
if (doc % numOrdinals == 0) {
ordinal = builder.nextOrdinal();
}
controlDocToOrdinal.put(doc, ordinal);
builder.addDoc(doc);
}
Ordinals ords = builder.build(ImmutableSettings.EMPTY);
assertThat(ords, instanceOf(SinglePackedOrdinals.class));
Ordinals.Docs docs = ords.ordinals();
assertThat(controlDocToOrdinal.size(), equalTo(docs.getNumDocs()));
for (Map.Entry<Integer, Integer> entry : controlDocToOrdinal.entrySet()) {
assertThat(entry.getValue(), equalTo(docs.getOrd(entry.getKey())));
}
}
@Test
public void testMvOrdinalsTrigger() throws IOException {
int numDocs = 1000000;
OrdinalsBuilder builder = new OrdinalsBuilder(numDocs);
builder.nextOrdinal();
for (int doc = 0; doc < numDocs; doc++) {
builder.addDoc(doc);
}
Ordinals ords = builder.build(ImmutableSettings.EMPTY);
assertThat(ords, instanceOf(SinglePackedOrdinals.class));
builder.nextOrdinal();
builder.addDoc(0);
ords = builder.build(ImmutableSettings.EMPTY);
assertThat(ords, not(instanceOf(SinglePackedOrdinals.class)));
}
}