From 8d59ed3ab0bf307126313db87884d9bc47326d1b Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 14 Jun 2013 10:16:49 +0200 Subject: [PATCH] Use SinglePackedOrdinals over SingleArrayOrdinals to reduce the memory ordinals take for single valued fields in field data. Closes #3185 --- .../fielddata/ordinals/OrdinalsBuilder.java | 180 +++++++++++------- .../ordinals/SingleArrayOrdinals.java | 147 -------------- .../plain/AbstractBytesIndexFieldData.java | 27 +-- .../plain/FSTBytesIndexFieldData.java | 11 +- .../plain/PagedBytesIndexFieldData.java | 9 +- .../ordinals/MultiOrdinalsTests.java | 3 +- .../ordinals/SingleOrdinalsTests.java | 83 ++++++++ 7 files changed, 217 insertions(+), 243 deletions(-) delete mode 100644 src/main/java/org/elasticsearch/index/fielddata/ordinals/SingleArrayOrdinals.java create mode 100644 src/test/java/org/elasticsearch/test/unit/index/fielddata/ordinals/SingleOrdinalsTests.java diff --git a/src/main/java/org/elasticsearch/index/fielddata/ordinals/OrdinalsBuilder.java b/src/main/java/org/elasticsearch/index/fielddata/ordinals/OrdinalsBuilder.java index 501bcd32466..8e3547618ba 100644 --- a/src/main/java/org/elasticsearch/index/fielddata/ordinals/OrdinalsBuilder.java +++ b/src/main/java/org/elasticsearch/index/fielddata/ordinals/OrdinalsBuilder.java @@ -17,28 +17,23 @@ package org.elasticsearch.index.fielddata.ordinals; * specific language governing permissions and limitations * under the License. */ -import java.io.Closeable; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Comparator; - import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FilteredTermsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefIterator; -import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.IntBlockPool; +import org.apache.lucene.util.*; import org.apache.lucene.util.IntBlockPool.Allocator; import org.apache.lucene.util.IntBlockPool.DirectAllocator; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.packed.GrowableWriter; +import org.apache.lucene.util.packed.PackedInts; import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.common.settings.Settings; +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; + /** * Simple class to build document ID <-> ordinal mapping. Note: Ordinals are * 1 based monotocially increasing positive integers. 0 @@ -46,7 +41,10 @@ import org.elasticsearch.common.settings.Settings; */ public final class OrdinalsBuilder implements Closeable { - private final int[] ords; + private final int maxDoc; + private int[] mvOrds; + private GrowableWriter svOrds; + private int[] offsets; private final IntBlockPool pool; private final IntBlockPool.SliceWriter writer; @@ -57,19 +55,35 @@ public final class OrdinalsBuilder implements Closeable { private int numMultiValuedDocs = 0; private int totalNumOrds = 0; - public OrdinalsBuilder(Terms terms, int maxDoc, Allocator allocator) { - this.ords = new int[maxDoc]; + public OrdinalsBuilder(Terms terms, boolean preDefineBitsRequired, int maxDoc, Allocator allocator) throws IOException { + this.maxDoc = maxDoc; + // TODO: Make configurable... + float acceptableOverheadRatio = PackedInts.FAST; + if (preDefineBitsRequired) { + int numTerms = (int) terms.size(); + if (numTerms == -1) { + svOrds = new GrowableWriter(1, maxDoc, acceptableOverheadRatio); + } else { + svOrds = new GrowableWriter(PackedInts.bitsRequired(numTerms), maxDoc, acceptableOverheadRatio); + } + } else { + svOrds = new GrowableWriter(1, maxDoc, acceptableOverheadRatio); + } pool = new IntBlockPool(allocator); reader = new IntBlockPool.SliceReader(pool); writer = new IntBlockPool.SliceWriter(pool); } - public OrdinalsBuilder(int maxDoc) { - this(null, maxDoc); + public OrdinalsBuilder(int maxDoc) throws IOException { + this(null, false, maxDoc); } - public OrdinalsBuilder(Terms terms, int maxDoc) { - this(terms, maxDoc, new DirectAllocator()); + public OrdinalsBuilder(Terms terms, boolean preDefineBitsRequired, int maxDoc) throws IOException { + this(terms, preDefineBitsRequired, maxDoc, new DirectAllocator()); + } + + public OrdinalsBuilder(Terms terms, int maxDoc) throws IOException { + this(terms, true, maxDoc, new DirectAllocator()); } /** @@ -93,25 +107,42 @@ public final class OrdinalsBuilder implements Closeable { */ public OrdinalsBuilder addDoc(int doc) { totalNumOrds++; - int docsOrd = ords[doc]; - if (docsOrd == 0) { - ords[doc] = currentOrd; - numDocsWithValue++; - } else if (docsOrd > 0) { - numMultiValuedDocs++; - int offset = writer.startNewSlice(); - writer.writeInt(docsOrd); - writer.writeInt(currentOrd); - if (offsets == null) { - offsets = new int[ords.length]; + if (svOrds != null) { + int docsOrd = (int) svOrds.get(doc); + if (docsOrd == 0) { + svOrds.set(doc, currentOrd); + numDocsWithValue++; + } else { + // Rebuilding ords that supports mv based on sv ords. + mvOrds = new int[maxDoc]; + for (int docId = 0; docId < maxDoc; docId++) { + mvOrds[docId] = (int) svOrds.get(docId); + } + svOrds = null; + } + } + + if (mvOrds != null) { + int docsOrd = mvOrds[doc]; + if (docsOrd == 0) { + mvOrds[doc] = currentOrd; + numDocsWithValue++; + } else if (docsOrd > 0) { + numMultiValuedDocs++; + int offset = writer.startNewSlice(); + writer.writeInt(docsOrd); + writer.writeInt(currentOrd); + if (offsets == null) { + offsets = new int[mvOrds.length]; + } + offsets[doc] = writer.getCurrentOffset(); + mvOrds[doc] = (-1 * offset) - 1; + } else { + assert offsets != null; + writer.reset(offsets[doc]); + writer.writeInt(currentOrd); + offsets[doc] = writer.getCurrentOffset(); } - offsets[doc] = writer.getCurrentOffset(); - ords[doc] = (-1 * offset) - 1; - } else { - assert offsets != null; - writer.reset(offsets[doc]); - writer.writeInt(currentOrd); - offsets[doc] = writer.getCurrentOffset(); } return this; } @@ -163,12 +194,22 @@ public final class OrdinalsBuilder implements Closeable { * if every document has an ordinal associated with it this method returns null */ public FixedBitSet buildDocsWithValuesSet() { - if (numDocsWithValue == this.ords.length) + if (numDocsWithValue == maxDoc) { return null; - final FixedBitSet bitSet = new FixedBitSet(this.ords.length); - for (int i = 0; i < ords.length; i++) { - if (ords[i] != 0) { - bitSet.set(i); + } + final FixedBitSet bitSet = new FixedBitSet(maxDoc); + if (svOrds != null) { + for (int docId = 0; docId < maxDoc; docId++) { + int ord = (int) svOrds.get(docId); + if (ord != 0) { + bitSet.set(docId); + } + } + } else { + for (int docId = 0; docId < maxDoc; docId++) { + if (mvOrds[docId] != 0) { + bitSet.set(docId); + } } } return bitSet; @@ -179,15 +220,15 @@ public final class OrdinalsBuilder implements Closeable { */ public Ordinals build(Settings settings) { if (numMultiValuedDocs == 0) { - return new SingleArrayOrdinals(ords, getNumOrds()); + return new SinglePackedOrdinals(svOrds.getMutable(), getNumOrds()); } final String multiOrdinals = settings.get("multi_ordinals", "sparse"); if ("flat".equals(multiOrdinals)) { final ArrayList ordinalBuffer = new ArrayList(); - for (int i = 0; i < ords.length; i++) { + for (int i = 0; i < mvOrds.length; i++) { final IntsRef docOrds = docOrds(i); while (ordinalBuffer.size() < docOrds.length) { - ordinalBuffer.add(new int[ords.length]); + ordinalBuffer.add(new int[mvOrds.length]); } for (int j = docOrds.offset; j < docOrds.offset+docOrds.length; j++) { @@ -211,24 +252,35 @@ public final class OrdinalsBuilder implements Closeable { * Returns a shared {@link IntsRef} instance for the given doc ID holding all ordinals associated with it. */ public IntsRef docOrds(int doc) { - int docsOrd = ords[doc]; - intsRef.offset = 0; - if (docsOrd == 0) { - intsRef.length = 0; - } else if (docsOrd > 0) { - intsRef.ints[0] = ords[doc]; - intsRef.length = 1; - } else { - assert offsets != null; - reader.reset(-1 * (ords[doc] + 1), offsets[doc]); - int pos = 0; - while (!reader.endOfSlice()) { - if (intsRef.ints.length <= pos) { - intsRef.ints = ArrayUtil.grow(intsRef.ints, pos + 1); - } - intsRef.ints[pos++] = reader.readInt(); + if (svOrds != null) { + int docsOrd = (int) svOrds.get(doc); + intsRef.offset = 0; + if (docsOrd == 0) { + intsRef.length = 0; + } else if (docsOrd > 0) { + intsRef.ints[0] = docsOrd; + intsRef.length = 1; + } + } else { + int docsOrd = mvOrds[doc]; + intsRef.offset = 0; + if (docsOrd == 0) { + intsRef.length = 0; + } else if (docsOrd > 0) { + intsRef.ints[0] = mvOrds[doc]; + intsRef.length = 1; + } else { + assert offsets != null; + reader.reset(-1 * (mvOrds[doc] + 1), offsets[doc]); + int pos = 0; + while (!reader.endOfSlice()) { + if (intsRef.ints.length <= pos) { + intsRef.ints = ArrayUtil.grow(intsRef.ints, pos + 1); + } + intsRef.ints[pos++] = reader.readInt(); + } + intsRef.length = pos; } - intsRef.length = pos; } return intsRef; } @@ -237,7 +289,7 @@ public final class OrdinalsBuilder implements Closeable { * Returns the maximum document ID this builder can associate with an ordinal */ public int maxDoc() { - return ords.length; + return maxDoc; } /** diff --git a/src/main/java/org/elasticsearch/index/fielddata/ordinals/SingleArrayOrdinals.java b/src/main/java/org/elasticsearch/index/fielddata/ordinals/SingleArrayOrdinals.java deleted file mode 100644 index f5052d453d3..00000000000 --- a/src/main/java/org/elasticsearch/index/fielddata/ordinals/SingleArrayOrdinals.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Licensed to ElasticSearch and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.fielddata.ordinals; - -import org.apache.lucene.util.IntsRef; -import org.elasticsearch.common.RamUsage; - -/** - */ -public class SingleArrayOrdinals implements Ordinals { - - // ordinals with value 0 indicates no value - private final int[] ordinals; - private final int numOrds; - private final int maxOrd; - - private long size = -1; - - public SingleArrayOrdinals(int[] ordinals, int numOrds) { - this.ordinals = ordinals; - this.numOrds = numOrds; - this.maxOrd = numOrds + 1; - } - - @Override - public boolean hasSingleArrayBackingStorage() { - return true; - } - - @Override - public Object getBackingStorage() { - return ordinals; - } - - @Override - public long getMemorySizeInBytes() { - if (size == -1) { - size = RamUsage.NUM_BYTES_INT * ordinals.length + RamUsage.NUM_BYTES_ARRAY_HEADER; - } - return size; - } - - @Override - public boolean isMultiValued() { - return false; - } - - @Override - public int getNumDocs() { - return ordinals.length; - } - - @Override - public int getNumOrds() { - return numOrds; - } - - @Override - public int getMaxOrd() { - return maxOrd; - } - - @Override - public Docs ordinals() { - return new Docs(this, ordinals); - } - - public static class Docs implements Ordinals.Docs { - - private final SingleArrayOrdinals parent; - private final int[] ordinals; - - private final IntsRef intsScratch = new IntsRef(1); - private final SingleValueIter iter = new SingleValueIter(); - - public Docs(SingleArrayOrdinals parent, int[] ordinals) { - this.parent = parent; - this.ordinals = ordinals; - } - - @Override - public Ordinals ordinals() { - return parent; - } - - @Override - public int getNumDocs() { - return parent.getNumDocs(); - } - - @Override - public int getNumOrds() { - return parent.getNumOrds(); - } - - @Override - public int getMaxOrd() { - return parent.getMaxOrd(); - } - - @Override - public boolean isMultiValued() { - return false; - } - - @Override - public int getOrd(int docId) { - return ordinals[docId]; - } - - @Override - public IntsRef getOrds(int docId) { - final int ordinal = ordinals[docId]; - if (ordinal == 0) { - intsScratch.length = 0; - } else { - intsScratch.ints[0] = ordinal; - intsScratch.offset = 0; - intsScratch.length = 1; - } - return intsScratch; - } - - @Override - public Iter getIter(int docId) { - return iter.reset(ordinals[docId]); - } - - } -} diff --git a/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractBytesIndexFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractBytesIndexFieldData.java index 38f50e9a039..1a5f9f96e4f 100644 --- a/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractBytesIndexFieldData.java +++ b/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractBytesIndexFieldData.java @@ -18,16 +18,7 @@ */ package org.elasticsearch.index.fielddata.plain; -import java.io.IOException; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.lucene.index.AtomicReader; -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.FilteredTermsEnum; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.*; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.UnicodeUtil; @@ -35,20 +26,20 @@ import org.elasticsearch.ElasticSearchException; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; -import org.elasticsearch.index.fielddata.AbstractIndexFieldData; -import org.elasticsearch.index.fielddata.AtomicFieldData; -import org.elasticsearch.index.fielddata.FieldDataType; -import org.elasticsearch.index.fielddata.IndexFieldData; -import org.elasticsearch.index.fielddata.IndexFieldDataCache; -import org.elasticsearch.index.fielddata.ScriptDocValues; +import org.elasticsearch.index.fielddata.*; import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource; import org.elasticsearch.index.fielddata.fieldcomparator.SortMode; import org.elasticsearch.index.mapper.FieldMapper.Names; +import java.io.IOException; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + public abstract class AbstractBytesIndexFieldData> extends AbstractIndexFieldData implements IndexFieldData.WithOrdinals { - private Settings frequency; - private Settings regex; + protected Settings frequency; + protected Settings regex; protected AbstractBytesIndexFieldData(Index index, Settings indexSettings, Names fieldNames, FieldDataType fieldDataType, IndexFieldDataCache cache) { diff --git a/src/main/java/org/elasticsearch/index/fielddata/plain/FSTBytesIndexFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/plain/FSTBytesIndexFieldData.java index be3c7285a53..acb9480d581 100644 --- a/src/main/java/org/elasticsearch/index/fielddata/plain/FSTBytesIndexFieldData.java +++ b/src/main/java/org/elasticsearch/index/fielddata/plain/FSTBytesIndexFieldData.java @@ -19,11 +19,7 @@ package org.elasticsearch.index.fielddata.plain; -import org.apache.lucene.index.AtomicReader; -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.*; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.FST; @@ -67,8 +63,9 @@ public class FSTBytesIndexFieldData extends AbstractBytesIndexFieldData fstBuilder = new org.apache.lucene.util.fst.Builder(INPUT_TYPE.BYTE1, outputs); final IntsRef scratch = new IntsRef(); - - OrdinalsBuilder builder = new OrdinalsBuilder(terms, reader.maxDoc()); + + boolean preDefineBitsRequired = regex == null && frequency == null; + OrdinalsBuilder builder = new OrdinalsBuilder(terms, preDefineBitsRequired, reader.maxDoc()); try { // we don't store an ord 0 in the FST since we could have an empty string in there and FST don't support diff --git a/src/main/java/org/elasticsearch/index/fielddata/plain/PagedBytesIndexFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/plain/PagedBytesIndexFieldData.java index 0d80884c0b4..80797c8c433 100644 --- a/src/main/java/org/elasticsearch/index/fielddata/plain/PagedBytesIndexFieldData.java +++ b/src/main/java/org/elasticsearch/index/fielddata/plain/PagedBytesIndexFieldData.java @@ -19,11 +19,7 @@ package org.elasticsearch.index.fielddata.plain; -import org.apache.lucene.index.AtomicReader; -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.*; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PagedBytes; @@ -100,7 +96,8 @@ public class PagedBytesIndexFieldData extends AbstractBytesIndexFieldData controlDocToOrdinal = new HashMap(); + OrdinalsBuilder builder = new OrdinalsBuilder(numDocs); + int ordinal = builder.nextOrdinal(); + for (int doc = 0; doc < numDocs; doc++) { + if (doc % numOrdinals == 0) { + ordinal = builder.nextOrdinal(); + } + controlDocToOrdinal.put(doc, ordinal); + builder.addDoc(doc); + } + + Ordinals ords = builder.build(ImmutableSettings.EMPTY); + assertThat(ords, instanceOf(SinglePackedOrdinals.class)); + Ordinals.Docs docs = ords.ordinals(); + + assertThat(controlDocToOrdinal.size(), equalTo(docs.getNumDocs())); + for (Map.Entry entry : controlDocToOrdinal.entrySet()) { + assertThat(entry.getValue(), equalTo(docs.getOrd(entry.getKey()))); + } + + } + + @Test + public void testMvOrdinalsTrigger() throws IOException { + int numDocs = 1000000; + OrdinalsBuilder builder = new OrdinalsBuilder(numDocs); + builder.nextOrdinal(); + for (int doc = 0; doc < numDocs; doc++) { + builder.addDoc(doc); + } + + Ordinals ords = builder.build(ImmutableSettings.EMPTY); + assertThat(ords, instanceOf(SinglePackedOrdinals.class)); + + builder.nextOrdinal(); + builder.addDoc(0); + ords = builder.build(ImmutableSettings.EMPTY); + assertThat(ords, not(instanceOf(SinglePackedOrdinals.class))); + } + +}